-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstack_scraper.py
More file actions
145 lines (130 loc) · 6.83 KB
/
stack_scraper.py
File metadata and controls
145 lines (130 loc) · 6.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from utils.database_handler import Json_Handler, Database_Handler
import requests
from bs4 import BeautifulSoup
from utils.pprints import Pprints
import re
class StackOverFlow(object):
base_url = "https://stackoverflow.com/questions/tagged/python?tab=newest&page=1&pagesize=50"
def __init__(self,
database_handler: Database_Handler = Database_Handler(),
json_handler: Json_Handler = Json_Handler(),
pretty_prints: Pprints = Pprints(),
page_size: int = 50
):
self.json_handler: Json_Handler = json_handler
self.database_handler: Database_Handler = database_handler
self.pretty_prints: Pprints = pretty_prints
self.page_size: int = page_size
# self.current_last_index: int = self.last_index_finder(base_url=self.base_url, page_size=self.page_size)
def soup_maker(self, url) -> BeautifulSoup:
content = requests.get(url)
soup = BeautifulSoup(content.text, 'lxml')
return soup
def last_index_finder(self):
soup = self.soup_maker(self.base_url)
total_question_tag = soup.find('div', {'data-controller': 'se-uql'})
total_questions_text = total_question_tag.find('div', {
'class': 'fs-body3 flex--item fl1 mr12 sm:mr0 sm:mb12'}).text.strip()
total_question_extraction = re.search(r'[\d,]+', total_questions_text).group()
total_questions = int(total_question_extraction.replace(",", "")) # Remove commas from the extracted number
current_last_index = round(total_questions / self.page_size)
return current_last_index
def __pretty_prints_override(self, status: str, log: bool = False):
self.pretty_prints.pretty_prints(status=status, log=log)
def __url_to_search(self, current_last_index):
last_page_visited: any([int, None]) = self.json_handler.load_scraped_pages_index()
if last_page_visited != 0:
current_page = round(current_last_index - last_page_visited)
search_url = f"https://stackoverflow.com/questions/tagged/python?tab=newest&page={current_page}&pagesize=50"
else:
search_url = f"https://stackoverflow.com/questions/tagged/python?tab=newest&page={current_last_index}&pagesize=50"
return search_url
def __valid_questions_links(self, soup: BeautifulSoup) -> list[str]:
questions_list = soup.find_all('div', {'class': 's-post-summary'})
valid_questions_links: list[str] = []
for question in questions_list:
questions_tags = question.find_all('span', {'class': 's-post-summary--stats-item-number'})
votes_to_question = int(questions_tags[0].text)
answers_to_question = int(questions_tags[1].text)
verified_ans = question.find('svg', {'class': 'svg-icon iconCheckmarkSm'})
if (votes_to_question > 0 and answers_to_question > 0) or verified_ans:
question_id = int(question.get('data-post-id'))
if self.json_handler.is_exist(question_id=question_id):
continue
self.json_handler.add_question_id(question_id=question_id)
href = question.find('a').get('href')
link = f"https://stackoverflow.com/{href}"
valid_questions_links.append(link)
else:
pass
return valid_questions_links
def __collect_question_and_answer(self, link):
self.__pretty_prints_override(status=f"Searched Link: {link}")
# INSIDE QUESTION LINK
soup = self.soup_maker(url=link)
# GETTING QUESTIONS CONTENT
question = ""
question_text = soup.find('div', {'class': 's-prose js-post-body'})
for element in question_text.children:
if element.name == 'pre' and element.find('code'):
code = element.find('code').get_text()
formatted_code = f"<code>'\n'{code}</code>"
question += formatted_code + '\n'
elif element.name == 'p':
question += element.get_text() + '\n'
elif element.name == 'ul':
question += element.get_text() + '\n'
# CHOOSING THR CORRECT ANSWER ON THE BASIS OF VOTES AND VERIFIED TICK
answers_html = soup.find_all('div', {'class': 'answer'})
vote = -1
selected_answer_html = None
answer = ""
for divs in answers_html:
score_count = int(divs['data-score'])
if 'js-accepted-answer' in divs['class']:
selected_answer_html = divs
break
elif score_count > vote:
vote = score_count
selected_answer_html = divs
# GETTING CONTENT FROM THE ANSWER
answer_html_tags = selected_answer_html.find('div', {'class': 's-prose js-post-body'})
for element in answer_html_tags.children:
if element.name == 'pre' and element.find('code'):
code = element.find('code').get_text()
formatted_code = f"<code>'\n'{code}</code>"
answer += formatted_code + '\n'
elif element.name == 'p':
answer += element.get_text()
self.__pretty_prints_override(status=f"Saving data into Database")
self.database_handler.insert_into_database(title=question, code=answer, source=link)
def __switch_to_next_page(self, url, current_last_index):
soup = self.soup_maker(url=url)
pagination_bar = soup.find('div', {'class': 's-pagination site1 themed pager float-left'})
prev_page_tag = pagination_bar.find('a', {'rel': 'prev'})
if not prev_page_tag:
return None
prev_page_link = f"https://stackoverflow.com{prev_page_tag.get('href')}&pagesize=50"
prev_link_url = prev_page_link
current_page = int(pagination_bar.find('div', {'class': 's-pagination--item is-selected'}).text)
pages_scraped = current_last_index - current_page
self.json_handler.save_scraped_pages_index(pages_scraped)
return prev_link_url
def scraper(self):
current_last_index = self.last_index_finder()
url = self.__url_to_search(current_last_index)
while True:
soup: BeautifulSoup = self.soup_maker(url)
questions_link: list[dict[str:str]] = self.__valid_questions_links(soup)
if questions_link:
for link in questions_link:
try:
self.__collect_question_and_answer(link)
except:
pass
url = self.__switch_to_next_page(url, current_last_index)
if url is None:
break
if __name__ == "__main__":
obj = StackOverFlow()
obj.scraper()