-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest.py
More file actions
94 lines (68 loc) · 3.03 KB
/
test.py
File metadata and controls
94 lines (68 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!usr/bin/python
"""Test File Web Crawler designed to find products and ratings for products developed and targeting seniors.
"""
import unicodecsv as csv
from web_crawler import WebCrawler
__author__ = "Disaiah Bennett"
__version__ = "0.1"
def main():
"""Extract data from the walmart website and return the information into a csv file.
"""
url = "https://www.walmart.com/cp/home-health-care/1005860"
crawler = WebCrawler()
# Set the url of the crawler
crawler.url = url
soup = crawler.data_extract()
cat_product_price = []
cat_product_link = []
cat_product_rating = []
cat_sidebar_li = soup.find_all("li", {"class": "SideBarMenuModuleItem"})
for category in cat_sidebar_li:
links = category.findAll("a", {"class": "SideBarMenu-toggle"})
item = category.findAll("span", {"class": "SideBarMenu-title"})
for link in links:
crawler.catlinks.append(link.get('href'))
for item in category:
head, _, _ = item.text.partition("-")
if len(head) > 30:
pass
else:
crawler.categories.append(head)
for i, _ in enumerate(crawler.categories):
# Set the url of the crawler [Done]
crawler.url = crawler.catlinks[i]
print("Current Category", crawler.categories[i], "URL: ", crawler.catlinks[i])
# Open individual CSV File [Done]
csv_file = csv.writer(open(crawler.categories[i] + ".csv", "wb"))
csv_file.writerow(["#", crawler.categories[i], "Price", "Rating", "Link", "Top Review"])
# Parse html data [Done]
soup = crawler.data_extract()
# Products to a list [Products]
prods = soup.find_all("a", {"class": "product-title-link line-clamp line-clamp-2"})
# Ratings [Done]
ratings = soup.find_all("span", {"class": "seo-avg-rating"})
# Prices [Done]
prices = soup.find_all("div", {"class": "price-main-block"})
# Links [Done]
links = soup.find_all("div", {"class": ["search-result-productimage gridview", "search-result-productimage listview"]})
for link in links:
sub_links = link.find_all("a", {"class": "display-block"})
for j, _ in enumerate(sub_links):
cat_product_link.append(sub_links[j].get("href"))
# For k in the range of products total
for k, _ in enumerate(prods):
try:
rate = float(ratings[k].text)
cat_product_rating.append(round(rate, 1))
cat_product_price.append(prices[k].text)
csv_file.writerow([k+1, prods[k].text, cat_product_price[k], str(cat_product_rating[k]) + "/5.0", "https://www.walmart.com" + cat_product_link[k], "blank"])
except IndexError:
pass
print(crawler.categories[crawler.count] + ".csv file created.\n")
crawler.count += 1
cat_product_price.clear()
cat_product_link.clear()
cat_product_rating.clear()
crawler.cleanup()
if __name__ == "__main__":
main()