-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrap.py
More file actions
138 lines (121 loc) · 5.61 KB
/
scrap.py
File metadata and controls
138 lines (121 loc) · 5.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
@RAPHABIZ
"""
import csv
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.webdriver.common.action_chains import ActionChains
import requests
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException,ElementNotInteractableException
driver = webdriver.Chrome()
driver.get("https://www.alchemy.com/dapps")
driver.set_window_size(1280, 680)
class Alchemy():
def __init__(self):
""" self.driver = webdriver.Chrome()
self.driver.get("https://www.alchemy.com/dapps")
self.driver.set_window_size(1280, 680) """
def process_page(self,i):
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, f'/html/body/div[1]/main/section[1]/div/div/div/div/div/div[2]/header/div/div[2]/div[1]/div[{i}]'))).click()
time.sleep(1)
title = driver.find_element(By.XPATH,'//*[@id="w-node-_8e8f92b4-41cd-c97d-c0c2-f11963c19fa0-551a0e8a"]/div[1]/div[2]/h1[2]').get_attribute("innerHTML")
image_url = driver.find_element(By.XPATH,'//*[@id="w-node-_0b28749b-1df0-9158-e12d-ade7d09ab962-551a0e8a"]/img').get_attribute("src")
twitter= driver.find_element(By.XPATH,'//*[@id="w-node-_0b28749b-1df0-9158-e12d-ade7d09ab977-551a0e8a"]/div[1]/a[2]').get_attribute("href")
website=driver.find_element(By.XPATH,'//*[@id="w-node-a6e2b1fc-baea-8113-f670-112998a3fa78-551a0e8a"]/a[2]').get_attribute('href')
description = driver.find_element(By.XPATH,'//*[@id="w-node-_0b28749b-1df0-9158-e12d-ade7d09ab990-551a0e8a"]/p').get_attribute('innerHTML')
chains = driver.find_element(By.XPATH,'//*[@id="w-node-_0b28749b-1df0-9158-e12d-ade7d09ab966-551a0e8a"]/div[3]/div[2]').get_attribute('innerHTML')
dapps ={
"title":title,
"image":image_url,
"twitter":twitter,
"website":website,
"tags": self.get_tags(),
"chains":chains,
"description":description
}
columns=["title","image","twitter","website","tags","chains","description"]
self.save_to_csv(dapps,columns,"alchemy")
def get_tags(self):
list= driver.find_elements(By.CLASS_NAME,'item-header_tag.is--parent.w-inline-block')
list2= driver.find_elements(By.CLASS_NAME,'item-header_tag.is--child.w-inline-block')
array=[]
for i in range(len(list)):
array.append(driver.find_element(By.XPATH,f'//*[@id="w-node-_0b28749b-1df0-9158-e12d-ade7d09ab968-551a0e8a"]/div[1]/div/div[{i+1}]/a/div').get_attribute("innerHTML"))
for i in range(len(list2)):
array.append(driver.find_element(By.XPATH,f'//*[@id="w-node-_0b28749b-1df0-9158-e12d-ade7d09ab968-551a0e8a"]/div[2]/div/div[{i+1}]/a/div').get_attribute("innerHTML"))
return array
def is_element_present(self,driver, locator):
try :
WebDriverWait(driver, 20).until(EC.element_to_be_clickable(By.CLASS_NAME, locator))
print("Element exists")
return True
except :
print("Element does not exist")
return False
""" try:
driver.find_element(By.CLASS_NAME,locator)
except:
return False
return True """
def get_data(self):
page=0
while driver.find_element(By.CLASS_NAME,'w-pagination-next.cms-load_next-button').is_displayed() :
list= driver.find_elements(By.CLASS_NAME,'cms-filter_item.is--dapp.w-dyn-item')
page=page+1
print("Page "+str(page))
for i in range(len(list)):
print("Element "+str(i+1))
self.process_page(i+1)
driver.back()
time.sleep(4)
if driver.find_element(By.CLASS_NAME,'w-pagination-next.cms-load_next-button').is_displayed() :
driver.find_element(By.CLASS_NAME,'w-pagination-next.cms-load_next-button').click()
list= driver.find_elements(By.CLASS_NAME,'cms-filter_item.is--dapp.w-dyn-item')
page=page+1
print("Page "+str(page))
for i in range(len(list)):
print("Element "+str(i+1))
self.process_page(i+1)
driver.back()
print("done")
def verify_duplicate_in_csv(self, dapps,csvfilename):
with open(f"{csvfilename}.csv",encoding='utf-8') as f:
reader = csv.reader(f, delimiter=",", quotechar='"')
# Skip the headers
next(reader, None)
data_read = [row for row in reader]
# Check for duplicate IDs in the data_read list
for row in data_read:
if row[0] == dapps.get('title'):
print(f"Duplicate ID found: {dapps.get('title')}")
return True # Duplicate found
print("No duplicate found.")
return False # No duplicate found
def create_csv_if_not_exists(self,filepath, header):
if not os.path.exists(filepath+'.csv'):
with open(filepath+'.csv', 'w', newline='') as csv_file:
if header:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(header)
print(f"CSV file '{filepath}' created.")
else:
print(f"CSV file '{filepath}' already exists.")
def save_to_csv(self,dapps,columns,csvfilename):
# Verify if csv file exists
self.create_csv_if_not_exists(csvfilename,columns)
# Verify if participant is alredy in file
if self.verify_duplicate_in_csv(dapps,csvfilename=csvfilename) == False:
# Append data to the CSV file
with open(f'{csvfilename}.csv', 'a', newline='', encoding='UTF-8') as f:
w = csv.DictWriter(f, fieldnames=columns)
print(dapps)
w.writerow(dapps)
# Instantiate the Alchemy class and call the get_data method
alchemy = Alchemy()
alchemy.get_data()