Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

110 changes: 66 additions & 44 deletions Google-Search-API/gSearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
import itertools
import operator
import json
import pprint
import requests
import urllib


def listGen(filePath):
keyList=[]
Expand All @@ -11,54 +15,72 @@ def listGen(filePath):
file.close()
return keyList

k1 = listGen('/Users/simin/Google Drive/CS599-Simin/gSearching/Google-Search-API/keys/keys1.txt')
k2 = listGen('/Users/simin/Google Drive/CS599-Simin/gSearching/Google-Search-API/keys/keys2.txt')
k3 = listGen('/Users/simin/Google Drive/CS599-Simin/gSearching/Google-Search-API/keys/keys3.txt')
k4 = listGen('/Users/simin/Google Drive/CS599-Simin/gSearching/Google-Search-API/keys/keys4.txt')
k5 = listGen('/Users/simin/Google Drive/CS599-Simin/gSearching/Google-Search-API/keys/keys5.txt')

keywords=[]
# keywords.extend(k1)
# keywords.extend(k2)
# keywords.extend(k3)
# keywords.extend(k4)
keywords.extend(itertools.product(k1,k2))
keywords.extend(itertools.product(k1,k3))
keywords.extend(itertools.product(k1,k4))

print("Number of Search Terms:" + str(len(keywords)))
print (keywords)
def extractKeys():

num_page = 1 #how many pages to consider while searching Google - each pages about 10 urls
k1 = listGen('keys/keys1.txt')
k2 = listGen('keys/keys2.txt')
k3 = listGen('keys/keys3.txt')
k4 = listGen('keys/keys4.txt')
k5 = listGen('keys/keys5.txt')

urlDict = {}
#urlList = []
for terms in keywords:
print (terms)
#here we define the keywords order
search_results = google.search(terms[0] +" " + terms[1], num_page)
#search_results = google.search(terms, num_page)
i=1
for result in search_results:
#urlList.append(result.link)
if result.link in urlDict.keys():
urlDict[result.link][0]+=1
urlDict[result.link][1].append(i)
else:
urlDict[result.link]=[1,[i]]
i += 1
print(result.link)
urlDict = sorted(urlDict.items(), key=operator.itemgetter(1), reverse=True)
keywords = []
keywords.extend(itertools.product(k1, k2))
keywords.extend(itertools.product(k1, k3))
keywords.extend(itertools.product(k1, k4))

#print("Total number of URLs:" + str(len(urlList)))
print("Total number of unique URLs:" + str(len(urlDict)))
print("Number of Search Terms:" + str(len(keywords)))
#pprint.pprint(keywords)
return keywords

with open('top-urls.json', 'w') as fp:
json.dump(urlDict, fp)

topURLlist = open('topURLlist.txt', 'w')
for urls in urlDict:
topURLlist.write(urls[0]+'\n')
topURLlist.close()
def extractURLs():
num_page = 1 # how many pages to consider while searching Google - each pages about 10 urls
urlDict = {}
keywords = extractKeys()

for terms in keywords:
print(terms)
# define the keywords order
search_results = google.search(terms[0] + " " + terms[1], num_page)
i = 1
for result in search_results:
# to check if the link exists [ omit 404 errors ]
try:
a = urllib.request.urlopen(result.link)
if a.getcode()==200:
print(result.link)
'''
Keys are URLs
Values are of the form [x,Y]
x = total number of times this URL has been hit
Y = list of URL google page ranks (the rank given by Google for each URL)
eg:
{
"URL": [2, [4,2]]
}
'''
if result.link in urlDict.keys():
urlDict[str(result.link)][0] += 1
urlDict[str(result.link)][1].append(i)
else:
urlDict[str(result.link)] = [1, [i]]
i += 1

except:
print("********LINK ERROR [", result.link,"]********")
# sorting dict based on values
urlDict = dict(sorted(urlDict.items(), key=operator.itemgetter(1), reverse=False))
print("Total number of unique URLs:" + str(len(urlDict)))

with open('top-urls2.json', 'w') as fp:
json.dump(urlDict, fp, indent=4)

topURLlist = open('topURLlist2.txt', 'w')
for urls in urlDict:
topURLlist.write(urls[0]+'\n')
topURLlist.close()

#print(urlDict)

if __name__ == "__main__":
extractURLs()
235 changes: 235 additions & 0 deletions Google-Search-API/keys/searchTerms.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
Number of Search Terms:234
[('Marine', 'Sensor'),
('Marine', 'Instrument'),
('Marine', 'Platform'),
('Marine', 'Probe'),
('Marine', 'Detector'),
('Marine', 'Counter'),
('Marine', 'Sampling'),
('Ocean', 'Sensor'),
('Ocean', 'Instrument'),
('Ocean', 'Platform'),
('Ocean', 'Probe'),
('Ocean', 'Detector'),
('Ocean', 'Counter'),
('Ocean', 'Sampling'),
('Coastal', 'Sensor'),
('Coastal', 'Instrument'),
('Coastal', 'Platform'),
('Coastal', 'Probe'),
('Coastal', 'Detector'),
('Coastal', 'Counter'),
('Coastal', 'Sampling'),
('Aquatic', 'Sensor'),
('Aquatic', 'Instrument'),
('Aquatic', 'Platform'),
('Aquatic', 'Probe'),
('Aquatic', 'Detector'),
('Aquatic', 'Counter'),
('Aquatic', 'Sampling'),
('Water', 'Sensor'),
('Water', 'Instrument'),
('Water', 'Platform'),
('Water', 'Probe'),
('Water', 'Detector'),
('Water', 'Counter'),
('Water', 'Sampling'),
('Seawater', 'Sensor'),
('Seawater', 'Instrument'),
('Seawater', 'Platform'),
('Seawater', 'Probe'),
('Seawater', 'Detector'),
('Seawater', 'Counter'),
('Seawater', 'Sampling'),
('Sediment', 'Sensor'),
('Sediment', 'Instrument'),
('Sediment', 'Platform'),
('Sediment', 'Probe'),
('Sediment', 'Detector'),
('Sediment', 'Counter'),
('Sediment', 'Sampling'),
('Seafloor', 'Sensor'),
('Seafloor', 'Instrument'),
('Seafloor', 'Platform'),
('Seafloor', 'Probe'),
('Seafloor', 'Detector'),
('Seafloor', 'Counter'),
('Seafloor', 'Sampling'),
('Water column', 'Sensor'),
('Water column', 'Instrument'),
('Water column', 'Platform'),
('Water column', 'Probe'),
('Water column', 'Detector'),
('Water column', 'Counter'),
('Water column', 'Sampling'),
('Marine', 'Monitoring'),
('Marine', 'Survey'),
('Marine', 'Observation'),
('Marine', 'Scanning'),
('Marine', 'Measurement'),
('Marine', 'Research'),
('Marine', 'Imaging'),
('Marine', 'Profiling'),
('Marine', 'Detecting'),
('Marine', 'Identifying'),
('Marine', 'Characterization'),
('Marine', 'Quantification'),
('Ocean', 'Monitoring'),
('Ocean', 'Survey'),
('Ocean', 'Observation'),
('Ocean', 'Scanning'),
('Ocean', 'Measurement'),
('Ocean', 'Research'),
('Ocean', 'Imaging'),
('Ocean', 'Profiling'),
('Ocean', 'Detecting'),
('Ocean', 'Identifying'),
('Ocean', 'Characterization'),
('Ocean', 'Quantification'),
('Coastal', 'Monitoring'),
('Coastal', 'Survey'),
('Coastal', 'Observation'),
('Coastal', 'Scanning'),
('Coastal', 'Measurement'),
('Coastal', 'Research'),
('Coastal', 'Imaging'),
('Coastal', 'Profiling'),
('Coastal', 'Detecting'),
('Coastal', 'Identifying'),
('Coastal', 'Characterization'),
('Coastal', 'Quantification'),
('Aquatic', 'Monitoring'),
('Aquatic', 'Survey'),
('Aquatic', 'Observation'),
('Aquatic', 'Scanning'),
('Aquatic', 'Measurement'),
('Aquatic', 'Research'),
('Aquatic', 'Imaging'),
('Aquatic', 'Profiling'),
('Aquatic', 'Detecting'),
('Aquatic', 'Identifying'),
('Aquatic', 'Characterization'),
('Aquatic', 'Quantification'),
('Water', 'Monitoring'),
('Water', 'Survey'),
('Water', 'Observation'),
('Water', 'Scanning'),
('Water', 'Measurement'),
('Water', 'Research'),
('Water', 'Imaging'),
('Water', 'Profiling'),
('Water', 'Detecting'),
('Water', 'Identifying'),
('Water', 'Characterization'),
('Water', 'Quantification'),
('Seawater', 'Monitoring'),
('Seawater', 'Survey'),
('Seawater', 'Observation'),
('Seawater', 'Scanning'),
('Seawater', 'Measurement'),
('Seawater', 'Research'),
('Seawater', 'Imaging'),
('Seawater', 'Profiling'),
('Seawater', 'Detecting'),
('Seawater', 'Identifying'),
('Seawater', 'Characterization'),
('Seawater', 'Quantification'),
('Sediment', 'Monitoring'),
('Sediment', 'Survey'),
('Sediment', 'Observation'),
('Sediment', 'Scanning'),
('Sediment', 'Measurement'),
('Sediment', 'Research'),
('Sediment', 'Imaging'),
('Sediment', 'Profiling'),
('Sediment', 'Detecting'),
('Sediment', 'Identifying'),
('Sediment', 'Characterization'),
('Sediment', 'Quantification'),
('Seafloor', 'Monitoring'),
('Seafloor', 'Survey'),
('Seafloor', 'Observation'),
('Seafloor', 'Scanning'),
('Seafloor', 'Measurement'),
('Seafloor', 'Research'),
('Seafloor', 'Imaging'),
('Seafloor', 'Profiling'),
('Seafloor', 'Detecting'),
('Seafloor', 'Identifying'),
('Seafloor', 'Characterization'),
('Seafloor', 'Quantification'),
('Water column', 'Monitoring'),
('Water column', 'Survey'),
('Water column', 'Observation'),
('Water column', 'Scanning'),
('Water column', 'Measurement'),
('Water column', 'Research'),
('Water column', 'Imaging'),
('Water column', 'Profiling'),
('Water column', 'Detecting'),
('Water column', 'Identifying'),
('Water column', 'Characterization'),
('Water column', 'Quantification'),
('Marine', 'Protocol'),
('Marine', 'Procedure'),
('Marine', 'Method'),
('Marine', 'Manual'),
('Marine', 'Guide'),
('Marine', 'Guideline'),
('Marine', 'Technique'),
('Ocean', 'Protocol'),
('Ocean', 'Procedure'),
('Ocean', 'Method'),
('Ocean', 'Manual'),
('Ocean', 'Guide'),
('Ocean', 'Guideline'),
('Ocean', 'Technique'),
('Coastal', 'Protocol'),
('Coastal', 'Procedure'),
('Coastal', 'Method'),
('Coastal', 'Manual'),
('Coastal', 'Guide'),
('Coastal', 'Guideline'),
('Coastal', 'Technique'),
('Aquatic', 'Protocol'),
('Aquatic', 'Procedure'),
('Aquatic', 'Method'),
('Aquatic', 'Manual'),
('Aquatic', 'Guide'),
('Aquatic', 'Guideline'),
('Aquatic', 'Technique'),
('Water', 'Protocol'),
('Water', 'Procedure'),
('Water', 'Method'),
('Water', 'Manual'),
('Water', 'Guide'),
('Water', 'Guideline'),
('Water', 'Technique'),
('Seawater', 'Protocol'),
('Seawater', 'Procedure'),
('Seawater', 'Method'),
('Seawater', 'Manual'),
('Seawater', 'Guide'),
('Seawater', 'Guideline'),
('Seawater', 'Technique'),
('Sediment', 'Protocol'),
('Sediment', 'Procedure'),
('Sediment', 'Method'),
('Sediment', 'Manual'),
('Sediment', 'Guide'),
('Sediment', 'Guideline'),
('Sediment', 'Technique'),
('Seafloor', 'Protocol'),
('Seafloor', 'Procedure'),
('Seafloor', 'Method'),
('Seafloor', 'Manual'),
('Seafloor', 'Guide'),
('Seafloor', 'Guideline'),
('Seafloor', 'Technique'),
('Water column', 'Protocol'),
('Water column', 'Procedure'),
('Water column', 'Method'),
('Water column', 'Manual'),
('Water column', 'Guide'),
('Water column', 'Guideline'),
('Water column', 'Technique')]
1 change: 1 addition & 0 deletions Google-Search-API/top-urls1.json

Large diffs are not rendered by default.

Loading