-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathSearch_Engine.py
More file actions
157 lines (136 loc) Β· 3.18 KB
/
Search_Engine.py
File metadata and controls
157 lines (136 loc) Β· 3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#Search Engine 0.2 - sushi
import urllib2
import operator
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"',start_link)
end_quote = page.find('"',start_quote+1)
url = page[start_quote+1:end_quote]
return url, end_quote;
def get_all_links(page):
links=[]
while True:
url, endpos=get_next_target(page)
if url:
links.append(url)
page= page[endpos:]
else:
break
return links
def get_page(link):
try:
if link.find('mailto')!=-1:
return ''
req = urllib2.Request(link, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' })
html = urllib2.urlopen(req).read()
return html
except (urllib2.URLError,urllib2.HTTPError,ValueError) as e:
return ''
def split_string(source):
splitlist= ",!-.;/?@ #"
output=[]
atsplit = True
for char in source:
if char in splitlist:
atsplit=True
else:
if atsplit:
output.append(char)
atsplit= False
else:
output[-1] = output[-1] +char
return output
def add_to_index(index,keyword,url):
if keyword in index:
values=[]
values=lookup(index,keyword)
if url not in values:
values.append(url)
else:
values=[]
values.append(url)
index[keyword]=values
def add_page_to_index(index,url,content):
words=split_string(content)
for word in words:
add_to_index(index,word,url)
def lookup(index, keyword):
if keyword in index:
return index[keyword]
return None
def lookup_ranked(index,keyword,ranks):
results=lookup(index,keyword)
if results== None:
return "no results :("
rr={}
for e in results:
rr[e]= ranks[e]
values= sorted(rr.items(),key=operator.itemgetter(1),reverse=True)
output=[]
i=0
for e in values:
output.append(e[0])
i+=1
return output
"""ranked_results=[]#contains ranks
new_rr={}#contains the ranks pointing to url
for e in rr:
ranked_results=rr[e]
new_rr[ranked_results]=e
ranked_results.sort()
output=[]
i=0
for e in ranked_results:
output[i]=new_rr[e]
i+=1
return output"""
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def compute_ranks(graph):
d = 0.8 # damping factor
numloops = 10
ranks = {}
npages = len(graph)
for page in graph:
ranks[page] = 1.0 / npages
for i in range(0, numloops):
newranks = {}
for page in graph:
newrank = (1 - d) / npages
for node in graph:
if page in graph[node]:
newrank+=d*(ranks[node]/len(graph[node]))
newranks[page] = newrank
ranks = newranks
return ranks
def crawl_web(seed,max_depth):
tocrawl=[seed];
crawled=[]
sum_depth=[0]
sum_each=0
i=0
index={}
graph={}
depth=0
next_depth=[]
while tocrawl and depth<= max_depth:
page= tocrawl.pop()
if page not in crawled:
content= get_page(page)
add_page_to_index(index,page,content)
outlinks=get_all_links(content)
graph[page]=outlinks
union(next_depth,outlinks)
crawled.append(page)
if not tocrawl:
tocrawl,next_depth= next_depth,[]
depth=depth+1
#ranks=compute_ranks(graph)
return index,graph
#seed='http://xkcd.com/353'
#index= crawl_web(seed,1)
#print lookup(index,'contact')