-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathspider.py
More file actions
executable file
·205 lines (179 loc) · 6.92 KB
/
spider.py
File metadata and controls
executable file
·205 lines (179 loc) · 6.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/env python
#-*- coding:utf-8 -*-
import sys
import urllib2
from Queue import Queue
from bs4 import BeautifulSoup, Doctype, Comment
from time import sleep,time
#Custom Modules
from logger import *
from storage import *
from threadpool import *
from status import *
from defaults import *
class Spider(object):
def __init__(self, url, max_depth=10000, workers=2, storage=None, keyword=None):
self.url = url
if self.url[-1] <> '/':
self.url += '/'
self.max_depth = max_depth
self.stat = Status()
self.storage = storage #sth for storing pages
self.parsed = set() #urls processed
#Set up keyword searching function
if keyword:
LOGGER.debug('Search keyword: %s', keyword)
self.keyword = keyword.decode(INPUT_ENCODING)
else:
LOGGER.debug('No search keyword')
self.keyword = None
#Single-threading
#self.queue = Queue()
#self.queue.put((self.url, 0))
#Worker pool
self.pool = ThreadPool(workers)
self.pool_stop = True
def run(self):
"""Worker pool mode"""
self.pool_stop = False
self.pool.start()
self.stat.clear()
#add 1st job
self.pool.add_job(self.worker, self.url, 0)
last_output = time()
idle_count = 0 #To judget if it's really idle
try:
LOGGER.info('++++++++++ Run! ++++++++++')
while True:
now = time()
#IDLE judgement
if self.pool.idle():
idle_count += 1
if idle_count >= IDLE_TO_QUIT:
break
else:
idle_count = 0
#Should show stat?
if now - last_output >= OUTPUT_INTERVAL:
self.print_stat()
last_output = now
sleep(2)
except KeyboardInterrupt:
LOGGER.info('++++++++++ [CTRL-c] received! ++++++++++')
finally:
self.stop()
print '\n\nFinal stat:'
self.print_stat()
LOGGER.info('++++++++++ Terminated! ++++++++++')
def stop(self):
self.pool_stop = True
self.pool.stop(immediately=True)
#UGLY: wait for storage
if self.storage and hasattr(self.storage, 'join') and callable(self.storage.join):
self.storage.join()
def worker(self, url, depth):
LOGGER.debug('Start with %s', url)
content, urls, match = self.fetch(url, fetch_urls=(depth+1 < self.max_depth), keyword=self.keyword)
self.parsed.add(url) #Mark parsed
if not content:
self.stat.update(failed=1)
return
#Save page
if match and self.storage:
self.storage.save(url, content)
#Handle child urls
for u in urls:
if u and u not in self.parsed:
self.pool.add_job(self.worker, u, depth+1)
self.stat.update(matched=match)
LOGGER.debug('End with %s', url)
def bs4_search_keyword(self, soup, keyword):
"""
Use BeautifulSoup object to search.
It's better to put these all in a new module.
"""
def text_filter(elem):
if isinstance(elem, (Doctype, Comment)):
return False
elif elem.parent and elem.parent.name in ['script', 'noscript', 'link']:
return False
else:
return True
text = soup.find_all(text=text_filter)
for t in text:
if keyword in unicode(t).strip():
return True
return False
def fetch(self, url, fetch_urls=False, keyword=None):
LOGGER.info('-- Fetch %s', url)
urls = list()
content = None
match = False
try:
content = urllib2.urlopen(url, timeout=FETCH_TIMEOUT).read()
except urllib2.URLError, e:
LOGGER.error('%s %s', str(e), url)
return (content, urls, match)
except Exception, e:
LOGGER.error('%s %s', str(e), url)
return (content, urls, match)
soup = BeautifulSoup(content, 'lxml')
#Search keyword
if keyword:
match = self.bs4_search_keyword(soup, keyword)
#Check if we should search deeper or not.
if fetch_urls:
for a in soup.find_all('a'):
if a.get('href'):
u = self.normalized_url(url, a['href'])
if u:
urls.append(u)
return (content, urls, match)
def normalized_url(self, base_url, url):
u = urllib2.urlparse.urlsplit(url)
if not u.scheme and not u.netloc:
#Relative URL.
return urllib2.urlparse.urljoin(base_url, u.path)
else:
#Absolute URL.
if u.scheme in ['http', 'https', 'ftp']:
return url
def print_stat(self):
st = self.stat.get()
qsize = self.pool.jobs_count()
print 'Total: %d, Fetch-failed: %d, Matched: %d, In-Queue: %d' %(st['total'], st['failed'], st['matched'], qsize)
def load_options(args):
from optparse import OptionParser
parser = OptionParser('usage: %prog -u URL -d DEPTH [options]')
parser.add_option('-u', dest='url', type='string', help='isinstance(text[0],(Doctype,Comment))URL to start with.', metavar='URL') #URL option
parser.add_option('-d', dest='depth', type='int', help='Parse depth.', metavar='DEPTH') #DEPTH option
parser.add_option('-l', dest='loglevel', default=3, type='int', help='Log level.', metavar='LOGLEVEL') #LOGLEVEL option
parser.add_option('-f', dest='logfile', type='string', help='Log file name.', metavar='LOGFILE') #LOGFILE option
parser.add_option('', '--thread', dest='thread', type='int', default=10, help='Thread number.', metavar='THREAD') #THREAD option
parser.add_option('', '--dbfile', dest='dbfile', type='string', help='SQLite database file name.', metavar='DBFILE') #DBFILE option
parser.add_option('', '--key', dest='keyword', type='string', help='Search keyword.', metavar='KEYWORD') #KEYWORD option
(options, unknown) = parser.parse_args(args)
for un in unknown[1:]:
print '[UNKNOWN] %s' % un
#URL
if not options.url:
raise ValueError('URL is required.')
#DEPTH
if not options.depth:
raise ValueError('DEPTH is required.')
#THREAD
if options.thread < 1:
raise ValueError('THREAD >= 1.')
#Logging
init_logger(options.loglevel, options.logfile)
return options
if __name__=='__main__':
opts = load_options(sys.argv)
if opts.dbfile:
store = SqliteStorage(opts.dbfile)
elif FILE_STORE_DIR:
store = FileStorage(FILE_STORE_DIR)
else:
store = None
spider = Spider(url=opts.url, max_depth=opts.depth, workers=opts.thread, storage=store, keyword=opts.keyword)
spider.run()