python-crawler-example/crawler-gevent.py at master · begray/python-crawler-example · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from gevent import monkey

monkey.patch_all()

import gevent
import gevent.pool

import urllib2
import os
import threading
import optparse

from bs4 import BeautifulSoup
from urlparse import urlparse, urlunparse, urljoin

OUTPUT_DIR = 'output'

pending_links = []
visited_links = set()
crawlin_links = set()


def download_and_save(url):
    """
    Download one page from the web and save it to OUTPUT_DIR preserving relative path
    :param url: url to download
    :return: string with raw web page content
    """
    request = urllib2.Request(url)

    response = urllib2.urlopen(request)

    page = response.read()

    rel_path = request.get_selector()

    parts = rel_path.split('/')
    output_dir_path = parts[:-1]

    output_dir = os.path.join(OUTPUT_DIR, *output_dir_path)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(os.path.join(OUTPUT_DIR, *parts), 'w') as output_file:
        output_file.write(page)

    # print('%s' % (threading.currentThread()))
    # print('written to disk: %s' % (request.get_full_url()))

    return page


def is_new_link(current, link):
    """
    Is this a link we didn't visit before?
    :param current: current link we're crawling
    :param link: subject link
    :return: true/false
    """
    return \
        link not in visited_links and \
        link not in pending_links and \
        link not in crawlin_links and \
        current != link


def drop_params(url):
    """
    Drop query params and fragment from URL
    """
    return urlunparse(urlparse(url)[:3] + ('', '', ''))


def same_host(url1, url2):
    """
    Are this two URLs pointing to the same host?
    """
    return urlparse(url1)[1] == urlparse(url2)[1]


def crawl_one(url):
    """
    Crawl one page, i.e. download and collect outgoing links from one web page
    :param url: url to crawl
    :return: None
    """
    page = download_and_save(url)
    soup = BeautifulSoup(page, 'html.parser')

    tags = soup.find_all('a')
    links = [urljoin(url, tag.get('href')) for tag in tags]

    # print('%s - links crawled: %s' % (threading.currentThread(), links))

    new_links = set(filter(
        lambda l: same_host(url, l),
        (drop_params(link) for link in links if is_new_link(url, drop_params(link)))
    ))
    new_links = sorted(new_links)

    # print('%s - new links: %s' % (threading.currentThread(), new_links))

    pending_links.extend(new_links)

    visited_links.add(url)
    crawlin_links.remove(url)


def crawl(start_url, concurrency_level, visited_link_limit):
    """
    Main crawling function. Uses a pool of greenlets to get the job done
    :param start_url: URL to start crawling from
    :param concurrency_level: number of concurrent downloads
    :param visited_link_limit: maximum number of links to crawl
    :return: None
    """

    print('start crawling from %s' % start_url)
    print('concurrency level: %s, visited link limit: %s' % (concurrency_level, visited_link_limit))

    # init our pending links with start_url
    pending_links.append(start_url)

    pool = gevent.pool.Pool(concurrency_level)

    # limit number of visited links, just for testing purposes
    while len(visited_links) < visited_link_limit and (len(pending_links) > 0 or len(crawlin_links) > 0):
        # if there is nothing more to schedule, then wait for current jobs to complete and try again
        if not pending_links:
            pool.join()
            continue

        link = pending_links.pop(0)
        crawlin_links.add(link)

        pool.wait_available()
        pool.add(gevent.spawn(crawl_one, link))

        # print('%s - current visited: %s' % (threading.currentThread(), visited_links))

    pool.join()

    # print('%s - visited links: %s' % (threading.currentThread(), visited_links))
    # print('%s - pending links: %s' % (threading.currentThread(), pending_links))

    print('Done. %s links visited.' % len(visited_links))


if __name__ == '__main__':
    usage = 'usage: %prog [options] <start URL>'

    parser = optparse.OptionParser(usage)
    parser.add_option(
        '-c', '--concurrency',
        action='store', type='int', dest='concurrency', default=6
    )
    parser.add_option(
        '-l', '--limit',
        action='store', type='int', dest='limit', default=100
    )
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('start URL for crawl was not specified')

    main = gevent.spawn(
        crawl,
        args[0],
        options.concurrency,
        options.limit
    )
    main.start()

    gevent.wait([main])