Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/http_crawler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
__version__ = '0.1.2'


def crawl(base_url, follow_external_links=True):
def crawl(base_url, follow_external_links=True, follow_redirects=True):
base_netloc = urlparse(base_url).netloc

seen = set([base_url])
Expand All @@ -19,7 +19,10 @@ def crawl(base_url, follow_external_links=True):

while todo:
url = todo.pop()
rsp = session.get(url)
rsp = session.get(url, allow_redirects=follow_redirects)

if rsp.status_code // 100 == 3:
continue
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should still yield a response, even if it is a redirect. For instance, a user of the library might want to know about all requests that are redirected. We should also extract any URLs from the Location header of a 3xx response.

Please could you:

  • remove these two lines, so that we continue to yield all responses;
  • add a check in this section so that if the response is a 3xx, we try to extract a URL from the Location header.

Does that make sense?

Once you've done that, you should also add yourself to AUTHORS.txt

Copy link
Author

@rkrp rkrp Oct 13, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a check in this section so that if the response is a 3xx, we try to extract a URL from the Location header.

@inglesp By doing this, wouldn't we essentially be following redirects? I am confused.

If we are proceeding this manner, I think it would be a better approach to scrap the body of the 3xx HTTP response also, in addition to adding the url from location header.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @rkrp. Sorry for the slow reply -- have been AFK for a few days.

I understand your confusion, and I think it's because the wording of #5 is unclear. I'm sorry about that!

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking about this a bit more,follow_redirects is not the right name for the new argument, since it suggests that 3xx responses shouldn't be followed at all.

Can you think of a better name for it?

Copy link
Author

@rkrp rkrp Oct 21, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@inglesp How about pause_at_redirects or scrap_redirects?

Any thoughts on this?

I think it would be a better approach to scrap the body of the 3xx HTTP response also, in addition to adding the url from location header.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @rkrp. I'm so sorry it's taken me so long to reply to you.

What about auto_follow_redirects?

And yes, we should also scrape the body of the 3xx response.


yield rsp

Expand Down
7 changes: 7 additions & 0 deletions tests/redirect-site/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<html>
<body>
<p>This is a test page</p>
<a href="redirect-old-path/page-1.html">page-1</a>
<a href="redirect-old-path/page-2.html">page-2</a>
</body>
</html>
1 change: 1 addition & 0 deletions tests/redirect-site/redirect-new-path/page-1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hello
1 change: 1 addition & 0 deletions tests/redirect-site/redirect-new-path/page-2.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
World
43 changes: 42 additions & 1 deletion tests/test_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,28 @@ def serve():

serving = True

class HTTPHandler(SimpleHTTPRequestHandler):
def do_GET(self):
path = self.path.split('/')
if path[-2] == 'redirect-old-path':
path[-2] = 'redirect-new-path'
newpath = '/'.join(path)
self.send_response(301)
self.send_header('Location', newpath)
self.send_header('content-type', 'text/html')
self.end_headers()
else:
return super(HTTPHandler, self).do_GET()

def _serve(dir, port):
base_dir = os.path.join('tests', dir)
os.chdir(base_dir)
server = HTTPServer(('', port), SimpleHTTPRequestHandler)
server = HTTPServer(('', port), HTTPHandler)
server.serve_forever()

Process(target=_serve, args=('site', 8000), daemon=True).start()
Process(target=_serve, args=('external-site', 8001), daemon=True).start()
Process(target=_serve, args=('redirect-site', 8002), daemon=True).start()


def test_crawl():
Expand Down Expand Up @@ -112,3 +126,30 @@ def test_extract_urls_from_css():
'/assets/somefont.eot',
'/assets/somefont.ttf',
}


def test_with_redirect():
serve()

rsps = list(http_crawler.crawl('http://localhost:8002/'))
actual_urls = set([resp.url for resp in rsps])
expected_urls = set([
'http://localhost:8002/',
'http://localhost:8002/redirect-new-path/page-2.html',
'http://localhost:8002/redirect-new-path/page-1.html'
])

assert actual_urls == expected_urls


def test_without_redirect():
serve()

rsps = list(http_crawler.crawl('http://localhost:8002/',
follow_redirects=False))
actual_urls = set([resp.url for resp in rsps])
expected_urls = set([
'http://localhost:8002/',
])

assert actual_urls == expected_urls