Skip to content

Commit 84119fa

Browse files
gh-79638: Treat an unreachable robots.txt as "disallow all"
Disallow all access in urllib.robotparser if the robots.txt file is unreachable due to server or network errors.
1 parent cb7ef18 commit 84119fa

3 files changed

Lines changed: 63 additions & 18 deletions

File tree

Lib/test/test_robotparser.py

Lines changed: 52 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import urllib.robotparser
66
from test import support
77
from test.support import socket_helper
8-
from test.support import threading_helper
98
from http.server import BaseHTTPRequestHandler, HTTPServer
109

1110

@@ -384,26 +383,25 @@ def test_string_formatting(self):
384383
)
385384
class BaseLocalNetworkTestCase:
386385

387-
def setUp(self):
386+
@classmethod
387+
def setUpClass(cls):
388388
# clear _opener global variable
389-
self.addCleanup(urllib.request.urlcleanup)
389+
cls.addClassCleanup(urllib.request.urlcleanup)
390390

391-
self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
391+
cls.server = HTTPServer((socket_helper.HOST, 0), cls.RobotHandler)
392+
cls.addClassCleanup(cls.server.server_close)
392393

393-
self.t = threading.Thread(
394+
t = threading.Thread(
394395
name='HTTPServer serving',
395-
target=self.server.serve_forever,
396+
target=cls.server.serve_forever,
396397
# Short poll interval to make the test finish quickly.
397398
# Time between requests is short enough that we won't wake
398399
# up spuriously too many times.
399400
kwargs={'poll_interval':0.01})
400-
self.t.daemon = True # In case this function raises.
401-
self.t.start()
402-
403-
def tearDown(self):
404-
self.server.shutdown()
405-
self.t.join()
406-
self.server.server_close()
401+
t.daemon = True # In case this function raises.
402+
t.start()
403+
cls.addClassCleanup(t.join)
404+
cls.addClassCleanup(cls.server.shutdown)
407405

408406

409407
SAMPLE_ROBOTS_TXT = b'''\
@@ -425,7 +423,6 @@ def do_GET(self):
425423
def log_message(self, format, *args):
426424
pass
427425

428-
@threading_helper.reap_threads
429426
def testRead(self):
430427
# Test that reading a weird robots.txt doesn't fail.
431428
addr = self.server.server_address
@@ -447,24 +444,62 @@ def testRead(self):
447444
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
448445

449446

450-
class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
447+
class HttpErrorsTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
451448
class RobotHandler(BaseHTTPRequestHandler):
452449

453450
def do_GET(self):
454-
self.send_error(403, "Forbidden access")
451+
self.send_error(self.server.return_code)
455452

456453
def log_message(self, format, *args):
457454
pass
458455

459-
@threading_helper.reap_threads
456+
def setUp(self):
457+
# Make sure that a valid code is set in the test.
458+
self.server.return_code = None
459+
460460
def testPasswordProtectedSite(self):
461+
self.server.return_code = 403
461462
addr = self.server.server_address
462463
url = 'http://' + socket_helper.HOST + ':' + str(addr[1])
463464
robots_url = url + "/robots.txt"
464465
parser = urllib.robotparser.RobotFileParser()
465466
parser.set_url(url)
466467
parser.read()
467468
self.assertFalse(parser.can_fetch("*", robots_url))
469+
self.assertFalse(parser.can_fetch("*", url + '/some/file.html'))
470+
471+
def testNotFound(self):
472+
self.server.return_code = 404
473+
addr = self.server.server_address
474+
url = f'http://{socket_helper.HOST}:{addr[1]}'
475+
robots_url = url + "/robots.txt"
476+
parser = urllib.robotparser.RobotFileParser()
477+
parser.set_url(url)
478+
parser.read()
479+
self.assertTrue(parser.can_fetch("*", robots_url))
480+
self.assertTrue(parser.can_fetch("*", url + '/path/file.html'))
481+
482+
def testTeapot(self):
483+
self.server.return_code = 418
484+
addr = self.server.server_address
485+
url = f'http://{socket_helper.HOST}:{addr[1]}'
486+
robots_url = url + "/robots.txt"
487+
parser = urllib.robotparser.RobotFileParser()
488+
parser.set_url(url)
489+
parser.read()
490+
self.assertTrue(parser.can_fetch("*", robots_url))
491+
self.assertTrue(parser.can_fetch("*", url + '/pot-1?milk-type=Cream'))
492+
493+
def testServiceUnavailable(self):
494+
self.server.return_code = 503
495+
addr = self.server.server_address
496+
url = f'http://{socket_helper.HOST}:{addr[1]}'
497+
robots_url = url + "/robots.txt"
498+
parser = urllib.robotparser.RobotFileParser()
499+
parser.set_url(url)
500+
parser.read()
501+
self.assertFalse(parser.can_fetch("*", robots_url))
502+
self.assertFalse(parser.can_fetch("*", url + '/path/file.html'))
468503

469504

470505
@support.requires_working_socket()

Lib/urllib/robotparser.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,17 @@ def read(self):
7777
f = urllib.request.urlopen(self.url)
7878
except urllib.error.HTTPError as err:
7979
if err.code in (401, 403):
80+
# If access to robot.txt has the status Unauthorized/Forbidden,
81+
# then most likely this applies to the entire site.
8082
self.disallow_all = True
81-
elif err.code >= 400 and err.code < 500:
83+
elif 400 <= err.code < 500:
84+
# RFC 9309, Section 2.3.1.3: the crawler MAY access any
85+
# resources on the server.
8286
self.allow_all = True
87+
elif 500 <= err.code < 600:
88+
# RFC 9309, Section 2.3.1.4: the crawler MUST assume
89+
# complete disallow.
90+
self.disallow_all = True
8391
err.close()
8492
else:
8593
raw = f.read()
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Disallow all access in :mod:`urllib.robotparser` if the ``robots.txt`` file
2+
is unreachable due to server or network errors.

0 commit comments

Comments
 (0)