Skip to content

Commit 317c116

Browse files
gh-79638: Treat an unreachable robots.txt as "disallow all"
Disallow all access in urllib.robotparser if the robots.txt file is unreachable due to server or network errors.
1 parent cb7ef18 commit 317c116

3 files changed

Lines changed: 61 additions & 17 deletions

File tree

Lib/test/test_robotparser.py

Lines changed: 50 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -384,26 +384,23 @@ def test_string_formatting(self):
384384
)
385385
class BaseLocalNetworkTestCase:
386386

387-
def setUp(self):
387+
@classmethod
388+
def setUpClass(cls):
388389
# clear _opener global variable
389-
self.addCleanup(urllib.request.urlcleanup)
390+
cls.addClassCleanup(urllib.request.urlcleanup)
390391

391-
self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
392+
cls.server = HTTPServer((socket_helper.HOST, 0), cls.RobotHandler)
393+
cls.addClassCleanup(cls.server.server_close)
392394

393-
self.t = threading.Thread(
395+
t = threading.Thread(
394396
name='HTTPServer serving',
395-
target=self.server.serve_forever,
397+
target=cls.server.serve_forever,
396398
# Short poll interval to make the test finish quickly.
397399
# Time between requests is short enough that we won't wake
398400
# up spuriously too many times.
399401
kwargs={'poll_interval':0.01})
400-
self.t.daemon = True # In case this function raises.
401-
self.t.start()
402-
403-
def tearDown(self):
404-
self.server.shutdown()
405-
self.t.join()
406-
self.server.server_close()
402+
cls.enterClassContext(threading_helper.start_threads([t]))
403+
cls.addClassCleanup(cls.server.shutdown)
407404

408405

409406
SAMPLE_ROBOTS_TXT = b'''\
@@ -425,7 +422,6 @@ def do_GET(self):
425422
def log_message(self, format, *args):
426423
pass
427424

428-
@threading_helper.reap_threads
429425
def testRead(self):
430426
# Test that reading a weird robots.txt doesn't fail.
431427
addr = self.server.server_address
@@ -447,24 +443,62 @@ def testRead(self):
447443
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
448444

449445

450-
class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
446+
class HttpErrorsTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
451447
class RobotHandler(BaseHTTPRequestHandler):
452448

453449
def do_GET(self):
454-
self.send_error(403, "Forbidden access")
450+
self.send_error(self.server.return_code)
455451

456452
def log_message(self, format, *args):
457453
pass
458454

459-
@threading_helper.reap_threads
455+
def setUp(self):
456+
# Make sure that a valid code is set in the test.
457+
self.server.return_code = None
458+
460459
def testPasswordProtectedSite(self):
460+
self.server.return_code = 403
461461
addr = self.server.server_address
462462
url = 'http://' + socket_helper.HOST + ':' + str(addr[1])
463463
robots_url = url + "/robots.txt"
464464
parser = urllib.robotparser.RobotFileParser()
465465
parser.set_url(url)
466466
parser.read()
467467
self.assertFalse(parser.can_fetch("*", robots_url))
468+
self.assertFalse(parser.can_fetch("*", url + '/some/file.html'))
469+
470+
def testNotFound(self):
471+
self.server.return_code = 404
472+
addr = self.server.server_address
473+
url = f'http://{socket_helper.HOST}:{addr[1]}'
474+
robots_url = url + "/robots.txt"
475+
parser = urllib.robotparser.RobotFileParser()
476+
parser.set_url(url)
477+
parser.read()
478+
self.assertTrue(parser.can_fetch("*", robots_url))
479+
self.assertTrue(parser.can_fetch("*", url + '/path/file.html'))
480+
481+
def testTeapot(self):
482+
self.server.return_code = 418
483+
addr = self.server.server_address
484+
url = f'http://{socket_helper.HOST}:{addr[1]}'
485+
robots_url = url + "/robots.txt"
486+
parser = urllib.robotparser.RobotFileParser()
487+
parser.set_url(url)
488+
parser.read()
489+
self.assertTrue(parser.can_fetch("*", robots_url))
490+
self.assertTrue(parser.can_fetch("*", url + '/pot-1?milk-type=Cream'))
491+
492+
def testServiceUnavailable(self):
493+
self.server.return_code = 503
494+
addr = self.server.server_address
495+
url = f'http://{socket_helper.HOST}:{addr[1]}'
496+
robots_url = url + "/robots.txt"
497+
parser = urllib.robotparser.RobotFileParser()
498+
parser.set_url(url)
499+
parser.read()
500+
self.assertFalse(parser.can_fetch("*", robots_url))
501+
self.assertFalse(parser.can_fetch("*", url + '/path/file.html'))
468502

469503

470504
@support.requires_working_socket()

Lib/urllib/robotparser.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,17 @@ def read(self):
7777
f = urllib.request.urlopen(self.url)
7878
except urllib.error.HTTPError as err:
7979
if err.code in (401, 403):
80+
# If access to robot.txt has the status Unauthorized/Forbidden,
81+
# then most likely this applies to the entire site.
8082
self.disallow_all = True
81-
elif err.code >= 400 and err.code < 500:
83+
elif 400 <= err.code < 500:
84+
# RFC 9309, Section 2.3.1.3: the crawler MAY access any
85+
# resources on the server.
8286
self.allow_all = True
87+
elif 500 <= err.code < 600:
88+
# RFC 9309, Section 2.3.1.4: the crawler MUST assume
89+
# complete disallow.
90+
self.disallow_all = True
8391
err.close()
8492
else:
8593
raw = f.read()
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Disallow all access in :mod:`urllib.robotparser` if the ``robots.txt`` file
2+
is unreachable due to server or network errors.

0 commit comments

Comments
 (0)