Skip to content

Commit fdf75b3

Browse files
committed
Revert "gh-79638: Treat an unreachable robots.txt as "disallow all" (GH-138555)"
This reverts commit 310fe88.
1 parent fbdda04 commit fdf75b3

3 files changed

Lines changed: 17 additions & 61 deletions

File tree

Lib/test/test_robotparser.py

Lines changed: 16 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -646,23 +646,26 @@ def test_group_without_user_agent(self):
646646
)
647647
class BaseLocalNetworkTestCase:
648648

649-
@classmethod
650-
def setUpClass(cls):
649+
def setUp(self):
651650
# clear _opener global variable
652-
cls.addClassCleanup(urllib.request.urlcleanup)
651+
self.addCleanup(urllib.request.urlcleanup)
653652

654-
cls.server = HTTPServer((socket_helper.HOST, 0), cls.RobotHandler)
655-
cls.addClassCleanup(cls.server.server_close)
653+
self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
656654

657-
t = threading.Thread(
655+
self.t = threading.Thread(
658656
name='HTTPServer serving',
659-
target=cls.server.serve_forever,
657+
target=self.server.serve_forever,
660658
# Short poll interval to make the test finish quickly.
661659
# Time between requests is short enough that we won't wake
662660
# up spuriously too many times.
663661
kwargs={'poll_interval':0.01})
664-
cls.enterClassContext(threading_helper.start_threads([t]))
665-
cls.addClassCleanup(cls.server.shutdown)
662+
self.t.daemon = True # In case this function raises.
663+
self.t.start()
664+
665+
def tearDown(self):
666+
self.server.shutdown()
667+
self.t.join()
668+
self.server.server_close()
666669

667670

668671
SAMPLE_ROBOTS_TXT = b'''\
@@ -684,6 +687,7 @@ def do_GET(self):
684687
def log_message(self, format, *args):
685688
pass
686689

690+
@threading_helper.reap_threads
687691
def testRead(self):
688692
# Test that reading a weird robots.txt doesn't fail.
689693
addr = self.server.server_address
@@ -705,62 +709,24 @@ def testRead(self):
705709
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
706710

707711

708-
class HttpErrorsTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
712+
class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
709713
class RobotHandler(BaseHTTPRequestHandler):
710714

711715
def do_GET(self):
712-
self.send_error(self.server.return_code)
716+
self.send_error(403, "Forbidden access")
713717

714718
def log_message(self, format, *args):
715719
pass
716720

717-
def setUp(self):
718-
# Make sure that a valid code is set in the test.
719-
self.server.return_code = None
720-
721+
@threading_helper.reap_threads
721722
def testPasswordProtectedSite(self):
722-
self.server.return_code = 403
723723
addr = self.server.server_address
724724
url = 'http://' + socket_helper.HOST + ':' + str(addr[1])
725725
robots_url = url + "/robots.txt"
726726
parser = urllib.robotparser.RobotFileParser()
727727
parser.set_url(url)
728728
parser.read()
729729
self.assertFalse(parser.can_fetch("*", robots_url))
730-
self.assertFalse(parser.can_fetch("*", url + '/some/file.html'))
731-
732-
def testNotFound(self):
733-
self.server.return_code = 404
734-
addr = self.server.server_address
735-
url = f'http://{socket_helper.HOST}:{addr[1]}'
736-
robots_url = url + "/robots.txt"
737-
parser = urllib.robotparser.RobotFileParser()
738-
parser.set_url(url)
739-
parser.read()
740-
self.assertTrue(parser.can_fetch("*", robots_url))
741-
self.assertTrue(parser.can_fetch("*", url + '/path/file.html'))
742-
743-
def testTeapot(self):
744-
self.server.return_code = 418
745-
addr = self.server.server_address
746-
url = f'http://{socket_helper.HOST}:{addr[1]}'
747-
robots_url = url + "/robots.txt"
748-
parser = urllib.robotparser.RobotFileParser()
749-
parser.set_url(url)
750-
parser.read()
751-
self.assertTrue(parser.can_fetch("*", robots_url))
752-
self.assertTrue(parser.can_fetch("*", url + '/pot-1?milk-type=Cream'))
753-
754-
def testServiceUnavailable(self):
755-
self.server.return_code = 503
756-
addr = self.server.server_address
757-
url = f'http://{socket_helper.HOST}:{addr[1]}'
758-
robots_url = url + "/robots.txt"
759-
parser = urllib.robotparser.RobotFileParser()
760-
parser.set_url(url)
761-
parser.read()
762-
self.assertFalse(parser.can_fetch("*", robots_url))
763-
self.assertFalse(parser.can_fetch("*", url + '/path/file.html'))
764730

765731

766732
@support.requires_working_socket()

Lib/urllib/robotparser.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -65,17 +65,9 @@ def read(self):
6565
f = urllib.request.urlopen(self.url)
6666
except urllib.error.HTTPError as err:
6767
if err.code in (401, 403):
68-
# If access to robot.txt has the status Unauthorized/Forbidden,
69-
# then most likely this applies to the entire site.
7068
self.disallow_all = True
71-
elif 400 <= err.code < 500:
72-
# RFC 9309, Section 2.3.1.3: the crawler MAY access any
73-
# resources on the server.
69+
elif err.code >= 400 and err.code < 500:
7470
self.allow_all = True
75-
elif 500 <= err.code < 600:
76-
# RFC 9309, Section 2.3.1.4: the crawler MUST assume
77-
# complete disallow.
78-
self.disallow_all = True
7971
err.close()
8072
else:
8173
raw = f.read()

Misc/NEWS.d/next/Library/2025-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst

Lines changed: 0 additions & 2 deletions
This file was deleted.

0 commit comments

Comments
 (0)