/Lib/test/test_robotparser.py
Python | 332 lines | 322 code | 9 blank | 1 comment | 6 complexity | 0da539f1693b5f2b016b8c0cf0cd4f95 MD5 | raw file
- import io
- import os
- import unittest
- import urllib.robotparser
- from collections import namedtuple
- from test import support
- from http.server import BaseHTTPRequestHandler, HTTPServer
- try:
- import threading
- except ImportError:
- threading = None
- class BaseRobotTest:
- robots_txt = ''
- agent = 'test_robotparser'
- good = []
- bad = []
- def setUp(self):
- lines = io.StringIO(self.robots_txt).readlines()
- self.parser = urllib.robotparser.RobotFileParser()
- self.parser.parse(lines)
- def get_agent_and_url(self, url):
- if isinstance(url, tuple):
- agent, url = url
- return agent, url
- return self.agent, url
- def test_good_urls(self):
- for url in self.good:
- agent, url = self.get_agent_and_url(url)
- with self.subTest(url=url, agent=agent):
- self.assertTrue(self.parser.can_fetch(agent, url))
- def test_bad_urls(self):
- for url in self.bad:
- agent, url = self.get_agent_and_url(url)
- with self.subTest(url=url, agent=agent):
- self.assertFalse(self.parser.can_fetch(agent, url))
- class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
- robots_txt = """\
- User-agent: *
- Disallow: /cyberworld/map/ # This is an infinite virtual URL space
- Disallow: /tmp/ # these will soon disappear
- Disallow: /foo.html
- """
- good = ['/', '/test.html']
- bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
- class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
- robots_txt = """\
- # robots.txt for http://www.example.com/
- User-agent: *
- Crawl-delay: 1
- Request-rate: 3/15
- Disallow: /cyberworld/map/ # This is an infinite virtual URL space
- # Cybermapper knows where to go.
- User-agent: cybermapper
- Disallow:
- """
- good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
- bad = ['/cyberworld/map/index.html']
- class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
- robots_txt = """\
- # go away
- User-agent: *
- Disallow: /
- """
- good = []
- bad = ['/cyberworld/map/index.html', '/', '/tmp/']
- class BaseRequestRateTest(BaseRobotTest):
- def test_request_rate(self):
- for url in self.good + self.bad:
- agent, url = self.get_agent_and_url(url)
- with self.subTest(url=url, agent=agent):
- if self.crawl_delay:
- self.assertEqual(
- self.parser.crawl_delay(agent), self.crawl_delay
- )
- if self.request_rate:
- self.assertEqual(
- self.parser.request_rate(agent).requests,
- self.request_rate.requests
- )
- self.assertEqual(
- self.parser.request_rate(agent).seconds,
- self.request_rate.seconds
- )
- class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
- robots_txt = """\
- User-agent: figtree
- Crawl-delay: 3
- Request-rate: 9/30
- Disallow: /tmp
- Disallow: /a%3cd.html
- Disallow: /a%2fb.html
- Disallow: /%7ejoe/index.html
- """
- agent = 'figtree'
- request_rate = namedtuple('req_rate', 'requests seconds')(9, 30)
- crawl_delay = 3
- good = [('figtree', '/foo.html')]
- bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
- '/a%2fb.html', '/~joe/index.html']
- class DifferentAgentTest(CrawlDelayAndRequestRateTest):
- agent = 'FigTree Robot libwww-perl/5.04'
- # these are not actually tested, but we still need to parse it
- # in order to accommodate the input parameters
- request_rate = None
- crawl_delay = None
- class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
- robots_txt = """\
- User-agent: *
- Disallow: /tmp/
- Disallow: /a%3Cd.html
- Disallow: /a/b.html
- Disallow: /%7ejoe/index.html
- Crawl-delay: 3
- Request-rate: 9/banana
- """
- good = ['/tmp']
- bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
- '/%7Ejoe/index.html']
- crawl_delay = 3
- class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
- # From bug report #523041
- robots_txt = """\
- User-Agent: *
- Disallow: /.
- Crawl-delay: pears
- """
- good = ['/foo.html']
- # bug report says "/" should be denied, but that is not in the RFC
- bad = []
- class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
- # also test that Allow and Diasallow works well with each other
- robots_txt = """\
- User-agent: Googlebot
- Allow: /folder1/myfile.html
- Disallow: /folder1/
- Request-rate: whale/banana
- """
- agent = 'Googlebot'
- good = ['/folder1/myfile.html']
- bad = ['/folder1/anotherfile.html']
- class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
- # the order of User-agent should be correct. note
- # that this file is incorrect because "Googlebot" is a
- # substring of "Googlebot-Mobile"
- robots_txt = """\
- User-agent: Googlebot
- Disallow: /
- User-agent: Googlebot-Mobile
- Allow: /
- """
- agent = 'Googlebot'
- bad = ['/something.jpg']
- class UserAgentGoogleMobileTest(UserAgentOrderingTest):
- agent = 'Googlebot-Mobile'
- class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
- # Google also got the order wrong. You need
- # to specify the URLs from more specific to more general
- robots_txt = """\
- User-agent: Googlebot
- Allow: /folder1/myfile.html
- Disallow: /folder1/
- """
- agent = 'googlebot'
- good = ['/folder1/myfile.html']
- bad = ['/folder1/anotherfile.html']
- class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
- # see issue #6325 for details
- robots_txt = """\
- User-agent: *
- Disallow: /some/path?name=value
- """
- good = ['/some/path']
- bad = ['/some/path?name=value']
- class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
- # obey first * entry (#4108)
- robots_txt = """\
- User-agent: *
- Disallow: /some/path
- User-agent: *
- Disallow: /another/path
- """
- good = ['/another/path']
- bad = ['/some/path']
- class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
- # normalize the URL first (#17403)
- robots_txt = """\
- User-agent: *
- Allow: /some/path?
- Disallow: /another/path?
- """
- good = ['/some/path?']
- bad = ['/another/path?']
- class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
- robots_txt = """\
- User-agent: *
- Crawl-delay: 1
- Request-rate: 3/15
- Disallow: /cyberworld/map/
- """
- request_rate = namedtuple('req_rate', 'requests seconds')(3, 15)
- crawl_delay = 1
- good = ['/', '/test.html']
- bad = ['/cyberworld/map/index.html']
- class RobotHandler(BaseHTTPRequestHandler):
- def do_GET(self):
- self.send_error(403, "Forbidden access")
- def log_message(self, format, *args):
- pass
- @unittest.skipUnless(threading, 'threading required for this test')
- class PasswordProtectedSiteTestCase(unittest.TestCase):
- def setUp(self):
- self.server = HTTPServer((support.HOST, 0), RobotHandler)
- self.t = threading.Thread(
- name='HTTPServer serving',
- target=self.server.serve_forever,
- # Short poll interval to make the test finish quickly.
- # Time between requests is short enough that we won't wake
- # up spuriously too many times.
- kwargs={'poll_interval':0.01})
- self.t.daemon = True # In case this function raises.
- self.t.start()
- def tearDown(self):
- self.server.shutdown()
- self.t.join()
- self.server.server_close()
- @support.reap_threads
- def testPasswordProtectedSite(self):
- addr = self.server.server_address
- url = 'http://' + support.HOST + ':' + str(addr[1])
- robots_url = url + "/robots.txt"
- parser = urllib.robotparser.RobotFileParser()
- parser.set_url(url)
- parser.read()
- self.assertFalse(parser.can_fetch("*", robots_url))
- class NetworkTestCase(unittest.TestCase):
- base_url = 'http://www.pythontest.net/'
- robots_txt = '{}elsewhere/robots.txt'.format(base_url)
- @classmethod
- def setUpClass(cls):
- support.requires('network')
- with support.transient_internet(cls.base_url):
- cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
- cls.parser.read()
- def url(self, path):
- return '{}{}{}'.format(
- self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
- )
- def test_basic(self):
- self.assertFalse(self.parser.disallow_all)
- self.assertFalse(self.parser.allow_all)
- self.assertGreater(self.parser.mtime(), 0)
- self.assertFalse(self.parser.crawl_delay('*'))
- self.assertFalse(self.parser.request_rate('*'))
- def test_can_fetch(self):
- self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
- self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
- self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
- self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
- self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
- self.assertTrue(self.parser.can_fetch('*', self.base_url))
- def test_read_404(self):
- parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
- parser.read()
- self.assertTrue(parser.allow_all)
- self.assertFalse(parser.disallow_all)
- self.assertEqual(parser.mtime(), 0)
- self.assertIsNone(parser.crawl_delay('*'))
- self.assertIsNone(parser.request_rate('*'))
- if __name__=='__main__':
- unittest.main()