PageRenderTime 41ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/Lib/test/test_robotparser.py

https://gitlab.com/unofficial-mirrors/cpython
Python | 332 lines | 322 code | 9 blank | 1 comment | 6 complexity | 0da539f1693b5f2b016b8c0cf0cd4f95 MD5 | raw file
  1. import io
  2. import os
  3. import unittest
  4. import urllib.robotparser
  5. from collections import namedtuple
  6. from test import support
  7. from http.server import BaseHTTPRequestHandler, HTTPServer
  8. try:
  9. import threading
  10. except ImportError:
  11. threading = None
  12. class BaseRobotTest:
  13. robots_txt = ''
  14. agent = 'test_robotparser'
  15. good = []
  16. bad = []
  17. def setUp(self):
  18. lines = io.StringIO(self.robots_txt).readlines()
  19. self.parser = urllib.robotparser.RobotFileParser()
  20. self.parser.parse(lines)
  21. def get_agent_and_url(self, url):
  22. if isinstance(url, tuple):
  23. agent, url = url
  24. return agent, url
  25. return self.agent, url
  26. def test_good_urls(self):
  27. for url in self.good:
  28. agent, url = self.get_agent_and_url(url)
  29. with self.subTest(url=url, agent=agent):
  30. self.assertTrue(self.parser.can_fetch(agent, url))
  31. def test_bad_urls(self):
  32. for url in self.bad:
  33. agent, url = self.get_agent_and_url(url)
  34. with self.subTest(url=url, agent=agent):
  35. self.assertFalse(self.parser.can_fetch(agent, url))
  36. class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
  37. robots_txt = """\
  38. User-agent: *
  39. Disallow: /cyberworld/map/ # This is an infinite virtual URL space
  40. Disallow: /tmp/ # these will soon disappear
  41. Disallow: /foo.html
  42. """
  43. good = ['/', '/test.html']
  44. bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
  45. class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
  46. robots_txt = """\
  47. # robots.txt for http://www.example.com/
  48. User-agent: *
  49. Crawl-delay: 1
  50. Request-rate: 3/15
  51. Disallow: /cyberworld/map/ # This is an infinite virtual URL space
  52. # Cybermapper knows where to go.
  53. User-agent: cybermapper
  54. Disallow:
  55. """
  56. good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
  57. bad = ['/cyberworld/map/index.html']
  58. class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
  59. robots_txt = """\
  60. # go away
  61. User-agent: *
  62. Disallow: /
  63. """
  64. good = []
  65. bad = ['/cyberworld/map/index.html', '/', '/tmp/']
  66. class BaseRequestRateTest(BaseRobotTest):
  67. def test_request_rate(self):
  68. for url in self.good + self.bad:
  69. agent, url = self.get_agent_and_url(url)
  70. with self.subTest(url=url, agent=agent):
  71. if self.crawl_delay:
  72. self.assertEqual(
  73. self.parser.crawl_delay(agent), self.crawl_delay
  74. )
  75. if self.request_rate:
  76. self.assertEqual(
  77. self.parser.request_rate(agent).requests,
  78. self.request_rate.requests
  79. )
  80. self.assertEqual(
  81. self.parser.request_rate(agent).seconds,
  82. self.request_rate.seconds
  83. )
  84. class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
  85. robots_txt = """\
  86. User-agent: figtree
  87. Crawl-delay: 3
  88. Request-rate: 9/30
  89. Disallow: /tmp
  90. Disallow: /a%3cd.html
  91. Disallow: /a%2fb.html
  92. Disallow: /%7ejoe/index.html
  93. """
  94. agent = 'figtree'
  95. request_rate = namedtuple('req_rate', 'requests seconds')(9, 30)
  96. crawl_delay = 3
  97. good = [('figtree', '/foo.html')]
  98. bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
  99. '/a%2fb.html', '/~joe/index.html']
  100. class DifferentAgentTest(CrawlDelayAndRequestRateTest):
  101. agent = 'FigTree Robot libwww-perl/5.04'
  102. # these are not actually tested, but we still need to parse it
  103. # in order to accommodate the input parameters
  104. request_rate = None
  105. crawl_delay = None
  106. class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
  107. robots_txt = """\
  108. User-agent: *
  109. Disallow: /tmp/
  110. Disallow: /a%3Cd.html
  111. Disallow: /a/b.html
  112. Disallow: /%7ejoe/index.html
  113. Crawl-delay: 3
  114. Request-rate: 9/banana
  115. """
  116. good = ['/tmp']
  117. bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
  118. '/%7Ejoe/index.html']
  119. crawl_delay = 3
  120. class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
  121. # From bug report #523041
  122. robots_txt = """\
  123. User-Agent: *
  124. Disallow: /.
  125. Crawl-delay: pears
  126. """
  127. good = ['/foo.html']
  128. # bug report says "/" should be denied, but that is not in the RFC
  129. bad = []
  130. class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
  131. # also test that Allow and Diasallow works well with each other
  132. robots_txt = """\
  133. User-agent: Googlebot
  134. Allow: /folder1/myfile.html
  135. Disallow: /folder1/
  136. Request-rate: whale/banana
  137. """
  138. agent = 'Googlebot'
  139. good = ['/folder1/myfile.html']
  140. bad = ['/folder1/anotherfile.html']
  141. class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
  142. # the order of User-agent should be correct. note
  143. # that this file is incorrect because "Googlebot" is a
  144. # substring of "Googlebot-Mobile"
  145. robots_txt = """\
  146. User-agent: Googlebot
  147. Disallow: /
  148. User-agent: Googlebot-Mobile
  149. Allow: /
  150. """
  151. agent = 'Googlebot'
  152. bad = ['/something.jpg']
  153. class UserAgentGoogleMobileTest(UserAgentOrderingTest):
  154. agent = 'Googlebot-Mobile'
  155. class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
  156. # Google also got the order wrong. You need
  157. # to specify the URLs from more specific to more general
  158. robots_txt = """\
  159. User-agent: Googlebot
  160. Allow: /folder1/myfile.html
  161. Disallow: /folder1/
  162. """
  163. agent = 'googlebot'
  164. good = ['/folder1/myfile.html']
  165. bad = ['/folder1/anotherfile.html']
  166. class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
  167. # see issue #6325 for details
  168. robots_txt = """\
  169. User-agent: *
  170. Disallow: /some/path?name=value
  171. """
  172. good = ['/some/path']
  173. bad = ['/some/path?name=value']
  174. class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
  175. # obey first * entry (#4108)
  176. robots_txt = """\
  177. User-agent: *
  178. Disallow: /some/path
  179. User-agent: *
  180. Disallow: /another/path
  181. """
  182. good = ['/another/path']
  183. bad = ['/some/path']
  184. class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
  185. # normalize the URL first (#17403)
  186. robots_txt = """\
  187. User-agent: *
  188. Allow: /some/path?
  189. Disallow: /another/path?
  190. """
  191. good = ['/some/path?']
  192. bad = ['/another/path?']
  193. class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
  194. robots_txt = """\
  195. User-agent: *
  196. Crawl-delay: 1
  197. Request-rate: 3/15
  198. Disallow: /cyberworld/map/
  199. """
  200. request_rate = namedtuple('req_rate', 'requests seconds')(3, 15)
  201. crawl_delay = 1
  202. good = ['/', '/test.html']
  203. bad = ['/cyberworld/map/index.html']
  204. class RobotHandler(BaseHTTPRequestHandler):
  205. def do_GET(self):
  206. self.send_error(403, "Forbidden access")
  207. def log_message(self, format, *args):
  208. pass
  209. @unittest.skipUnless(threading, 'threading required for this test')
  210. class PasswordProtectedSiteTestCase(unittest.TestCase):
  211. def setUp(self):
  212. self.server = HTTPServer((support.HOST, 0), RobotHandler)
  213. self.t = threading.Thread(
  214. name='HTTPServer serving',
  215. target=self.server.serve_forever,
  216. # Short poll interval to make the test finish quickly.
  217. # Time between requests is short enough that we won't wake
  218. # up spuriously too many times.
  219. kwargs={'poll_interval':0.01})
  220. self.t.daemon = True # In case this function raises.
  221. self.t.start()
  222. def tearDown(self):
  223. self.server.shutdown()
  224. self.t.join()
  225. self.server.server_close()
  226. @support.reap_threads
  227. def testPasswordProtectedSite(self):
  228. addr = self.server.server_address
  229. url = 'http://' + support.HOST + ':' + str(addr[1])
  230. robots_url = url + "/robots.txt"
  231. parser = urllib.robotparser.RobotFileParser()
  232. parser.set_url(url)
  233. parser.read()
  234. self.assertFalse(parser.can_fetch("*", robots_url))
  235. class NetworkTestCase(unittest.TestCase):
  236. base_url = 'http://www.pythontest.net/'
  237. robots_txt = '{}elsewhere/robots.txt'.format(base_url)
  238. @classmethod
  239. def setUpClass(cls):
  240. support.requires('network')
  241. with support.transient_internet(cls.base_url):
  242. cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
  243. cls.parser.read()
  244. def url(self, path):
  245. return '{}{}{}'.format(
  246. self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
  247. )
  248. def test_basic(self):
  249. self.assertFalse(self.parser.disallow_all)
  250. self.assertFalse(self.parser.allow_all)
  251. self.assertGreater(self.parser.mtime(), 0)
  252. self.assertFalse(self.parser.crawl_delay('*'))
  253. self.assertFalse(self.parser.request_rate('*'))
  254. def test_can_fetch(self):
  255. self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
  256. self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
  257. self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
  258. self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
  259. self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
  260. self.assertTrue(self.parser.can_fetch('*', self.base_url))
  261. def test_read_404(self):
  262. parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
  263. parser.read()
  264. self.assertTrue(parser.allow_all)
  265. self.assertFalse(parser.disallow_all)
  266. self.assertEqual(parser.mtime(), 0)
  267. self.assertIsNone(parser.crawl_delay('*'))
  268. self.assertIsNone(parser.request_rate('*'))
  269. if __name__=='__main__':
  270. unittest.main()