PageRenderTime 52ms CodeModel.GetById 15ms RepoModel.GetById 1ms app.codeStats 0ms

/tests/test_spider.py

https://gitlab.com/e0/scrapy
Python | 400 lines | 368 code | 30 blank | 2 comment | 4 complexity | c4745e3c588fd6b6528e22af8c074aac MD5 | raw file
  1. import gzip
  2. import inspect
  3. import warnings
  4. from io import BytesIO
  5. from testfixtures import LogCapture
  6. from twisted.trial import unittest
  7. from scrapy import signals
  8. from scrapy.settings import Settings
  9. from scrapy.http import Request, Response, TextResponse, XmlResponse, HtmlResponse
  10. from scrapy.spiders.init import InitSpider
  11. from scrapy.spiders import Spider, BaseSpider, CrawlSpider, Rule, XMLFeedSpider, \
  12. CSVFeedSpider, SitemapSpider
  13. from scrapy.linkextractors import LinkExtractor
  14. from scrapy.exceptions import ScrapyDeprecationWarning
  15. from scrapy.utils.trackref import object_ref
  16. from scrapy.utils.test import get_crawler
  17. from tests import mock
  18. class SpiderTest(unittest.TestCase):
  19. spider_class = Spider
  20. def setUp(self):
  21. warnings.simplefilter("always")
  22. def tearDown(self):
  23. warnings.resetwarnings()
  24. def test_base_spider(self):
  25. spider = self.spider_class("example.com")
  26. self.assertEqual(spider.name, 'example.com')
  27. self.assertEqual(spider.start_urls, [])
  28. def test_start_requests(self):
  29. spider = self.spider_class('example.com')
  30. start_requests = spider.start_requests()
  31. self.assertTrue(inspect.isgenerator(start_requests))
  32. self.assertEqual(list(start_requests), [])
  33. def test_spider_args(self):
  34. """Constructor arguments are assigned to spider attributes"""
  35. spider = self.spider_class('example.com', foo='bar')
  36. self.assertEqual(spider.foo, 'bar')
  37. def test_spider_without_name(self):
  38. """Constructor arguments are assigned to spider attributes"""
  39. self.assertRaises(ValueError, self.spider_class)
  40. self.assertRaises(ValueError, self.spider_class, somearg='foo')
  41. def test_deprecated_set_crawler_method(self):
  42. spider = self.spider_class('example.com')
  43. crawler = get_crawler()
  44. with warnings.catch_warnings(record=True) as w:
  45. spider.set_crawler(crawler)
  46. self.assertIn("set_crawler", str(w[0].message))
  47. self.assertTrue(hasattr(spider, 'crawler'))
  48. self.assertIs(spider.crawler, crawler)
  49. self.assertTrue(hasattr(spider, 'settings'))
  50. self.assertIs(spider.settings, crawler.settings)
  51. def test_from_crawler_crawler_and_settings_population(self):
  52. crawler = get_crawler()
  53. spider = self.spider_class.from_crawler(crawler, 'example.com')
  54. self.assertTrue(hasattr(spider, 'crawler'))
  55. self.assertIs(spider.crawler, crawler)
  56. self.assertTrue(hasattr(spider, 'settings'))
  57. self.assertIs(spider.settings, crawler.settings)
  58. def test_from_crawler_init_call(self):
  59. with mock.patch.object(self.spider_class, '__init__',
  60. return_value=None) as mock_init:
  61. self.spider_class.from_crawler(get_crawler(), 'example.com',
  62. foo='bar')
  63. mock_init.assert_called_once_with('example.com', foo='bar')
  64. def test_closed_signal_call(self):
  65. class TestSpider(self.spider_class):
  66. closed_called = False
  67. def closed(self, reason):
  68. self.closed_called = True
  69. crawler = get_crawler()
  70. spider = TestSpider.from_crawler(crawler, 'example.com')
  71. crawler.signals.send_catch_log(signal=signals.spider_opened,
  72. spider=spider)
  73. crawler.signals.send_catch_log(signal=signals.spider_closed,
  74. spider=spider, reason=None)
  75. self.assertTrue(spider.closed_called)
  76. def test_update_settings(self):
  77. spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
  78. project_settings = {'TEST1': 'project', 'TEST3': 'project'}
  79. self.spider_class.custom_settings = spider_settings
  80. settings = Settings(project_settings, priority='project')
  81. self.spider_class.update_settings(settings)
  82. self.assertEqual(settings.get('TEST1'), 'spider')
  83. self.assertEqual(settings.get('TEST2'), 'spider')
  84. self.assertEqual(settings.get('TEST3'), 'project')
  85. def test_logger(self):
  86. spider = self.spider_class('example.com')
  87. with LogCapture() as l:
  88. spider.logger.info('test log msg')
  89. l.check(('example.com', 'INFO', 'test log msg'))
  90. record = l.records[0]
  91. self.assertIn('spider', record.__dict__)
  92. self.assertIs(record.spider, spider)
  93. def test_log(self):
  94. spider = self.spider_class('example.com')
  95. with mock.patch('scrapy.spiders.Spider.logger') as mock_logger:
  96. spider.log('test log msg', 'INFO')
  97. mock_logger.log.assert_called_once_with('INFO', 'test log msg')
  98. class InitSpiderTest(SpiderTest):
  99. spider_class = InitSpider
  100. class XMLFeedSpiderTest(SpiderTest):
  101. spider_class = XMLFeedSpider
  102. def test_register_namespace(self):
  103. body = b"""<?xml version="1.0" encoding="UTF-8"?>
  104. <urlset xmlns:x="http://www.google.com/schemas/sitemap/0.84"
  105. xmlns:y="http://www.example.com/schemas/extras/1.0">
  106. <url><x:loc>http://www.example.com/Special-Offers.html</loc><y:updated>2009-08-16</updated><other value="bar" y:custom="fuu"/></url>
  107. <url><loc>http://www.example.com/</loc><y:updated>2009-08-16</updated><other value="foo"/></url>
  108. </urlset>"""
  109. response = XmlResponse(url='http://example.com/sitemap.xml', body=body)
  110. class _XMLSpider(self.spider_class):
  111. itertag = 'url'
  112. namespaces = (
  113. ('a', 'http://www.google.com/schemas/sitemap/0.84'),
  114. ('b', 'http://www.example.com/schemas/extras/1.0'),
  115. )
  116. def parse_node(self, response, selector):
  117. yield {
  118. 'loc': selector.xpath('a:loc/text()').extract(),
  119. 'updated': selector.xpath('b:updated/text()').extract(),
  120. 'other': selector.xpath('other/@value').extract(),
  121. 'custom': selector.xpath('other/@b:custom').extract(),
  122. }
  123. for iterator in ('iternodes', 'xml'):
  124. spider = _XMLSpider('example', iterator=iterator)
  125. output = list(spider.parse(response))
  126. self.assertEqual(len(output), 2, iterator)
  127. self.assertEqual(output, [
  128. {'loc': [u'http://www.example.com/Special-Offers.html'],
  129. 'updated': [u'2009-08-16'],
  130. 'custom': [u'fuu'],
  131. 'other': [u'bar']},
  132. {'loc': [],
  133. 'updated': [u'2009-08-16'],
  134. 'other': [u'foo'],
  135. 'custom': []},
  136. ], iterator)
  137. class CSVFeedSpiderTest(SpiderTest):
  138. spider_class = CSVFeedSpider
  139. class CrawlSpiderTest(SpiderTest):
  140. test_body = b"""<html><head><title>Page title<title>
  141. <body>
  142. <p><a href="item/12.html">Item 12</a></p>
  143. <div class='links'>
  144. <p><a href="/about.html">About us</a></p>
  145. </div>
  146. <div>
  147. <p><a href="/nofollow.html">This shouldn't be followed</a></p>
  148. </div>
  149. </body></html>"""
  150. spider_class = CrawlSpider
  151. def test_process_links(self):
  152. response = HtmlResponse("http://example.org/somepage/index.html",
  153. body=self.test_body)
  154. class _CrawlSpider(self.spider_class):
  155. name="test"
  156. allowed_domains=['example.org']
  157. rules = (
  158. Rule(LinkExtractor(), process_links="dummy_process_links"),
  159. )
  160. def dummy_process_links(self, links):
  161. return links
  162. spider = _CrawlSpider()
  163. output = list(spider._requests_to_follow(response))
  164. self.assertEqual(len(output), 3)
  165. self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
  166. self.assertEquals([r.url for r in output],
  167. ['http://example.org/somepage/item/12.html',
  168. 'http://example.org/about.html',
  169. 'http://example.org/nofollow.html'])
  170. def test_process_links_filter(self):
  171. response = HtmlResponse("http://example.org/somepage/index.html",
  172. body=self.test_body)
  173. class _CrawlSpider(self.spider_class):
  174. import re
  175. name="test"
  176. allowed_domains=['example.org']
  177. rules = (
  178. Rule(LinkExtractor(), process_links="filter_process_links"),
  179. )
  180. _test_regex = re.compile('nofollow')
  181. def filter_process_links(self, links):
  182. return [link for link in links
  183. if not self._test_regex.search(link.url)]
  184. spider = _CrawlSpider()
  185. output = list(spider._requests_to_follow(response))
  186. self.assertEqual(len(output), 2)
  187. self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
  188. self.assertEquals([r.url for r in output],
  189. ['http://example.org/somepage/item/12.html',
  190. 'http://example.org/about.html'])
  191. def test_process_links_generator(self):
  192. response = HtmlResponse("http://example.org/somepage/index.html",
  193. body=self.test_body)
  194. class _CrawlSpider(self.spider_class):
  195. name="test"
  196. allowed_domains=['example.org']
  197. rules = (
  198. Rule(LinkExtractor(), process_links="dummy_process_links"),
  199. )
  200. def dummy_process_links(self, links):
  201. for link in links:
  202. yield link
  203. spider = _CrawlSpider()
  204. output = list(spider._requests_to_follow(response))
  205. self.assertEqual(len(output), 3)
  206. self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
  207. self.assertEquals([r.url for r in output],
  208. ['http://example.org/somepage/item/12.html',
  209. 'http://example.org/about.html',
  210. 'http://example.org/nofollow.html'])
  211. def test_follow_links_attribute_population(self):
  212. crawler = get_crawler()
  213. spider = self.spider_class.from_crawler(crawler, 'example.com')
  214. self.assertTrue(hasattr(spider, '_follow_links'))
  215. self.assertTrue(spider._follow_links)
  216. settings_dict = {'CRAWLSPIDER_FOLLOW_LINKS': False}
  217. crawler = get_crawler(settings_dict=settings_dict)
  218. spider = self.spider_class.from_crawler(crawler, 'example.com')
  219. self.assertTrue(hasattr(spider, '_follow_links'))
  220. self.assertFalse(spider._follow_links)
  221. def test_follow_links_attribute_deprecated_population(self):
  222. spider = self.spider_class('example.com')
  223. self.assertFalse(hasattr(spider, '_follow_links'))
  224. spider.set_crawler(get_crawler())
  225. self.assertTrue(hasattr(spider, '_follow_links'))
  226. self.assertTrue(spider._follow_links)
  227. spider = self.spider_class('example.com')
  228. settings_dict = {'CRAWLSPIDER_FOLLOW_LINKS': False}
  229. spider.set_crawler(get_crawler(settings_dict=settings_dict))
  230. self.assertTrue(hasattr(spider, '_follow_links'))
  231. self.assertFalse(spider._follow_links)
  232. class SitemapSpiderTest(SpiderTest):
  233. spider_class = SitemapSpider
  234. BODY = b"SITEMAP"
  235. f = BytesIO()
  236. g = gzip.GzipFile(fileobj=f, mode='w+b')
  237. g.write(BODY)
  238. g.close()
  239. GZBODY = f.getvalue()
  240. def assertSitemapBody(self, response, body):
  241. spider = self.spider_class("example.com")
  242. self.assertEqual(spider._get_sitemap_body(response), body)
  243. def test_get_sitemap_body(self):
  244. r = XmlResponse(url="http://www.example.com/", body=self.BODY)
  245. self.assertSitemapBody(r, self.BODY)
  246. r = HtmlResponse(url="http://www.example.com/", body=self.BODY)
  247. self.assertSitemapBody(r, None)
  248. r = Response(url="http://www.example.com/favicon.ico", body=self.BODY)
  249. self.assertSitemapBody(r, None)
  250. def test_get_sitemap_body_gzip_headers(self):
  251. r = Response(url="http://www.example.com/sitemap", body=self.GZBODY,
  252. headers={"content-type": "application/gzip"})
  253. self.assertSitemapBody(r, self.BODY)
  254. def test_get_sitemap_body_xml_url(self):
  255. r = TextResponse(url="http://www.example.com/sitemap.xml", body=self.BODY)
  256. self.assertSitemapBody(r, self.BODY)
  257. def test_get_sitemap_body_xml_url_compressed(self):
  258. r = Response(url="http://www.example.com/sitemap.xml.gz", body=self.GZBODY)
  259. self.assertSitemapBody(r, self.BODY)
  260. def test_get_sitemap_urls_from_robotstxt(self):
  261. robots = b"""# Sitemap files
  262. Sitemap: http://example.com/sitemap.xml
  263. Sitemap: http://example.com/sitemap-product-index.xml
  264. """
  265. r = TextResponse(url="http://www.example.com/robots.txt", body=robots)
  266. spider = self.spider_class("example.com")
  267. self.assertEqual([req.url for req in spider._parse_sitemap(r)],
  268. ['http://example.com/sitemap.xml',
  269. 'http://example.com/sitemap-product-index.xml'])
  270. class BaseSpiderDeprecationTest(unittest.TestCase):
  271. def test_basespider_is_deprecated(self):
  272. with warnings.catch_warnings(record=True) as w:
  273. class MySpider1(BaseSpider):
  274. pass
  275. self.assertEqual(len(w), 1)
  276. self.assertEqual(w[0].category, ScrapyDeprecationWarning)
  277. self.assertEqual(w[0].lineno, inspect.getsourcelines(MySpider1)[1])
  278. def test_basespider_issubclass(self):
  279. class MySpider2(Spider):
  280. pass
  281. class MySpider2a(MySpider2):
  282. pass
  283. class Foo(object):
  284. pass
  285. class Foo2(object_ref):
  286. pass
  287. assert issubclass(MySpider2, BaseSpider)
  288. assert issubclass(MySpider2a, BaseSpider)
  289. assert not issubclass(Foo, BaseSpider)
  290. assert not issubclass(Foo2, BaseSpider)
  291. def test_basespider_isinstance(self):
  292. class MySpider3(Spider):
  293. name = 'myspider3'
  294. class MySpider3a(MySpider3):
  295. pass
  296. class Foo(object):
  297. pass
  298. class Foo2(object_ref):
  299. pass
  300. assert isinstance(MySpider3(), BaseSpider)
  301. assert isinstance(MySpider3a(), BaseSpider)
  302. assert not isinstance(Foo(), BaseSpider)
  303. assert not isinstance(Foo2(), BaseSpider)
  304. def test_crawl_spider(self):
  305. assert issubclass(CrawlSpider, Spider)
  306. assert issubclass(CrawlSpider, BaseSpider)
  307. assert isinstance(CrawlSpider(name='foo'), Spider)
  308. assert isinstance(CrawlSpider(name='foo'), BaseSpider)
  309. if __name__ == '__main__':
  310. unittest.main()