PageRenderTime 55ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/tests/test_utils_sitemap.py

https://gitlab.com/oytunistrator/scrapy
Python | 215 lines | 212 code | 3 blank | 0 comment | 0 complexity | b44408b50aeff4a6367e04f7773277bd MD5 | raw file
  1. import unittest
  2. from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
  3. class SitemapTest(unittest.TestCase):
  4. def test_sitemap(self):
  5. s = Sitemap(b"""<?xml version="1.0" encoding="UTF-8"?>
  6. <urlset xmlns="http://www.google.com/schemas/sitemap/0.84">
  7. <url>
  8. <loc>http://www.example.com/</loc>
  9. <lastmod>2009-08-16</lastmod>
  10. <changefreq>daily</changefreq>
  11. <priority>1</priority>
  12. </url>
  13. <url>
  14. <loc>http://www.example.com/Special-Offers.html</loc>
  15. <lastmod>2009-08-16</lastmod>
  16. <changefreq>weekly</changefreq>
  17. <priority>0.8</priority>
  18. </url>
  19. </urlset>""")
  20. assert s.type == 'urlset'
  21. self.assertEqual(list(s),
  22. [{'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily'}, {'priority': '0.8', 'loc': 'http://www.example.com/Special-Offers.html', 'lastmod': '2009-08-16', 'changefreq': 'weekly'}])
  23. def test_sitemap_index(self):
  24. s = Sitemap(b"""<?xml version="1.0" encoding="UTF-8"?>
  25. <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  26. <sitemap>
  27. <loc>http://www.example.com/sitemap1.xml.gz</loc>
  28. <lastmod>2004-10-01T18:23:17+00:00</lastmod>
  29. </sitemap>
  30. <sitemap>
  31. <loc>http://www.example.com/sitemap2.xml.gz</loc>
  32. <lastmod>2005-01-01</lastmod>
  33. </sitemap>
  34. </sitemapindex>""")
  35. assert s.type == 'sitemapindex'
  36. self.assertEqual(list(s), [{'loc': 'http://www.example.com/sitemap1.xml.gz', 'lastmod': '2004-10-01T18:23:17+00:00'}, {'loc': 'http://www.example.com/sitemap2.xml.gz', 'lastmod': '2005-01-01'}])
  37. def test_sitemap_strip(self):
  38. """Assert we can deal with trailing spaces inside <loc> tags - we've
  39. seen those
  40. """
  41. s = Sitemap(b"""<?xml version="1.0" encoding="UTF-8"?>
  42. <urlset xmlns="http://www.google.com/schemas/sitemap/0.84">
  43. <url>
  44. <loc> http://www.example.com/</loc>
  45. <lastmod>2009-08-16</lastmod>
  46. <changefreq>daily</changefreq>
  47. <priority>1</priority>
  48. </url>
  49. <url>
  50. <loc> http://www.example.com/2</loc>
  51. <lastmod />
  52. </url>
  53. </urlset>
  54. """)
  55. self.assertEqual(list(s),
  56. [{'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily'},
  57. {'loc': 'http://www.example.com/2', 'lastmod': ''},
  58. ])
  59. def test_sitemap_wrong_ns(self):
  60. """We have seen sitemaps with wrongs ns. Presumably, Google still works
  61. with these, though is not 100% confirmed"""
  62. s = Sitemap(b"""<?xml version="1.0" encoding="UTF-8"?>
  63. <urlset xmlns="http://www.google.com/schemas/sitemap/0.84">
  64. <url xmlns="">
  65. <loc> http://www.example.com/</loc>
  66. <lastmod>2009-08-16</lastmod>
  67. <changefreq>daily</changefreq>
  68. <priority>1</priority>
  69. </url>
  70. <url xmlns="">
  71. <loc> http://www.example.com/2</loc>
  72. <lastmod />
  73. </url>
  74. </urlset>
  75. """)
  76. self.assertEqual(list(s),
  77. [{'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily'},
  78. {'loc': 'http://www.example.com/2', 'lastmod': ''},
  79. ])
  80. def test_sitemap_wrong_ns2(self):
  81. """We have seen sitemaps with wrongs ns. Presumably, Google still works
  82. with these, though is not 100% confirmed"""
  83. s = Sitemap(b"""<?xml version="1.0" encoding="UTF-8"?>
  84. <urlset>
  85. <url xmlns="">
  86. <loc> http://www.example.com/</loc>
  87. <lastmod>2009-08-16</lastmod>
  88. <changefreq>daily</changefreq>
  89. <priority>1</priority>
  90. </url>
  91. <url xmlns="">
  92. <loc> http://www.example.com/2</loc>
  93. <lastmod />
  94. </url>
  95. </urlset>
  96. """)
  97. assert s.type == 'urlset'
  98. self.assertEqual(list(s),
  99. [{'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily'},
  100. {'loc': 'http://www.example.com/2', 'lastmod': ''},
  101. ])
  102. def test_sitemap_urls_from_robots(self):
  103. robots = """User-agent: *
  104. Disallow: /aff/
  105. Disallow: /wl/
  106. # Search and shopping refining
  107. Disallow: /s*/*facet
  108. Disallow: /s*/*tags
  109. # Sitemap files
  110. Sitemap: http://example.com/sitemap.xml
  111. Sitemap: http://example.com/sitemap-product-index.xml
  112. # Forums
  113. Disallow: /forum/search/
  114. Disallow: /forum/active/
  115. """
  116. self.assertEqual(list(sitemap_urls_from_robots(robots)),
  117. ['http://example.com/sitemap.xml', 'http://example.com/sitemap-product-index.xml'])
  118. def test_sitemap_blanklines(self):
  119. """Assert we can deal with starting blank lines before <xml> tag"""
  120. s = Sitemap(b"""\
  121. <?xml version="1.0" encoding="UTF-8"?>
  122. <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  123. <!-- cache: cached = yes name = sitemap_jspCache key = sitemap -->
  124. <sitemap>
  125. <loc>http://www.example.com/sitemap1.xml</loc>
  126. <lastmod>2013-07-15</lastmod>
  127. </sitemap>
  128. <sitemap>
  129. <loc>http://www.example.com/sitemap2.xml</loc>
  130. <lastmod>2013-07-15</lastmod>
  131. </sitemap>
  132. <sitemap>
  133. <loc>http://www.example.com/sitemap3.xml</loc>
  134. <lastmod>2013-07-15</lastmod>
  135. </sitemap>
  136. <!-- end cache -->
  137. </sitemapindex>
  138. """)
  139. self.assertEqual(list(s), [
  140. {'lastmod': '2013-07-15', 'loc': 'http://www.example.com/sitemap1.xml'},
  141. {'lastmod': '2013-07-15', 'loc': 'http://www.example.com/sitemap2.xml'},
  142. {'lastmod': '2013-07-15', 'loc': 'http://www.example.com/sitemap3.xml'},
  143. ])
  144. def test_comment(self):
  145. s = Sitemap(b"""<?xml version="1.0" encoding="UTF-8"?>
  146. <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
  147. xmlns:xhtml="http://www.w3.org/1999/xhtml">
  148. <url>
  149. <loc>http://www.example.com/</loc>
  150. <!-- this is a comment on which the parser might raise an exception if implemented incorrectly -->
  151. </url>
  152. </urlset>""")
  153. self.assertEqual(list(s), [
  154. {'loc': 'http://www.example.com/'}
  155. ])
  156. def test_alternate(self):
  157. s = Sitemap(b"""<?xml version="1.0" encoding="UTF-8"?>
  158. <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
  159. xmlns:xhtml="http://www.w3.org/1999/xhtml">
  160. <url>
  161. <loc>http://www.example.com/english/</loc>
  162. <xhtml:link rel="alternate" hreflang="de"
  163. href="http://www.example.com/deutsch/"/>
  164. <xhtml:link rel="alternate" hreflang="de-ch"
  165. href="http://www.example.com/schweiz-deutsch/"/>
  166. <xhtml:link rel="alternate" hreflang="en"
  167. href="http://www.example.com/english/"/>
  168. <xhtml:link rel="alternate" hreflang="en"/><!-- wrong tag without href -->
  169. </url>
  170. </urlset>""")
  171. self.assertEqual(list(s), [
  172. {'loc': 'http://www.example.com/english/',
  173. 'alternate': ['http://www.example.com/deutsch/', 'http://www.example.com/schweiz-deutsch/', 'http://www.example.com/english/']
  174. }
  175. ])
  176. def test_xml_entity_expansion(self):
  177. s = Sitemap(b"""<?xml version="1.0" encoding="utf-8"?>
  178. <!DOCTYPE foo [
  179. <!ELEMENT foo ANY >
  180. <!ENTITY xxe SYSTEM "file:///etc/passwd" >
  181. ]>
  182. <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  183. <url>
  184. <loc>http://127.0.0.1:8000/&xxe;</loc>
  185. </url>
  186. </urlset>
  187. """)
  188. self.assertEqual(list(s), [{'loc': 'http://127.0.0.1:8000/'}])
  189. if __name__ == '__main__':
  190. unittest.main()