PageRenderTime 52ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/lib-python/2.7/test/test_sgmllib.py

https://bitbucket.org/pwaller/pypy
Python | 439 lines | 387 code | 38 blank | 14 comment | 11 complexity | 0b06cb621349ca24ebce3d85cee0d3d3 MD5 | raw file
  1. import pprint
  2. import re
  3. import unittest
  4. from test import test_support
  5. sgmllib = test_support.import_module('sgmllib', deprecated=True)
  6. class EventCollector(sgmllib.SGMLParser):
  7. def __init__(self):
  8. self.events = []
  9. self.append = self.events.append
  10. sgmllib.SGMLParser.__init__(self)
  11. def get_events(self):
  12. # Normalize the list of events so that buffer artefacts don't
  13. # separate runs of contiguous characters.
  14. L = []
  15. prevtype = None
  16. for event in self.events:
  17. type = event[0]
  18. if type == prevtype == "data":
  19. L[-1] = ("data", L[-1][1] + event[1])
  20. else:
  21. L.append(event)
  22. prevtype = type
  23. self.events = L
  24. return L
  25. # structure markup
  26. def unknown_starttag(self, tag, attrs):
  27. self.append(("starttag", tag, attrs))
  28. def unknown_endtag(self, tag):
  29. self.append(("endtag", tag))
  30. # all other markup
  31. def handle_comment(self, data):
  32. self.append(("comment", data))
  33. def handle_charref(self, data):
  34. self.append(("charref", data))
  35. def handle_data(self, data):
  36. self.append(("data", data))
  37. def handle_decl(self, decl):
  38. self.append(("decl", decl))
  39. def handle_entityref(self, data):
  40. self.append(("entityref", data))
  41. def handle_pi(self, data):
  42. self.append(("pi", data))
  43. def unknown_decl(self, decl):
  44. self.append(("unknown decl", decl))
  45. class CDATAEventCollector(EventCollector):
  46. def start_cdata(self, attrs):
  47. self.append(("starttag", "cdata", attrs))
  48. self.setliteral()
  49. class HTMLEntityCollector(EventCollector):
  50. entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
  51. '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)')
  52. def convert_charref(self, name):
  53. self.append(("charref", "convert", name))
  54. if name[0] != "x":
  55. return EventCollector.convert_charref(self, name)
  56. def convert_codepoint(self, codepoint):
  57. self.append(("codepoint", "convert", codepoint))
  58. EventCollector.convert_codepoint(self, codepoint)
  59. def convert_entityref(self, name):
  60. self.append(("entityref", "convert", name))
  61. return EventCollector.convert_entityref(self, name)
  62. # These to record that they were called, then pass the call along
  63. # to the default implementation so that it's actions can be
  64. # recorded.
  65. def handle_charref(self, data):
  66. self.append(("charref", data))
  67. sgmllib.SGMLParser.handle_charref(self, data)
  68. def handle_entityref(self, data):
  69. self.append(("entityref", data))
  70. sgmllib.SGMLParser.handle_entityref(self, data)
  71. class SGMLParserTestCase(unittest.TestCase):
  72. collector = EventCollector
  73. def get_events(self, source):
  74. parser = self.collector()
  75. try:
  76. for s in source:
  77. parser.feed(s)
  78. parser.close()
  79. except:
  80. #self.events = parser.events
  81. raise
  82. return parser.get_events()
  83. def check_events(self, source, expected_events):
  84. try:
  85. events = self.get_events(source)
  86. except:
  87. #import sys
  88. #print >>sys.stderr, pprint.pformat(self.events)
  89. raise
  90. if events != expected_events:
  91. self.fail("received events did not match expected events\n"
  92. "Expected:\n" + pprint.pformat(expected_events) +
  93. "\nReceived:\n" + pprint.pformat(events))
  94. def check_parse_error(self, source):
  95. parser = EventCollector()
  96. try:
  97. parser.feed(source)
  98. parser.close()
  99. except sgmllib.SGMLParseError:
  100. pass
  101. else:
  102. self.fail("expected SGMLParseError for %r\nReceived:\n%s"
  103. % (source, pprint.pformat(parser.get_events())))
  104. def test_doctype_decl_internal(self):
  105. inside = """\
  106. DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
  107. SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [
  108. <!ELEMENT html - O EMPTY>
  109. <!ATTLIST html
  110. version CDATA #IMPLIED
  111. profile CDATA 'DublinCore'>
  112. <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
  113. <!ENTITY myEntity 'internal parsed entity'>
  114. <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
  115. <!ENTITY % paramEntity 'name|name|name'>
  116. %paramEntity;
  117. <!-- comment -->
  118. ]"""
  119. self.check_events(["<!%s>" % inside], [
  120. ("decl", inside),
  121. ])
  122. def test_doctype_decl_external(self):
  123. inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'"
  124. self.check_events("<!%s>" % inside, [
  125. ("decl", inside),
  126. ])
  127. def test_underscore_in_attrname(self):
  128. # SF bug #436621
  129. """Make sure attribute names with underscores are accepted"""
  130. self.check_events("<a has_under _under>", [
  131. ("starttag", "a", [("has_under", "has_under"),
  132. ("_under", "_under")]),
  133. ])
  134. def test_underscore_in_tagname(self):
  135. # SF bug #436621
  136. """Make sure tag names with underscores are accepted"""
  137. self.check_events("<has_under></has_under>", [
  138. ("starttag", "has_under", []),
  139. ("endtag", "has_under"),
  140. ])
  141. def test_quotes_in_unquoted_attrs(self):
  142. # SF bug #436621
  143. """Be sure quotes in unquoted attributes are made part of the value"""
  144. self.check_events("<a href=foo'bar\"baz>", [
  145. ("starttag", "a", [("href", "foo'bar\"baz")]),
  146. ])
  147. def test_xhtml_empty_tag(self):
  148. """Handling of XHTML-style empty start tags"""
  149. self.check_events("<br />text<i></i>", [
  150. ("starttag", "br", []),
  151. ("data", "text"),
  152. ("starttag", "i", []),
  153. ("endtag", "i"),
  154. ])
  155. def test_processing_instruction_only(self):
  156. self.check_events("<?processing instruction>", [
  157. ("pi", "processing instruction"),
  158. ])
  159. def test_bad_nesting(self):
  160. self.check_events("<a><b></a></b>", [
  161. ("starttag", "a", []),
  162. ("starttag", "b", []),
  163. ("endtag", "a"),
  164. ("endtag", "b"),
  165. ])
  166. def test_bare_ampersands(self):
  167. self.check_events("this text & contains & ampersands &", [
  168. ("data", "this text & contains & ampersands &"),
  169. ])
  170. def test_bare_pointy_brackets(self):
  171. self.check_events("this < text > contains < bare>pointy< brackets", [
  172. ("data", "this < text > contains < bare>pointy< brackets"),
  173. ])
  174. def test_attr_syntax(self):
  175. output = [
  176. ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")])
  177. ]
  178. self.check_events("""<a b='v' c="v" d=v e>""", output)
  179. self.check_events("""<a b = 'v' c = "v" d = v e>""", output)
  180. self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
  181. self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
  182. def test_attr_values(self):
  183. self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
  184. [("starttag", "a", [("b", "xxx\n\txxx"),
  185. ("c", "yyy\t\nyyy"),
  186. ("d", "\txyz\n")])
  187. ])
  188. self.check_events("""<a b='' c="">""", [
  189. ("starttag", "a", [("b", ""), ("c", "")]),
  190. ])
  191. # URL construction stuff from RFC 1808:
  192. safe = "$-_.+"
  193. extra = "!*'(),"
  194. reserved = ";/?:@&="
  195. url = "http://example.com:8080/path/to/file?%s%s%s" % (
  196. safe, extra, reserved)
  197. self.check_events("""<e a=%s>""" % url, [
  198. ("starttag", "e", [("a", url)]),
  199. ])
  200. # Regression test for SF patch #669683.
  201. self.check_events("<e a=rgb(1,2,3)>", [
  202. ("starttag", "e", [("a", "rgb(1,2,3)")]),
  203. ])
  204. def test_attr_values_entities(self):
  205. """Substitution of entities and charrefs in attribute values"""
  206. # SF bug #1452246
  207. self.check_events("""<a b=&lt; c=&lt;&gt; d=&lt-&gt; e='&lt; '
  208. f="&xxx;" g='&#32;&#33;' h='&#500;'
  209. i='x?a=b&c=d;'
  210. j='&amp;#42;' k='&#38;#42;'>""",
  211. [("starttag", "a", [("b", "<"),
  212. ("c", "<>"),
  213. ("d", "&lt->"),
  214. ("e", "< "),
  215. ("f", "&xxx;"),
  216. ("g", " !"),
  217. ("h", "&#500;"),
  218. ("i", "x?a=b&c=d;"),
  219. ("j", "&#42;"),
  220. ("k", "&#42;"),
  221. ])])
  222. def test_convert_overrides(self):
  223. # This checks that the character and entity reference
  224. # conversion helpers are called at the documented times. No
  225. # attempt is made to really change what the parser accepts.
  226. #
  227. self.collector = HTMLEntityCollector
  228. self.check_events(('<a title="&ldquo;test&#x201d;">foo</a>'
  229. '&foobar;&#42;'), [
  230. ('entityref', 'convert', 'ldquo'),
  231. ('charref', 'convert', 'x201d'),
  232. ('starttag', 'a', [('title', '&ldquo;test&#x201d;')]),
  233. ('data', 'foo'),
  234. ('endtag', 'a'),
  235. ('entityref', 'foobar'),
  236. ('entityref', 'convert', 'foobar'),
  237. ('charref', '42'),
  238. ('charref', 'convert', '42'),
  239. ('codepoint', 'convert', 42),
  240. ])
  241. def test_attr_funky_names(self):
  242. self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
  243. ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
  244. ])
  245. def test_attr_value_ip6_url(self):
  246. # http://www.python.org/sf/853506
  247. self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>"
  248. "<a href=http://[1080::8:800:200C:417A]/>"), [
  249. ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
  250. ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
  251. ])
  252. def test_weird_starttags(self):
  253. self.check_events("<a<a>", [
  254. ("starttag", "a", []),
  255. ("starttag", "a", []),
  256. ])
  257. self.check_events("</a<a>", [
  258. ("endtag", "a"),
  259. ("starttag", "a", []),
  260. ])
  261. def test_declaration_junk_chars(self):
  262. self.check_parse_error("<!DOCTYPE foo $ >")
  263. def test_get_starttag_text(self):
  264. s = """<foobar \n one="1"\ttwo=2 >"""
  265. self.check_events(s, [
  266. ("starttag", "foobar", [("one", "1"), ("two", "2")]),
  267. ])
  268. def test_cdata_content(self):
  269. s = ("<cdata> <!-- not a comment --> &not-an-entity-ref; </cdata>"
  270. "<notcdata> <!-- comment --> </notcdata>")
  271. self.collector = CDATAEventCollector
  272. self.check_events(s, [
  273. ("starttag", "cdata", []),
  274. ("data", " <!-- not a comment --> &not-an-entity-ref; "),
  275. ("endtag", "cdata"),
  276. ("starttag", "notcdata", []),
  277. ("data", " "),
  278. ("comment", " comment "),
  279. ("data", " "),
  280. ("endtag", "notcdata"),
  281. ])
  282. s = """<cdata> <not a='start tag'> </cdata>"""
  283. self.check_events(s, [
  284. ("starttag", "cdata", []),
  285. ("data", " <not a='start tag'> "),
  286. ("endtag", "cdata"),
  287. ])
  288. def test_illegal_declarations(self):
  289. s = 'abc<!spacer type="block" height="25">def'
  290. self.check_events(s, [
  291. ("data", "abc"),
  292. ("unknown decl", 'spacer type="block" height="25"'),
  293. ("data", "def"),
  294. ])
  295. def test_enumerated_attr_type(self):
  296. s = "<!DOCTYPE doc [<!ATTLIST doc attr (a | b) >]>"
  297. self.check_events(s, [
  298. ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'),
  299. ])
  300. def test_read_chunks(self):
  301. # SF bug #1541697, this caused sgml parser to hang
  302. # Just verify this code doesn't cause a hang.
  303. CHUNK = 1024 # increasing this to 8212 makes the problem go away
  304. f = open(test_support.findfile('sgml_input.html'))
  305. fp = sgmllib.SGMLParser()
  306. while 1:
  307. data = f.read(CHUNK)
  308. fp.feed(data)
  309. if len(data) != CHUNK:
  310. break
  311. def test_only_decode_ascii(self):
  312. # SF bug #1651995, make sure non-ascii character references are not decoded
  313. s = '<signs exclamation="&#33" copyright="&#169" quoteleft="&#8216;">'
  314. self.check_events(s, [
  315. ('starttag', 'signs',
  316. [('exclamation', '!'), ('copyright', '&#169'),
  317. ('quoteleft', '&#8216;')]),
  318. ])
  319. # XXX These tests have been disabled by prefixing their names with
  320. # an underscore. The first two exercise outstanding bugs in the
  321. # sgmllib module, and the third exhibits questionable behavior
  322. # that needs to be carefully considered before changing it.
  323. def _test_starttag_end_boundary(self):
  324. self.check_events("<a b='<'>", [("starttag", "a", [("b", "<")])])
  325. self.check_events("<a b='>'>", [("starttag", "a", [("b", ">")])])
  326. def _test_buffer_artefacts(self):
  327. output = [("starttag", "a", [("b", "<")])]
  328. self.check_events(["<a b='<'>"], output)
  329. self.check_events(["<a ", "b='<'>"], output)
  330. self.check_events(["<a b", "='<'>"], output)
  331. self.check_events(["<a b=", "'<'>"], output)
  332. self.check_events(["<a b='<", "'>"], output)
  333. self.check_events(["<a b='<'", ">"], output)
  334. output = [("starttag", "a", [("b", ">")])]
  335. self.check_events(["<a b='>'>"], output)
  336. self.check_events(["<a ", "b='>'>"], output)
  337. self.check_events(["<a b", "='>'>"], output)
  338. self.check_events(["<a b=", "'>'>"], output)
  339. self.check_events(["<a b='>", "'>"], output)
  340. self.check_events(["<a b='>'", ">"], output)
  341. output = [("comment", "abc")]
  342. self.check_events(["", "<!--abc-->"], output)
  343. self.check_events(["<", "!--abc-->"], output)
  344. self.check_events(["<!", "--abc-->"], output)
  345. self.check_events(["<!-", "-abc-->"], output)
  346. self.check_events(["<!--", "abc-->"], output)
  347. self.check_events(["<!--a", "bc-->"], output)
  348. self.check_events(["<!--ab", "c-->"], output)
  349. self.check_events(["<!--abc", "-->"], output)
  350. self.check_events(["<!--abc-", "->"], output)
  351. self.check_events(["<!--abc--", ">"], output)
  352. self.check_events(["<!--abc-->", ""], output)
  353. def _test_starttag_junk_chars(self):
  354. self.check_parse_error("<")
  355. self.check_parse_error("<>")
  356. self.check_parse_error("</$>")
  357. self.check_parse_error("</")
  358. self.check_parse_error("</a")
  359. self.check_parse_error("<$")
  360. self.check_parse_error("<$>")
  361. self.check_parse_error("<!")
  362. self.check_parse_error("<a $>")
  363. self.check_parse_error("<a")
  364. self.check_parse_error("<a foo='bar'")
  365. self.check_parse_error("<a foo='bar")
  366. self.check_parse_error("<a foo='>'")
  367. self.check_parse_error("<a foo='>")
  368. self.check_parse_error("<a foo=>")
  369. def test_main():
  370. test_support.run_unittest(SGMLParserTestCase)
  371. if __name__ == "__main__":
  372. test_main()