PageRenderTime 55ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/lib-python/2.7/HTMLParser.py

https://bitbucket.org/ltratt/pypy
Python | 475 lines | 422 code | 20 blank | 33 comment | 58 complexity | c34d903ae5efe2f115d655dde17b7651 MD5 | raw file
Possible License(s): Apache-2.0, AGPL-3.0, BSD-3-Clause
  1. """A parser for HTML and XHTML."""
  2. # This file is based on sgmllib.py, but the API is slightly different.
  3. # XXX There should be a way to distinguish between PCDATA (parsed
  4. # character data -- the normal case), RCDATA (replaceable character
  5. # data -- only char and entity references and end tags are special)
  6. # and CDATA (character data -- only end tags are special).
  7. import markupbase
  8. import re
  9. # Regular expressions used for parsing
  10. interesting_normal = re.compile('[&<]')
  11. incomplete = re.compile('&[a-zA-Z#]')
  12. entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  13. charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
  14. starttagopen = re.compile('<[a-zA-Z]')
  15. piclose = re.compile('>')
  16. commentclose = re.compile(r'--\s*>')
  17. # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
  18. # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
  19. # note: if you change tagfind/attrfind remember to update locatestarttagend too
  20. tagfind = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
  21. # this regex is currently unused, but left for backward compatibility
  22. tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
  23. attrfind = re.compile(
  24. r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
  25. r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
  26. locatestarttagend = re.compile(r"""
  27. <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
  28. (?:[\s/]* # optional whitespace before attribute name
  29. (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
  30. (?:\s*=+\s* # value indicator
  31. (?:'[^']*' # LITA-enclosed value
  32. |"[^"]*" # LIT-enclosed value
  33. |(?!['"])[^>\s]* # bare value
  34. )
  35. )?(?:\s|/(?!>))*
  36. )*
  37. )?
  38. \s* # trailing whitespace
  39. """, re.VERBOSE)
  40. endendtag = re.compile('>')
  41. # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
  42. # </ and the tag name, so maybe this should be fixed
  43. endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
  44. class HTMLParseError(Exception):
  45. """Exception raised for all parse errors."""
  46. def __init__(self, msg, position=(None, None)):
  47. assert msg
  48. self.msg = msg
  49. self.lineno = position[0]
  50. self.offset = position[1]
  51. def __str__(self):
  52. result = self.msg
  53. if self.lineno is not None:
  54. result = result + ", at line %d" % self.lineno
  55. if self.offset is not None:
  56. result = result + ", column %d" % (self.offset + 1)
  57. return result
  58. class HTMLParser(markupbase.ParserBase):
  59. """Find tags and other markup and call handler functions.
  60. Usage:
  61. p = HTMLParser()
  62. p.feed(data)
  63. ...
  64. p.close()
  65. Start tags are handled by calling self.handle_starttag() or
  66. self.handle_startendtag(); end tags by self.handle_endtag(). The
  67. data between tags is passed from the parser to the derived class
  68. by calling self.handle_data() with the data as argument (the data
  69. may be split up in arbitrary chunks). Entity references are
  70. passed by calling self.handle_entityref() with the entity
  71. reference as the argument. Numeric character references are
  72. passed to self.handle_charref() with the string containing the
  73. reference as the argument.
  74. """
  75. CDATA_CONTENT_ELEMENTS = ("script", "style")
  76. def __init__(self):
  77. """Initialize and reset this instance."""
  78. self.reset()
  79. def reset(self):
  80. """Reset this instance. Loses all unprocessed data."""
  81. self.rawdata = ''
  82. self.lasttag = '???'
  83. self.interesting = interesting_normal
  84. self.cdata_elem = None
  85. markupbase.ParserBase.reset(self)
  86. def feed(self, data):
  87. r"""Feed data to the parser.
  88. Call this as often as you want, with as little or as much text
  89. as you want (may include '\n').
  90. """
  91. self.rawdata = self.rawdata + data
  92. self.goahead(0)
  93. def close(self):
  94. """Handle any buffered data."""
  95. self.goahead(1)
  96. def error(self, message):
  97. raise HTMLParseError(message, self.getpos())
  98. __starttag_text = None
  99. def get_starttag_text(self):
  100. """Return full source of start tag: '<...>'."""
  101. return self.__starttag_text
  102. def set_cdata_mode(self, elem):
  103. self.cdata_elem = elem.lower()
  104. self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
  105. def clear_cdata_mode(self):
  106. self.interesting = interesting_normal
  107. self.cdata_elem = None
  108. # Internal -- handle data as far as reasonable. May leave state
  109. # and data to be processed by a subsequent call. If 'end' is
  110. # true, force handling all data as if followed by EOF marker.
  111. def goahead(self, end):
  112. rawdata = self.rawdata
  113. i = 0
  114. n = len(rawdata)
  115. while i < n:
  116. match = self.interesting.search(rawdata, i) # < or &
  117. if match:
  118. j = match.start()
  119. else:
  120. if self.cdata_elem:
  121. break
  122. j = n
  123. if i < j: self.handle_data(rawdata[i:j])
  124. i = self.updatepos(i, j)
  125. if i == n: break
  126. startswith = rawdata.startswith
  127. if startswith('<', i):
  128. if starttagopen.match(rawdata, i): # < + letter
  129. k = self.parse_starttag(i)
  130. elif startswith("</", i):
  131. k = self.parse_endtag(i)
  132. elif startswith("<!--", i):
  133. k = self.parse_comment(i)
  134. elif startswith("<?", i):
  135. k = self.parse_pi(i)
  136. elif startswith("<!", i):
  137. k = self.parse_html_declaration(i)
  138. elif (i + 1) < n:
  139. self.handle_data("<")
  140. k = i + 1
  141. else:
  142. break
  143. if k < 0:
  144. if not end:
  145. break
  146. k = rawdata.find('>', i + 1)
  147. if k < 0:
  148. k = rawdata.find('<', i + 1)
  149. if k < 0:
  150. k = i + 1
  151. else:
  152. k += 1
  153. self.handle_data(rawdata[i:k])
  154. i = self.updatepos(i, k)
  155. elif startswith("&#", i):
  156. match = charref.match(rawdata, i)
  157. if match:
  158. name = match.group()[2:-1]
  159. self.handle_charref(name)
  160. k = match.end()
  161. if not startswith(';', k-1):
  162. k = k - 1
  163. i = self.updatepos(i, k)
  164. continue
  165. else:
  166. if ";" in rawdata[i:]: # bail by consuming '&#'
  167. self.handle_data(rawdata[i:i+2])
  168. i = self.updatepos(i, i+2)
  169. break
  170. elif startswith('&', i):
  171. match = entityref.match(rawdata, i)
  172. if match:
  173. name = match.group(1)
  174. self.handle_entityref(name)
  175. k = match.end()
  176. if not startswith(';', k-1):
  177. k = k - 1
  178. i = self.updatepos(i, k)
  179. continue
  180. match = incomplete.match(rawdata, i)
  181. if match:
  182. # match.group() will contain at least 2 chars
  183. if end and match.group() == rawdata[i:]:
  184. self.error("EOF in middle of entity or char ref")
  185. # incomplete
  186. break
  187. elif (i + 1) < n:
  188. # not the end of the buffer, and can't be confused
  189. # with some other construct
  190. self.handle_data("&")
  191. i = self.updatepos(i, i + 1)
  192. else:
  193. break
  194. else:
  195. assert 0, "interesting.search() lied"
  196. # end while
  197. if end and i < n and not self.cdata_elem:
  198. self.handle_data(rawdata[i:n])
  199. i = self.updatepos(i, n)
  200. self.rawdata = rawdata[i:]
  201. # Internal -- parse html declarations, return length or -1 if not terminated
  202. # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
  203. # See also parse_declaration in _markupbase
  204. def parse_html_declaration(self, i):
  205. rawdata = self.rawdata
  206. if rawdata[i:i+2] != '<!':
  207. self.error('unexpected call to parse_html_declaration()')
  208. if rawdata[i:i+4] == '<!--':
  209. # this case is actually already handled in goahead()
  210. return self.parse_comment(i)
  211. elif rawdata[i:i+3] == '<![':
  212. return self.parse_marked_section(i)
  213. elif rawdata[i:i+9].lower() == '<!doctype':
  214. # find the closing >
  215. gtpos = rawdata.find('>', i+9)
  216. if gtpos == -1:
  217. return -1
  218. self.handle_decl(rawdata[i+2:gtpos])
  219. return gtpos+1
  220. else:
  221. return self.parse_bogus_comment(i)
  222. # Internal -- parse bogus comment, return length or -1 if not terminated
  223. # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
  224. def parse_bogus_comment(self, i, report=1):
  225. rawdata = self.rawdata
  226. if rawdata[i:i+2] not in ('<!', '</'):
  227. self.error('unexpected call to parse_comment()')
  228. pos = rawdata.find('>', i+2)
  229. if pos == -1:
  230. return -1
  231. if report:
  232. self.handle_comment(rawdata[i+2:pos])
  233. return pos + 1
  234. # Internal -- parse processing instr, return end or -1 if not terminated
  235. def parse_pi(self, i):
  236. rawdata = self.rawdata
  237. assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
  238. match = piclose.search(rawdata, i+2) # >
  239. if not match:
  240. return -1
  241. j = match.start()
  242. self.handle_pi(rawdata[i+2: j])
  243. j = match.end()
  244. return j
  245. # Internal -- handle starttag, return end or -1 if not terminated
  246. def parse_starttag(self, i):
  247. self.__starttag_text = None
  248. endpos = self.check_for_whole_start_tag(i)
  249. if endpos < 0:
  250. return endpos
  251. rawdata = self.rawdata
  252. self.__starttag_text = rawdata[i:endpos]
  253. # Now parse the data between i+1 and j into a tag and attrs
  254. attrs = []
  255. match = tagfind.match(rawdata, i+1)
  256. assert match, 'unexpected call to parse_starttag()'
  257. k = match.end()
  258. self.lasttag = tag = match.group(1).lower()
  259. while k < endpos:
  260. m = attrfind.match(rawdata, k)
  261. if not m:
  262. break
  263. attrname, rest, attrvalue = m.group(1, 2, 3)
  264. if not rest:
  265. attrvalue = None
  266. elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
  267. attrvalue[:1] == '"' == attrvalue[-1:]:
  268. attrvalue = attrvalue[1:-1]
  269. if attrvalue:
  270. attrvalue = self.unescape(attrvalue)
  271. attrs.append((attrname.lower(), attrvalue))
  272. k = m.end()
  273. end = rawdata[k:endpos].strip()
  274. if end not in (">", "/>"):
  275. lineno, offset = self.getpos()
  276. if "\n" in self.__starttag_text:
  277. lineno = lineno + self.__starttag_text.count("\n")
  278. offset = len(self.__starttag_text) \
  279. - self.__starttag_text.rfind("\n")
  280. else:
  281. offset = offset + len(self.__starttag_text)
  282. self.handle_data(rawdata[i:endpos])
  283. return endpos
  284. if end.endswith('/>'):
  285. # XHTML-style empty tag: <span attr="value" />
  286. self.handle_startendtag(tag, attrs)
  287. else:
  288. self.handle_starttag(tag, attrs)
  289. if tag in self.CDATA_CONTENT_ELEMENTS:
  290. self.set_cdata_mode(tag)
  291. return endpos
  292. # Internal -- check to see if we have a complete starttag; return end
  293. # or -1 if incomplete.
  294. def check_for_whole_start_tag(self, i):
  295. rawdata = self.rawdata
  296. m = locatestarttagend.match(rawdata, i)
  297. if m:
  298. j = m.end()
  299. next = rawdata[j:j+1]
  300. if next == ">":
  301. return j + 1
  302. if next == "/":
  303. if rawdata.startswith("/>", j):
  304. return j + 2
  305. if rawdata.startswith("/", j):
  306. # buffer boundary
  307. return -1
  308. # else bogus input
  309. self.updatepos(i, j + 1)
  310. self.error("malformed empty start tag")
  311. if next == "":
  312. # end of input
  313. return -1
  314. if next in ("abcdefghijklmnopqrstuvwxyz=/"
  315. "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
  316. # end of input in or before attribute value, or we have the
  317. # '/' from a '/>' ending
  318. return -1
  319. if j > i:
  320. return j
  321. else:
  322. return i + 1
  323. raise AssertionError("we should not get here!")
  324. # Internal -- parse endtag, return end or -1 if incomplete
  325. def parse_endtag(self, i):
  326. rawdata = self.rawdata
  327. assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
  328. match = endendtag.search(rawdata, i+1) # >
  329. if not match:
  330. return -1
  331. gtpos = match.end()
  332. match = endtagfind.match(rawdata, i) # </ + tag + >
  333. if not match:
  334. if self.cdata_elem is not None:
  335. self.handle_data(rawdata[i:gtpos])
  336. return gtpos
  337. # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
  338. namematch = tagfind.match(rawdata, i+2)
  339. if not namematch:
  340. # w3.org/TR/html5/tokenization.html#end-tag-open-state
  341. if rawdata[i:i+3] == '</>':
  342. return i+3
  343. else:
  344. return self.parse_bogus_comment(i)
  345. tagname = namematch.group(1).lower()
  346. # consume and ignore other stuff between the name and the >
  347. # Note: this is not 100% correct, since we might have things like
  348. # </tag attr=">">, but looking for > after tha name should cover
  349. # most of the cases and is much simpler
  350. gtpos = rawdata.find('>', namematch.end())
  351. self.handle_endtag(tagname)
  352. return gtpos+1
  353. elem = match.group(1).lower() # script or style
  354. if self.cdata_elem is not None:
  355. if elem != self.cdata_elem:
  356. self.handle_data(rawdata[i:gtpos])
  357. return gtpos
  358. self.handle_endtag(elem)
  359. self.clear_cdata_mode()
  360. return gtpos
  361. # Overridable -- finish processing of start+end tag: <tag.../>
  362. def handle_startendtag(self, tag, attrs):
  363. self.handle_starttag(tag, attrs)
  364. self.handle_endtag(tag)
  365. # Overridable -- handle start tag
  366. def handle_starttag(self, tag, attrs):
  367. pass
  368. # Overridable -- handle end tag
  369. def handle_endtag(self, tag):
  370. pass
  371. # Overridable -- handle character reference
  372. def handle_charref(self, name):
  373. pass
  374. # Overridable -- handle entity reference
  375. def handle_entityref(self, name):
  376. pass
  377. # Overridable -- handle data
  378. def handle_data(self, data):
  379. pass
  380. # Overridable -- handle comment
  381. def handle_comment(self, data):
  382. pass
  383. # Overridable -- handle declaration
  384. def handle_decl(self, decl):
  385. pass
  386. # Overridable -- handle processing instruction
  387. def handle_pi(self, data):
  388. pass
  389. def unknown_decl(self, data):
  390. pass
  391. # Internal -- helper to remove special character quoting
  392. entitydefs = None
  393. def unescape(self, s):
  394. if '&' not in s:
  395. return s
  396. def replaceEntities(s):
  397. s = s.groups()[0]
  398. try:
  399. if s[0] == "#":
  400. s = s[1:]
  401. if s[0] in ['x','X']:
  402. c = int(s[1:], 16)
  403. else:
  404. c = int(s)
  405. return unichr(c)
  406. except ValueError:
  407. return '&#'+s+';'
  408. else:
  409. # Cannot use name2codepoint directly, because HTMLParser supports apos,
  410. # which is not part of HTML 4
  411. import htmlentitydefs
  412. if HTMLParser.entitydefs is None:
  413. entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
  414. for k, v in htmlentitydefs.name2codepoint.iteritems():
  415. entitydefs[k] = unichr(v)
  416. try:
  417. return self.entitydefs[s]
  418. except KeyError:
  419. return '&'+s+';'
  420. return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)