PageRenderTime 49ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/lib-python/2.7/HTMLParser.py

https://bitbucket.org/squeaky/pypy
Python | 472 lines | 422 code | 19 blank | 31 comment | 58 complexity | a480473ab86296e4602b92ca642c9f77 MD5 | raw file
Possible License(s): Apache-2.0
  1. """A parser for HTML and XHTML."""
  2. # This file is based on sgmllib.py, but the API is slightly different.
  3. # XXX There should be a way to distinguish between PCDATA (parsed
  4. # character data -- the normal case), RCDATA (replaceable character
  5. # data -- only char and entity references and end tags are special)
  6. # and CDATA (character data -- only end tags are special).
  7. import markupbase
  8. import re
  9. # Regular expressions used for parsing
  10. interesting_normal = re.compile('[&<]')
  11. incomplete = re.compile('&[a-zA-Z#]')
  12. entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  13. charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
  14. starttagopen = re.compile('<[a-zA-Z]')
  15. piclose = re.compile('>')
  16. commentclose = re.compile(r'--\s*>')
  17. tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
  18. # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
  19. # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
  20. tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
  21. attrfind = re.compile(
  22. r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
  23. r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
  24. locatestarttagend = re.compile(r"""
  25. <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
  26. (?:[\s/]* # optional whitespace before attribute name
  27. (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
  28. (?:\s*=+\s* # value indicator
  29. (?:'[^']*' # LITA-enclosed value
  30. |"[^"]*" # LIT-enclosed value
  31. |(?!['"])[^>\s]* # bare value
  32. )
  33. )?(?:\s|/(?!>))*
  34. )*
  35. )?
  36. \s* # trailing whitespace
  37. """, re.VERBOSE)
  38. endendtag = re.compile('>')
  39. # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
  40. # </ and the tag name, so maybe this should be fixed
  41. endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
  42. class HTMLParseError(Exception):
  43. """Exception raised for all parse errors."""
  44. def __init__(self, msg, position=(None, None)):
  45. assert msg
  46. self.msg = msg
  47. self.lineno = position[0]
  48. self.offset = position[1]
  49. def __str__(self):
  50. result = self.msg
  51. if self.lineno is not None:
  52. result = result + ", at line %d" % self.lineno
  53. if self.offset is not None:
  54. result = result + ", column %d" % (self.offset + 1)
  55. return result
  56. class HTMLParser(markupbase.ParserBase):
  57. """Find tags and other markup and call handler functions.
  58. Usage:
  59. p = HTMLParser()
  60. p.feed(data)
  61. ...
  62. p.close()
  63. Start tags are handled by calling self.handle_starttag() or
  64. self.handle_startendtag(); end tags by self.handle_endtag(). The
  65. data between tags is passed from the parser to the derived class
  66. by calling self.handle_data() with the data as argument (the data
  67. may be split up in arbitrary chunks). Entity references are
  68. passed by calling self.handle_entityref() with the entity
  69. reference as the argument. Numeric character references are
  70. passed to self.handle_charref() with the string containing the
  71. reference as the argument.
  72. """
  73. CDATA_CONTENT_ELEMENTS = ("script", "style")
  74. def __init__(self):
  75. """Initialize and reset this instance."""
  76. self.reset()
  77. def reset(self):
  78. """Reset this instance. Loses all unprocessed data."""
  79. self.rawdata = ''
  80. self.lasttag = '???'
  81. self.interesting = interesting_normal
  82. self.cdata_elem = None
  83. markupbase.ParserBase.reset(self)
  84. def feed(self, data):
  85. r"""Feed data to the parser.
  86. Call this as often as you want, with as little or as much text
  87. as you want (may include '\n').
  88. """
  89. self.rawdata = self.rawdata + data
  90. self.goahead(0)
  91. def close(self):
  92. """Handle any buffered data."""
  93. self.goahead(1)
  94. def error(self, message):
  95. raise HTMLParseError(message, self.getpos())
  96. __starttag_text = None
  97. def get_starttag_text(self):
  98. """Return full source of start tag: '<...>'."""
  99. return self.__starttag_text
  100. def set_cdata_mode(self, elem):
  101. self.cdata_elem = elem.lower()
  102. self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
  103. def clear_cdata_mode(self):
  104. self.interesting = interesting_normal
  105. self.cdata_elem = None
  106. # Internal -- handle data as far as reasonable. May leave state
  107. # and data to be processed by a subsequent call. If 'end' is
  108. # true, force handling all data as if followed by EOF marker.
  109. def goahead(self, end):
  110. rawdata = self.rawdata
  111. i = 0
  112. n = len(rawdata)
  113. while i < n:
  114. match = self.interesting.search(rawdata, i) # < or &
  115. if match:
  116. j = match.start()
  117. else:
  118. if self.cdata_elem:
  119. break
  120. j = n
  121. if i < j: self.handle_data(rawdata[i:j])
  122. i = self.updatepos(i, j)
  123. if i == n: break
  124. startswith = rawdata.startswith
  125. if startswith('<', i):
  126. if starttagopen.match(rawdata, i): # < + letter
  127. k = self.parse_starttag(i)
  128. elif startswith("</", i):
  129. k = self.parse_endtag(i)
  130. elif startswith("<!--", i):
  131. k = self.parse_comment(i)
  132. elif startswith("<?", i):
  133. k = self.parse_pi(i)
  134. elif startswith("<!", i):
  135. k = self.parse_html_declaration(i)
  136. elif (i + 1) < n:
  137. self.handle_data("<")
  138. k = i + 1
  139. else:
  140. break
  141. if k < 0:
  142. if not end:
  143. break
  144. k = rawdata.find('>', i + 1)
  145. if k < 0:
  146. k = rawdata.find('<', i + 1)
  147. if k < 0:
  148. k = i + 1
  149. else:
  150. k += 1
  151. self.handle_data(rawdata[i:k])
  152. i = self.updatepos(i, k)
  153. elif startswith("&#", i):
  154. match = charref.match(rawdata, i)
  155. if match:
  156. name = match.group()[2:-1]
  157. self.handle_charref(name)
  158. k = match.end()
  159. if not startswith(';', k-1):
  160. k = k - 1
  161. i = self.updatepos(i, k)
  162. continue
  163. else:
  164. if ";" in rawdata[i:]: #bail by consuming &#
  165. self.handle_data(rawdata[0:2])
  166. i = self.updatepos(i, 2)
  167. break
  168. elif startswith('&', i):
  169. match = entityref.match(rawdata, i)
  170. if match:
  171. name = match.group(1)
  172. self.handle_entityref(name)
  173. k = match.end()
  174. if not startswith(';', k-1):
  175. k = k - 1
  176. i = self.updatepos(i, k)
  177. continue
  178. match = incomplete.match(rawdata, i)
  179. if match:
  180. # match.group() will contain at least 2 chars
  181. if end and match.group() == rawdata[i:]:
  182. self.error("EOF in middle of entity or char ref")
  183. # incomplete
  184. break
  185. elif (i + 1) < n:
  186. # not the end of the buffer, and can't be confused
  187. # with some other construct
  188. self.handle_data("&")
  189. i = self.updatepos(i, i + 1)
  190. else:
  191. break
  192. else:
  193. assert 0, "interesting.search() lied"
  194. # end while
  195. if end and i < n and not self.cdata_elem:
  196. self.handle_data(rawdata[i:n])
  197. i = self.updatepos(i, n)
  198. self.rawdata = rawdata[i:]
  199. # Internal -- parse html declarations, return length or -1 if not terminated
  200. # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
  201. # See also parse_declaration in _markupbase
  202. def parse_html_declaration(self, i):
  203. rawdata = self.rawdata
  204. if rawdata[i:i+2] != '<!':
  205. self.error('unexpected call to parse_html_declaration()')
  206. if rawdata[i:i+4] == '<!--':
  207. # this case is actually already handled in goahead()
  208. return self.parse_comment(i)
  209. elif rawdata[i:i+3] == '<![':
  210. return self.parse_marked_section(i)
  211. elif rawdata[i:i+9].lower() == '<!doctype':
  212. # find the closing >
  213. gtpos = rawdata.find('>', i+9)
  214. if gtpos == -1:
  215. return -1
  216. self.handle_decl(rawdata[i+2:gtpos])
  217. return gtpos+1
  218. else:
  219. return self.parse_bogus_comment(i)
  220. # Internal -- parse bogus comment, return length or -1 if not terminated
  221. # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
  222. def parse_bogus_comment(self, i, report=1):
  223. rawdata = self.rawdata
  224. if rawdata[i:i+2] not in ('<!', '</'):
  225. self.error('unexpected call to parse_comment()')
  226. pos = rawdata.find('>', i+2)
  227. if pos == -1:
  228. return -1
  229. if report:
  230. self.handle_comment(rawdata[i+2:pos])
  231. return pos + 1
  232. # Internal -- parse processing instr, return end or -1 if not terminated
  233. def parse_pi(self, i):
  234. rawdata = self.rawdata
  235. assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
  236. match = piclose.search(rawdata, i+2) # >
  237. if not match:
  238. return -1
  239. j = match.start()
  240. self.handle_pi(rawdata[i+2: j])
  241. j = match.end()
  242. return j
  243. # Internal -- handle starttag, return end or -1 if not terminated
  244. def parse_starttag(self, i):
  245. self.__starttag_text = None
  246. endpos = self.check_for_whole_start_tag(i)
  247. if endpos < 0:
  248. return endpos
  249. rawdata = self.rawdata
  250. self.__starttag_text = rawdata[i:endpos]
  251. # Now parse the data between i+1 and j into a tag and attrs
  252. attrs = []
  253. match = tagfind.match(rawdata, i+1)
  254. assert match, 'unexpected call to parse_starttag()'
  255. k = match.end()
  256. self.lasttag = tag = match.group(1).lower()
  257. while k < endpos:
  258. m = attrfind.match(rawdata, k)
  259. if not m:
  260. break
  261. attrname, rest, attrvalue = m.group(1, 2, 3)
  262. if not rest:
  263. attrvalue = None
  264. elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
  265. attrvalue[:1] == '"' == attrvalue[-1:]:
  266. attrvalue = attrvalue[1:-1]
  267. if attrvalue:
  268. attrvalue = self.unescape(attrvalue)
  269. attrs.append((attrname.lower(), attrvalue))
  270. k = m.end()
  271. end = rawdata[k:endpos].strip()
  272. if end not in (">", "/>"):
  273. lineno, offset = self.getpos()
  274. if "\n" in self.__starttag_text:
  275. lineno = lineno + self.__starttag_text.count("\n")
  276. offset = len(self.__starttag_text) \
  277. - self.__starttag_text.rfind("\n")
  278. else:
  279. offset = offset + len(self.__starttag_text)
  280. self.handle_data(rawdata[i:endpos])
  281. return endpos
  282. if end.endswith('/>'):
  283. # XHTML-style empty tag: <span attr="value" />
  284. self.handle_startendtag(tag, attrs)
  285. else:
  286. self.handle_starttag(tag, attrs)
  287. if tag in self.CDATA_CONTENT_ELEMENTS:
  288. self.set_cdata_mode(tag)
  289. return endpos
  290. # Internal -- check to see if we have a complete starttag; return end
  291. # or -1 if incomplete.
  292. def check_for_whole_start_tag(self, i):
  293. rawdata = self.rawdata
  294. m = locatestarttagend.match(rawdata, i)
  295. if m:
  296. j = m.end()
  297. next = rawdata[j:j+1]
  298. if next == ">":
  299. return j + 1
  300. if next == "/":
  301. if rawdata.startswith("/>", j):
  302. return j + 2
  303. if rawdata.startswith("/", j):
  304. # buffer boundary
  305. return -1
  306. # else bogus input
  307. self.updatepos(i, j + 1)
  308. self.error("malformed empty start tag")
  309. if next == "":
  310. # end of input
  311. return -1
  312. if next in ("abcdefghijklmnopqrstuvwxyz=/"
  313. "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
  314. # end of input in or before attribute value, or we have the
  315. # '/' from a '/>' ending
  316. return -1
  317. if j > i:
  318. return j
  319. else:
  320. return i + 1
  321. raise AssertionError("we should not get here!")
  322. # Internal -- parse endtag, return end or -1 if incomplete
  323. def parse_endtag(self, i):
  324. rawdata = self.rawdata
  325. assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
  326. match = endendtag.search(rawdata, i+1) # >
  327. if not match:
  328. return -1
  329. gtpos = match.end()
  330. match = endtagfind.match(rawdata, i) # </ + tag + >
  331. if not match:
  332. if self.cdata_elem is not None:
  333. self.handle_data(rawdata[i:gtpos])
  334. return gtpos
  335. # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
  336. namematch = tagfind_tolerant.match(rawdata, i+2)
  337. if not namematch:
  338. # w3.org/TR/html5/tokenization.html#end-tag-open-state
  339. if rawdata[i:i+3] == '</>':
  340. return i+3
  341. else:
  342. return self.parse_bogus_comment(i)
  343. tagname = namematch.group().lower()
  344. # consume and ignore other stuff between the name and the >
  345. # Note: this is not 100% correct, since we might have things like
  346. # </tag attr=">">, but looking for > after tha name should cover
  347. # most of the cases and is much simpler
  348. gtpos = rawdata.find('>', namematch.end())
  349. self.handle_endtag(tagname)
  350. return gtpos+1
  351. elem = match.group(1).lower() # script or style
  352. if self.cdata_elem is not None:
  353. if elem != self.cdata_elem:
  354. self.handle_data(rawdata[i:gtpos])
  355. return gtpos
  356. self.handle_endtag(elem)
  357. self.clear_cdata_mode()
  358. return gtpos
  359. # Overridable -- finish processing of start+end tag: <tag.../>
  360. def handle_startendtag(self, tag, attrs):
  361. self.handle_starttag(tag, attrs)
  362. self.handle_endtag(tag)
  363. # Overridable -- handle start tag
  364. def handle_starttag(self, tag, attrs):
  365. pass
  366. # Overridable -- handle end tag
  367. def handle_endtag(self, tag):
  368. pass
  369. # Overridable -- handle character reference
  370. def handle_charref(self, name):
  371. pass
  372. # Overridable -- handle entity reference
  373. def handle_entityref(self, name):
  374. pass
  375. # Overridable -- handle data
  376. def handle_data(self, data):
  377. pass
  378. # Overridable -- handle comment
  379. def handle_comment(self, data):
  380. pass
  381. # Overridable -- handle declaration
  382. def handle_decl(self, decl):
  383. pass
  384. # Overridable -- handle processing instruction
  385. def handle_pi(self, data):
  386. pass
  387. def unknown_decl(self, data):
  388. pass
  389. # Internal -- helper to remove special character quoting
  390. entitydefs = None
  391. def unescape(self, s):
  392. if '&' not in s:
  393. return s
  394. def replaceEntities(s):
  395. s = s.groups()[0]
  396. try:
  397. if s[0] == "#":
  398. s = s[1:]
  399. if s[0] in ['x','X']:
  400. c = int(s[1:], 16)
  401. else:
  402. c = int(s)
  403. return unichr(c)
  404. except ValueError:
  405. return '&#'+s+';'
  406. else:
  407. # Cannot use name2codepoint directly, because HTMLParser supports apos,
  408. # which is not part of HTML 4
  409. import htmlentitydefs
  410. if HTMLParser.entitydefs is None:
  411. entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
  412. for k, v in htmlentitydefs.name2codepoint.iteritems():
  413. entitydefs[k] = unichr(v)
  414. try:
  415. return self.entitydefs[s]
  416. except KeyError:
  417. return '&'+s+';'
  418. return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)