PageRenderTime 76ms CodeModel.GetById 45ms RepoModel.GetById 0ms app.codeStats 0ms

/extern/python/closured/lib/python2.7/HTMLParser.py

https://github.com/atoun/jsrepl
Python | 393 lines | 355 code | 16 blank | 22 comment | 47 complexity | be37e4a371c6dc5700ea5be7dbcf72c8 MD5 | raw file
  1. """A parser for HTML and XHTML."""
  2. # This file is based on sgmllib.py, but the API is slightly different.
  3. # XXX There should be a way to distinguish between PCDATA (parsed
  4. # character data -- the normal case), RCDATA (replaceable character
  5. # data -- only char and entity references and end tags are special)
  6. # and CDATA (character data -- only end tags are special).
  7. import markupbase
  8. import re
  9. # Regular expressions used for parsing
  10. interesting_normal = re.compile('[&<]')
  11. interesting_cdata = re.compile(r'<(/|\Z)')
  12. incomplete = re.compile('&[a-zA-Z#]')
  13. entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  14. charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
  15. starttagopen = re.compile('<[a-zA-Z]')
  16. piclose = re.compile('>')
  17. commentclose = re.compile(r'--\s*>')
  18. tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
  19. attrfind = re.compile(
  20. r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
  21. r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
  22. locatestarttagend = re.compile(r"""
  23. <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
  24. (?:\s+ # whitespace before attribute name
  25. (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
  26. (?:\s*=\s* # value indicator
  27. (?:'[^']*' # LITA-enclosed value
  28. |\"[^\"]*\" # LIT-enclosed value
  29. |[^'\">\s]+ # bare value
  30. )
  31. )?
  32. )
  33. )*
  34. \s* # trailing whitespace
  35. """, re.VERBOSE)
  36. endendtag = re.compile('>')
  37. endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
  38. class HTMLParseError(Exception):
  39. """Exception raised for all parse errors."""
  40. def __init__(self, msg, position=(None, None)):
  41. assert msg
  42. self.msg = msg
  43. self.lineno = position[0]
  44. self.offset = position[1]
  45. def __str__(self):
  46. result = self.msg
  47. if self.lineno is not None:
  48. result = result + ", at line %d" % self.lineno
  49. if self.offset is not None:
  50. result = result + ", column %d" % (self.offset + 1)
  51. return result
  52. class HTMLParser(markupbase.ParserBase):
  53. """Find tags and other markup and call handler functions.
  54. Usage:
  55. p = HTMLParser()
  56. p.feed(data)
  57. ...
  58. p.close()
  59. Start tags are handled by calling self.handle_starttag() or
  60. self.handle_startendtag(); end tags by self.handle_endtag(). The
  61. data between tags is passed from the parser to the derived class
  62. by calling self.handle_data() with the data as argument (the data
  63. may be split up in arbitrary chunks). Entity references are
  64. passed by calling self.handle_entityref() with the entity
  65. reference as the argument. Numeric character references are
  66. passed to self.handle_charref() with the string containing the
  67. reference as the argument.
  68. """
  69. CDATA_CONTENT_ELEMENTS = ("script", "style")
  70. def __init__(self):
  71. """Initialize and reset this instance."""
  72. self.reset()
  73. def reset(self):
  74. """Reset this instance. Loses all unprocessed data."""
  75. self.rawdata = ''
  76. self.lasttag = '???'
  77. self.interesting = interesting_normal
  78. markupbase.ParserBase.reset(self)
  79. def feed(self, data):
  80. r"""Feed data to the parser.
  81. Call this as often as you want, with as little or as much text
  82. as you want (may include '\n').
  83. """
  84. self.rawdata = self.rawdata + data
  85. self.goahead(0)
  86. def close(self):
  87. """Handle any buffered data."""
  88. self.goahead(1)
  89. def error(self, message):
  90. raise HTMLParseError(message, self.getpos())
  91. __starttag_text = None
  92. def get_starttag_text(self):
  93. """Return full source of start tag: '<...>'."""
  94. return self.__starttag_text
  95. def set_cdata_mode(self):
  96. self.interesting = interesting_cdata
  97. def clear_cdata_mode(self):
  98. self.interesting = interesting_normal
  99. # Internal -- handle data as far as reasonable. May leave state
  100. # and data to be processed by a subsequent call. If 'end' is
  101. # true, force handling all data as if followed by EOF marker.
  102. def goahead(self, end):
  103. rawdata = self.rawdata
  104. i = 0
  105. n = len(rawdata)
  106. while i < n:
  107. match = self.interesting.search(rawdata, i) # < or &
  108. if match:
  109. j = match.start()
  110. else:
  111. j = n
  112. if i < j: self.handle_data(rawdata[i:j])
  113. i = self.updatepos(i, j)
  114. if i == n: break
  115. startswith = rawdata.startswith
  116. if startswith('<', i):
  117. if starttagopen.match(rawdata, i): # < + letter
  118. k = self.parse_starttag(i)
  119. elif startswith("</", i):
  120. k = self.parse_endtag(i)
  121. elif startswith("<!--", i):
  122. k = self.parse_comment(i)
  123. elif startswith("<?", i):
  124. k = self.parse_pi(i)
  125. elif startswith("<!", i):
  126. k = self.parse_declaration(i)
  127. elif (i + 1) < n:
  128. self.handle_data("<")
  129. k = i + 1
  130. else:
  131. break
  132. if k < 0:
  133. if end:
  134. self.error("EOF in middle of construct")
  135. break
  136. i = self.updatepos(i, k)
  137. elif startswith("&#", i):
  138. match = charref.match(rawdata, i)
  139. if match:
  140. name = match.group()[2:-1]
  141. self.handle_charref(name)
  142. k = match.end()
  143. if not startswith(';', k-1):
  144. k = k - 1
  145. i = self.updatepos(i, k)
  146. continue
  147. else:
  148. if ";" in rawdata[i:]: #bail by consuming &#
  149. self.handle_data(rawdata[0:2])
  150. i = self.updatepos(i, 2)
  151. break
  152. elif startswith('&', i):
  153. match = entityref.match(rawdata, i)
  154. if match:
  155. name = match.group(1)
  156. self.handle_entityref(name)
  157. k = match.end()
  158. if not startswith(';', k-1):
  159. k = k - 1
  160. i = self.updatepos(i, k)
  161. continue
  162. match = incomplete.match(rawdata, i)
  163. if match:
  164. # match.group() will contain at least 2 chars
  165. if end and match.group() == rawdata[i:]:
  166. self.error("EOF in middle of entity or char ref")
  167. # incomplete
  168. break
  169. elif (i + 1) < n:
  170. # not the end of the buffer, and can't be confused
  171. # with some other construct
  172. self.handle_data("&")
  173. i = self.updatepos(i, i + 1)
  174. else:
  175. break
  176. else:
  177. assert 0, "interesting.search() lied"
  178. # end while
  179. if end and i < n:
  180. self.handle_data(rawdata[i:n])
  181. i = self.updatepos(i, n)
  182. self.rawdata = rawdata[i:]
  183. # Internal -- parse processing instr, return end or -1 if not terminated
  184. def parse_pi(self, i):
  185. rawdata = self.rawdata
  186. assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
  187. match = piclose.search(rawdata, i+2) # >
  188. if not match:
  189. return -1
  190. j = match.start()
  191. self.handle_pi(rawdata[i+2: j])
  192. j = match.end()
  193. return j
  194. # Internal -- handle starttag, return end or -1 if not terminated
  195. def parse_starttag(self, i):
  196. self.__starttag_text = None
  197. endpos = self.check_for_whole_start_tag(i)
  198. if endpos < 0:
  199. return endpos
  200. rawdata = self.rawdata
  201. self.__starttag_text = rawdata[i:endpos]
  202. # Now parse the data between i+1 and j into a tag and attrs
  203. attrs = []
  204. match = tagfind.match(rawdata, i+1)
  205. assert match, 'unexpected call to parse_starttag()'
  206. k = match.end()
  207. self.lasttag = tag = rawdata[i+1:k].lower()
  208. while k < endpos:
  209. m = attrfind.match(rawdata, k)
  210. if not m:
  211. break
  212. attrname, rest, attrvalue = m.group(1, 2, 3)
  213. if not rest:
  214. attrvalue = None
  215. elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
  216. attrvalue[:1] == '"' == attrvalue[-1:]:
  217. attrvalue = attrvalue[1:-1]
  218. attrvalue = self.unescape(attrvalue)
  219. attrs.append((attrname.lower(), attrvalue))
  220. k = m.end()
  221. end = rawdata[k:endpos].strip()
  222. if end not in (">", "/>"):
  223. lineno, offset = self.getpos()
  224. if "\n" in self.__starttag_text:
  225. lineno = lineno + self.__starttag_text.count("\n")
  226. offset = len(self.__starttag_text) \
  227. - self.__starttag_text.rfind("\n")
  228. else:
  229. offset = offset + len(self.__starttag_text)
  230. self.error("junk characters in start tag: %r"
  231. % (rawdata[k:endpos][:20],))
  232. if end.endswith('/>'):
  233. # XHTML-style empty tag: <span attr="value" />
  234. self.handle_startendtag(tag, attrs)
  235. else:
  236. self.handle_starttag(tag, attrs)
  237. if tag in self.CDATA_CONTENT_ELEMENTS:
  238. self.set_cdata_mode()
  239. return endpos
  240. # Internal -- check to see if we have a complete starttag; return end
  241. # or -1 if incomplete.
  242. def check_for_whole_start_tag(self, i):
  243. rawdata = self.rawdata
  244. m = locatestarttagend.match(rawdata, i)
  245. if m:
  246. j = m.end()
  247. next = rawdata[j:j+1]
  248. if next == ">":
  249. return j + 1
  250. if next == "/":
  251. if rawdata.startswith("/>", j):
  252. return j + 2
  253. if rawdata.startswith("/", j):
  254. # buffer boundary
  255. return -1
  256. # else bogus input
  257. self.updatepos(i, j + 1)
  258. self.error("malformed empty start tag")
  259. if next == "":
  260. # end of input
  261. return -1
  262. if next in ("abcdefghijklmnopqrstuvwxyz=/"
  263. "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
  264. # end of input in or before attribute value, or we have the
  265. # '/' from a '/>' ending
  266. return -1
  267. self.updatepos(i, j)
  268. self.error("malformed start tag")
  269. raise AssertionError("we should not get here!")
  270. # Internal -- parse endtag, return end or -1 if incomplete
  271. def parse_endtag(self, i):
  272. rawdata = self.rawdata
  273. assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
  274. match = endendtag.search(rawdata, i+1) # >
  275. if not match:
  276. return -1
  277. j = match.end()
  278. match = endtagfind.match(rawdata, i) # </ + tag + >
  279. if not match:
  280. self.error("bad end tag: %r" % (rawdata[i:j],))
  281. tag = match.group(1)
  282. self.handle_endtag(tag.lower())
  283. self.clear_cdata_mode()
  284. return j
  285. # Overridable -- finish processing of start+end tag: <tag.../>
  286. def handle_startendtag(self, tag, attrs):
  287. self.handle_starttag(tag, attrs)
  288. self.handle_endtag(tag)
  289. # Overridable -- handle start tag
  290. def handle_starttag(self, tag, attrs):
  291. pass
  292. # Overridable -- handle end tag
  293. def handle_endtag(self, tag):
  294. pass
  295. # Overridable -- handle character reference
  296. def handle_charref(self, name):
  297. pass
  298. # Overridable -- handle entity reference
  299. def handle_entityref(self, name):
  300. pass
  301. # Overridable -- handle data
  302. def handle_data(self, data):
  303. pass
  304. # Overridable -- handle comment
  305. def handle_comment(self, data):
  306. pass
  307. # Overridable -- handle declaration
  308. def handle_decl(self, decl):
  309. pass
  310. # Overridable -- handle processing instruction
  311. def handle_pi(self, data):
  312. pass
  313. def unknown_decl(self, data):
  314. self.error("unknown declaration: %r" % (data,))
  315. # Internal -- helper to remove special character quoting
  316. entitydefs = None
  317. def unescape(self, s):
  318. if '&' not in s:
  319. return s
  320. def replaceEntities(s):
  321. s = s.groups()[0]
  322. try:
  323. if s[0] == "#":
  324. s = s[1:]
  325. if s[0] in ['x','X']:
  326. c = int(s[1:], 16)
  327. else:
  328. c = int(s)
  329. return unichr(c)
  330. except ValueError:
  331. return '&#'+s+';'
  332. else:
  333. # Cannot use name2codepoint directly, because HTMLParser supports apos,
  334. # which is not part of HTML 4
  335. import htmlentitydefs
  336. if HTMLParser.entitydefs is None:
  337. entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
  338. for k, v in htmlentitydefs.name2codepoint.iteritems():
  339. entitydefs[k] = unichr(v)
  340. try:
  341. return self.entitydefs[s]
  342. except KeyError:
  343. return '&'+s+';'
  344. return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)