PageRenderTime 58ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/yarss2/include/beautifulsoup/py3k/bs4/builder/_htmlparser.py

https://bitbucket.org/bendikro/deluge-yarss-plugin
Python | 350 lines | 345 code | 1 blank | 4 comment | 3 complexity | fe7aab696ecb1869e655428667ca0b44 MD5 | raw file
Possible License(s): GPL-3.0, MIT, MPL-2.0, Apache-2.0, BSD-3-Clause
  1. # encoding: utf-8
  2. """Use the HTMLParser library to parse HTML files that aren't too bad."""
  3. # Use of this source code is governed by the MIT license.
  4. __license__ = "MIT"
  5. __all__ = [
  6. 'HTMLParserTreeBuilder',
  7. ]
  8. from html.parser import HTMLParser
  9. try:
  10. from html.parser import HTMLParseError
  11. except ImportError as e:
  12. # HTMLParseError is removed in Python 3.5. Since it can never be
  13. # thrown in 3.5, we can just define our own class as a placeholder.
  14. class HTMLParseError(Exception):
  15. pass
  16. import sys
  17. import warnings
  18. # Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
  19. # argument, which we'd like to set to False. Unfortunately,
  20. # http://bugs.python.org/issue13273 makes strict=True a better bet
  21. # before Python 3.2.3.
  22. #
  23. # At the end of this file, we monkeypatch HTMLParser so that
  24. # strict=True works well on Python 3.2.2.
  25. major, minor, release = sys.version_info[:3]
  26. CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
  27. CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
  28. CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
  29. from bs4.element import (
  30. CData,
  31. Comment,
  32. Declaration,
  33. Doctype,
  34. ProcessingInstruction,
  35. )
  36. from bs4.dammit import EntitySubstitution, UnicodeDammit
  37. from bs4.builder import (
  38. HTML,
  39. HTMLTreeBuilder,
  40. STRICT,
  41. )
  42. HTMLPARSER = 'html.parser'
  43. class BeautifulSoupHTMLParser(HTMLParser):
  44. def __init__(self, *args, **kwargs):
  45. HTMLParser.__init__(self, *args, **kwargs)
  46. # Keep a list of empty-element tags that were encountered
  47. # without an explicit closing tag. If we encounter a closing tag
  48. # of this type, we'll associate it with one of those entries.
  49. #
  50. # This isn't a stack because we don't care about the
  51. # order. It's a list of closing tags we've already handled and
  52. # will ignore, assuming they ever show up.
  53. self.already_closed_empty_element = []
  54. def error(self, msg):
  55. """In Python 3, HTMLParser subclasses must implement error(), although this
  56. requirement doesn't appear to be documented.
  57. In Python 2, HTMLParser implements error() as raising an exception.
  58. In any event, this method is called only on very strange markup and our best strategy
  59. is to pretend it didn't happen and keep going.
  60. """
  61. warnings.warn(msg)
  62. def handle_startendtag(self, name, attrs):
  63. # This is only called when the markup looks like
  64. # <tag/>.
  65. # is_startend() tells handle_starttag not to close the tag
  66. # just because its name matches a known empty-element tag. We
  67. # know that this is an empty-element tag and we want to call
  68. # handle_endtag ourselves.
  69. tag = self.handle_starttag(name, attrs, handle_empty_element=False)
  70. self.handle_endtag(name)
  71. def handle_starttag(self, name, attrs, handle_empty_element=True):
  72. # XXX namespace
  73. attr_dict = {}
  74. for key, value in attrs:
  75. # Change None attribute values to the empty string
  76. # for consistency with the other tree builders.
  77. if value is None:
  78. value = ''
  79. attr_dict[key] = value
  80. attrvalue = '""'
  81. #print "START", name
  82. tag = self.soup.handle_starttag(name, None, None, attr_dict)
  83. if tag and tag.is_empty_element and handle_empty_element:
  84. # Unlike other parsers, html.parser doesn't send separate end tag
  85. # events for empty-element tags. (It's handled in
  86. # handle_startendtag, but only if the original markup looked like
  87. # <tag/>.)
  88. #
  89. # So we need to call handle_endtag() ourselves. Since we
  90. # know the start event is identical to the end event, we
  91. # don't want handle_endtag() to cross off any previous end
  92. # events for tags of this name.
  93. self.handle_endtag(name, check_already_closed=False)
  94. # But we might encounter an explicit closing tag for this tag
  95. # later on. If so, we want to ignore it.
  96. self.already_closed_empty_element.append(name)
  97. def handle_endtag(self, name, check_already_closed=True):
  98. #print "END", name
  99. if check_already_closed and name in self.already_closed_empty_element:
  100. # This is a redundant end tag for an empty-element tag.
  101. # We've already called handle_endtag() for it, so just
  102. # check it off the list.
  103. # print "ALREADY CLOSED", name
  104. self.already_closed_empty_element.remove(name)
  105. else:
  106. self.soup.handle_endtag(name)
  107. def handle_data(self, data):
  108. self.soup.handle_data(data)
  109. def handle_charref(self, name):
  110. # XXX workaround for a bug in HTMLParser. Remove this once
  111. # it's fixed in all supported versions.
  112. # http://bugs.python.org/issue13633
  113. if name.startswith('x'):
  114. real_name = int(name.lstrip('x'), 16)
  115. elif name.startswith('X'):
  116. real_name = int(name.lstrip('X'), 16)
  117. else:
  118. real_name = int(name)
  119. data = None
  120. if real_name < 256:
  121. # HTML numeric entities are supposed to reference Unicode
  122. # code points, but sometimes they reference code points in
  123. # some other encoding (ahem, Windows-1252). E.g. &#147;
  124. # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
  125. # code tries to detect this situation and compensate.
  126. for encoding in (self.soup.original_encoding, 'windows-1252'):
  127. if not encoding:
  128. continue
  129. try:
  130. data = bytearray([real_name]).decode(encoding)
  131. except UnicodeDecodeError as e:
  132. pass
  133. if not data:
  134. try:
  135. data = chr(real_name)
  136. except (ValueError, OverflowError) as e:
  137. pass
  138. data = data or "\N{REPLACEMENT CHARACTER}"
  139. self.handle_data(data)
  140. def handle_entityref(self, name):
  141. character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
  142. if character is not None:
  143. data = character
  144. else:
  145. # If this were XML, it would be ambiguous whether "&foo"
  146. # was an character entity reference with a missing
  147. # semicolon or the literal string "&foo". Since this is
  148. # HTML, we have a complete list of all character entity references,
  149. # and this one wasn't found, so assume it's the literal string "&foo".
  150. data = "&%s" % name
  151. self.handle_data(data)
  152. def handle_comment(self, data):
  153. self.soup.endData()
  154. self.soup.handle_data(data)
  155. self.soup.endData(Comment)
  156. def handle_decl(self, data):
  157. self.soup.endData()
  158. if data.startswith("DOCTYPE "):
  159. data = data[len("DOCTYPE "):]
  160. elif data == 'DOCTYPE':
  161. # i.e. "<!DOCTYPE>"
  162. data = ''
  163. self.soup.handle_data(data)
  164. self.soup.endData(Doctype)
  165. def unknown_decl(self, data):
  166. if data.upper().startswith('CDATA['):
  167. cls = CData
  168. data = data[len('CDATA['):]
  169. else:
  170. cls = Declaration
  171. self.soup.endData()
  172. self.soup.handle_data(data)
  173. self.soup.endData(cls)
  174. def handle_pi(self, data):
  175. self.soup.endData()
  176. self.soup.handle_data(data)
  177. self.soup.endData(ProcessingInstruction)
  178. class HTMLParserTreeBuilder(HTMLTreeBuilder):
  179. is_xml = False
  180. picklable = True
  181. NAME = HTMLPARSER
  182. features = [NAME, HTML, STRICT]
  183. def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
  184. super(HTMLParserTreeBuilder, self).__init__(**kwargs)
  185. parser_args = parser_args or []
  186. parser_kwargs = parser_kwargs or {}
  187. if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
  188. parser_kwargs['strict'] = False
  189. if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
  190. parser_kwargs['convert_charrefs'] = False
  191. self.parser_args = (parser_args, parser_kwargs)
  192. def prepare_markup(self, markup, user_specified_encoding=None,
  193. document_declared_encoding=None, exclude_encodings=None):
  194. """
  195. :return: A 4-tuple (markup, original encoding, encoding
  196. declared within markup, whether any characters had to be
  197. replaced with REPLACEMENT CHARACTER).
  198. """
  199. if isinstance(markup, str):
  200. yield (markup, None, None, False)
  201. return
  202. try_encodings = [user_specified_encoding, document_declared_encoding]
  203. dammit = UnicodeDammit(markup, try_encodings, is_html=True,
  204. exclude_encodings=exclude_encodings)
  205. yield (dammit.markup, dammit.original_encoding,
  206. dammit.declared_html_encoding,
  207. dammit.contains_replacement_characters)
  208. def feed(self, markup):
  209. args, kwargs = self.parser_args
  210. parser = BeautifulSoupHTMLParser(*args, **kwargs)
  211. parser.soup = self.soup
  212. try:
  213. parser.feed(markup)
  214. parser.close()
  215. except HTMLParseError as e:
  216. warnings.warn(RuntimeWarning(
  217. "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
  218. raise e
  219. parser.already_closed_empty_element = []
  220. # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
  221. # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
  222. # string.
  223. #
  224. # XXX This code can be removed once most Python 3 users are on 3.2.3.
  225. if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
  226. import re
  227. attrfind_tolerant = re.compile(
  228. r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
  229. r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
  230. HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
  231. locatestarttagend = re.compile(r"""
  232. <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
  233. (?:\s+ # whitespace before attribute name
  234. (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
  235. (?:\s*=\s* # value indicator
  236. (?:'[^']*' # LITA-enclosed value
  237. |\"[^\"]*\" # LIT-enclosed value
  238. |[^'\">\s]+ # bare value
  239. )
  240. )?
  241. )
  242. )*
  243. \s* # trailing whitespace
  244. """, re.VERBOSE)
  245. BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
  246. from html.parser import tagfind, attrfind
  247. def parse_starttag(self, i):
  248. self.__starttag_text = None
  249. endpos = self.check_for_whole_start_tag(i)
  250. if endpos < 0:
  251. return endpos
  252. rawdata = self.rawdata
  253. self.__starttag_text = rawdata[i:endpos]
  254. # Now parse the data between i+1 and j into a tag and attrs
  255. attrs = []
  256. match = tagfind.match(rawdata, i+1)
  257. assert match, 'unexpected call to parse_starttag()'
  258. k = match.end()
  259. self.lasttag = tag = rawdata[i+1:k].lower()
  260. while k < endpos:
  261. if self.strict:
  262. m = attrfind.match(rawdata, k)
  263. else:
  264. m = attrfind_tolerant.match(rawdata, k)
  265. if not m:
  266. break
  267. attrname, rest, attrvalue = m.group(1, 2, 3)
  268. if not rest:
  269. attrvalue = None
  270. elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
  271. attrvalue[:1] == '"' == attrvalue[-1:]:
  272. attrvalue = attrvalue[1:-1]
  273. if attrvalue:
  274. attrvalue = self.unescape(attrvalue)
  275. attrs.append((attrname.lower(), attrvalue))
  276. k = m.end()
  277. end = rawdata[k:endpos].strip()
  278. if end not in (">", "/>"):
  279. lineno, offset = self.getpos()
  280. if "\n" in self.__starttag_text:
  281. lineno = lineno + self.__starttag_text.count("\n")
  282. offset = len(self.__starttag_text) \
  283. - self.__starttag_text.rfind("\n")
  284. else:
  285. offset = offset + len(self.__starttag_text)
  286. if self.strict:
  287. self.error("junk characters in start tag: %r"
  288. % (rawdata[k:endpos][:20],))
  289. self.handle_data(rawdata[i:endpos])
  290. return endpos
  291. if end.endswith('/>'):
  292. # XHTML-style empty tag: <span attr="value" />
  293. self.handle_startendtag(tag, attrs)
  294. else:
  295. self.handle_starttag(tag, attrs)
  296. if tag in self.CDATA_CONTENT_ELEMENTS:
  297. self.set_cdata_mode(tag)
  298. return endpos
  299. def set_cdata_mode(self, elem):
  300. self.cdata_elem = elem.lower()
  301. self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
  302. BeautifulSoupHTMLParser.parse_starttag = parse_starttag
  303. BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
  304. CONSTRUCTOR_TAKES_STRICT = True