PageRenderTime 51ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 1ms

/yarss2/include/beautifulsoup/py3k/bs4/builder/_lxml.py

https://bitbucket.org/bendikro/deluge-yarss-plugin
Python | 296 lines | 188 code | 47 blank | 61 comment | 40 complexity | 32bd057c93c180932bf84b0197703858 MD5 | raw file
Possible License(s): GPL-3.0, MIT, MPL-2.0, Apache-2.0, BSD-3-Clause
  1. # Use of this source code is governed by the MIT license.
  2. __license__ = "MIT"
  3. __all__ = [
  4. 'LXMLTreeBuilderForXML',
  5. 'LXMLTreeBuilder',
  6. ]
  7. try:
  8. from collections.abc import Callable # Python 3.6
  9. except ImportError as e:
  10. from collections import Callable
  11. from io import BytesIO
  12. from io import StringIO
  13. from lxml import etree
  14. from bs4.element import (
  15. Comment,
  16. Doctype,
  17. NamespacedAttribute,
  18. ProcessingInstruction,
  19. XMLProcessingInstruction,
  20. )
  21. from bs4.builder import (
  22. FAST,
  23. HTML,
  24. HTMLTreeBuilder,
  25. PERMISSIVE,
  26. ParserRejectedMarkup,
  27. TreeBuilder,
  28. XML)
  29. from bs4.dammit import EncodingDetector
  30. LXML = 'lxml'
  31. def _invert(d):
  32. "Invert a dictionary."
  33. return dict((v,k) for k, v in list(d.items()))
  34. class LXMLTreeBuilderForXML(TreeBuilder):
  35. DEFAULT_PARSER_CLASS = etree.XMLParser
  36. is_xml = True
  37. processing_instruction_class = XMLProcessingInstruction
  38. NAME = "lxml-xml"
  39. ALTERNATE_NAMES = ["xml"]
  40. # Well, it's permissive by XML parser standards.
  41. features = [NAME, LXML, XML, FAST, PERMISSIVE]
  42. CHUNK_SIZE = 512
  43. # This namespace mapping is specified in the XML Namespace
  44. # standard.
  45. DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
  46. DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
  47. def initialize_soup(self, soup):
  48. """Let the BeautifulSoup object know about the standard namespace
  49. mapping.
  50. """
  51. super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
  52. self._register_namespaces(self.DEFAULT_NSMAPS)
  53. def _register_namespaces(self, mapping):
  54. """Let the BeautifulSoup object know about namespaces encountered
  55. while parsing the document.
  56. This might be useful later on when creating CSS selectors.
  57. """
  58. for key, value in list(mapping.items()):
  59. if key and key not in self.soup._namespaces:
  60. # Let the BeautifulSoup object know about a new namespace.
  61. # If there are multiple namespaces defined with the same
  62. # prefix, the first one in the document takes precedence.
  63. self.soup._namespaces[key] = value
  64. def default_parser(self, encoding):
  65. # This can either return a parser object or a class, which
  66. # will be instantiated with default arguments.
  67. if self._default_parser is not None:
  68. return self._default_parser
  69. return etree.XMLParser(
  70. target=self, strip_cdata=False, recover=True, encoding=encoding)
  71. def parser_for(self, encoding):
  72. # Use the default parser.
  73. parser = self.default_parser(encoding)
  74. if isinstance(parser, Callable):
  75. # Instantiate the parser with default arguments
  76. parser = parser(target=self, strip_cdata=False, encoding=encoding)
  77. return parser
  78. def __init__(self, parser=None, empty_element_tags=None, **kwargs):
  79. # TODO: Issue a warning if parser is present but not a
  80. # callable, since that means there's no way to create new
  81. # parsers for different encodings.
  82. self._default_parser = parser
  83. if empty_element_tags is not None:
  84. self.empty_element_tags = set(empty_element_tags)
  85. self.soup = None
  86. self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
  87. super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
  88. def _getNsTag(self, tag):
  89. # Split the namespace URL out of a fully-qualified lxml tag
  90. # name. Copied from lxml's src/lxml/sax.py.
  91. if tag[0] == '{':
  92. return tuple(tag[1:].split('}', 1))
  93. else:
  94. return (None, tag)
  95. def prepare_markup(self, markup, user_specified_encoding=None,
  96. exclude_encodings=None,
  97. document_declared_encoding=None):
  98. """
  99. :yield: A series of 4-tuples.
  100. (markup, encoding, declared encoding,
  101. has undergone character replacement)
  102. Each 4-tuple represents a strategy for parsing the document.
  103. """
  104. # Instead of using UnicodeDammit to convert the bytestring to
  105. # Unicode using different encodings, use EncodingDetector to
  106. # iterate over the encodings, and tell lxml to try to parse
  107. # the document as each one in turn.
  108. is_html = not self.is_xml
  109. if is_html:
  110. self.processing_instruction_class = ProcessingInstruction
  111. else:
  112. self.processing_instruction_class = XMLProcessingInstruction
  113. if isinstance(markup, str):
  114. # We were given Unicode. Maybe lxml can parse Unicode on
  115. # this system?
  116. yield markup, None, document_declared_encoding, False
  117. if isinstance(markup, str):
  118. # No, apparently not. Convert the Unicode to UTF-8 and
  119. # tell lxml to parse it as UTF-8.
  120. yield (markup.encode("utf8"), "utf8",
  121. document_declared_encoding, False)
  122. try_encodings = [user_specified_encoding, document_declared_encoding]
  123. detector = EncodingDetector(
  124. markup, try_encodings, is_html, exclude_encodings)
  125. for encoding in detector.encodings:
  126. yield (detector.markup, encoding, document_declared_encoding, False)
  127. def feed(self, markup):
  128. if isinstance(markup, bytes):
  129. markup = BytesIO(markup)
  130. elif isinstance(markup, str):
  131. markup = StringIO(markup)
  132. # Call feed() at least once, even if the markup is empty,
  133. # or the parser won't be initialized.
  134. data = markup.read(self.CHUNK_SIZE)
  135. try:
  136. self.parser = self.parser_for(self.soup.original_encoding)
  137. self.parser.feed(data)
  138. while len(data) != 0:
  139. # Now call feed() on the rest of the data, chunk by chunk.
  140. data = markup.read(self.CHUNK_SIZE)
  141. if len(data) != 0:
  142. self.parser.feed(data)
  143. self.parser.close()
  144. except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
  145. raise ParserRejectedMarkup(str(e))
  146. def close(self):
  147. self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
  148. def start(self, name, attrs, nsmap={}):
  149. # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
  150. attrs = dict(attrs)
  151. nsprefix = None
  152. # Invert each namespace map as it comes in.
  153. if len(nsmap) == 0 and len(self.nsmaps) > 1:
  154. # There are no new namespaces for this tag, but
  155. # non-default namespaces are in play, so we need a
  156. # separate tag stack to know when they end.
  157. self.nsmaps.append(None)
  158. elif len(nsmap) > 0:
  159. # A new namespace mapping has come into play.
  160. # First, Let the BeautifulSoup object know about it.
  161. self._register_namespaces(nsmap)
  162. # Then, add it to our running list of inverted namespace
  163. # mappings.
  164. self.nsmaps.append(_invert(nsmap))
  165. # Also treat the namespace mapping as a set of attributes on the
  166. # tag, so we can recreate it later.
  167. attrs = attrs.copy()
  168. for prefix, namespace in list(nsmap.items()):
  169. attribute = NamespacedAttribute(
  170. "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
  171. attrs[attribute] = namespace
  172. # Namespaces are in play. Find any attributes that came in
  173. # from lxml with namespaces attached to their names, and
  174. # turn then into NamespacedAttribute objects.
  175. new_attrs = {}
  176. for attr, value in list(attrs.items()):
  177. namespace, attr = self._getNsTag(attr)
  178. if namespace is None:
  179. new_attrs[attr] = value
  180. else:
  181. nsprefix = self._prefix_for_namespace(namespace)
  182. attr = NamespacedAttribute(nsprefix, attr, namespace)
  183. new_attrs[attr] = value
  184. attrs = new_attrs
  185. namespace, name = self._getNsTag(name)
  186. nsprefix = self._prefix_for_namespace(namespace)
  187. self.soup.handle_starttag(name, namespace, nsprefix, attrs)
  188. def _prefix_for_namespace(self, namespace):
  189. """Find the currently active prefix for the given namespace."""
  190. if namespace is None:
  191. return None
  192. for inverted_nsmap in reversed(self.nsmaps):
  193. if inverted_nsmap is not None and namespace in inverted_nsmap:
  194. return inverted_nsmap[namespace]
  195. return None
  196. def end(self, name):
  197. self.soup.endData()
  198. completed_tag = self.soup.tagStack[-1]
  199. namespace, name = self._getNsTag(name)
  200. nsprefix = None
  201. if namespace is not None:
  202. for inverted_nsmap in reversed(self.nsmaps):
  203. if inverted_nsmap is not None and namespace in inverted_nsmap:
  204. nsprefix = inverted_nsmap[namespace]
  205. break
  206. self.soup.handle_endtag(name, nsprefix)
  207. if len(self.nsmaps) > 1:
  208. # This tag, or one of its parents, introduced a namespace
  209. # mapping, so pop it off the stack.
  210. self.nsmaps.pop()
  211. def pi(self, target, data):
  212. self.soup.endData()
  213. self.soup.handle_data(target + ' ' + data)
  214. self.soup.endData(self.processing_instruction_class)
  215. def data(self, content):
  216. self.soup.handle_data(content)
  217. def doctype(self, name, pubid, system):
  218. self.soup.endData()
  219. doctype = Doctype.for_name_and_ids(name, pubid, system)
  220. self.soup.object_was_parsed(doctype)
  221. def comment(self, content):
  222. "Handle comments as Comment objects."
  223. self.soup.endData()
  224. self.soup.handle_data(content)
  225. self.soup.endData(Comment)
  226. def test_fragment_to_document(self, fragment):
  227. """See `TreeBuilder`."""
  228. return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
  229. class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
  230. NAME = LXML
  231. ALTERNATE_NAMES = ["lxml-html"]
  232. features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
  233. is_xml = False
  234. processing_instruction_class = ProcessingInstruction
  235. def default_parser(self, encoding):
  236. return etree.HTMLParser
  237. def feed(self, markup):
  238. encoding = self.soup.original_encoding
  239. try:
  240. self.parser = self.parser_for(encoding)
  241. self.parser.feed(markup)
  242. self.parser.close()
  243. except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
  244. raise ParserRejectedMarkup(str(e))
  245. def test_fragment_to_document(self, fragment):
  246. """See `TreeBuilder`."""
  247. return '<html><body>%s</body></html>' % fragment