/lxmlselector.py

https://github.com/steeve/scrapy-lxmlselector · Python · 154 lines · 129 code · 12 blank · 13 comment · 15 complexity · 666aa5821cdb076e855e8e0f0923cb5f MD5 · raw file

  1. """
  2. Lxml selector
  3. Provides both XPath and CSS Selection.
  4. Can use html5lib and BeautifulSoup.
  5. Provided by Steeve Morin <steeve.morin@gmail.com>
  6. See documentation in docs/topics/selectors.rst
  7. """
  8. from lxml import etree
  9. from lxml.cssselect import CSSSelector as lxml_CSSSelector
  10. from scrapy.http import TextResponse
  11. from scrapy.utils.python import flatten, unicode_to_str
  12. from scrapy.utils.misc import extract_regex
  13. from scrapy.utils.trackref import object_ref
  14. __all__ = ['HtmlXPathSelector', 'XmlXPathSelector', 'LxmlSelector',
  15. 'LxmlSelectorList']
  16. class LxmlSelector(object_ref):
  17. __slots__ = [ 'doc', 'xmlNode', 'expr', '__weakref__', "namespaces" ]
  18. def __init__(self, response=None, text=None, node=None, parent=None, expr=None,
  19. use_html5lib=False, use_BeautifulSoup=False, namespaces=None):
  20. if parent:
  21. self.doc = parent.doc
  22. self.xmlNode = node
  23. elif response:
  24. self.xmlNode = self._lxml_parse_document(response.body, use_html5lib,
  25. use_BeautifulSoup)
  26. self.doc = self.xmlNode.getroottree()
  27. elif text:
  28. response = TextResponse(url='about:blank', body=unicode_to_str(text),
  29. encoding='utf-8')
  30. self.xmlNode = self._lxml_parse_document(response.body, use_html5lib,
  31. use_BeautifulSoup)
  32. self.doc = self.xmlNode.getroottree()
  33. self.expr = expr
  34. self.namespaces = namespaces or {}
  35. def _lxml_parse_document(self, body, use_html5lib=False,
  36. use_BeautifulSoup=False):
  37. if use_html5lib:
  38. from lxml.html import html5parser
  39. return html5parser.fromstring(body)
  40. elif use_BeautifulSoup:
  41. from lxml.html import soupparser
  42. return soupparser.fromstring(body)
  43. else:
  44. for parser in [ etree.XML, etree.HTML ]:
  45. try:
  46. return (parser(body))
  47. except:
  48. pass
  49. def xpath(self, xpath):
  50. """Perform the given XPath query on the current XPathSelector and
  51. return a XPathSelectorList of the result"""
  52. return self._make_select_results(self.xmlNode.xpath(xpath, namespaces=self.namespaces), expr=xpath, namespaces=self.namespaces)
  53. def css(self, css):
  54. return self._make_select_results(lxml_CSSSelector(css)(self.xmlNode), expr=css, namespaces=self.namespaces)
  55. def _make_select_results(self, result, expr, namespaces):
  56. if hasattr(result, '__iter__'):
  57. return LxmlSelectorList([ type(self)(node=node, parent=self,
  58. expr=expr, namespaces=self.namespaces) for node in result ])
  59. else:
  60. return LxmlSelectorList([ type(self)(node=result,
  61. parent=self, expr=expr, namespaces=self.namespaces) ])
  62. def re(self, regex):
  63. """Return a list of unicode strings by applying the regex over all
  64. current XPath selections, and flattening the results"""
  65. return extract_regex(regex, self.extract(), 'utf-8')
  66. def extract(self):
  67. """Return a unicode string of the content referenced by the XPathSelector"""
  68. if isinstance(self.xmlNode, etree._Element):
  69. return unicode(etree.tostring(self.xmlNode, encoding='utf-8'), 'utf-8', errors='ignore')
  70. try:
  71. text = unicode(self.xmlNode, 'utf-8', errors='ignore')
  72. except TypeError: # catched when self.xmlNode is a float - see tests
  73. text = unicode(self.xmlNode)
  74. return text
  75. def extract_unquoted(self):
  76. """Get unescaped contents from the text node (no entities, no CDATA)"""
  77. if self.select('self::text()'):
  78. return unicode(self.xmlNode.getContent(), 'utf-8', errors='ignore')
  79. else:
  80. return u''
  81. def register_namespace(self, prefix, uri):
  82. """Register namespace so that it can be used in XPath queries"""
  83. self.doc.xpathContext.xpathRegisterNs(prefix, uri)
  84. def attrib(self, name):
  85. return self._make_select_results([self.xmlNode.attrib[name]], expr="%s/@%s()" % (self.expr, name), namespaces=self.namespaces)[0]
  86. def text(self):
  87. return self._make_select_results(self.xmlNode.xpath("text()"), expr="%s/text()" % self.expr, namespaces=self.namespaces)
  88. def __nonzero__(self):
  89. return bool(self.extract())
  90. def __str__(self):
  91. return "<%s (%s) xpath=%s>" % (type(self).__name__, getattr(self.xmlNode,
  92. 'name', type(self.xmlNode).__name__), self.expr)
  93. __repr__ = __str__
  94. class LxmlSelectorList(list):
  95. """List of XPathSelector objects"""
  96. def __getslice__(self, i, j):
  97. return LxmlSelectorList(list.__getslice__(self, i, j))
  98. def xpath(self, expr):
  99. """Perform the given XPath query on each XPathSelector of the list and
  100. return a new (flattened) XPathSelectorList of the results"""
  101. return LxmlSelectorList(flatten([ x.xpath(expr) for x in self ]))
  102. def css(self, expr):
  103. """Perform the given XPath query on each XPathSelector of the list and
  104. return a new (flattened) XPathSelectorList of the results"""
  105. return LxmlSelectorList(flatten([ x.css(expr) for x in self ]))
  106. def re(self, regex):
  107. """Perform the re() method on each XPathSelector of the list, and
  108. return the result as a flattened list of unicode strings"""
  109. return flatten([ x.re(regex) for x in self ])
  110. def attrib(self, name):
  111. """Return a list of unicode strings with the attributes referenced by each
  112. XPathSelector of the list"""
  113. return LxmlSelectorList([ x.attrib(name) if isinstance(x, LxmlSelector) else x for x in self])
  114. def text(self):
  115. """Return a list of unicode strings with the content text referenced by each
  116. XPathSelector of the list"""
  117. return LxmlSelectorList(flatten([ x.text() if isinstance(x, LxmlSelector) else x for x in self]))
  118. def extract(self):
  119. """Return a list of unicode strings with the content referenced by each
  120. XPathSelector of the list"""
  121. return [ x.extract() if isinstance(x, LxmlSelector) else x for x in self]
  122. def extract_unquoted(self):
  123. return [ x.extract_unquoted() if isinstance(x, LxmlSelector) else x for x in self]