PageRenderTime 67ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 1ms

/src/lxml/html/__init__.py

https://gitlab.com/tlevine/lxml
Python | 1451 lines | 1214 code | 66 blank | 171 comment | 125 complexity | a17751397ef7291d9d3c0669b98738c4 MD5 | raw file
  1. # Copyright (c) 2004 Ian Bicking. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are
  5. # met:
  6. #
  7. # 1. Redistributions of source code must retain the above copyright
  8. # notice, this list of conditions and the following disclaimer.
  9. #
  10. # 2. Redistributions in binary form must reproduce the above copyright
  11. # notice, this list of conditions and the following disclaimer in
  12. # the documentation and/or other materials provided with the
  13. # distribution.
  14. #
  15. # 3. Neither the name of Ian Bicking nor the names of its contributors may
  16. # be used to endorse or promote products derived from this software
  17. # without specific prior written permission.
  18. #
  19. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20. # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21. # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22. # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
  23. # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  24. # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  25. # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  26. # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  27. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  28. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  29. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30. """The ``lxml.html`` tool set for HTML handling.
  31. """
  32. from __future__ import absolute_import
  33. __all__ = [
  34. 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
  35. 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
  36. 'find_rel_links', 'find_class', 'make_links_absolute',
  37. 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
  38. import copy
  39. import sys
  40. import re
  41. from functools import partial
  42. try:
  43. # while unnecessary, importing from 'collections.abc' is the right way to do it
  44. from collections.abc import MutableMapping, MutableSet
  45. except ImportError:
  46. from collections import MutableMapping, MutableSet
  47. from .. import etree
  48. from . import defs
  49. from ._setmixin import SetMixin
  50. try:
  51. from urlparse import urljoin
  52. except ImportError:
  53. # Python 3
  54. from urllib.parse import urljoin
  55. try:
  56. unicode
  57. except NameError:
  58. # Python 3
  59. unicode = str
  60. try:
  61. basestring
  62. except NameError:
  63. # Python 3
  64. basestring = (str, bytes)
  65. def __fix_docstring(s):
  66. if not s:
  67. return s
  68. if sys.version_info[0] >= 3:
  69. sub = re.compile(r"^(\s*)u'", re.M).sub
  70. else:
  71. sub = re.compile(r"^(\s*)b'", re.M).sub
  72. return sub(r"\1'", s)
  73. XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
  74. _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
  75. namespaces={'x':XHTML_NAMESPACE})
  76. _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
  77. namespaces={'x':XHTML_NAMESPACE})
  78. _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
  79. namespaces={'x':XHTML_NAMESPACE})
  80. #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
  81. _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
  82. _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
  83. _collect_string_content = etree.XPath("string()")
  84. _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
  85. _iter_css_imports = re.compile(r'@import "(.*?)"').finditer
  86. _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
  87. namespaces={'x':XHTML_NAMESPACE})
  88. _archive_re = re.compile(r'[^ ]+')
  89. _parse_meta_refresh_url = re.compile(
  90. r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
  91. def _unquote_match(s, pos):
  92. if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
  93. return s[1:-1], pos+1
  94. else:
  95. return s,pos
  96. def _transform_result(typ, result):
  97. """Convert the result back into the input type.
  98. """
  99. if issubclass(typ, bytes):
  100. return tostring(result, encoding='utf-8')
  101. elif issubclass(typ, unicode):
  102. return tostring(result, encoding='unicode')
  103. else:
  104. return result
  105. def _nons(tag):
  106. if isinstance(tag, basestring):
  107. if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
  108. return tag.split('}')[-1]
  109. return tag
  110. class Classes(MutableSet):
  111. """Provides access to an element's class attribute as a set-like collection.
  112. Usage::
  113. >>> el = fromstring('<p class="hidden large">Text</p>')
  114. >>> classes = el.classes # or: classes = Classes(el.attrib)
  115. >>> classes |= ['block', 'paragraph']
  116. >>> el.get('class')
  117. 'hidden large block paragraph'
  118. >>> classes.toggle('hidden')
  119. False
  120. >>> el.get('class')
  121. 'large block paragraph'
  122. >>> classes -= ('some', 'classes', 'block')
  123. >>> el.get('class')
  124. 'large paragraph'
  125. """
  126. def __init__(self, attributes):
  127. self._attributes = attributes
  128. self._get_class_value = partial(attributes.get, 'class', '')
  129. def add(self, value):
  130. """
  131. Add a class.
  132. This has no effect if the class is already present.
  133. """
  134. if not value or re.search(r'\s', value):
  135. raise ValueError("Invalid class name: %r" % value)
  136. classes = self._get_class_value().split()
  137. if value in classes:
  138. return
  139. classes.append(value)
  140. self._attributes['class'] = ' '.join(classes)
  141. def discard(self, value):
  142. """
  143. Remove a class if it is currently present.
  144. If the class is not present, do nothing.
  145. """
  146. if not value or re.search(r'\s', value):
  147. raise ValueError("Invalid class name: %r" % value)
  148. classes = [name for name in self._get_class_value().split()
  149. if name != value]
  150. if classes:
  151. self._attributes['class'] = ' '.join(classes)
  152. elif 'class' in self._attributes:
  153. del self._attributes['class']
  154. def remove(self, value):
  155. """
  156. Remove a class; it must currently be present.
  157. If the class is not present, raise a KeyError.
  158. """
  159. if not value or re.search(r'\s', value):
  160. raise ValueError("Invalid class name: %r" % value)
  161. super(Classes, self).remove(value)
  162. def __contains__(self, name):
  163. classes = self._get_class_value()
  164. return name in classes and name in classes.split()
  165. def __iter__(self):
  166. return iter(self._get_class_value().split())
  167. def __len__(self):
  168. return len(self._get_class_value().split())
  169. # non-standard methods
  170. def update(self, values):
  171. """
  172. Add all names from 'values'.
  173. """
  174. classes = self._get_class_value().split()
  175. extended = False
  176. for value in values:
  177. if value not in classes:
  178. classes.append(value)
  179. extended = True
  180. if extended:
  181. self._attributes['class'] = ' '.join(classes)
  182. def toggle(self, value):
  183. """
  184. Add a class name if it isn't there yet, or remove it if it exists.
  185. Returns true if the class was added (and is now enabled) and
  186. false if it was removed (and is now disabled).
  187. """
  188. if not value or re.search(r'\s', value):
  189. raise ValueError("Invalid class name: %r" % value)
  190. classes = self._get_class_value().split()
  191. try:
  192. classes.remove(value)
  193. enabled = False
  194. except ValueError:
  195. classes.append(value)
  196. enabled = True
  197. if classes:
  198. self._attributes['class'] = ' '.join(classes)
  199. else:
  200. del self._attributes['class']
  201. return enabled
  202. class HtmlMixin(object):
  203. @property
  204. def classes(self):
  205. """
  206. A set-like wrapper around the 'class' attribute.
  207. """
  208. return Classes(self.attrib)
  209. @classes.setter
  210. def classes(self, classes):
  211. assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.
  212. value = classes._get_class_value()
  213. if value:
  214. self.set('class', value)
  215. elif self.get('class') is not None:
  216. del self.attrib['class']
  217. @property
  218. def base_url(self):
  219. """
  220. Returns the base URL, given when the page was parsed.
  221. Use with ``urlparse.urljoin(el.base_url, href)`` to get
  222. absolute URLs.
  223. """
  224. return self.getroottree().docinfo.URL
  225. @property
  226. def forms(self):
  227. """
  228. Return a list of all the forms
  229. """
  230. return _forms_xpath(self)
  231. @property
  232. def body(self):
  233. """
  234. Return the <body> element. Can be called from a child element
  235. to get the document's head.
  236. """
  237. return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
  238. @property
  239. def head(self):
  240. """
  241. Returns the <head> element. Can be called from a child
  242. element to get the document's head.
  243. """
  244. return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
  245. @property
  246. def label(self):
  247. """
  248. Get or set any <label> element associated with this element.
  249. """
  250. id = self.get('id')
  251. if not id:
  252. return None
  253. result = _label_xpath(self, id=id)
  254. if not result:
  255. return None
  256. else:
  257. return result[0]
  258. @label.setter
  259. def label(self, label):
  260. id = self.get('id')
  261. if not id:
  262. raise TypeError(
  263. "You cannot set a label for an element (%r) that has no id"
  264. % self)
  265. if _nons(label.tag) != 'label':
  266. raise TypeError(
  267. "You can only assign label to a label element (not %r)"
  268. % label)
  269. label.set('for', id)
  270. @label.deleter
  271. def label(self):
  272. label = self.label
  273. if label is not None:
  274. del label.attrib['for']
  275. def drop_tree(self):
  276. """
  277. Removes this element from the tree, including its children and
  278. text. The tail text is joined to the previous element or
  279. parent.
  280. """
  281. parent = self.getparent()
  282. assert parent is not None
  283. if self.tail:
  284. previous = self.getprevious()
  285. if previous is None:
  286. parent.text = (parent.text or '') + self.tail
  287. else:
  288. previous.tail = (previous.tail or '') + self.tail
  289. parent.remove(self)
  290. def drop_tag(self):
  291. """
  292. Remove the tag, but not its children or text. The children and text
  293. are merged into the parent.
  294. Example::
  295. >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
  296. >>> h.find('.//b').drop_tag()
  297. >>> print(tostring(h, encoding='unicode'))
  298. <div>Hello World!</div>
  299. """
  300. parent = self.getparent()
  301. assert parent is not None
  302. previous = self.getprevious()
  303. if self.text and isinstance(self.tag, basestring):
  304. # not a Comment, etc.
  305. if previous is None:
  306. parent.text = (parent.text or '') + self.text
  307. else:
  308. previous.tail = (previous.tail or '') + self.text
  309. if self.tail:
  310. if len(self):
  311. last = self[-1]
  312. last.tail = (last.tail or '') + self.tail
  313. elif previous is None:
  314. parent.text = (parent.text or '') + self.tail
  315. else:
  316. previous.tail = (previous.tail or '') + self.tail
  317. index = parent.index(self)
  318. parent[index:index+1] = self[:]
  319. def find_rel_links(self, rel):
  320. """
  321. Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
  322. """
  323. rel = rel.lower()
  324. return [el for el in _rel_links_xpath(self)
  325. if el.get('rel').lower() == rel]
  326. def find_class(self, class_name):
  327. """
  328. Find any elements with the given class name.
  329. """
  330. return _class_xpath(self, class_name=class_name)
  331. def get_element_by_id(self, id, *default):
  332. """
  333. Get the first element in a document with the given id. If none is
  334. found, return the default argument if provided or raise KeyError
  335. otherwise.
  336. Note that there can be more than one element with the same id,
  337. and this isn't uncommon in HTML documents found in the wild.
  338. Browsers return only the first match, and this function does
  339. the same.
  340. """
  341. try:
  342. # FIXME: should this check for multiple matches?
  343. # browsers just return the first one
  344. return _id_xpath(self, id=id)[0]
  345. except IndexError:
  346. if default:
  347. return default[0]
  348. else:
  349. raise KeyError(id)
  350. def text_content(self):
  351. """
  352. Return the text content of the tag (and the text in any children).
  353. """
  354. return _collect_string_content(self)
  355. def cssselect(self, expr, translator='html'):
  356. """
  357. Run the CSS expression on this element and its children,
  358. returning a list of the results.
  359. Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
  360. -- note that pre-compiling the expression can provide a substantial
  361. speedup.
  362. """
  363. # Do the import here to make the dependency optional.
  364. from lxml.cssselect import CSSSelector
  365. return CSSSelector(expr, translator=translator)(self)
  366. ########################################
  367. ## Link functions
  368. ########################################
  369. def make_links_absolute(self, base_url=None, resolve_base_href=True,
  370. handle_failures=None):
  371. """
  372. Make all links in the document absolute, given the
  373. ``base_url`` for the document (the full URL where the document
  374. came from), or if no ``base_url`` is given, then the ``.base_url``
  375. of the document.
  376. If ``resolve_base_href`` is true, then any ``<base href>``
  377. tags in the document are used *and* removed from the document.
  378. If it is false then any such tag is ignored.
  379. If ``handle_failures`` is None (default), a failure to process
  380. a URL will abort the processing. If set to 'ignore', errors
  381. are ignored. If set to 'discard', failing URLs will be removed.
  382. """
  383. if base_url is None:
  384. base_url = self.base_url
  385. if base_url is None:
  386. raise TypeError(
  387. "No base_url given, and the document has no base_url")
  388. if resolve_base_href:
  389. self.resolve_base_href()
  390. if handle_failures == 'ignore':
  391. def link_repl(href):
  392. try:
  393. return urljoin(base_url, href)
  394. except ValueError:
  395. return href
  396. elif handle_failures == 'discard':
  397. def link_repl(href):
  398. try:
  399. return urljoin(base_url, href)
  400. except ValueError:
  401. return None
  402. elif handle_failures is None:
  403. def link_repl(href):
  404. return urljoin(base_url, href)
  405. else:
  406. raise ValueError(
  407. "unexpected value for handle_failures: %r" % handle_failures)
  408. self.rewrite_links(link_repl)
  409. def resolve_base_href(self, handle_failures=None):
  410. """
  411. Find any ``<base href>`` tag in the document, and apply its
  412. values to all links found in the document. Also remove the
  413. tag once it has been applied.
  414. If ``handle_failures`` is None (default), a failure to process
  415. a URL will abort the processing. If set to 'ignore', errors
  416. are ignored. If set to 'discard', failing URLs will be removed.
  417. """
  418. base_href = None
  419. basetags = self.xpath('//base[@href]|//x:base[@href]',
  420. namespaces={'x': XHTML_NAMESPACE})
  421. for b in basetags:
  422. base_href = b.get('href')
  423. b.drop_tree()
  424. if not base_href:
  425. return
  426. self.make_links_absolute(base_href, resolve_base_href=False,
  427. handle_failures=handle_failures)
  428. def iterlinks(self):
  429. """
  430. Yield (element, attribute, link, pos), where attribute may be None
  431. (indicating the link is in the text). ``pos`` is the position
  432. where the link occurs; often 0, but sometimes something else in
  433. the case of links in stylesheets or style tags.
  434. Note: <base href> is *not* taken into account in any way. The
  435. link you get is exactly the link in the document.
  436. Note: multiple links inside of a single text string or
  437. attribute value are returned in reversed order. This makes it
  438. possible to replace or delete them from the text string value
  439. based on their reported text positions. Otherwise, a
  440. modification at one text position can change the positions of
  441. links reported later on.
  442. """
  443. link_attrs = defs.link_attrs
  444. for el in self.iter(etree.Element):
  445. attribs = el.attrib
  446. tag = _nons(el.tag)
  447. if tag == 'object':
  448. codebase = None
  449. ## <object> tags have attributes that are relative to
  450. ## codebase
  451. if 'codebase' in attribs:
  452. codebase = el.get('codebase')
  453. yield (el, 'codebase', codebase, 0)
  454. for attrib in ('classid', 'data'):
  455. if attrib in attribs:
  456. value = el.get(attrib)
  457. if codebase is not None:
  458. value = urljoin(codebase, value)
  459. yield (el, attrib, value, 0)
  460. if 'archive' in attribs:
  461. for match in _archive_re.finditer(el.get('archive')):
  462. value = match.group(0)
  463. if codebase is not None:
  464. value = urljoin(codebase, value)
  465. yield (el, 'archive', value, match.start())
  466. else:
  467. for attrib in link_attrs:
  468. if attrib in attribs:
  469. yield (el, attrib, attribs[attrib], 0)
  470. if tag == 'meta':
  471. http_equiv = attribs.get('http-equiv', '').lower()
  472. if http_equiv == 'refresh':
  473. content = attribs.get('content', '')
  474. match = _parse_meta_refresh_url(content)
  475. url = (match.group('url') if match else content).strip()
  476. # unexpected content means the redirect won't work, but we might
  477. # as well be permissive and return the entire string.
  478. if url:
  479. url, pos = _unquote_match(
  480. url, match.start('url') if match else content.find(url))
  481. yield (el, 'content', url, pos)
  482. elif tag == 'param':
  483. valuetype = el.get('valuetype') or ''
  484. if valuetype.lower() == 'ref':
  485. ## FIXME: while it's fine we *find* this link,
  486. ## according to the spec we aren't supposed to
  487. ## actually change the value, including resolving
  488. ## it. It can also still be a link, even if it
  489. ## doesn't have a valuetype="ref" (which seems to be the norm)
  490. ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
  491. yield (el, 'value', el.get('value'), 0)
  492. elif tag == 'style' and el.text:
  493. urls = [
  494. # (start_pos, url)
  495. _unquote_match(match.group(1), match.start(1))[::-1]
  496. for match in _iter_css_urls(el.text)
  497. ] + [
  498. (match.start(1), match.group(1))
  499. for match in _iter_css_imports(el.text)
  500. ]
  501. if urls:
  502. # sort by start pos to bring both match sets back into order
  503. # and reverse the list to report correct positions despite
  504. # modifications
  505. urls.sort(reverse=True)
  506. for start, url in urls:
  507. yield (el, None, url, start)
  508. if 'style' in attribs:
  509. urls = list(_iter_css_urls(attribs['style']))
  510. if urls:
  511. # return in reversed order to simplify in-place modifications
  512. for match in urls[::-1]:
  513. url, start = _unquote_match(match.group(1), match.start(1))
  514. yield (el, 'style', url, start)
  515. def rewrite_links(self, link_repl_func, resolve_base_href=True,
  516. base_href=None):
  517. """
  518. Rewrite all the links in the document. For each link
  519. ``link_repl_func(link)`` will be called, and the return value
  520. will replace the old link.
  521. Note that links may not be absolute (unless you first called
  522. ``make_links_absolute()``), and may be internal (e.g.,
  523. ``'#anchor'``). They can also be values like
  524. ``'mailto:email'`` or ``'javascript:expr'``.
  525. If you give ``base_href`` then all links passed to
  526. ``link_repl_func()`` will take that into account.
  527. If the ``link_repl_func`` returns None, the attribute or
  528. tag text will be removed completely.
  529. """
  530. if base_href is not None:
  531. # FIXME: this can be done in one pass with a wrapper
  532. # around link_repl_func
  533. self.make_links_absolute(
  534. base_href, resolve_base_href=resolve_base_href)
  535. elif resolve_base_href:
  536. self.resolve_base_href()
  537. for el, attrib, link, pos in self.iterlinks():
  538. new_link = link_repl_func(link.strip())
  539. if new_link == link:
  540. continue
  541. if new_link is None:
  542. # Remove the attribute or element content
  543. if attrib is None:
  544. el.text = ''
  545. else:
  546. del el.attrib[attrib]
  547. continue
  548. if attrib is None:
  549. new = el.text[:pos] + new_link + el.text[pos+len(link):]
  550. el.text = new
  551. else:
  552. cur = el.get(attrib)
  553. if not pos and len(cur) == len(link):
  554. new = new_link # most common case
  555. else:
  556. new = cur[:pos] + new_link + cur[pos+len(link):]
  557. el.set(attrib, new)
  558. class _MethodFunc(object):
  559. """
  560. An object that represents a method on an element as a function;
  561. the function takes either an element or an HTML string. It
  562. returns whatever the function normally returns, or if the function
  563. works in-place (and so returns None) it returns a serialized form
  564. of the resulting document.
  565. """
  566. def __init__(self, name, copy=False, source_class=HtmlMixin):
  567. self.name = name
  568. self.copy = copy
  569. self.__doc__ = getattr(source_class, self.name).__doc__
  570. def __call__(self, doc, *args, **kw):
  571. result_type = type(doc)
  572. if isinstance(doc, basestring):
  573. if 'copy' in kw:
  574. raise TypeError(
  575. "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
  576. doc = fromstring(doc, **kw)
  577. else:
  578. if 'copy' in kw:
  579. make_a_copy = kw.pop('copy')
  580. else:
  581. make_a_copy = self.copy
  582. if make_a_copy:
  583. doc = copy.deepcopy(doc)
  584. meth = getattr(doc, self.name)
  585. result = meth(*args, **kw)
  586. # FIXME: this None test is a bit sloppy
  587. if result is None:
  588. # Then return what we got in
  589. return _transform_result(result_type, doc)
  590. else:
  591. return result
  592. find_rel_links = _MethodFunc('find_rel_links', copy=False)
  593. find_class = _MethodFunc('find_class', copy=False)
  594. make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
  595. resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
  596. iterlinks = _MethodFunc('iterlinks', copy=False)
  597. rewrite_links = _MethodFunc('rewrite_links', copy=True)
  598. class HtmlComment(etree.CommentBase, HtmlMixin):
  599. pass
  600. class HtmlElement(etree.ElementBase, HtmlMixin):
  601. # Override etree.ElementBase.cssselect, despite the MRO
  602. cssselect = HtmlMixin.cssselect
  603. class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
  604. pass
  605. class HtmlEntity(etree.EntityBase, HtmlMixin):
  606. pass
  607. class HtmlElementClassLookup(etree.CustomElementClassLookup):
  608. """A lookup scheme for HTML Element classes.
  609. To create a lookup instance with different Element classes, pass a tag
  610. name mapping of Element classes in the ``classes`` keyword argument and/or
  611. a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
  612. The special key '*' denotes a Mixin class that should be mixed into all
  613. Element classes.
  614. """
  615. _default_element_classes = {}
  616. def __init__(self, classes=None, mixins=None):
  617. etree.CustomElementClassLookup.__init__(self)
  618. if classes is None:
  619. classes = self._default_element_classes.copy()
  620. if mixins:
  621. mixers = {}
  622. for name, value in mixins:
  623. if name == '*':
  624. for n in classes.keys():
  625. mixers.setdefault(n, []).append(value)
  626. else:
  627. mixers.setdefault(name, []).append(value)
  628. for name, mix_bases in mixers.items():
  629. cur = classes.get(name, HtmlElement)
  630. bases = tuple(mix_bases + [cur])
  631. classes[name] = type(cur.__name__, bases, {})
  632. self._element_classes = classes
  633. def lookup(self, node_type, document, namespace, name):
  634. if node_type == 'element':
  635. return self._element_classes.get(name.lower(), HtmlElement)
  636. elif node_type == 'comment':
  637. return HtmlComment
  638. elif node_type == 'PI':
  639. return HtmlProcessingInstruction
  640. elif node_type == 'entity':
  641. return HtmlEntity
  642. # Otherwise normal lookup
  643. return None
  644. ################################################################################
  645. # parsing
  646. ################################################################################
  647. _looks_like_full_html_unicode = re.compile(
  648. unicode(r'^\s*<(?:html|!doctype)'), re.I).match
  649. _looks_like_full_html_bytes = re.compile(
  650. r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
  651. def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
  652. if parser is None:
  653. parser = html_parser
  654. value = etree.fromstring(html, parser, **kw)
  655. if value is None:
  656. raise etree.ParserError(
  657. "Document is empty")
  658. if ensure_head_body and value.find('head') is None:
  659. value.insert(0, Element('head'))
  660. if ensure_head_body and value.find('body') is None:
  661. value.append(Element('body'))
  662. return value
  663. def fragments_fromstring(html, no_leading_text=False, base_url=None,
  664. parser=None, **kw):
  665. """
  666. Parses several HTML elements, returning a list of elements.
  667. The first item in the list may be a string (though leading
  668. whitespace is removed). If no_leading_text is true, then it will
  669. be an error if there is leading text, and it will always be a list
  670. of only elements.
  671. base_url will set the document's base_url attribute (and the tree's docinfo.URL)
  672. """
  673. if parser is None:
  674. parser = html_parser
  675. # FIXME: check what happens when you give html with a body, head, etc.
  676. if isinstance(html, bytes):
  677. if not _looks_like_full_html_bytes(html):
  678. # can't use %-formatting in early Py3 versions
  679. html = ('<html><body>'.encode('ascii') + html +
  680. '</body></html>'.encode('ascii'))
  681. else:
  682. if not _looks_like_full_html_unicode(html):
  683. html = '<html><body>%s</body></html>' % html
  684. doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  685. assert _nons(doc.tag) == 'html'
  686. bodies = [e for e in doc if _nons(e.tag) == 'body']
  687. assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
  688. body = bodies[0]
  689. elements = []
  690. if no_leading_text and body.text and body.text.strip():
  691. raise etree.ParserError(
  692. "There is leading text: %r" % body.text)
  693. if body.text and body.text.strip():
  694. elements.append(body.text)
  695. elements.extend(body)
  696. # FIXME: removing the reference to the parent artificial document
  697. # would be nice
  698. return elements
  699. def fragment_fromstring(html, create_parent=False, base_url=None,
  700. parser=None, **kw):
  701. """
  702. Parses a single HTML element; it is an error if there is more than
  703. one element, or if anything but whitespace precedes or follows the
  704. element.
  705. If ``create_parent`` is true (or is a tag name) then a parent node
  706. will be created to encapsulate the HTML in a single element. In this
  707. case, leading or trailing text is also allowed, as are multiple elements
  708. as result of the parsing.
  709. Passing a ``base_url`` will set the document's ``base_url`` attribute
  710. (and the tree's docinfo.URL).
  711. """
  712. if parser is None:
  713. parser = html_parser
  714. accept_leading_text = bool(create_parent)
  715. elements = fragments_fromstring(
  716. html, parser=parser, no_leading_text=not accept_leading_text,
  717. base_url=base_url, **kw)
  718. if create_parent:
  719. if not isinstance(create_parent, basestring):
  720. create_parent = 'div'
  721. new_root = Element(create_parent)
  722. if elements:
  723. if isinstance(elements[0], basestring):
  724. new_root.text = elements[0]
  725. del elements[0]
  726. new_root.extend(elements)
  727. return new_root
  728. if not elements:
  729. raise etree.ParserError('No elements found')
  730. if len(elements) > 1:
  731. raise etree.ParserError(
  732. "Multiple elements found (%s)"
  733. % ', '.join([_element_name(e) for e in elements]))
  734. el = elements[0]
  735. if el.tail and el.tail.strip():
  736. raise etree.ParserError(
  737. "Element followed by text: %r" % el.tail)
  738. el.tail = None
  739. return el
  740. def fromstring(html, base_url=None, parser=None, **kw):
  741. """
  742. Parse the html, returning a single element/document.
  743. This tries to minimally parse the chunk of text, without knowing if it
  744. is a fragment or a document.
  745. base_url will set the document's base_url attribute (and the tree's docinfo.URL)
  746. """
  747. if parser is None:
  748. parser = html_parser
  749. if isinstance(html, bytes):
  750. is_full_html = _looks_like_full_html_bytes(html)
  751. else:
  752. is_full_html = _looks_like_full_html_unicode(html)
  753. doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  754. if is_full_html:
  755. return doc
  756. # otherwise, lets parse it out...
  757. bodies = doc.findall('body')
  758. if not bodies:
  759. bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
  760. if bodies:
  761. body = bodies[0]
  762. if len(bodies) > 1:
  763. # Somehow there are multiple bodies, which is bad, but just
  764. # smash them into one body
  765. for other_body in bodies[1:]:
  766. if other_body.text:
  767. if len(body):
  768. body[-1].tail = (body[-1].tail or '') + other_body.text
  769. else:
  770. body.text = (body.text or '') + other_body.text
  771. body.extend(other_body)
  772. # We'll ignore tail
  773. # I guess we are ignoring attributes too
  774. other_body.drop_tree()
  775. else:
  776. body = None
  777. heads = doc.findall('head')
  778. if not heads:
  779. heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
  780. if heads:
  781. # Well, we have some sort of structure, so lets keep it all
  782. head = heads[0]
  783. if len(heads) > 1:
  784. for other_head in heads[1:]:
  785. head.extend(other_head)
  786. # We don't care about text or tail in a head
  787. other_head.drop_tree()
  788. return doc
  789. if body is None:
  790. return doc
  791. if (len(body) == 1 and (not body.text or not body.text.strip())
  792. and (not body[-1].tail or not body[-1].tail.strip())):
  793. # The body has just one element, so it was probably a single
  794. # element passed in
  795. return body[0]
  796. # Now we have a body which represents a bunch of tags which have the
  797. # content that was passed in. We will create a fake container, which
  798. # is the body tag, except <body> implies too much structure.
  799. if _contains_block_level_tag(body):
  800. body.tag = 'div'
  801. else:
  802. body.tag = 'span'
  803. return body
  804. def parse(filename_or_url, parser=None, base_url=None, **kw):
  805. """
  806. Parse a filename, URL, or file-like object into an HTML document
  807. tree. Note: this returns a tree, not an element. Use
  808. ``parse(...).getroot()`` to get the document root.
  809. You can override the base URL with the ``base_url`` keyword. This
  810. is most useful when parsing from a file-like object.
  811. """
  812. if parser is None:
  813. parser = html_parser
  814. return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
  815. def _contains_block_level_tag(el):
  816. # FIXME: I could do this with XPath, but would that just be
  817. # unnecessarily slow?
  818. for el in el.iter(etree.Element):
  819. if _nons(el.tag) in defs.block_tags:
  820. return True
  821. return False
  822. def _element_name(el):
  823. if isinstance(el, etree.CommentBase):
  824. return 'comment'
  825. elif isinstance(el, basestring):
  826. return 'string'
  827. else:
  828. return _nons(el.tag)
  829. ################################################################################
  830. # form handling
  831. ################################################################################
  832. class FormElement(HtmlElement):
  833. """
  834. Represents a <form> element.
  835. """
  836. @property
  837. def inputs(self):
  838. """
  839. Returns an accessor for all the input elements in the form.
  840. See `InputGetter` for more information about the object.
  841. """
  842. return InputGetter(self)
  843. @property
  844. def fields(self):
  845. """
  846. Dictionary-like object that represents all the fields in this
  847. form. You can set values in this dictionary to effect the
  848. form.
  849. """
  850. return FieldsDict(self.inputs)
  851. @fields.setter
  852. def fields(self, value):
  853. fields = self.fields
  854. prev_keys = fields.keys()
  855. for key, value in value.items():
  856. if key in prev_keys:
  857. prev_keys.remove(key)
  858. fields[key] = value
  859. for key in prev_keys:
  860. if key is None:
  861. # Case of an unnamed input; these aren't really
  862. # expressed in form_values() anyway.
  863. continue
  864. fields[key] = None
  865. def _name(self):
  866. if self.get('name'):
  867. return self.get('name')
  868. elif self.get('id'):
  869. return '#' + self.get('id')
  870. iter_tags = self.body.iter
  871. forms = list(iter_tags('form'))
  872. if not forms:
  873. forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))
  874. return str(forms.index(self))
  875. def form_values(self):
  876. """
  877. Return a list of tuples of the field values for the form.
  878. This is suitable to be passed to ``urllib.urlencode()``.
  879. """
  880. results = []
  881. for el in self.inputs:
  882. name = el.name
  883. if not name:
  884. continue
  885. tag = _nons(el.tag)
  886. if tag == 'textarea':
  887. results.append((name, el.value))
  888. elif tag == 'select':
  889. value = el.value
  890. if el.multiple:
  891. for v in value:
  892. results.append((name, v))
  893. elif value is not None:
  894. results.append((name, el.value))
  895. else:
  896. assert tag == 'input', (
  897. "Unexpected tag: %r" % el)
  898. if el.checkable and not el.checked:
  899. continue
  900. if el.type in ('submit', 'image', 'reset'):
  901. continue
  902. value = el.value
  903. if value is not None:
  904. results.append((name, el.value))
  905. return results
  906. @property
  907. def action(self):
  908. """
  909. Get/set the form's ``action`` attribute.
  910. """
  911. base_url = self.base_url
  912. action = self.get('action')
  913. if base_url and action is not None:
  914. return urljoin(base_url, action)
  915. else:
  916. return action
  917. @action.setter
  918. def action(self, value):
  919. self.set('action', value)
  920. @action.deleter
  921. def action(self):
  922. attrib = self.attrib
  923. if 'action' in attrib:
  924. del attrib['action']
  925. @property
  926. def method(self):
  927. """
  928. Get/set the form's method. Always returns a capitalized
  929. string, and defaults to ``'GET'``
  930. """
  931. return self.get('method', 'GET').upper()
  932. @method.setter
  933. def method(self, value):
  934. self.set('method', value.upper())
  935. HtmlElementClassLookup._default_element_classes['form'] = FormElement
  936. def submit_form(form, extra_values=None, open_http=None):
  937. """
  938. Helper function to submit a form. Returns a file-like object, as from
  939. ``urllib.urlopen()``. This object also has a ``.geturl()`` function,
  940. which shows the URL if there were any redirects.
  941. You can use this like::
  942. form = doc.forms[0]
  943. form.inputs['foo'].value = 'bar' # etc
  944. response = form.submit()
  945. doc = parse(response)
  946. doc.make_links_absolute(response.geturl())
  947. To change the HTTP requester, pass a function as ``open_http`` keyword
  948. argument that opens the URL for you. The function must have the following
  949. signature::
  950. open_http(method, URL, values)
  951. The action is one of 'GET' or 'POST', the URL is the target URL as a
  952. string, and the values are a sequence of ``(name, value)`` tuples with the
  953. form data.
  954. """
  955. values = form.form_values()
  956. if extra_values:
  957. if hasattr(extra_values, 'items'):
  958. extra_values = extra_values.items()
  959. values.extend(extra_values)
  960. if open_http is None:
  961. open_http = open_http_urllib
  962. if form.action:
  963. url = form.action
  964. else:
  965. url = form.base_url
  966. return open_http(form.method, url, values)
  967. def open_http_urllib(method, url, values):
  968. if not url:
  969. raise ValueError("cannot submit, no URL provided")
  970. ## FIXME: should test that it's not a relative URL or something
  971. try:
  972. from urllib import urlencode, urlopen
  973. except ImportError: # Python 3
  974. from urllib.request import urlopen
  975. from urllib.parse import urlencode
  976. if method == 'GET':
  977. if '?' in url:
  978. url += '&'
  979. else:
  980. url += '?'
  981. url += urlencode(values)
  982. data = None
  983. else:
  984. data = urlencode(values)
  985. return urlopen(url, data)
  986. class FieldsDict(MutableMapping):
  987. def __init__(self, inputs):
  988. self.inputs = inputs
  989. def __getitem__(self, item):
  990. return self.inputs[item].value
  991. def __setitem__(self, item, value):
  992. self.inputs[item].value = value
  993. def __delitem__(self, item):
  994. raise KeyError(
  995. "You cannot remove keys from ElementDict")
  996. def keys(self):
  997. return self.inputs.keys()
  998. def __contains__(self, item):
  999. return item in self.inputs
  1000. def __iter__(self):
  1001. return iter(self.inputs.keys())
  1002. def __len__(self):
  1003. return len(self.inputs)
  1004. def __repr__(self):
  1005. return '<%s for form %s>' % (
  1006. self.__class__.__name__,
  1007. self.inputs.form._name())
  1008. class InputGetter(object):
  1009. """
  1010. An accessor that represents all the input fields in a form.
  1011. You can get fields by name from this, with
  1012. ``form.inputs['field_name']``. If there are a set of checkboxes
  1013. with the same name, they are returned as a list (a `CheckboxGroup`
  1014. which also allows value setting). Radio inputs are handled
  1015. similarly.
  1016. You can also iterate over this to get all input elements. This
  1017. won't return the same thing as if you get all the names, as
  1018. checkboxes and radio elements are returned individually.
  1019. """
  1020. _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]")
  1021. _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']")
  1022. def __init__(self, form):
  1023. self.form = form
  1024. def __repr__(self):
  1025. return '<%s for form %s>' % (
  1026. self.__class__.__name__,
  1027. self.form._name())
  1028. ## FIXME: there should be more methods, and it's unclear if this is
  1029. ## a dictionary-like object or list-like object
  1030. def __getitem__(self, name):
  1031. results = self._name_xpath(self.form, name=name)
  1032. if results:
  1033. type = results[0].get('type')
  1034. if type == 'radio' and len(results) > 1:
  1035. group = RadioGroup(results)
  1036. group.name = name
  1037. return group
  1038. elif type == 'checkbox' and len(results) > 1:
  1039. group = CheckboxGroup(results)
  1040. group.name = name
  1041. return group
  1042. else:
  1043. # I don't like throwing away elements like this
  1044. return results[0]
  1045. else:
  1046. raise KeyError(
  1047. "No input element with the name %r" % name)
  1048. def __contains__(self, name):
  1049. results = self._name_xpath(self.form, name=name)
  1050. return bool(results)
  1051. def keys(self):
  1052. names = set()
  1053. for el in self:
  1054. names.add(el.name)
  1055. if None in names:
  1056. names.remove(None)
  1057. return list(names)
  1058. def __iter__(self):
  1059. ## FIXME: kind of dumb to turn a list into an iterator, only
  1060. ## to have it likely turned back into a list again :(
  1061. return iter(self._all_xpath(self.form))
  1062. class InputMixin(object):
  1063. """
  1064. Mix-in for all input elements (input, select, and textarea)
  1065. """
  1066. @property
  1067. def name(self):
  1068. """
  1069. Get/set the name of the element
  1070. """
  1071. return self.get('name')
  1072. @name.setter
  1073. def name(self, value):
  1074. self.set('name', value)
  1075. @name.deleter
  1076. def name(self):
  1077. attrib = self.attrib
  1078. if 'name' in attrib:
  1079. del attrib['name']
  1080. def __repr__(self):
  1081. type_name = getattr(self, 'type', None)
  1082. if type_name:
  1083. type_name = ' type=%r' % type_name
  1084. else:
  1085. type_name = ''
  1086. return '<%s %x name=%r%s>' % (
  1087. self.__class__.__name__, id(self), self.name, type_name)
  1088. class TextareaElement(InputMixin, HtmlElement):
  1089. """
  1090. ``<textarea>`` element. You can get the name with ``.name`` and
  1091. get/set the value with ``.value``
  1092. """
  1093. @property
  1094. def value(self):
  1095. """
  1096. Get/set the value (which is the contents of this element)
  1097. """
  1098. content = self.text or ''
  1099. if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
  1100. serialisation_method = 'xml'
  1101. else:
  1102. serialisation_method = 'html'
  1103. for el in self:
  1104. # it's rare that we actually get here, so let's not use ''.join()
  1105. content += etree.tostring(
  1106. el, method=serialisation_method, encoding='unicode')
  1107. return content
  1108. @value.setter
  1109. def value(self, value):
  1110. del self[:]
  1111. self.text = value
  1112. @value.deleter
  1113. def value(self):
  1114. self.text = ''
  1115. del self[:]
  1116. HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
  1117. class SelectElement(InputMixin, HtmlElement):
  1118. """
  1119. ``<select>`` element. You can get the name with ``.name``.
  1120. ``.value`` will be the value of the selected option, unless this
  1121. is a multi-select element (``<select multiple>``), in which case
  1122. it will be a set-like object. In either case ``.value_options``
  1123. gives the possible values.
  1124. The boolean attribute ``.multiple`` shows if this is a
  1125. multi-select.
  1126. """
  1127. @property
  1128. def value(self):
  1129. """
  1130. Get/set the value of this select (the selected option).
  1131. If this is a multi-select, this is a set-like object that
  1132. represents all the selected options.
  1133. """
  1134. if self.multiple:
  1135. return MultipleSelectOptions(self)
  1136. for el in _options_xpath(self):
  1137. if el.get('selected') is not None:
  1138. value = el.get('value')
  1139. if value is None:
  1140. value = el.text or ''
  1141. if value:
  1142. value = value.strip()
  1143. return value
  1144. return None
  1145. @value.setter
  1146. def value(self, value):
  1147. if self.multiple:
  1148. if isinstance(value, basestring):
  1149. raise TypeError("You must pass in a sequence")
  1150. values = self.value
  1151. values.clear()
  1152. values.update(value)
  1153. return
  1154. checked_option = None
  1155. if value is not None:
  1156. value = value.strip()
  1157. for el in _options_xpath(self):
  1158. opt_value = el.get('value')
  1159. if opt_value is None:
  1160. opt_value = el.text or ''
  1161. if opt_value:
  1162. opt_value = opt_value.strip()
  1163. if opt_value == value:
  1164. checked_option = el
  1165. break
  1166. else:
  1167. raise ValueError(
  1168. "There is no option with the value of %r" % value)
  1169. for el in _options_xpath(self):
  1170. if 'selected' in el.attrib:
  1171. del el.attrib['selected']
  1172. if checked_option is not None:
  1173. checked_option.set('selected', '')
  1174. @value.deleter
  1175. def value(self):
  1176. # FIXME: should del be allowed at all?
  1177. if self.multiple:
  1178. self.value.clear()
  1179. else:
  1180. self.value = None
  1181. @property
  1182. def value_options(self):
  1183. """
  1184. All the possible values this select can have (the ``value``
  1185. attribute of all the ``<option>`` elements.
  1186. """
  1187. options = []
  1188. for el in _options_xpath(self):
  1189. value = el.get('value')
  1190. if value is None:
  1191. value = el.text or ''
  1192. if value:
  1193. value = value.strip()
  1194. options.append(value)
  1195. return options
  1196. @property
  1197. def multiple(self):
  1198. """
  1199. Boolean attribute: is there a ``multiple`` attribute on this element.
  1200. """
  1201. return 'multiple' in self.attrib
  1202. @multiple.setter
  1203. def multiple(self, value):
  1204. if value:
  1205. self.set('multiple', '')
  1206. elif 'multiple' in self.attrib:
  1207. del self.attrib['multiple']
  1208. HtmlElementClassLookup._default_element_classes['select'] = SelectElement
  1209. class MultipleSelectOptions(SetMixin):
  1210. """
  1211. Represents all the selected options in a ``<select multiple>`` element.
  1212. You can add to this set-like option to select an option, or remove
  1213. to unselect the option.
  1214. """
  1215. def __init__(self, select):
  1216. self.select = select
  1217. @property
  1218. def options(self):
  1219. """
  1220. Iterator of all the ``<option>`` elements.
  1221. """
  1222. return iter(_options_xpath(self.select))
  1223. def __iter__(self):
  1224. for option in self.options:
  1225. if 'selected' in option.attrib:
  1226. opt_value = option.get('value')
  1227. if opt_value is None:
  1228. opt_value = option.text or ''
  1229. if opt_value:
  1230. opt_value = opt_value.strip()
  1231. yield opt_value
  1232. def add(self, item):
  1233. for option in self.options:
  1234. opt_value = option.get('value')
  1235. if opt_value is None:
  1236. opt_value = option.text or ''
  1237. if opt_value:
  1238. opt_value = opt_value.strip()
  1239. if opt_value == item:
  1240. option.set('selected', '')
  1241. break
  1242. else:
  1243. raise ValueError(
  1244. "There is no option with the value %r" % item)
  1245. def remove(self, item):
  1246. for option in self.options:
  1247. opt_value = option.get('value')
  1248. if opt_value is None:
  1249. opt_value = option.text or ''
  1250. if opt_value:
  1251. opt_value = opt_value.strip()
  1252. if opt_value == item:
  1253. if 'selected' in option.attrib:
  1254. del option.attrib['selected']
  1255. else:
  1256. raise ValueError(