/src/lxml/html/__init__.py
Python | 1451 lines | 1214 code | 66 blank | 171 comment | 125 complexity | a17751397ef7291d9d3c0669b98738c4 MD5 | raw file
- # Copyright (c) 2004 Ian Bicking. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are
- # met:
- #
- # 1. Redistributions of source code must retain the above copyright
- # notice, this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in
- # the documentation and/or other materials provided with the
- # distribution.
- #
- # 3. Neither the name of Ian Bicking nor the names of its contributors may
- # be used to endorse or promote products derived from this software
- # without specific prior written permission.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
- # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- """The ``lxml.html`` tool set for HTML handling.
- """
- from __future__ import absolute_import
- __all__ = [
- 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
- 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
- 'find_rel_links', 'find_class', 'make_links_absolute',
- 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
- import copy
- import sys
- import re
- from functools import partial
- try:
- # while unnecessary, importing from 'collections.abc' is the right way to do it
- from collections.abc import MutableMapping, MutableSet
- except ImportError:
- from collections import MutableMapping, MutableSet
- from .. import etree
- from . import defs
- from ._setmixin import SetMixin
- try:
- from urlparse import urljoin
- except ImportError:
- # Python 3
- from urllib.parse import urljoin
- try:
- unicode
- except NameError:
- # Python 3
- unicode = str
- try:
- basestring
- except NameError:
- # Python 3
- basestring = (str, bytes)
- def __fix_docstring(s):
- if not s:
- return s
- if sys.version_info[0] >= 3:
- sub = re.compile(r"^(\s*)u'", re.M).sub
- else:
- sub = re.compile(r"^(\s*)b'", re.M).sub
- return sub(r"\1'", s)
- XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
- _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
- namespaces={'x':XHTML_NAMESPACE})
- _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
- namespaces={'x':XHTML_NAMESPACE})
- _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
- namespaces={'x':XHTML_NAMESPACE})
- #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
- _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
- _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
- _collect_string_content = etree.XPath("string()")
- _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
- _iter_css_imports = re.compile(r'@import "(.*?)"').finditer
- _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
- namespaces={'x':XHTML_NAMESPACE})
- _archive_re = re.compile(r'[^ ]+')
- _parse_meta_refresh_url = re.compile(
- r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
- def _unquote_match(s, pos):
- if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
- return s[1:-1], pos+1
- else:
- return s,pos
- def _transform_result(typ, result):
- """Convert the result back into the input type.
- """
- if issubclass(typ, bytes):
- return tostring(result, encoding='utf-8')
- elif issubclass(typ, unicode):
- return tostring(result, encoding='unicode')
- else:
- return result
- def _nons(tag):
- if isinstance(tag, basestring):
- if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
- return tag.split('}')[-1]
- return tag
- class Classes(MutableSet):
- """Provides access to an element's class attribute as a set-like collection.
- Usage::
- >>> el = fromstring('<p class="hidden large">Text</p>')
- >>> classes = el.classes # or: classes = Classes(el.attrib)
- >>> classes |= ['block', 'paragraph']
- >>> el.get('class')
- 'hidden large block paragraph'
- >>> classes.toggle('hidden')
- False
- >>> el.get('class')
- 'large block paragraph'
- >>> classes -= ('some', 'classes', 'block')
- >>> el.get('class')
- 'large paragraph'
- """
- def __init__(self, attributes):
- self._attributes = attributes
- self._get_class_value = partial(attributes.get, 'class', '')
- def add(self, value):
- """
- Add a class.
- This has no effect if the class is already present.
- """
- if not value or re.search(r'\s', value):
- raise ValueError("Invalid class name: %r" % value)
- classes = self._get_class_value().split()
- if value in classes:
- return
- classes.append(value)
- self._attributes['class'] = ' '.join(classes)
- def discard(self, value):
- """
- Remove a class if it is currently present.
- If the class is not present, do nothing.
- """
- if not value or re.search(r'\s', value):
- raise ValueError("Invalid class name: %r" % value)
- classes = [name for name in self._get_class_value().split()
- if name != value]
- if classes:
- self._attributes['class'] = ' '.join(classes)
- elif 'class' in self._attributes:
- del self._attributes['class']
- def remove(self, value):
- """
- Remove a class; it must currently be present.
- If the class is not present, raise a KeyError.
- """
- if not value or re.search(r'\s', value):
- raise ValueError("Invalid class name: %r" % value)
- super(Classes, self).remove(value)
- def __contains__(self, name):
- classes = self._get_class_value()
- return name in classes and name in classes.split()
- def __iter__(self):
- return iter(self._get_class_value().split())
- def __len__(self):
- return len(self._get_class_value().split())
- # non-standard methods
- def update(self, values):
- """
- Add all names from 'values'.
- """
- classes = self._get_class_value().split()
- extended = False
- for value in values:
- if value not in classes:
- classes.append(value)
- extended = True
- if extended:
- self._attributes['class'] = ' '.join(classes)
- def toggle(self, value):
- """
- Add a class name if it isn't there yet, or remove it if it exists.
- Returns true if the class was added (and is now enabled) and
- false if it was removed (and is now disabled).
- """
- if not value or re.search(r'\s', value):
- raise ValueError("Invalid class name: %r" % value)
- classes = self._get_class_value().split()
- try:
- classes.remove(value)
- enabled = False
- except ValueError:
- classes.append(value)
- enabled = True
- if classes:
- self._attributes['class'] = ' '.join(classes)
- else:
- del self._attributes['class']
- return enabled
- class HtmlMixin(object):
- @property
- def classes(self):
- """
- A set-like wrapper around the 'class' attribute.
- """
- return Classes(self.attrib)
- @classes.setter
- def classes(self, classes):
- assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.
- value = classes._get_class_value()
- if value:
- self.set('class', value)
- elif self.get('class') is not None:
- del self.attrib['class']
- @property
- def base_url(self):
- """
- Returns the base URL, given when the page was parsed.
- Use with ``urlparse.urljoin(el.base_url, href)`` to get
- absolute URLs.
- """
- return self.getroottree().docinfo.URL
- @property
- def forms(self):
- """
- Return a list of all the forms
- """
- return _forms_xpath(self)
- @property
- def body(self):
- """
- Return the <body> element. Can be called from a child element
- to get the document's head.
- """
- return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
- @property
- def head(self):
- """
- Returns the <head> element. Can be called from a child
- element to get the document's head.
- """
- return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
- @property
- def label(self):
- """
- Get or set any <label> element associated with this element.
- """
- id = self.get('id')
- if not id:
- return None
- result = _label_xpath(self, id=id)
- if not result:
- return None
- else:
- return result[0]
- @label.setter
- def label(self, label):
- id = self.get('id')
- if not id:
- raise TypeError(
- "You cannot set a label for an element (%r) that has no id"
- % self)
- if _nons(label.tag) != 'label':
- raise TypeError(
- "You can only assign label to a label element (not %r)"
- % label)
- label.set('for', id)
- @label.deleter
- def label(self):
- label = self.label
- if label is not None:
- del label.attrib['for']
- def drop_tree(self):
- """
- Removes this element from the tree, including its children and
- text. The tail text is joined to the previous element or
- parent.
- """
- parent = self.getparent()
- assert parent is not None
- if self.tail:
- previous = self.getprevious()
- if previous is None:
- parent.text = (parent.text or '') + self.tail
- else:
- previous.tail = (previous.tail or '') + self.tail
- parent.remove(self)
- def drop_tag(self):
- """
- Remove the tag, but not its children or text. The children and text
- are merged into the parent.
- Example::
- >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
- >>> h.find('.//b').drop_tag()
- >>> print(tostring(h, encoding='unicode'))
- <div>Hello World!</div>
- """
- parent = self.getparent()
- assert parent is not None
- previous = self.getprevious()
- if self.text and isinstance(self.tag, basestring):
- # not a Comment, etc.
- if previous is None:
- parent.text = (parent.text or '') + self.text
- else:
- previous.tail = (previous.tail or '') + self.text
- if self.tail:
- if len(self):
- last = self[-1]
- last.tail = (last.tail or '') + self.tail
- elif previous is None:
- parent.text = (parent.text or '') + self.tail
- else:
- previous.tail = (previous.tail or '') + self.tail
- index = parent.index(self)
- parent[index:index+1] = self[:]
- def find_rel_links(self, rel):
- """
- Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
- """
- rel = rel.lower()
- return [el for el in _rel_links_xpath(self)
- if el.get('rel').lower() == rel]
- def find_class(self, class_name):
- """
- Find any elements with the given class name.
- """
- return _class_xpath(self, class_name=class_name)
- def get_element_by_id(self, id, *default):
- """
- Get the first element in a document with the given id. If none is
- found, return the default argument if provided or raise KeyError
- otherwise.
- Note that there can be more than one element with the same id,
- and this isn't uncommon in HTML documents found in the wild.
- Browsers return only the first match, and this function does
- the same.
- """
- try:
- # FIXME: should this check for multiple matches?
- # browsers just return the first one
- return _id_xpath(self, id=id)[0]
- except IndexError:
- if default:
- return default[0]
- else:
- raise KeyError(id)
- def text_content(self):
- """
- Return the text content of the tag (and the text in any children).
- """
- return _collect_string_content(self)
- def cssselect(self, expr, translator='html'):
- """
- Run the CSS expression on this element and its children,
- returning a list of the results.
- Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
- -- note that pre-compiling the expression can provide a substantial
- speedup.
- """
- # Do the import here to make the dependency optional.
- from lxml.cssselect import CSSSelector
- return CSSSelector(expr, translator=translator)(self)
- ########################################
- ## Link functions
- ########################################
- def make_links_absolute(self, base_url=None, resolve_base_href=True,
- handle_failures=None):
- """
- Make all links in the document absolute, given the
- ``base_url`` for the document (the full URL where the document
- came from), or if no ``base_url`` is given, then the ``.base_url``
- of the document.
- If ``resolve_base_href`` is true, then any ``<base href>``
- tags in the document are used *and* removed from the document.
- If it is false then any such tag is ignored.
- If ``handle_failures`` is None (default), a failure to process
- a URL will abort the processing. If set to 'ignore', errors
- are ignored. If set to 'discard', failing URLs will be removed.
- """
- if base_url is None:
- base_url = self.base_url
- if base_url is None:
- raise TypeError(
- "No base_url given, and the document has no base_url")
- if resolve_base_href:
- self.resolve_base_href()
- if handle_failures == 'ignore':
- def link_repl(href):
- try:
- return urljoin(base_url, href)
- except ValueError:
- return href
- elif handle_failures == 'discard':
- def link_repl(href):
- try:
- return urljoin(base_url, href)
- except ValueError:
- return None
- elif handle_failures is None:
- def link_repl(href):
- return urljoin(base_url, href)
- else:
- raise ValueError(
- "unexpected value for handle_failures: %r" % handle_failures)
- self.rewrite_links(link_repl)
- def resolve_base_href(self, handle_failures=None):
- """
- Find any ``<base href>`` tag in the document, and apply its
- values to all links found in the document. Also remove the
- tag once it has been applied.
- If ``handle_failures`` is None (default), a failure to process
- a URL will abort the processing. If set to 'ignore', errors
- are ignored. If set to 'discard', failing URLs will be removed.
- """
- base_href = None
- basetags = self.xpath('//base[@href]|//x:base[@href]',
- namespaces={'x': XHTML_NAMESPACE})
- for b in basetags:
- base_href = b.get('href')
- b.drop_tree()
- if not base_href:
- return
- self.make_links_absolute(base_href, resolve_base_href=False,
- handle_failures=handle_failures)
- def iterlinks(self):
- """
- Yield (element, attribute, link, pos), where attribute may be None
- (indicating the link is in the text). ``pos`` is the position
- where the link occurs; often 0, but sometimes something else in
- the case of links in stylesheets or style tags.
- Note: <base href> is *not* taken into account in any way. The
- link you get is exactly the link in the document.
- Note: multiple links inside of a single text string or
- attribute value are returned in reversed order. This makes it
- possible to replace or delete them from the text string value
- based on their reported text positions. Otherwise, a
- modification at one text position can change the positions of
- links reported later on.
- """
- link_attrs = defs.link_attrs
- for el in self.iter(etree.Element):
- attribs = el.attrib
- tag = _nons(el.tag)
- if tag == 'object':
- codebase = None
- ## <object> tags have attributes that are relative to
- ## codebase
- if 'codebase' in attribs:
- codebase = el.get('codebase')
- yield (el, 'codebase', codebase, 0)
- for attrib in ('classid', 'data'):
- if attrib in attribs:
- value = el.get(attrib)
- if codebase is not None:
- value = urljoin(codebase, value)
- yield (el, attrib, value, 0)
- if 'archive' in attribs:
- for match in _archive_re.finditer(el.get('archive')):
- value = match.group(0)
- if codebase is not None:
- value = urljoin(codebase, value)
- yield (el, 'archive', value, match.start())
- else:
- for attrib in link_attrs:
- if attrib in attribs:
- yield (el, attrib, attribs[attrib], 0)
- if tag == 'meta':
- http_equiv = attribs.get('http-equiv', '').lower()
- if http_equiv == 'refresh':
- content = attribs.get('content', '')
- match = _parse_meta_refresh_url(content)
- url = (match.group('url') if match else content).strip()
- # unexpected content means the redirect won't work, but we might
- # as well be permissive and return the entire string.
- if url:
- url, pos = _unquote_match(
- url, match.start('url') if match else content.find(url))
- yield (el, 'content', url, pos)
- elif tag == 'param':
- valuetype = el.get('valuetype') or ''
- if valuetype.lower() == 'ref':
- ## FIXME: while it's fine we *find* this link,
- ## according to the spec we aren't supposed to
- ## actually change the value, including resolving
- ## it. It can also still be a link, even if it
- ## doesn't have a valuetype="ref" (which seems to be the norm)
- ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
- yield (el, 'value', el.get('value'), 0)
- elif tag == 'style' and el.text:
- urls = [
- # (start_pos, url)
- _unquote_match(match.group(1), match.start(1))[::-1]
- for match in _iter_css_urls(el.text)
- ] + [
- (match.start(1), match.group(1))
- for match in _iter_css_imports(el.text)
- ]
- if urls:
- # sort by start pos to bring both match sets back into order
- # and reverse the list to report correct positions despite
- # modifications
- urls.sort(reverse=True)
- for start, url in urls:
- yield (el, None, url, start)
- if 'style' in attribs:
- urls = list(_iter_css_urls(attribs['style']))
- if urls:
- # return in reversed order to simplify in-place modifications
- for match in urls[::-1]:
- url, start = _unquote_match(match.group(1), match.start(1))
- yield (el, 'style', url, start)
- def rewrite_links(self, link_repl_func, resolve_base_href=True,
- base_href=None):
- """
- Rewrite all the links in the document. For each link
- ``link_repl_func(link)`` will be called, and the return value
- will replace the old link.
- Note that links may not be absolute (unless you first called
- ``make_links_absolute()``), and may be internal (e.g.,
- ``'#anchor'``). They can also be values like
- ``'mailto:email'`` or ``'javascript:expr'``.
- If you give ``base_href`` then all links passed to
- ``link_repl_func()`` will take that into account.
- If the ``link_repl_func`` returns None, the attribute or
- tag text will be removed completely.
- """
- if base_href is not None:
- # FIXME: this can be done in one pass with a wrapper
- # around link_repl_func
- self.make_links_absolute(
- base_href, resolve_base_href=resolve_base_href)
- elif resolve_base_href:
- self.resolve_base_href()
- for el, attrib, link, pos in self.iterlinks():
- new_link = link_repl_func(link.strip())
- if new_link == link:
- continue
- if new_link is None:
- # Remove the attribute or element content
- if attrib is None:
- el.text = ''
- else:
- del el.attrib[attrib]
- continue
- if attrib is None:
- new = el.text[:pos] + new_link + el.text[pos+len(link):]
- el.text = new
- else:
- cur = el.get(attrib)
- if not pos and len(cur) == len(link):
- new = new_link # most common case
- else:
- new = cur[:pos] + new_link + cur[pos+len(link):]
- el.set(attrib, new)
- class _MethodFunc(object):
- """
- An object that represents a method on an element as a function;
- the function takes either an element or an HTML string. It
- returns whatever the function normally returns, or if the function
- works in-place (and so returns None) it returns a serialized form
- of the resulting document.
- """
- def __init__(self, name, copy=False, source_class=HtmlMixin):
- self.name = name
- self.copy = copy
- self.__doc__ = getattr(source_class, self.name).__doc__
- def __call__(self, doc, *args, **kw):
- result_type = type(doc)
- if isinstance(doc, basestring):
- if 'copy' in kw:
- raise TypeError(
- "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
- doc = fromstring(doc, **kw)
- else:
- if 'copy' in kw:
- make_a_copy = kw.pop('copy')
- else:
- make_a_copy = self.copy
- if make_a_copy:
- doc = copy.deepcopy(doc)
- meth = getattr(doc, self.name)
- result = meth(*args, **kw)
- # FIXME: this None test is a bit sloppy
- if result is None:
- # Then return what we got in
- return _transform_result(result_type, doc)
- else:
- return result
- find_rel_links = _MethodFunc('find_rel_links', copy=False)
- find_class = _MethodFunc('find_class', copy=False)
- make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
- resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
- iterlinks = _MethodFunc('iterlinks', copy=False)
- rewrite_links = _MethodFunc('rewrite_links', copy=True)
- class HtmlComment(etree.CommentBase, HtmlMixin):
- pass
- class HtmlElement(etree.ElementBase, HtmlMixin):
- # Override etree.ElementBase.cssselect, despite the MRO
- cssselect = HtmlMixin.cssselect
- class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
- pass
- class HtmlEntity(etree.EntityBase, HtmlMixin):
- pass
- class HtmlElementClassLookup(etree.CustomElementClassLookup):
- """A lookup scheme for HTML Element classes.
- To create a lookup instance with different Element classes, pass a tag
- name mapping of Element classes in the ``classes`` keyword argument and/or
- a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
- The special key '*' denotes a Mixin class that should be mixed into all
- Element classes.
- """
- _default_element_classes = {}
- def __init__(self, classes=None, mixins=None):
- etree.CustomElementClassLookup.__init__(self)
- if classes is None:
- classes = self._default_element_classes.copy()
- if mixins:
- mixers = {}
- for name, value in mixins:
- if name == '*':
- for n in classes.keys():
- mixers.setdefault(n, []).append(value)
- else:
- mixers.setdefault(name, []).append(value)
- for name, mix_bases in mixers.items():
- cur = classes.get(name, HtmlElement)
- bases = tuple(mix_bases + [cur])
- classes[name] = type(cur.__name__, bases, {})
- self._element_classes = classes
- def lookup(self, node_type, document, namespace, name):
- if node_type == 'element':
- return self._element_classes.get(name.lower(), HtmlElement)
- elif node_type == 'comment':
- return HtmlComment
- elif node_type == 'PI':
- return HtmlProcessingInstruction
- elif node_type == 'entity':
- return HtmlEntity
- # Otherwise normal lookup
- return None
- ################################################################################
- # parsing
- ################################################################################
- _looks_like_full_html_unicode = re.compile(
- unicode(r'^\s*<(?:html|!doctype)'), re.I).match
- _looks_like_full_html_bytes = re.compile(
- r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
- def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
- if parser is None:
- parser = html_parser
- value = etree.fromstring(html, parser, **kw)
- if value is None:
- raise etree.ParserError(
- "Document is empty")
- if ensure_head_body and value.find('head') is None:
- value.insert(0, Element('head'))
- if ensure_head_body and value.find('body') is None:
- value.append(Element('body'))
- return value
- def fragments_fromstring(html, no_leading_text=False, base_url=None,
- parser=None, **kw):
- """
- Parses several HTML elements, returning a list of elements.
- The first item in the list may be a string (though leading
- whitespace is removed). If no_leading_text is true, then it will
- be an error if there is leading text, and it will always be a list
- of only elements.
- base_url will set the document's base_url attribute (and the tree's docinfo.URL)
- """
- if parser is None:
- parser = html_parser
- # FIXME: check what happens when you give html with a body, head, etc.
- if isinstance(html, bytes):
- if not _looks_like_full_html_bytes(html):
- # can't use %-formatting in early Py3 versions
- html = ('<html><body>'.encode('ascii') + html +
- '</body></html>'.encode('ascii'))
- else:
- if not _looks_like_full_html_unicode(html):
- html = '<html><body>%s</body></html>' % html
- doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
- assert _nons(doc.tag) == 'html'
- bodies = [e for e in doc if _nons(e.tag) == 'body']
- assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
- body = bodies[0]
- elements = []
- if no_leading_text and body.text and body.text.strip():
- raise etree.ParserError(
- "There is leading text: %r" % body.text)
- if body.text and body.text.strip():
- elements.append(body.text)
- elements.extend(body)
- # FIXME: removing the reference to the parent artificial document
- # would be nice
- return elements
- def fragment_fromstring(html, create_parent=False, base_url=None,
- parser=None, **kw):
- """
- Parses a single HTML element; it is an error if there is more than
- one element, or if anything but whitespace precedes or follows the
- element.
- If ``create_parent`` is true (or is a tag name) then a parent node
- will be created to encapsulate the HTML in a single element. In this
- case, leading or trailing text is also allowed, as are multiple elements
- as result of the parsing.
- Passing a ``base_url`` will set the document's ``base_url`` attribute
- (and the tree's docinfo.URL).
- """
- if parser is None:
- parser = html_parser
- accept_leading_text = bool(create_parent)
- elements = fragments_fromstring(
- html, parser=parser, no_leading_text=not accept_leading_text,
- base_url=base_url, **kw)
- if create_parent:
- if not isinstance(create_parent, basestring):
- create_parent = 'div'
- new_root = Element(create_parent)
- if elements:
- if isinstance(elements[0], basestring):
- new_root.text = elements[0]
- del elements[0]
- new_root.extend(elements)
- return new_root
- if not elements:
- raise etree.ParserError('No elements found')
- if len(elements) > 1:
- raise etree.ParserError(
- "Multiple elements found (%s)"
- % ', '.join([_element_name(e) for e in elements]))
- el = elements[0]
- if el.tail and el.tail.strip():
- raise etree.ParserError(
- "Element followed by text: %r" % el.tail)
- el.tail = None
- return el
- def fromstring(html, base_url=None, parser=None, **kw):
- """
- Parse the html, returning a single element/document.
- This tries to minimally parse the chunk of text, without knowing if it
- is a fragment or a document.
- base_url will set the document's base_url attribute (and the tree's docinfo.URL)
- """
- if parser is None:
- parser = html_parser
- if isinstance(html, bytes):
- is_full_html = _looks_like_full_html_bytes(html)
- else:
- is_full_html = _looks_like_full_html_unicode(html)
- doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
- if is_full_html:
- return doc
- # otherwise, lets parse it out...
- bodies = doc.findall('body')
- if not bodies:
- bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
- if bodies:
- body = bodies[0]
- if len(bodies) > 1:
- # Somehow there are multiple bodies, which is bad, but just
- # smash them into one body
- for other_body in bodies[1:]:
- if other_body.text:
- if len(body):
- body[-1].tail = (body[-1].tail or '') + other_body.text
- else:
- body.text = (body.text or '') + other_body.text
- body.extend(other_body)
- # We'll ignore tail
- # I guess we are ignoring attributes too
- other_body.drop_tree()
- else:
- body = None
- heads = doc.findall('head')
- if not heads:
- heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
- if heads:
- # Well, we have some sort of structure, so lets keep it all
- head = heads[0]
- if len(heads) > 1:
- for other_head in heads[1:]:
- head.extend(other_head)
- # We don't care about text or tail in a head
- other_head.drop_tree()
- return doc
- if body is None:
- return doc
- if (len(body) == 1 and (not body.text or not body.text.strip())
- and (not body[-1].tail or not body[-1].tail.strip())):
- # The body has just one element, so it was probably a single
- # element passed in
- return body[0]
- # Now we have a body which represents a bunch of tags which have the
- # content that was passed in. We will create a fake container, which
- # is the body tag, except <body> implies too much structure.
- if _contains_block_level_tag(body):
- body.tag = 'div'
- else:
- body.tag = 'span'
- return body
- def parse(filename_or_url, parser=None, base_url=None, **kw):
- """
- Parse a filename, URL, or file-like object into an HTML document
- tree. Note: this returns a tree, not an element. Use
- ``parse(...).getroot()`` to get the document root.
- You can override the base URL with the ``base_url`` keyword. This
- is most useful when parsing from a file-like object.
- """
- if parser is None:
- parser = html_parser
- return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
- def _contains_block_level_tag(el):
- # FIXME: I could do this with XPath, but would that just be
- # unnecessarily slow?
- for el in el.iter(etree.Element):
- if _nons(el.tag) in defs.block_tags:
- return True
- return False
- def _element_name(el):
- if isinstance(el, etree.CommentBase):
- return 'comment'
- elif isinstance(el, basestring):
- return 'string'
- else:
- return _nons(el.tag)
- ################################################################################
- # form handling
- ################################################################################
- class FormElement(HtmlElement):
- """
- Represents a <form> element.
- """
- @property
- def inputs(self):
- """
- Returns an accessor for all the input elements in the form.
- See `InputGetter` for more information about the object.
- """
- return InputGetter(self)
- @property
- def fields(self):
- """
- Dictionary-like object that represents all the fields in this
- form. You can set values in this dictionary to effect the
- form.
- """
- return FieldsDict(self.inputs)
- @fields.setter
- def fields(self, value):
- fields = self.fields
- prev_keys = fields.keys()
- for key, value in value.items():
- if key in prev_keys:
- prev_keys.remove(key)
- fields[key] = value
- for key in prev_keys:
- if key is None:
- # Case of an unnamed input; these aren't really
- # expressed in form_values() anyway.
- continue
- fields[key] = None
- def _name(self):
- if self.get('name'):
- return self.get('name')
- elif self.get('id'):
- return '#' + self.get('id')
- iter_tags = self.body.iter
- forms = list(iter_tags('form'))
- if not forms:
- forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))
- return str(forms.index(self))
- def form_values(self):
- """
- Return a list of tuples of the field values for the form.
- This is suitable to be passed to ``urllib.urlencode()``.
- """
- results = []
- for el in self.inputs:
- name = el.name
- if not name:
- continue
- tag = _nons(el.tag)
- if tag == 'textarea':
- results.append((name, el.value))
- elif tag == 'select':
- value = el.value
- if el.multiple:
- for v in value:
- results.append((name, v))
- elif value is not None:
- results.append((name, el.value))
- else:
- assert tag == 'input', (
- "Unexpected tag: %r" % el)
- if el.checkable and not el.checked:
- continue
- if el.type in ('submit', 'image', 'reset'):
- continue
- value = el.value
- if value is not None:
- results.append((name, el.value))
- return results
- @property
- def action(self):
- """
- Get/set the form's ``action`` attribute.
- """
- base_url = self.base_url
- action = self.get('action')
- if base_url and action is not None:
- return urljoin(base_url, action)
- else:
- return action
- @action.setter
- def action(self, value):
- self.set('action', value)
- @action.deleter
- def action(self):
- attrib = self.attrib
- if 'action' in attrib:
- del attrib['action']
- @property
- def method(self):
- """
- Get/set the form's method. Always returns a capitalized
- string, and defaults to ``'GET'``
- """
- return self.get('method', 'GET').upper()
- @method.setter
- def method(self, value):
- self.set('method', value.upper())
- HtmlElementClassLookup._default_element_classes['form'] = FormElement
- def submit_form(form, extra_values=None, open_http=None):
- """
- Helper function to submit a form. Returns a file-like object, as from
- ``urllib.urlopen()``. This object also has a ``.geturl()`` function,
- which shows the URL if there were any redirects.
- You can use this like::
- form = doc.forms[0]
- form.inputs['foo'].value = 'bar' # etc
- response = form.submit()
- doc = parse(response)
- doc.make_links_absolute(response.geturl())
- To change the HTTP requester, pass a function as ``open_http`` keyword
- argument that opens the URL for you. The function must have the following
- signature::
- open_http(method, URL, values)
- The action is one of 'GET' or 'POST', the URL is the target URL as a
- string, and the values are a sequence of ``(name, value)`` tuples with the
- form data.
- """
- values = form.form_values()
- if extra_values:
- if hasattr(extra_values, 'items'):
- extra_values = extra_values.items()
- values.extend(extra_values)
- if open_http is None:
- open_http = open_http_urllib
- if form.action:
- url = form.action
- else:
- url = form.base_url
- return open_http(form.method, url, values)
- def open_http_urllib(method, url, values):
- if not url:
- raise ValueError("cannot submit, no URL provided")
- ## FIXME: should test that it's not a relative URL or something
- try:
- from urllib import urlencode, urlopen
- except ImportError: # Python 3
- from urllib.request import urlopen
- from urllib.parse import urlencode
- if method == 'GET':
- if '?' in url:
- url += '&'
- else:
- url += '?'
- url += urlencode(values)
- data = None
- else:
- data = urlencode(values)
- return urlopen(url, data)
- class FieldsDict(MutableMapping):
- def __init__(self, inputs):
- self.inputs = inputs
- def __getitem__(self, item):
- return self.inputs[item].value
- def __setitem__(self, item, value):
- self.inputs[item].value = value
- def __delitem__(self, item):
- raise KeyError(
- "You cannot remove keys from ElementDict")
- def keys(self):
- return self.inputs.keys()
- def __contains__(self, item):
- return item in self.inputs
- def __iter__(self):
- return iter(self.inputs.keys())
- def __len__(self):
- return len(self.inputs)
- def __repr__(self):
- return '<%s for form %s>' % (
- self.__class__.__name__,
- self.inputs.form._name())
- class InputGetter(object):
- """
- An accessor that represents all the input fields in a form.
- You can get fields by name from this, with
- ``form.inputs['field_name']``. If there are a set of checkboxes
- with the same name, they are returned as a list (a `CheckboxGroup`
- which also allows value setting). Radio inputs are handled
- similarly.
- You can also iterate over this to get all input elements. This
- won't return the same thing as if you get all the names, as
- checkboxes and radio elements are returned individually.
- """
- _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]")
- _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']")
- def __init__(self, form):
- self.form = form
- def __repr__(self):
- return '<%s for form %s>' % (
- self.__class__.__name__,
- self.form._name())
- ## FIXME: there should be more methods, and it's unclear if this is
- ## a dictionary-like object or list-like object
- def __getitem__(self, name):
- results = self._name_xpath(self.form, name=name)
- if results:
- type = results[0].get('type')
- if type == 'radio' and len(results) > 1:
- group = RadioGroup(results)
- group.name = name
- return group
- elif type == 'checkbox' and len(results) > 1:
- group = CheckboxGroup(results)
- group.name = name
- return group
- else:
- # I don't like throwing away elements like this
- return results[0]
- else:
- raise KeyError(
- "No input element with the name %r" % name)
- def __contains__(self, name):
- results = self._name_xpath(self.form, name=name)
- return bool(results)
- def keys(self):
- names = set()
- for el in self:
- names.add(el.name)
- if None in names:
- names.remove(None)
- return list(names)
- def __iter__(self):
- ## FIXME: kind of dumb to turn a list into an iterator, only
- ## to have it likely turned back into a list again :(
- return iter(self._all_xpath(self.form))
- class InputMixin(object):
- """
- Mix-in for all input elements (input, select, and textarea)
- """
- @property
- def name(self):
- """
- Get/set the name of the element
- """
- return self.get('name')
- @name.setter
- def name(self, value):
- self.set('name', value)
- @name.deleter
- def name(self):
- attrib = self.attrib
- if 'name' in attrib:
- del attrib['name']
- def __repr__(self):
- type_name = getattr(self, 'type', None)
- if type_name:
- type_name = ' type=%r' % type_name
- else:
- type_name = ''
- return '<%s %x name=%r%s>' % (
- self.__class__.__name__, id(self), self.name, type_name)
- class TextareaElement(InputMixin, HtmlElement):
- """
- ``<textarea>`` element. You can get the name with ``.name`` and
- get/set the value with ``.value``
- """
- @property
- def value(self):
- """
- Get/set the value (which is the contents of this element)
- """
- content = self.text or ''
- if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
- serialisation_method = 'xml'
- else:
- serialisation_method = 'html'
- for el in self:
- # it's rare that we actually get here, so let's not use ''.join()
- content += etree.tostring(
- el, method=serialisation_method, encoding='unicode')
- return content
- @value.setter
- def value(self, value):
- del self[:]
- self.text = value
- @value.deleter
- def value(self):
- self.text = ''
- del self[:]
- HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
- class SelectElement(InputMixin, HtmlElement):
- """
- ``<select>`` element. You can get the name with ``.name``.
- ``.value`` will be the value of the selected option, unless this
- is a multi-select element (``<select multiple>``), in which case
- it will be a set-like object. In either case ``.value_options``
- gives the possible values.
- The boolean attribute ``.multiple`` shows if this is a
- multi-select.
- """
- @property
- def value(self):
- """
- Get/set the value of this select (the selected option).
- If this is a multi-select, this is a set-like object that
- represents all the selected options.
- """
- if self.multiple:
- return MultipleSelectOptions(self)
- for el in _options_xpath(self):
- if el.get('selected') is not None:
- value = el.get('value')
- if value is None:
- value = el.text or ''
- if value:
- value = value.strip()
- return value
- return None
- @value.setter
- def value(self, value):
- if self.multiple:
- if isinstance(value, basestring):
- raise TypeError("You must pass in a sequence")
- values = self.value
- values.clear()
- values.update(value)
- return
- checked_option = None
- if value is not None:
- value = value.strip()
- for el in _options_xpath(self):
- opt_value = el.get('value')
- if opt_value is None:
- opt_value = el.text or ''
- if opt_value:
- opt_value = opt_value.strip()
- if opt_value == value:
- checked_option = el
- break
- else:
- raise ValueError(
- "There is no option with the value of %r" % value)
- for el in _options_xpath(self):
- if 'selected' in el.attrib:
- del el.attrib['selected']
- if checked_option is not None:
- checked_option.set('selected', '')
- @value.deleter
- def value(self):
- # FIXME: should del be allowed at all?
- if self.multiple:
- self.value.clear()
- else:
- self.value = None
- @property
- def value_options(self):
- """
- All the possible values this select can have (the ``value``
- attribute of all the ``<option>`` elements.
- """
- options = []
- for el in _options_xpath(self):
- value = el.get('value')
- if value is None:
- value = el.text or ''
- if value:
- value = value.strip()
- options.append(value)
- return options
- @property
- def multiple(self):
- """
- Boolean attribute: is there a ``multiple`` attribute on this element.
- """
- return 'multiple' in self.attrib
- @multiple.setter
- def multiple(self, value):
- if value:
- self.set('multiple', '')
- elif 'multiple' in self.attrib:
- del self.attrib['multiple']
- HtmlElementClassLookup._default_element_classes['select'] = SelectElement
- class MultipleSelectOptions(SetMixin):
- """
- Represents all the selected options in a ``<select multiple>`` element.
- You can add to this set-like option to select an option, or remove
- to unselect the option.
- """
- def __init__(self, select):
- self.select = select
- @property
- def options(self):
- """
- Iterator of all the ``<option>`` elements.
- """
- return iter(_options_xpath(self.select))
- def __iter__(self):
- for option in self.options:
- if 'selected' in option.attrib:
- opt_value = option.get('value')
- if opt_value is None:
- opt_value = option.text or ''
- if opt_value:
- opt_value = opt_value.strip()
- yield opt_value
- def add(self, item):
- for option in self.options:
- opt_value = option.get('value')
- if opt_value is None:
- opt_value = option.text or ''
- if opt_value:
- opt_value = opt_value.strip()
- if opt_value == item:
- option.set('selected', '')
- break
- else:
- raise ValueError(
- "There is no option with the value %r" % item)
- def remove(self, item):
- for option in self.options:
- opt_value = option.get('value')
- if opt_value is None:
- opt_value = option.text or ''
- if opt_value:
- opt_value = opt_value.strip()
- if opt_value == item:
- if 'selected' in option.attrib:
- del option.attrib['selected']
- else:
- raise ValueError(
-