_html.py | searchcode

/mechanize/_html.py

https://github.com/tmiyamon/mechanize-python · Python · 629 lines · 451 code · 77 blank · 101 comment · 88 complexity · 17fdee3fba5ebceb2fd75595dd3d168e MD5 · raw file

"""HTML handling.

Copyright 2003-2006 John J. Lee <jjl@pobox.com>

This code is free software; you can redistribute it and/or modify it under
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
included with the distribution).

"""

import codecs
import copy
import htmlentitydefs
import re

import _sgmllib_copy as sgmllib

import _beautifulsoup
import _form
from _headersutil import split_header_words, is_html as _is_html
import _request
import _rfc3986

DEFAULT_ENCODING = "latin-1"

COMPRESS_RE = re.compile(r"\s+")


class CachingGeneratorFunction(object):
    """Caching wrapper around a no-arguments iterable."""

    def __init__(self, iterable):
        self._cache = []
        # wrap iterable to make it non-restartable (otherwise, repeated
        # __call__ would give incorrect results)
        self._iterator = iter(iterable)

    def __call__(self):
        cache = self._cache
        for item in cache:
            yield item
        for item in self._iterator:
            cache.append(item)
            yield item


class EncodingFinder:
    def __init__(self, default_encoding):
        self._default_encoding = default_encoding
    def encoding(self, response):
        # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
        # headers may be in the response.  HTTP-EQUIV headers come last,
        # so try in order from first to last.
        for ct in response.info().getheaders("content-type"):
            for k, v in split_header_words([ct])[0]:
                if k == "charset":
                    encoding = v
                    try:
                        codecs.lookup(v)
                    except LookupError:
                        continue
                    else:
                        return encoding
        return self._default_encoding


class ResponseTypeFinder:
    def __init__(self, allow_xhtml):
        self._allow_xhtml = allow_xhtml
    def is_html(self, response, encoding):
        ct_hdrs = response.info().getheaders("content-type")
        url = response.geturl()
        # XXX encoding
        return _is_html(ct_hdrs, url, self._allow_xhtml)


class Args(object):

    # idea for this argument-processing trick is from Peter Otten

    def __init__(self, args_map):
        self.__dict__["dictionary"] = dict(args_map)

    def __getattr__(self, key):
        try:
            return self.dictionary[key]
        except KeyError:
            return getattr(self.__class__, key)

    def __setattr__(self, key, value):
        if key == "dictionary":
            raise AttributeError()
        self.dictionary[key] = value


def form_parser_args(
    select_default=False,
    form_parser_class=None,
    request_class=None,
    backwards_compat=False,
    ):
    return Args(locals())


class Link:
    def __init__(self, base_url, url, text, tag, attrs):
        assert None not in [url, tag, attrs]
        self.base_url = base_url
        self.absolute_url = _rfc3986.urljoin(base_url, url)
        self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
    def __cmp__(self, other):
        try:
            for name in "url", "text", "tag", "attrs":
                if getattr(self, name) != getattr(other, name):
                    return -1
        except AttributeError:
            return -1
        return 0
    def __repr__(self):
        return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
            self.base_url, self.url, self.text, self.tag, self.attrs)


class LinksFactory:

    def __init__(self,
                 link_parser_class=None,
                 link_class=Link,
                 urltags=None,
                 ):
        import _pullparser
        if link_parser_class is None:
            link_parser_class = _pullparser.TolerantPullParser
        self.link_parser_class = link_parser_class
        self.link_class = link_class
        if urltags is None:
            urltags = {
                "a": "href",
                "area": "href",
                "frame": "src",
                "iframe": "src",
                }
        self.urltags = urltags
        self._response = None
        self._encoding = None

    def set_response(self, response, base_url, encoding):
        self._response = response
        self._encoding = encoding
        self._base_url = base_url

    def links(self):
        """Return an iterator that provides links of the document."""
        response = self._response
        encoding = self._encoding
        base_url = self._base_url
        p = self.link_parser_class(response, encoding=encoding)

        try:
            for token in p.tags(*(self.urltags.keys()+["base"])):
                if token.type == "endtag":
                    continue
                if token.data == "base":
                    base_href = dict(token.attrs).get("href")
                    if base_href is not None:
                        base_url = base_href
                    continue
                attrs = dict(token.attrs)
                tag = token.data
                text = None
                # XXX use attr_encoding for ref'd doc if that doc does not
                #  provide one by other means
                #attr_encoding = attrs.get("charset")
                url = attrs.get(self.urltags[tag])  # XXX is "" a valid URL?
                if not url:
                    # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
                    # For our purposes a link is something with a URL, so
                    # ignore this.
                    continue

                url = _rfc3986.clean_url(url, encoding)
                if tag == "a":
                    if token.type != "startendtag":
                        # hmm, this'd break if end tag is missing
                        text = p.get_compressed_text(("endtag", tag))
                    # but this doesn't work for e.g.
                    # <a href="blah"><b>Andy</b></a>
                    #text = p.get_compressed_text()

                yield Link(base_url, url, text, tag, token.attrs)
        except sgmllib.SGMLParseError, exc:
            raise _form.ParseError(exc)

class FormsFactory:

    """Makes a sequence of objects satisfying HTMLForm interface.

    After calling .forms(), the .global_form attribute is a form object
    containing all controls not a descendant of any FORM element.

    For constructor argument docs, see ParseResponse argument docs.
    """

    def __init__(self,
                 select_default=False,
                 form_parser_class=None,
                 request_class=None,
                 backwards_compat=False,
                 ):
        self.select_default = select_default
        if form_parser_class is None:
            form_parser_class = _form.FormParser
        self.form_parser_class = form_parser_class
        if request_class is None:
            request_class = _request.Request
        self.request_class = request_class
        self.backwards_compat = backwards_compat
        self._response = None
        self.encoding = None
        self.global_form = None

    def set_response(self, response, encoding):
        self._response = response
        self.encoding = encoding
        self.global_form = None

    def forms(self):
        encoding = self.encoding
        forms = _form.ParseResponseEx(
            self._response,
            select_default=self.select_default,
            form_parser_class=self.form_parser_class,
            request_class=self.request_class,
            encoding=encoding,
            _urljoin=_rfc3986.urljoin,
            _urlparse=_rfc3986.urlsplit,
            _urlunparse=_rfc3986.urlunsplit,
            )
        self.global_form = forms[0]
        return forms[1:]

class TitleFactory:
    def __init__(self):
        self._response = self._encoding = None

    def set_response(self, response, encoding):
        self._response = response
        self._encoding = encoding

    def _get_title_text(self, parser):
        import _pullparser
        text = []
        tok = None
        while 1:
            try:
                tok = parser.get_token()
            except _pullparser.NoMoreTokensError:
                break
            if tok.type == "data":
                text.append(str(tok))
            elif tok.type == "entityref":
                t = unescape("&%s;" % tok.data,
                             parser._entitydefs, parser.encoding)
                text.append(t)
            elif tok.type == "charref":
                t = unescape_charref(tok.data, parser.encoding)
                text.append(t)
            elif tok.type in ["starttag", "endtag", "startendtag"]:
                tag_name = tok.data
                if tok.type == "endtag" and tag_name == "title":
                    break
                text.append(str(tok))
        return COMPRESS_RE.sub(" ", "".join(text).strip())

    def title(self):
        import _pullparser
        p = _pullparser.TolerantPullParser(
            self._response, encoding=self._encoding)
        try:
            try:
                p.get_tag("title")
            except _pullparser.NoMoreTokensError:
                return None
            else:
                return self._get_title_text(p)
        except sgmllib.SGMLParseError, exc:
            raise _form.ParseError(exc)


def unescape(data, entities, encoding):
    if data is None or "&" not in data:
        return data

    def replace_entities(match):
        ent = match.group()
        if ent[1] == "#":
            return unescape_charref(ent[2:-1], encoding)

        repl = entities.get(ent[1:-1])
        if repl is not None:
            repl = unichr(repl)
            if type(repl) != type(""):
                try:
                    repl = repl.encode(encoding)
                except UnicodeError:
                    repl = ent
        else:
            repl = ent
        return repl

    return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)

def unescape_charref(data, encoding):
    name, base = data, 10
    if name.startswith("x"):
        name, base= name[1:], 16
    uc = unichr(int(name, base))
    if encoding is None:
        return uc
    else:
        try:
            repl = uc.encode(encoding)
        except UnicodeError:
            repl = "&#%s;" % data
        return repl


class MechanizeBs(_beautifulsoup.BeautifulSoup):
    _entitydefs = htmlentitydefs.name2codepoint
    # don't want the magic Microsoft-char workaround
    PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
                       lambda(x):x.group(1) + ' />'),
                      (re.compile('<!\s+([^<>]*)>'),
                       lambda(x):'<!' + x.group(1) + '>')
                      ]

    def __init__(self, encoding, text=None, avoidParserProblems=True,
                 initialTextIsEverything=True):
        self._encoding = encoding
        _beautifulsoup.BeautifulSoup.__init__(
            self, text, avoidParserProblems, initialTextIsEverything)

    def handle_charref(self, ref):
        t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
        self.handle_data(t)
    def handle_entityref(self, ref):
        t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
        self.handle_data(t)
    def unescape_attrs(self, attrs):
        escaped_attrs = []
        for key, val in attrs:
            val = unescape(val, self._entitydefs, self._encoding)
            escaped_attrs.append((key, val))
        return escaped_attrs

class RobustLinksFactory:

    compress_re = COMPRESS_RE

    def __init__(self,
                 link_parser_class=None,
                 link_class=Link,
                 urltags=None,
                 ):
        if link_parser_class is None:
            link_parser_class = MechanizeBs
        self.link_parser_class = link_parser_class
        self.link_class = link_class
        if urltags is None:
            urltags = {
                "a": "href",
                "area": "href",
                "frame": "src",
                "iframe": "src",
                }
        self.urltags = urltags
        self._bs = None
        self._encoding = None
        self._base_url = None

    def set_soup(self, soup, base_url, encoding):
        self._bs = soup
        self._base_url = base_url
        self._encoding = encoding

    def links(self):
        bs = self._bs
        base_url = self._base_url
        encoding = self._encoding
        for ch in bs.recursiveChildGenerator():
            if (isinstance(ch, _beautifulsoup.Tag) and
                ch.name in self.urltags.keys()+["base"]):
                link = ch
                attrs = bs.unescape_attrs(link.attrs)
                attrs_dict = dict(attrs)
                if link.name == "base":
                    base_href = attrs_dict.get("href")
                    if base_href is not None:
                        base_url = base_href
                    continue
                url_attr = self.urltags[link.name]
                url = attrs_dict.get(url_attr)
                if not url:
                    continue
                url = _rfc3986.clean_url(url, encoding)
                text = link.fetchText(lambda t: True)
                if not text:
                    # follow _pullparser's weird behaviour rigidly
                    if link.name == "a":
                        text = ""
                    else:
                        text = None
                else:
                    text = self.compress_re.sub(" ", " ".join(text).strip())
                yield Link(base_url, url, text, link.name, attrs)


class RobustFormsFactory(FormsFactory):
    def __init__(self, *args, **kwds):
        args = form_parser_args(*args, **kwds)
        if args.form_parser_class is None:
            args.form_parser_class = _form.RobustFormParser
        FormsFactory.__init__(self, **args.dictionary)

    def set_response(self, response, encoding):
        self._response = response
        self.encoding = encoding


class RobustTitleFactory:
    def __init__(self):
        self._bs = self._encoding = None

    def set_soup(self, soup, encoding):
        self._bs = soup
        self._encoding = encoding

    def title(self):
        title = self._bs.first("title")
        if title == _beautifulsoup.Null:
            return None
        else:
            inner_html = "".join([str(node) for node in title.contents])
            return COMPRESS_RE.sub(" ", inner_html.strip())


class Factory:
    """Factory for forms, links, etc.

    This interface may expand in future.

    Public methods:

    set_request_class(request_class)
    set_response(response)
    forms()
    links()

    Public attributes:

    Note that accessing these attributes may raise ParseError.

    encoding: string specifying the encoding of response if it contains a text
     document (this value is left unspecified for documents that do not have
     an encoding, e.g. an image file)
    is_html: true if response contains an HTML document (XHTML may be
     regarded as HTML too)
    title: page title, or None if no title or not HTML
    global_form: form object containing all controls that are not descendants
     of any FORM element, or None if the forms_factory does not support
     supplying a global form

    """

    LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]

    def __init__(self, forms_factory, links_factory, title_factory,
                 encoding_finder=EncodingFinder(DEFAULT_ENCODING),
                 response_type_finder=ResponseTypeFinder(allow_xhtml=False),
                 ):
        """

        Pass keyword arguments only.

        default_encoding: character encoding to use if encoding cannot be
         determined (or guessed) from the response.  You should turn on
         HTTP-EQUIV handling if you want the best chance of getting this right
         without resorting to this default.  The default value of this
         parameter (currently latin-1) may change in future.

        """
        self._forms_factory = forms_factory
        self._links_factory = links_factory
        self._title_factory = title_factory
        self._encoding_finder = encoding_finder
        self._response_type_finder = response_type_finder

        self.set_response(None)

    def set_request_class(self, request_class):
        """Set request class (mechanize.Request by default).

        HTMLForm instances returned by .forms() will return instances of this
        class when .click()ed.

        """
        self._forms_factory.request_class = request_class

    def set_response(self, response):
        """Set response.

        The response must either be None or implement the same interface as
        objects returned by mechanize.urlopen().

        """
        self._response = response
        self._forms_genf = self._links_genf = None
        self._get_title = None
        for name in self.LAZY_ATTRS:
            try:
                delattr(self, name)
            except AttributeError:
                pass

    def __getattr__(self, name):
        if name not in self.LAZY_ATTRS:
            return getattr(self.__class__, name)

        if name == "encoding":
            self.encoding = self._encoding_finder.encoding(
                copy.copy(self._response))
            return self.encoding
        elif name == "is_html":
            self.is_html = self._response_type_finder.is_html(
                copy.copy(self._response), self.encoding)
            return self.is_html
        elif name == "title":
            if self.is_html:
                self.title = self._title_factory.title()
            else:
                self.title = None
            return self.title
        elif name == "global_form":
            self.forms()
            return self.global_form

    def forms(self):
        """Return iterable over HTMLForm-like objects.

        Raises mechanize.ParseError on failure.
        """
        # this implementation sets .global_form as a side-effect, for benefit
        # of __getattr__ impl
        if self._forms_genf is None:
            try:
                self._forms_genf = CachingGeneratorFunction(
                    self._forms_factory.forms())
            except:  # XXXX define exception!
                self.set_response(self._response)
                raise
            self.global_form = getattr(
                self._forms_factory, "global_form", None)
        return self._forms_genf()

    def links(self):
        """Return iterable over mechanize.Link-like objects.

        Raises mechanize.ParseError on failure.
        """
        if self._links_genf is None:
            try:
                self._links_genf = CachingGeneratorFunction(
                    self._links_factory.links())
            except:  # XXXX define exception!
                self.set_response(self._response)
                raise
        return self._links_genf()

class DefaultFactory(Factory):
    """Based on sgmllib."""
    def __init__(self, i_want_broken_xhtml_support=False):
        Factory.__init__(
            self,
            forms_factory=FormsFactory(),
            links_factory=LinksFactory(),
            title_factory=TitleFactory(),
            response_type_finder=ResponseTypeFinder(
                allow_xhtml=i_want_broken_xhtml_support),
            )

    def set_response(self, response):
        Factory.set_response(self, response)
        if response is not None:
            self._forms_factory.set_response(
                copy.copy(response), self.encoding)
            self._links_factory.set_response(
                copy.copy(response), response.geturl(), self.encoding)
            self._title_factory.set_response(
                copy.copy(response), self.encoding)

class RobustFactory(Factory):
    """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
    DefaultFactory.

    """
    def __init__(self, i_want_broken_xhtml_support=False,
                 soup_class=None):
        Factory.__init__(
            self,
            forms_factory=RobustFormsFactory(),
            links_factory=RobustLinksFactory(),
            title_factory=RobustTitleFactory(),
            response_type_finder=ResponseTypeFinder(
                allow_xhtml=i_want_broken_xhtml_support),
            )
        if soup_class is None:
            soup_class = MechanizeBs
        self._soup_class = soup_class

    def set_response(self, response):
        Factory.set_response(self, response)
        if response is not None:
            data = response.read()
            soup = self._soup_class(self.encoding, data)
            self._forms_factory.set_response(
                copy.copy(response), self.encoding)
            self._links_factory.set_soup(
                soup, response.geturl(), self.encoding)
            self._title_factory.set_soup(soup, self.encoding)
Tech Fingerprint

Standard Library: String & Text
Alerts (34)

'def' Ensure functions have docstrings for documentation
50 70 96 147 222 227 246 275 290 294 313 343 346 349 381 386 425 434 438 591 620
Complexity hotspot; lines 54 to 56 (total complexity: 3)
54 55 56
Complexity hotspot; lines 112 to 114 (total complexity: 3)
112 113 114
'try:' Ensure try blocks have corresponding except or finally blocks
159
Complexity hotspot; lines 159 to 161 (total complexity: 3)
159 160 161
'type(' Use isinstance() for type checking instead of type()
302
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
558 574