_html.py | searchcode

/other/FetchData/mechanize/_html.py

http://github.com/jbeezley/wrf-fire
Python | 631 lines | 543 code | 46 blank | 42 comment | 52 complexity | 069652310b4beeec4d36a7e4a949046a MD5 | raw file
Possible License(s): AGPL-1.0

"""HTML handling.

Copyright 2003-2006 John J. Lee <jjl@pobox.com>

This code is free software; you can redistribute it and/or modify it under
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
included with the distribution).

"""

import re, copy, htmlentitydefs
import sgmllib, ClientForm

import _request
from _headersutil import split_header_words, is_html as _is_html
import _rfc3986

DEFAULT_ENCODING = "latin-1"

COMPRESS_RE = re.compile(r"\s+")


# the base classe is purely for backwards compatibility
class ParseError(ClientForm.ParseError): pass


class CachingGeneratorFunction(object):
    """Caching wrapper around a no-arguments iterable."""

    def __init__(self, iterable):
        self._cache = []
        # wrap iterable to make it non-restartable (otherwise, repeated
        # __call__ would give incorrect results)
        self._iterator = iter(iterable)

    def __call__(self):
        cache = self._cache
        for item in cache:
            yield item
        for item in self._iterator:
            cache.append(item)
            yield item


class EncodingFinder:
    def __init__(self, default_encoding):
        self._default_encoding = default_encoding
    def encoding(self, response):
        # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
        # headers may be in the response.  HTTP-EQUIV headers come last,
        # so try in order from first to last.
        for ct in response.info().getheaders("content-type"):
            for k, v in split_header_words([ct])[0]:
                if k == "charset":
                    return v
        return self._default_encoding

class ResponseTypeFinder:
    def __init__(self, allow_xhtml):
        self._allow_xhtml = allow_xhtml
    def is_html(self, response, encoding):
        ct_hdrs = response.info().getheaders("content-type")
        url = response.geturl()
        # XXX encoding
        return _is_html(ct_hdrs, url, self._allow_xhtml)


# idea for this argument-processing trick is from Peter Otten
class Args:
    def __init__(self, args_map):
        self.dictionary = dict(args_map)
    def __getattr__(self, key):
        try:
            return self.dictionary[key]
        except KeyError:
            return getattr(self.__class__, key)

def form_parser_args(
    select_default=False,
    form_parser_class=None,
    request_class=None,
    backwards_compat=False,
    ):
    return Args(locals())


class Link:
    def __init__(self, base_url, url, text, tag, attrs):
        assert None not in [url, tag, attrs]
        self.base_url = base_url
        self.absolute_url = _rfc3986.urljoin(base_url, url)
        self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
    def __cmp__(self, other):
        try:
            for name in "url", "text", "tag", "attrs":
                if getattr(self, name) != getattr(other, name):
                    return -1
        except AttributeError:
            return -1
        return 0
    def __repr__(self):
        return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
            self.base_url, self.url, self.text, self.tag, self.attrs)


class LinksFactory:

    def __init__(self,
                 link_parser_class=None,
                 link_class=Link,
                 urltags=None,
                 ):
        import _pullparser
        if link_parser_class is None:
            link_parser_class = _pullparser.TolerantPullParser
        self.link_parser_class = link_parser_class
        self.link_class = link_class
        if urltags is None:
            urltags = {
                "a": "href",
                "area": "href",
                "frame": "src",
                "iframe": "src",
                }
        self.urltags = urltags
        self._response = None
        self._encoding = None

    def set_response(self, response, base_url, encoding):
        self._response = response
        self._encoding = encoding
        self._base_url = base_url

    def links(self):
        """Return an iterator that provides links of the document."""
        response = self._response
        encoding = self._encoding
        base_url = self._base_url
        p = self.link_parser_class(response, encoding=encoding)

        try:
            for token in p.tags(*(self.urltags.keys()+["base"])):
                if token.type == "endtag":
                    continue
                if token.data == "base":
                    base_href = dict(token.attrs).get("href")
                    if base_href is not None:
                        base_url = base_href
                    continue
                attrs = dict(token.attrs)
                tag = token.data
                name = attrs.get("name")
                text = None
                # XXX use attr_encoding for ref'd doc if that doc does not
                #  provide one by other means
                #attr_encoding = attrs.get("charset")
                url = attrs.get(self.urltags[tag])  # XXX is "" a valid URL?
                if not url:
                    # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
                    # For our purposes a link is something with a URL, so
                    # ignore this.
                    continue

                url = _rfc3986.clean_url(url, encoding)
                if tag == "a":
                    if token.type != "startendtag":
                        # hmm, this'd break if end tag is missing
                        text = p.get_compressed_text(("endtag", tag))
                    # but this doesn't work for eg.
                    # <a href="blah"><b>Andy</b></a>
                    #text = p.get_compressed_text()

                yield Link(base_url, url, text, tag, token.attrs)
        except sgmllib.SGMLParseError, exc:
            raise ParseError(exc)

class FormsFactory:

    """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.

    After calling .forms(), the .global_form attribute is a form object
    containing all controls not a descendant of any FORM element.

    For constructor argument docs, see ClientForm.ParseResponse
    argument docs.

    """

    def __init__(self,
                 select_default=False,
                 form_parser_class=None,
                 request_class=None,
                 backwards_compat=False,
                 ):
        import ClientForm
        self.select_default = select_default
        if form_parser_class is None:
            form_parser_class = ClientForm.FormParser
        self.form_parser_class = form_parser_class
        if request_class is None:
            request_class = _request.Request
        self.request_class = request_class
        self.backwards_compat = backwards_compat
        self._response = None
        self.encoding = None
        self.global_form = None

    def set_response(self, response, encoding):
        self._response = response
        self.encoding = encoding
        self.global_form = None

    def forms(self):
        import ClientForm
        encoding = self.encoding
        try:
            forms = ClientForm.ParseResponseEx(
                self._response,
                select_default=self.select_default,
                form_parser_class=self.form_parser_class,
                request_class=self.request_class,
                encoding=encoding,
                _urljoin=_rfc3986.urljoin,
                _urlparse=_rfc3986.urlsplit,
                _urlunparse=_rfc3986.urlunsplit,
                )
        except ClientForm.ParseError, exc:
            raise ParseError(exc)
        self.global_form = forms[0]
        return forms[1:]

class TitleFactory:
    def __init__(self):
        self._response = self._encoding = None

    def set_response(self, response, encoding):
        self._response = response
        self._encoding = encoding

    def _get_title_text(self, parser):
        import _pullparser
        text = []
        tok = None
        while 1:
            try:
                tok = parser.get_token()
            except _pullparser.NoMoreTokensError:
                break
            if tok.type == "data":
                text.append(str(tok))
            elif tok.type == "entityref":
                t = unescape("&%s;" % tok.data,
                             parser._entitydefs, parser.encoding)
                text.append(t)
            elif tok.type == "charref":
                t = unescape_charref(tok.data, parser.encoding)
                text.append(t)
            elif tok.type in ["starttag", "endtag", "startendtag"]:
                tag_name = tok.data
                if tok.type == "endtag" and tag_name == "title":
                    break
                text.append(str(tok))
        return COMPRESS_RE.sub(" ", "".join(text).strip())

    def title(self):
        import _pullparser
        p = _pullparser.TolerantPullParser(
            self._response, encoding=self._encoding)
        try:
            try:
                p.get_tag("title")
            except _pullparser.NoMoreTokensError:
                return None
            else:
                return self._get_title_text(p)
        except sgmllib.SGMLParseError, exc:
            raise ParseError(exc)


def unescape(data, entities, encoding):
    if data is None or "&" not in data:
        return data

    def replace_entities(match):
        ent = match.group()
        if ent[1] == "#":
            return unescape_charref(ent[2:-1], encoding)

        repl = entities.get(ent[1:-1])
        if repl is not None:
            repl = unichr(repl)
            if type(repl) != type(""):
                try:
                    repl = repl.encode(encoding)
                except UnicodeError:
                    repl = ent
        else:
            repl = ent
        return repl

    return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)

def unescape_charref(data, encoding):
    name, base = data, 10
    if name.startswith("x"):
        name, base= name[1:], 16
    uc = unichr(int(name, base))
    if encoding is None:
        return uc
    else:
        try:
            repl = uc.encode(encoding)
        except UnicodeError:
            repl = "&#%s;" % data
        return repl


# bizarre import gymnastics for bundled BeautifulSoup
import _beautifulsoup
import ClientForm
RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
    _beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
    )
# monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")

class MechanizeBs(_beautifulsoup.BeautifulSoup):
    _entitydefs = htmlentitydefs.name2codepoint
    # don't want the magic Microsoft-char workaround
    PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
                       lambda(x):x.group(1) + ' />'),
                      (re.compile('<!\s+([^<>]*)>'),
                       lambda(x):'<!' + x.group(1) + '>')
                      ]

    def __init__(self, encoding, text=None, avoidParserProblems=True,
                 initialTextIsEverything=True):
        self._encoding = encoding
        _beautifulsoup.BeautifulSoup.__init__(
            self, text, avoidParserProblems, initialTextIsEverything)

    def handle_charref(self, ref):
        t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
        self.handle_data(t)
    def handle_entityref(self, ref):
        t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
        self.handle_data(t)
    def unescape_attrs(self, attrs):
        escaped_attrs = []
        for key, val in attrs:
            val = unescape(val, self._entitydefs, self._encoding)
            escaped_attrs.append((key, val))
        return escaped_attrs

class RobustLinksFactory:

    compress_re = COMPRESS_RE

    def __init__(self,
                 link_parser_class=None,
                 link_class=Link,
                 urltags=None,
                 ):
        if link_parser_class is None:
            link_parser_class = MechanizeBs
        self.link_parser_class = link_parser_class
        self.link_class = link_class
        if urltags is None:
            urltags = {
                "a": "href",
                "area": "href",
                "frame": "src",
                "iframe": "src",
                }
        self.urltags = urltags
        self._bs = None
        self._encoding = None
        self._base_url = None

    def set_soup(self, soup, base_url, encoding):
        self._bs = soup
        self._base_url = base_url
        self._encoding = encoding

    def links(self):
        import _beautifulsoup
        bs = self._bs
        base_url = self._base_url
        encoding = self._encoding
        gen = bs.recursiveChildGenerator()
        for ch in bs.recursiveChildGenerator():
            if (isinstance(ch, _beautifulsoup.Tag) and
                ch.name in self.urltags.keys()+["base"]):
                link = ch
                attrs = bs.unescape_attrs(link.attrs)
                attrs_dict = dict(attrs)
                if link.name == "base":
                    base_href = attrs_dict.get("href")
                    if base_href is not None:
                        base_url = base_href
                    continue
                url_attr = self.urltags[link.name]
                url = attrs_dict.get(url_attr)
                if not url:
                    continue
                url = _rfc3986.clean_url(url, encoding)
                text = link.fetchText(lambda t: True)
                if not text:
                    # follow _pullparser's weird behaviour rigidly
                    if link.name == "a":
                        text = ""
                    else:
                        text = None
                else:
                    text = self.compress_re.sub(" ", " ".join(text).strip())
                yield Link(base_url, url, text, link.name, attrs)


class RobustFormsFactory(FormsFactory):
    def __init__(self, *args, **kwds):
        args = form_parser_args(*args, **kwds)
        if args.form_parser_class is None:
            args.form_parser_class = RobustFormParser
        FormsFactory.__init__(self, **args.dictionary)

    def set_response(self, response, encoding):
        self._response = response
        self.encoding = encoding


class RobustTitleFactory:
    def __init__(self):
        self._bs = self._encoding = None

    def set_soup(self, soup, encoding):
        self._bs = soup
        self._encoding = encoding

    def title(self):
        import _beautifulsoup
        title = self._bs.first("title")
        if title == _beautifulsoup.Null:
            return None
        else:
            inner_html = "".join([str(node) for node in title.contents])
            return COMPRESS_RE.sub(" ", inner_html.strip())


class Factory:
    """Factory for forms, links, etc.

    This interface may expand in future.

    Public methods:

    set_request_class(request_class)
    set_response(response)
    forms()
    links()

    Public attributes:

    Note that accessing these attributes may raise ParseError.

    encoding: string specifying the encoding of response if it contains a text
     document (this value is left unspecified for documents that do not have
     an encoding, e.g. an image file)
    is_html: true if response contains an HTML document (XHTML may be
     regarded as HTML too)
    title: page title, or None if no title or not HTML
    global_form: form object containing all controls that are not descendants
     of any FORM element, or None if the forms_factory does not support
     supplying a global form

    """

    LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]

    def __init__(self, forms_factory, links_factory, title_factory,
                 encoding_finder=EncodingFinder(DEFAULT_ENCODING),
                 response_type_finder=ResponseTypeFinder(allow_xhtml=False),
                 ):
        """

        Pass keyword arguments only.

        default_encoding: character encoding to use if encoding cannot be
         determined (or guessed) from the response.  You should turn on
         HTTP-EQUIV handling if you want the best chance of getting this right
         without resorting to this default.  The default value of this
         parameter (currently latin-1) may change in future.

        """
        self._forms_factory = forms_factory
        self._links_factory = links_factory
        self._title_factory = title_factory
        self._encoding_finder = encoding_finder
        self._response_type_finder = response_type_finder

        self.set_response(None)

    def set_request_class(self, request_class):
        """Set urllib2.Request class.

        ClientForm.HTMLForm instances returned by .forms() will return
        instances of this class when .click()ed.

        """
        self._forms_factory.request_class = request_class

    def set_response(self, response):
        """Set response.

        The response must either be None or implement the same interface as
        objects returned by urllib2.urlopen().

        """
        self._response = response
        self._forms_genf = self._links_genf = None
        self._get_title = None
        for name in self.LAZY_ATTRS:
            try:
                delattr(self, name)
            except AttributeError:
                pass

    def __getattr__(self, name):
        if name not in self.LAZY_ATTRS:
            return getattr(self.__class__, name)

        if name == "encoding":
            self.encoding = self._encoding_finder.encoding(
                copy.copy(self._response))
            return self.encoding
        elif name == "is_html":
            self.is_html = self._response_type_finder.is_html(
                copy.copy(self._response), self.encoding)
            return self.is_html
        elif name == "title":
            if self.is_html:
                self.title = self._title_factory.title()
            else:
                self.title = None
            return self.title
        elif name == "global_form":
            self.forms()
            return self.global_form

    def forms(self):
        """Return iterable over ClientForm.HTMLForm-like objects.

        Raises mechanize.ParseError on failure.
        """
        # this implementation sets .global_form as a side-effect, for benefit
        # of __getattr__ impl
        if self._forms_genf is None:
            try:
                self._forms_genf = CachingGeneratorFunction(
                    self._forms_factory.forms())
            except:  # XXXX define exception!
                self.set_response(self._response)
                raise
            self.global_form = getattr(
                self._forms_factory, "global_form", None)
        return self._forms_genf()

    def links(self):
        """Return iterable over mechanize.Link-like objects.

        Raises mechanize.ParseError on failure.
        """
        if self._links_genf is None:
            try:
                self._links_genf = CachingGeneratorFunction(
                    self._links_factory.links())
            except:  # XXXX define exception!
                self.set_response(self._response)
                raise
        return self._links_genf()

class DefaultFactory(Factory):
    """Based on sgmllib."""
    def __init__(self, i_want_broken_xhtml_support=False):
        Factory.__init__(
            self,
            forms_factory=FormsFactory(),
            links_factory=LinksFactory(),
            title_factory=TitleFactory(),
            response_type_finder=ResponseTypeFinder(
                allow_xhtml=i_want_broken_xhtml_support),
            )

    def set_response(self, response):
        Factory.set_response(self, response)
        if response is not None:
            self._forms_factory.set_response(
                copy.copy(response), self.encoding)
            self._links_factory.set_response(
                copy.copy(response), response.geturl(), self.encoding)
            self._title_factory.set_response(
                copy.copy(response), self.encoding)

class RobustFactory(Factory):
    """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
    DefaultFactory.

    """
    def __init__(self, i_want_broken_xhtml_support=False,
                 soup_class=None):
        Factory.__init__(
            self,
            forms_factory=RobustFormsFactory(),
            links_factory=RobustLinksFactory(),
            title_factory=RobustTitleFactory(),
            response_type_finder=ResponseTypeFinder(
                allow_xhtml=i_want_broken_xhtml_support),
            )
        if soup_class is None:
            soup_class = MechanizeBs
        self._soup_class = soup_class

    def set_response(self, response):
        Factory.set_response(self, response)
        if response is not None:
            data = response.read()
            soup = self._soup_class(self.encoding, data)
            self._forms_factory.set_response(
                copy.copy(response), self.encoding)
            self._links_factory.set_soup(
                soup, response.geturl(), self.encoding)
            self._title_factory.set_soup(soup, self.encoding)