/other/FetchData/ClientForm.py
Python | 3401 lines | 3209 code | 69 blank | 123 comment | 84 complexity | e39b95cd355883183b9cc33194d1a082 MD5 | raw file
Possible License(s): AGPL-1.0
Large files files are truncated, but you can click here to view the full file
- """HTML form handling for web clients.
- ClientForm is a Python module for handling HTML forms on the client
- side, useful for parsing HTML forms, filling them in and returning the
- completed forms to the server. It has developed from a port of Gisle
- Aas' Perl module HTML::Form, from the libwww-perl library, but the
- interface is not the same.
- The most useful docstring is the one for HTMLForm.
- RFC 1866: HTML 2.0
- RFC 1867: Form-based File Upload in HTML
- RFC 2388: Returning Values from Forms: multipart/form-data
- HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX)
- HTML 4.01 Specification, W3C Recommendation 24 December 1999
- Copyright 2002-2007 John J. Lee <jjl@pobox.com>
- Copyright 2005 Gary Poster
- Copyright 2005 Zope Corporation
- Copyright 1998-2000 Gisle Aas.
- This code is free software; you can redistribute it and/or modify it
- under the terms of the BSD or ZPL 2.1 licenses (see the file
- COPYING.txt included with the distribution).
- """
- # XXX
- # Remove parser testing hack
- # safeUrl()-ize action
- # Switch to unicode throughout (would be 0.3.x)
- # See Wichert Akkerman's 2004-01-22 message to c.l.py.
- # Add charset parameter to Content-type headers? How to find value??
- # Add some more functional tests
- # Especially single and multiple file upload on the internet.
- # Does file upload work when name is missing? Sourceforge tracker form
- # doesn't like it. Check standards, and test with Apache. Test
- # binary upload with Apache.
- # mailto submission & enctype text/plain
- # I'm not going to fix this unless somebody tells me what real servers
- # that want this encoding actually expect: If enctype is
- # application/x-www-form-urlencoded and there's a FILE control present.
- # Strictly, it should be 'name=data' (see HTML 4.01 spec., section
- # 17.13.2), but I send "name=" ATM. What about multiple file upload??
- # Would be nice, but I'm not going to do it myself:
- # -------------------------------------------------
- # Maybe a 0.4.x?
- # Replace by_label etc. with moniker / selector concept. Allows, eg.,
- # a choice between selection by value / id / label / element
- # contents. Or choice between matching labels exactly or by
- # substring. Etc.
- # Remove deprecated methods.
- # ...what else?
- # Work on DOMForm.
- # XForms? Don't know if there's a need here.
- __all__ = ['AmbiguityError', 'CheckboxControl', 'Control',
- 'ControlNotFoundError', 'FileControl', 'FormParser', 'HTMLForm',
- 'HiddenControl', 'IgnoreControl', 'ImageControl', 'IsindexControl',
- 'Item', 'ItemCountError', 'ItemNotFoundError', 'Label',
- 'ListControl', 'LocateError', 'Missing', 'ParseError', 'ParseFile',
- 'ParseFileEx', 'ParseResponse', 'ParseResponseEx','PasswordControl',
- 'RadioControl', 'ScalarControl', 'SelectControl',
- 'SubmitButtonControl', 'SubmitControl', 'TextControl',
- 'TextareaControl', 'XHTMLCompatibleFormParser']
- try: True
- except NameError:
- True = 1
- False = 0
- try: bool
- except NameError:
- def bool(expr):
- if expr: return True
- else: return False
- try:
- import logging
- import inspect
- except ImportError:
- def debug(msg, *args, **kwds):
- pass
- else:
- _logger = logging.getLogger("ClientForm")
- OPTIMIZATION_HACK = True
- def debug(msg, *args, **kwds):
- if OPTIMIZATION_HACK:
- return
- caller_name = inspect.stack()[1][3]
- extended_msg = '%%s %s' % msg
- extended_args = (caller_name,)+args
- debug = _logger.debug(extended_msg, *extended_args, **kwds)
- def _show_debug_messages():
- global OPTIMIZATION_HACK
- OPTIMIZATION_HACK = False
- _logger.setLevel(logging.DEBUG)
- handler = logging.StreamHandler(sys.stdout)
- handler.setLevel(logging.DEBUG)
- _logger.addHandler(handler)
- import sys, urllib, urllib2, types, mimetools, copy, urlparse, \
- htmlentitydefs, re, random
- from cStringIO import StringIO
- import sgmllib
- # monkeypatch to fix http://www.python.org/sf/803422 :-(
- sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
- # HTMLParser.HTMLParser is recent, so live without it if it's not available
- # (also, sgmllib.SGMLParser is much more tolerant of bad HTML)
- try:
- import HTMLParser
- except ImportError:
- HAVE_MODULE_HTMLPARSER = False
- else:
- HAVE_MODULE_HTMLPARSER = True
- try:
- import warnings
- except ImportError:
- def deprecation(message, stack_offset=0):
- pass
- else:
- def deprecation(message, stack_offset=0):
- warnings.warn(message, DeprecationWarning, stacklevel=3+stack_offset)
- VERSION = "0.2.10"
- CHUNK = 1024 # size of chunks fed to parser, in bytes
- DEFAULT_ENCODING = "latin-1"
- class Missing: pass
- _compress_re = re.compile(r"\s+")
- def compress_text(text): return _compress_re.sub(" ", text.strip())
- def normalize_line_endings(text):
- return re.sub(r"(?:(?<!\r)\n)|(?:\r(?!\n))", "\r\n", text)
- # This version of urlencode is from my Python 1.5.2 back-port of the
- # Python 2.1 CVS maintenance branch of urllib. It will accept a sequence
- # of pairs instead of a mapping -- the 2.0 version only accepts a mapping.
- def urlencode(query,doseq=False,):
- """Encode a sequence of two-element tuples or dictionary into a URL query \
- string.
- If any values in the query arg are sequences and doseq is true, each
- sequence element is converted to a separate parameter.
- If the query arg is a sequence of two-element tuples, the order of the
- parameters in the output will match the order of parameters in the
- input.
- """
- if hasattr(query,"items"):
- # mapping objects
- query = query.items()
- else:
- # it's a bother at times that strings and string-like objects are
- # sequences...
- try:
- # non-sequence items should not work with len()
- x = len(query)
- # non-empty strings will fail this
- if len(query) and type(query[0]) != types.TupleType:
- raise TypeError()
- # zero-length sequences of all types will get here and succeed,
- # but that's a minor nit - since the original implementation
- # allowed empty dicts that type of behavior probably should be
- # preserved for consistency
- except TypeError:
- ty,va,tb = sys.exc_info()
- raise TypeError("not a valid non-string sequence or mapping "
- "object", tb)
- l = []
- if not doseq:
- # preserve old behavior
- for k, v in query:
- k = urllib.quote_plus(str(k))
- v = urllib.quote_plus(str(v))
- l.append(k + '=' + v)
- else:
- for k, v in query:
- k = urllib.quote_plus(str(k))
- if type(v) == types.StringType:
- v = urllib.quote_plus(v)
- l.append(k + '=' + v)
- elif type(v) == types.UnicodeType:
- # is there a reasonable way to convert to ASCII?
- # encode generates a string, but "replace" or "ignore"
- # lose information and "strict" can raise UnicodeError
- v = urllib.quote_plus(v.encode("ASCII","replace"))
- l.append(k + '=' + v)
- else:
- try:
- # is this a sufficient test for sequence-ness?
- x = len(v)
- except TypeError:
- # not a sequence
- v = urllib.quote_plus(str(v))
- l.append(k + '=' + v)
- else:
- # loop over the sequence
- for elt in v:
- l.append(k + '=' + urllib.quote_plus(str(elt)))
- return '&'.join(l)
- def unescape(data, entities, encoding=DEFAULT_ENCODING):
- if data is None or "&" not in data:
- return data
- def replace_entities(match, entities=entities, encoding=encoding):
- ent = match.group()
- if ent[1] == "#":
- return unescape_charref(ent[2:-1], encoding)
- repl = entities.get(ent)
- if repl is not None:
- if type(repl) != type(""):
- try:
- repl = repl.encode(encoding)
- except UnicodeError:
- repl = ent
- else:
- repl = ent
- return repl
- return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
- def unescape_charref(data, encoding):
- name, base = data, 10
- if name.startswith("x"):
- name, base= name[1:], 16
- uc = unichr(int(name, base))
- if encoding is None:
- return uc
- else:
- try:
- repl = uc.encode(encoding)
- except UnicodeError:
- repl = "&#%s;" % data
- return repl
- def get_entitydefs():
- import htmlentitydefs
- from codecs import latin_1_decode
- entitydefs = {}
- try:
- htmlentitydefs.name2codepoint
- except AttributeError:
- entitydefs = {}
- for name, char in htmlentitydefs.entitydefs.items():
- uc = latin_1_decode(char)[0]
- if uc.startswith("&#") and uc.endswith(";"):
- uc = unescape_charref(uc[2:-1], None)
- entitydefs["&%s;" % name] = uc
- else:
- for name, codepoint in htmlentitydefs.name2codepoint.items():
- entitydefs["&%s;" % name] = unichr(codepoint)
- return entitydefs
- def issequence(x):
- try:
- x[0]
- except (TypeError, KeyError):
- return False
- except IndexError:
- pass
- return True
- def isstringlike(x):
- try: x+""
- except: return False
- else: return True
- def choose_boundary():
- """Return a string usable as a multipart boundary."""
- # follow IE and firefox
- nonce = "".join([str(random.randint(0, sys.maxint-1)) for i in 0,1,2])
- return "-"*27 + nonce
- # This cut-n-pasted MimeWriter from standard library is here so can add
- # to HTTP headers rather than message body when appropriate. It also uses
- # \r\n in place of \n. This is a bit nasty.
- class MimeWriter:
- """Generic MIME writer.
- Methods:
- __init__()
- addheader()
- flushheaders()
- startbody()
- startmultipartbody()
- nextpart()
- lastpart()
- A MIME writer is much more primitive than a MIME parser. It
- doesn't seek around on the output file, and it doesn't use large
- amounts of buffer space, so you have to write the parts in the
- order they should occur on the output file. It does buffer the
- headers you add, allowing you to rearrange their order.
- General usage is:
- f = <open the output file>
- w = MimeWriter(f)
- ...call w.addheader(key, value) 0 or more times...
- followed by either:
- f = w.startbody(content_type)
- ...call f.write(data) for body data...
- or:
- w.startmultipartbody(subtype)
- for each part:
- subwriter = w.nextpart()
- ...use the subwriter's methods to create the subpart...
- w.lastpart()
- The subwriter is another MimeWriter instance, and should be
- treated in the same way as the toplevel MimeWriter. This way,
- writing recursive body parts is easy.
- Warning: don't forget to call lastpart()!
- XXX There should be more state so calls made in the wrong order
- are detected.
- Some special cases:
- - startbody() just returns the file passed to the constructor;
- but don't use this knowledge, as it may be changed.
- - startmultipartbody() actually returns a file as well;
- this can be used to write the initial 'if you can read this your
- mailer is not MIME-aware' message.
- - If you call flushheaders(), the headers accumulated so far are
- written out (and forgotten); this is useful if you don't need a
- body part at all, e.g. for a subpart of type message/rfc822
- that's (mis)used to store some header-like information.
- - Passing a keyword argument 'prefix=<flag>' to addheader(),
- start*body() affects where the header is inserted; 0 means
- append at the end, 1 means insert at the start; default is
- append for addheader(), but insert for start*body(), which use
- it to determine where the Content-type header goes.
- """
- def __init__(self, fp, http_hdrs=None):
- self._http_hdrs = http_hdrs
- self._fp = fp
- self._headers = []
- self._boundary = []
- self._first_part = True
- def addheader(self, key, value, prefix=0,
- add_to_http_hdrs=0):
- """
- prefix is ignored if add_to_http_hdrs is true.
- """
- lines = value.split("\r\n")
- while lines and not lines[-1]: del lines[-1]
- while lines and not lines[0]: del lines[0]
- if add_to_http_hdrs:
- value = "".join(lines)
- # 2.2 urllib2 doesn't normalize header case
- self._http_hdrs.append((key.capitalize(), value))
- else:
- for i in range(1, len(lines)):
- lines[i] = " " + lines[i].strip()
- value = "\r\n".join(lines) + "\r\n"
- line = key.title() + ": " + value
- if prefix:
- self._headers.insert(0, line)
- else:
- self._headers.append(line)
- def flushheaders(self):
- self._fp.writelines(self._headers)
- self._headers = []
- def startbody(self, ctype=None, plist=[], prefix=1,
- add_to_http_hdrs=0, content_type=1):
- """
- prefix is ignored if add_to_http_hdrs is true.
- """
- if content_type and ctype:
- for name, value in plist:
- ctype = ctype + ';\r\n %s=%s' % (name, value)
- self.addheader("Content-Type", ctype, prefix=prefix,
- add_to_http_hdrs=add_to_http_hdrs)
- self.flushheaders()
- if not add_to_http_hdrs: self._fp.write("\r\n")
- self._first_part = True
- return self._fp
- def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1,
- add_to_http_hdrs=0, content_type=1):
- boundary = boundary or choose_boundary()
- self._boundary.append(boundary)
- return self.startbody("multipart/" + subtype,
- [("boundary", boundary)] + plist,
- prefix=prefix,
- add_to_http_hdrs=add_to_http_hdrs,
- content_type=content_type)
- def nextpart(self):
- boundary = self._boundary[-1]
- if self._first_part:
- self._first_part = False
- else:
- self._fp.write("\r\n")
- self._fp.write("--" + boundary + "\r\n")
- return self.__class__(self._fp)
- def lastpart(self):
- if self._first_part:
- self.nextpart()
- boundary = self._boundary.pop()
- self._fp.write("\r\n--" + boundary + "--\r\n")
- class LocateError(ValueError): pass
- class AmbiguityError(LocateError): pass
- class ControlNotFoundError(LocateError): pass
- class ItemNotFoundError(LocateError): pass
- class ItemCountError(ValueError): pass
- # for backwards compatibility, ParseError derives from exceptions that were
- # raised by versions of ClientForm <= 0.2.5
- if HAVE_MODULE_HTMLPARSER:
- SGMLLIB_PARSEERROR = sgmllib.SGMLParseError
- class ParseError(sgmllib.SGMLParseError,
- HTMLParser.HTMLParseError,
- ):
- pass
- else:
- if hasattr(sgmllib, "SGMLParseError"):
- SGMLLIB_PARSEERROR = sgmllib.SGMLParseError
- class ParseError(sgmllib.SGMLParseError):
- pass
- else:
- SGMLLIB_PARSEERROR = RuntimeError
- class ParseError(RuntimeError):
- pass
- class _AbstractFormParser:
- """forms attribute contains HTMLForm instances on completion."""
- # thanks to Moshe Zadka for an example of sgmllib/htmllib usage
- def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
- if entitydefs is None:
- entitydefs = get_entitydefs()
- self._entitydefs = entitydefs
- self._encoding = encoding
- self.base = None
- self.forms = []
- self.labels = []
- self._current_label = None
- self._current_form = None
- self._select = None
- self._optgroup = None
- self._option = None
- self._textarea = None
- # forms[0] will contain all controls that are outside of any form
- # self._global_form is an alias for self.forms[0]
- self._global_form = None
- self.start_form([])
- self.end_form()
- self._current_form = self._global_form = self.forms[0]
- def do_base(self, attrs):
- debug("%s", attrs)
- for key, value in attrs:
- if key == "href":
- self.base = self.unescape_attr_if_required(value)
- def end_body(self):
- debug("")
- if self._current_label is not None:
- self.end_label()
- if self._current_form is not self._global_form:
- self.end_form()
- def start_form(self, attrs):
- debug("%s", attrs)
- if self._current_form is not self._global_form:
- raise ParseError("nested FORMs")
- name = None
- action = None
- enctype = "application/x-www-form-urlencoded"
- method = "GET"
- d = {}
- for key, value in attrs:
- if key == "name":
- name = self.unescape_attr_if_required(value)
- elif key == "action":
- action = self.unescape_attr_if_required(value)
- elif key == "method":
- method = self.unescape_attr_if_required(value.upper())
- elif key == "enctype":
- enctype = self.unescape_attr_if_required(value.lower())
- d[key] = self.unescape_attr_if_required(value)
- controls = []
- self._current_form = (name, action, method, enctype), d, controls
- def end_form(self):
- debug("")
- if self._current_label is not None:
- self.end_label()
- if self._current_form is self._global_form:
- raise ParseError("end of FORM before start")
- self.forms.append(self._current_form)
- self._current_form = self._global_form
- def start_select(self, attrs):
- debug("%s", attrs)
- if self._select is not None:
- raise ParseError("nested SELECTs")
- if self._textarea is not None:
- raise ParseError("SELECT inside TEXTAREA")
- d = {}
- for key, val in attrs:
- d[key] = self.unescape_attr_if_required(val)
- self._select = d
- self._add_label(d)
- self._append_select_control({"__select": d})
- def end_select(self):
- debug("")
- if self._select is None:
- raise ParseError("end of SELECT before start")
- if self._option is not None:
- self._end_option()
- self._select = None
- def start_optgroup(self, attrs):
- debug("%s", attrs)
- if self._select is None:
- raise ParseError("OPTGROUP outside of SELECT")
- d = {}
- for key, val in attrs:
- d[key] = self.unescape_attr_if_required(val)
- self._optgroup = d
- def end_optgroup(self):
- debug("")
- if self._optgroup is None:
- raise ParseError("end of OPTGROUP before start")
- self._optgroup = None
- def _start_option(self, attrs):
- debug("%s", attrs)
- if self._select is None:
- raise ParseError("OPTION outside of SELECT")
- if self._option is not None:
- self._end_option()
- d = {}
- for key, val in attrs:
- d[key] = self.unescape_attr_if_required(val)
- self._option = {}
- self._option.update(d)
- if (self._optgroup and self._optgroup.has_key("disabled") and
- not self._option.has_key("disabled")):
- self._option["disabled"] = None
- def _end_option(self):
- debug("")
- if self._option is None:
- raise ParseError("end of OPTION before start")
- contents = self._option.get("contents", "").strip()
- self._option["contents"] = contents
- if not self._option.has_key("value"):
- self._option["value"] = contents
- if not self._option.has_key("label"):
- self._option["label"] = contents
- # stuff dict of SELECT HTML attrs into a special private key
- # (gets deleted again later)
- self._option["__select"] = self._select
- self._append_select_control(self._option)
- self._option = None
- def _append_select_control(self, attrs):
- debug("%s", attrs)
- controls = self._current_form[2]
- name = self._select.get("name")
- controls.append(("select", name, attrs))
- def start_textarea(self, attrs):
- debug("%s", attrs)
- if self._textarea is not None:
- raise ParseError("nested TEXTAREAs")
- if self._select is not None:
- raise ParseError("TEXTAREA inside SELECT")
- d = {}
- for key, val in attrs:
- d[key] = self.unescape_attr_if_required(val)
- self._add_label(d)
- self._textarea = d
- def end_textarea(self):
- debug("")
- if self._textarea is None:
- raise ParseError("end of TEXTAREA before start")
- controls = self._current_form[2]
- name = self._textarea.get("name")
- controls.append(("textarea", name, self._textarea))
- self._textarea = None
- def start_label(self, attrs):
- debug("%s", attrs)
- if self._current_label:
- self.end_label()
- d = {}
- for key, val in attrs:
- d[key] = self.unescape_attr_if_required(val)
- taken = bool(d.get("for")) # empty id is invalid
- d["__text"] = ""
- d["__taken"] = taken
- if taken:
- self.labels.append(d)
- self._current_label = d
- def end_label(self):
- debug("")
- label = self._current_label
- if label is None:
- # something is ugly in the HTML, but we're ignoring it
- return
- self._current_label = None
- # if it is staying around, it is True in all cases
- del label["__taken"]
- def _add_label(self, d):
- #debug("%s", d)
- if self._current_label is not None:
- if not self._current_label["__taken"]:
- self._current_label["__taken"] = True
- d["__label"] = self._current_label
- def handle_data(self, data):
- debug("%s", data)
- if self._option is not None:
- # self._option is a dictionary of the OPTION element's HTML
- # attributes, but it has two special keys, one of which is the
- # special "contents" key contains text between OPTION tags (the
- # other is the "__select" key: see the end_option method)
- map = self._option
- key = "contents"
- elif self._textarea is not None:
- map = self._textarea
- key = "value"
- data = normalize_line_endings(data)
- # not if within option or textarea
- elif self._current_label is not None:
- map = self._current_label
- key = "__text"
- else:
- return
- if data and not map.has_key(key):
- # according to
- # http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1 line break
- # immediately after start tags or immediately before end tags must
- # be ignored, but real browsers only ignore a line break after a
- # start tag, so we'll do that.
- if data[0:2] == "\r\n":
- data = data[2:]
- elif data[0:1] in ["\n", "\r"]:
- data = data[1:]
- map[key] = data
- else:
- map[key] = map[key] + data
- def do_button(self, attrs):
- debug("%s", attrs)
- d = {}
- d["type"] = "submit" # default
- for key, val in attrs:
- d[key] = self.unescape_attr_if_required(val)
- controls = self._current_form[2]
- type = d["type"]
- name = d.get("name")
- # we don't want to lose information, so use a type string that
- # doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON}
- # e.g. type for BUTTON/RESET is "resetbutton"
- # (type for INPUT/RESET is "reset")
- type = type+"button"
- self._add_label(d)
- controls.append((type, name, d))
- def do_input(self, attrs):
- debug("%s", attrs)
- d = {}
- d["type"] = "text" # default
- for key, val in attrs:
- d[key] = self.unescape_attr_if_required(val)
- controls = self._current_form[2]
- type = d["type"]
- name = d.get("name")
- self._add_label(d)
- controls.append((type, name, d))
- def do_isindex(self, attrs):
- debug("%s", attrs)
- d = {}
- for key, val in attrs:
- d[key] = self.unescape_attr_if_required(val)
- controls = self._current_form[2]
- self._add_label(d)
- # isindex doesn't have type or name HTML attributes
- controls.append(("isindex", None, d))
- def handle_entityref(self, name):
- #debug("%s", name)
- self.handle_data(unescape(
- '&%s;' % name, self._entitydefs, self._encoding))
- def handle_charref(self, name):
- #debug("%s", name)
- self.handle_data(unescape_charref(name, self._encoding))
- def unescape_attr(self, name):
- #debug("%s", name)
- return unescape(name, self._entitydefs, self._encoding)
- def unescape_attrs(self, attrs):
- #debug("%s", attrs)
- escaped_attrs = {}
- for key, val in attrs.items():
- try:
- val.items
- except AttributeError:
- escaped_attrs[key] = self.unescape_attr(val)
- else:
- # e.g. "__select" -- yuck!
- escaped_attrs[key] = self.unescape_attrs(val)
- return escaped_attrs
- def unknown_entityref(self, ref): self.handle_data("&%s;" % ref)
- def unknown_charref(self, ref): self.handle_data("&#%s;" % ref)
- if not HAVE_MODULE_HTMLPARSER:
- class XHTMLCompatibleFormParser:
- def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
- raise ValueError("HTMLParser could not be imported")
- else:
- class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
- """Good for XHTML, bad for tolerance of incorrect HTML."""
- # thanks to Michael Howitz for this!
- def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
- HTMLParser.HTMLParser.__init__(self)
- _AbstractFormParser.__init__(self, entitydefs, encoding)
- def feed(self, data):
- try:
- HTMLParser.HTMLParser.feed(self, data)
- except HTMLParser.HTMLParseError, exc:
- raise ParseError(exc)
- def start_option(self, attrs):
- _AbstractFormParser._start_option(self, attrs)
- def end_option(self):
- _AbstractFormParser._end_option(self)
- def handle_starttag(self, tag, attrs):
- try:
- method = getattr(self, "start_" + tag)
- except AttributeError:
- try:
- method = getattr(self, "do_" + tag)
- except AttributeError:
- pass # unknown tag
- else:
- method(attrs)
- else:
- method(attrs)
- def handle_endtag(self, tag):
- try:
- method = getattr(self, "end_" + tag)
- except AttributeError:
- pass # unknown tag
- else:
- method()
- def unescape(self, name):
- # Use the entitydefs passed into constructor, not
- # HTMLParser.HTMLParser's entitydefs.
- return self.unescape_attr(name)
- def unescape_attr_if_required(self, name):
- return name # HTMLParser.HTMLParser already did it
- def unescape_attrs_if_required(self, attrs):
- return attrs # ditto
- def close(self):
- HTMLParser.HTMLParser.close(self)
- self.end_body()
- class _AbstractSgmllibParser(_AbstractFormParser):
- def do_option(self, attrs):
- _AbstractFormParser._start_option(self, attrs)
- if sys.version_info[:2] >= (2,5):
- # we override this attr to decode hex charrefs
- entity_or_charref = re.compile(
- '&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(x?[0-9a-fA-F]+))(;?)')
- def convert_entityref(self, name):
- return unescape("&%s;" % name, self._entitydefs, self._encoding)
- def convert_charref(self, name):
- return unescape_charref("%s" % name, self._encoding)
- def unescape_attr_if_required(self, name):
- return name # sgmllib already did it
- def unescape_attrs_if_required(self, attrs):
- return attrs # ditto
- else:
- def unescape_attr_if_required(self, name):
- return self.unescape_attr(name)
- def unescape_attrs_if_required(self, attrs):
- return self.unescape_attrs(attrs)
- class FormParser(_AbstractSgmllibParser, sgmllib.SGMLParser):
- """Good for tolerance of incorrect HTML, bad for XHTML."""
- def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
- sgmllib.SGMLParser.__init__(self)
- _AbstractFormParser.__init__(self, entitydefs, encoding)
- def feed(self, data):
- try:
- sgmllib.SGMLParser.feed(self, data)
- except SGMLLIB_PARSEERROR, exc:
- raise ParseError(exc)
- def close(self):
- sgmllib.SGMLParser.close(self)
- self.end_body()
- # sigh, must support mechanize by allowing dynamic creation of classes based on
- # its bundled copy of BeautifulSoup (which was necessary because of dependency
- # problems)
- def _create_bs_classes(bs,
- icbinbs,
- ):
- class _AbstractBSFormParser(_AbstractSgmllibParser):
- bs_base_class = None
- def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
- _AbstractFormParser.__init__(self, entitydefs, encoding)
- self.bs_base_class.__init__(self)
- def handle_data(self, data):
- _AbstractFormParser.handle_data(self, data)
- self.bs_base_class.handle_data(self, data)
- def feed(self, data):
- try:
- self.bs_base_class.feed(self, data)
- except SGMLLIB_PARSEERROR, exc:
- raise ParseError(exc)
- def close(self):
- self.bs_base_class.close(self)
- self.end_body()
- class RobustFormParser(_AbstractBSFormParser, bs):
- """Tries to be highly tolerant of incorrect HTML."""
- pass
- RobustFormParser.bs_base_class = bs
- class NestingRobustFormParser(_AbstractBSFormParser, icbinbs):
- """Tries to be highly tolerant of incorrect HTML.
- Different from RobustFormParser in that it more often guesses nesting
- above missing end tags (see BeautifulSoup docs).
- """
- pass
- NestingRobustFormParser.bs_base_class = icbinbs
- return RobustFormParser, NestingRobustFormParser
- try:
- if sys.version_info[:2] < (2, 2):
- raise ImportError # BeautifulSoup uses generators
- import BeautifulSoup
- except ImportError:
- pass
- else:
- RobustFormParser, NestingRobustFormParser = _create_bs_classes(
- BeautifulSoup.BeautifulSoup, BeautifulSoup.ICantBelieveItsBeautifulSoup
- )
- __all__ += ['RobustFormParser', 'NestingRobustFormParser']
- #FormParser = XHTMLCompatibleFormParser # testing hack
- #FormParser = RobustFormParser # testing hack
- def ParseResponseEx(response,
- select_default=False,
- form_parser_class=FormParser,
- request_class=urllib2.Request,
- entitydefs=None,
- encoding=DEFAULT_ENCODING,
- # private
- _urljoin=urlparse.urljoin,
- _urlparse=urlparse.urlparse,
- _urlunparse=urlparse.urlunparse,
- ):
- """Identical to ParseResponse, except that:
- 1. The returned list contains an extra item. The first form in the list
- contains all controls not contained in any FORM element.
- 2. The arguments ignore_errors and backwards_compat have been removed.
- 3. Backwards-compatibility mode (backwards_compat=True) is not available.
- """
- return _ParseFileEx(response, response.geturl(),
- select_default,
- False,
- form_parser_class,
- request_class,
- entitydefs,
- False,
- encoding,
- _urljoin=_urljoin,
- _urlparse=_urlparse,
- _urlunparse=_urlunparse,
- )
- def ParseFileEx(file, base_uri,
- select_default=False,
- form_parser_class=FormParser,
- request_class=urllib2.Request,
- entitydefs=None,
- encoding=DEFAULT_ENCODING,
- # private
- _urljoin=urlparse.urljoin,
- _urlparse=urlparse.urlparse,
- _urlunparse=urlparse.urlunparse,
- ):
- """Identical to ParseFile, except that:
- 1. The returned list contains an extra item. The first form in the list
- contains all controls not contained in any FORM element.
- 2. The arguments ignore_errors and backwards_compat have been removed.
- 3. Backwards-compatibility mode (backwards_compat=True) is not available.
- """
- return _ParseFileEx(file, base_uri,
- select_default,
- False,
- form_parser_class,
- request_class,
- entitydefs,
- False,
- encoding,
- _urljoin=_urljoin,
- _urlparse=_urlparse,
- _urlunparse=_urlunparse,
- )
- def ParseResponse(response, *args, **kwds):
- """Parse HTTP response and return a list of HTMLForm instances.
- The return value of urllib2.urlopen can be conveniently passed to this
- function as the response parameter.
- ClientForm.ParseError is raised on parse errors.
- response: file-like object (supporting read() method) with a method
- geturl(), returning the URI of the HTTP response
- select_default: for multiple-selection SELECT controls and RADIO controls,
- pick the first item as the default if none are selected in the HTML
- form_parser_class: class to instantiate and use to pass
- request_class: class to return from .click() method (default is
- urllib2.Request)
- entitydefs: mapping like {"&": "&", ...} containing HTML entity
- definitions (a sensible default is used)
- encoding: character encoding used for encoding numeric character references
- when matching link text. ClientForm does not attempt to find the encoding
- in a META HTTP-EQUIV attribute in the document itself (mechanize, for
- example, does do that and will pass the correct value to ClientForm using
- this parameter).
- backwards_compat: boolean that determines whether the returned HTMLForm
- objects are backwards-compatible with old code. If backwards_compat is
- true:
- - ClientForm 0.1 code will continue to work as before.
- - Label searches that do not specify a nr (number or count) will always
- get the first match, even if other controls match. If
- backwards_compat is False, label searches that have ambiguous results
- will raise an AmbiguityError.
- - Item label matching is done by strict string comparison rather than
- substring matching.
- - De-selecting individual list items is allowed even if the Item is
- disabled.
- The backwards_compat argument will be deprecated in a future release.
- Pass a true value for select_default if you want the behaviour specified by
- RFC 1866 (the HTML 2.0 standard), which is to select the first item in a
- RADIO or multiple-selection SELECT control if none were selected in the
- HTML. Most browsers (including Microsoft Internet Explorer (IE) and
- Netscape Navigator) instead leave all items unselected in these cases. The
- W3C HTML 4.0 standard leaves this behaviour undefined in the case of
- multiple-selection SELECT controls, but insists that at least one RADIO
- button should be checked at all times, in contradiction to browser
- behaviour.
- There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses
- HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses
- sgmllib.SGMLParser) (the default) works better for ordinary grubby HTML.
- Note that HTMLParser is only available in Python 2.2 and later. You can
- pass your own class in here as a hack to work around bad HTML, but at your
- own risk: there is no well-defined interface.
- """
- return _ParseFileEx(response, response.geturl(), *args, **kwds)[1:]
- def ParseFile(file, base_uri, *args, **kwds):
- """Parse HTML and return a list of HTMLForm instances.
- ClientForm.ParseError is raised on parse errors.
- file: file-like object (supporting read() method) containing HTML with zero
- or more forms to be parsed
- base_uri: the URI of the document (note that the base URI used to submit
- the form will be that given in the BASE element if present, not that of
- the document)
- For the other arguments and further details, see ParseResponse.__doc__.
- """
- return _ParseFileEx(file, base_uri, *args, **kwds)[1:]
- def _ParseFileEx(file, base_uri,
- select_default=False,
- ignore_errors=False,
- form_parser_class=FormParser,
- request_class=urllib2.Request,
- entitydefs=None,
- backwards_compat=True,
- encoding=DEFAULT_ENCODING,
- _urljoin=urlparse.urljoin,
- _urlparse=urlparse.urlparse,
- _urlunparse=urlparse.urlunparse,
- ):
- if backwards_compat:
- deprecation("operating in backwards-compatibility mode", 1)
- fp = form_parser_class(entitydefs, encoding)
- while 1:
- data = file.read(CHUNK)
- try:
- fp.feed(data)
- except ParseError, e:
- e.base_uri = base_uri
- raise
- if len(data) != CHUNK: break
- fp.close()
- if fp.base is not None:
- # HTML BASE element takes precedence over document URI
- base_uri = fp.base
- labels = [] # Label(label) for label in fp.labels]
- id_to_labels = {}
- for l in fp.labels:
- label = Label(l)
- labels.append(label)
- for_id = l["for"]
- coll = id_to_labels.get(for_id)
- if coll is None:
- id_to_labels[for_id] = [label]
- else:
- coll.append(label)
- forms = []
- for (name, action, method, enctype), attrs, controls in fp.forms:
- if action is None:
- action = base_uri
- else:
- action = _urljoin(base_uri, action)
- # would be nice to make HTMLForm class (form builder) pluggable
- form = HTMLForm(
- action, method, enctype, name, attrs, request_class,
- forms, labels, id_to_labels, backwards_compat)
- form._urlparse = _urlparse
- form._urlunparse = _urlunparse
- for ii in range(len(controls)):
- type, name, attrs = controls[ii]
- # index=ii*10 allows ImageControl to return multiple ordered pairs
- form.new_control(
- type, name, attrs, select_default=select_default, index=ii*10)
- forms.append(form)
- for form in forms:
- form.fixup()
- return forms
- class Label:
- def __init__(self, attrs):
- self.id = attrs.get("for")
- self._text = attrs.get("__text").strip()
- self._ctext = compress_text(self._text)
- self.attrs = attrs
- self._backwards_compat = False # maintained by HTMLForm
- def __getattr__(self, name):
- if name == "text":
- if self._backwards_compat:
- return self._text
- else:
- return self._ctext
- return getattr(Label, name)
- def __setattr__(self, name, value):
- if name == "text":
- # don't see any need for this, so make it read-only
- raise AttributeError("text attribute is read-only")
- self.__dict__[name] = value
- def __str__(self):
- return "<Label(id=%r, text=%r)>" % (self.id, self.text)
- def _get_label(attrs):
- text = attrs.get("__label")
- if text is not None:
- return Label(text)
- else:
- return None
- class Control:
- """An HTML form control.
- An HTMLForm contains a sequence of Controls. The Controls in an HTMLForm
- are accessed using the HTMLForm.find_control method or the
- HTMLForm.controls attribute.
- Control instances are usually constructed using the ParseFile /
- ParseResponse functions. If you use those functions, you can ignore the
- rest of this paragraph. A Control is only properly initialised after the
- fixup method has been called. In fact, this is only strictly necessary for
- ListControl instances. This is necessary because ListControls are built up
- from ListControls each containing only a single item, and their initial
- value(s) can only be known after the sequence is complete.
- The types and values that are acceptable for assignment to the value
- attribute are defined by subclasses.
- If the disabled attribute is true, this represents the state typically
- represented by browsers by 'greying out' a control. If the disabled
- attribute is true, the Control will raise AttributeError if an attempt is
- made to change its value. In addition, the control will not be considered
- 'successful' as defined by the W3C HTML 4 standard -- ie. it will
- contribute no data to the return value of the HTMLForm.click* methods. To
- enable a control, set the disabled attribute to a false value.
- If the readonly attribute is true, the Control will raise AttributeError if
- an attempt is made to change its value. To make a control writable, set
- the readonly attribute to a false value.
- All controls have the disabled and readonly attributes, not only those that
- may have the HTML attributes of the same names.
- On assignment to the value attribute, the following exceptions are raised:
- TypeError, AttributeError (if the value attribute should not be assigned
- to, because the control is disabled, for example) and ValueError.
- If the name or value attributes are None, or the value is an empty list, or
- if the control is disabled, the control is not successful.
- Public attributes:
- type: string describing type of control (see the keys of the
- HTMLForm.type2class dictionary for the allowable values) (readonly)
- name: name of control (readonly)
- value: current value of control (subclasses may allow a single value, a
- sequence of values, or either)
- disabled: disabled state
- readonly: readonly state
- id: value of id HTML attribute
- """
- def __init__(self, type, name, attrs, index=None):
- """
- type: string describing type of control (see the keys of the
- HTMLForm.type2class dictionary for the allowable values)
- name: control name
- attrs: HTML attributes of control's HTML element
- """
- raise NotImplementedError()
- def add_to_form(self, form):
- self._form = form
- form.controls.append(self)
- def fixup(self):
- pass
- def is_of_kind(self, kind):
- raise NotImplementedError()
- def clear(self):
- raise NotImplementedError()
- def __getattr__(self, name): raise NotImplementedError()
- def __setattr__(self, name, value): raise NotImplementedError()
- def pairs(self):
- """Return list of (key, value) pairs suitable for passing to urlencode.
- """
- return [(k, v) for (i, k, v) in self._totally_ordered_pairs()]
- def _totally_ordered_pairs(self):
- """Return list of (key, value, index) tuples.
- Like pairs, but allows preserving correct ordering even where several
- controls are involved.
- """
- raise NotImplementedError()
- def _write_mime_data(self, mw, name, value):
- """Write data for a subitem of this control to a MimeWriter."""
- # called by HTMLForm
- mw2 = mw.nextpart()
- mw2.addheader("Content-Disposition",
- 'form-data; name="%s"' % name, 1)
- f = mw2.startbody(prefix=0)
- f.write(value)
- def __str__(self):
- raise NotImplementedError()
- def get_labels(self):
- """Return all labels (Label instances) for this control.
-
- If the control was surrounded by a <label> tag, that will be the first
- label; all other labels, connected by 'for' and 'id', are in the order
- that appear in the HTML.
- """
- res = []
- if self._label:
- res.append(self._label)
- if self.id:
- res.extend(self._form._id_to_labels.get(self.id, ()))
- return res
- #---------------------------------------------------
- class ScalarControl(Control):
- """Control whose value is not restricted to one of a prescribed set.
- Some ScalarControls don't accept any value attribute. Otherwise, takes a
- single value, which must be string-like.
- Additional read-only public attribute:
- attrs: dictionary mapping the names of original HTML attributes of the
- control to their values
- """
- def __init__(self, type, name, attrs, index=None):
- self._index = index
- self._label = _get_label(attrs)
- self.__dict__["type"] = type.lower()
- self.__dict__["name"] = name
- self._value = attrs.get("value")
- self.disabled = attrs.has_key("disabled")
- self.readonly = attrs.has_key("readonly")
- self.id = attrs.get("id")
- self.attrs = attrs.copy()
- self._clicked = False
- self._urlparse = urlparse.urlparse
- self._urlunparse = urlparse.urlunparse
- def __getattr__(self, name):
- if name == "value":
- return self.__dict__["_value"]
- else:
- raise AttributeError("%s instance has no attribute '%s'" %
- (self.__class__.__name__, name))
- def __setattr__(self, name, value):
- if name == "value":
- if not isstringlike(value):
- raise TypeError("must assign a string")
- elif self.readonly:
- raise AttributeError("control '%s' is readonly" % self.name)
- elif self.disabled:
- raise AttributeError("control '%s' is disabled" % self.name)
- self.__dict__["_value"] = value
- elif name in ("name", "type"):
- raise AttributeError("%s attribute is readonly" % name)
- else:
- self.__dict__[name] = value
- def _totally_ordered_pairs(self):
- name = self.name
- value = self.value
- if name is None or value is None or self.disabled:
- return []
- return [(self._index, name, value)]
- def clear(self):
- if self.readonly:
- raise AttributeError("control '%s' is readonly" % self.name)
- self.__dict__["_value"] = None
- def __str__(self):
- name = self.name
- value = self.value
- if name is None: name = "<None>"
- if value is None: value = "<None>"
- infos = []
- if self.disabled: infos.append("disabled")
- if self.readonly: infos.append("readonly")
- info = ", ".join(infos)
- if info: info = " (%s)" % info
- return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)
- #---------------------------------------------------
- class TextControl(ScalarControl):
- """Textual input control.
- Covers:
- INPUT/TEXT
- INPUT/PASSWORD
- INPUT/HIDDEN
- TEXTAREA
- """
- def __init__(self, type, name, attrs, index=None):
- ScalarControl.__init__(self, type, name, attrs, index)
- if self.type == "hidden": self.readonly = True
- if self._value is None:
- self._value = ""
- def is_of_kind(self, kind): return kind == "text"
- #---------------------------------------------------
- class FileControl(ScalarControl):
- """File upload with INPUT TYPE=FILE.
- The value attribute of a FileControl is always None. Use add_file instead.
- Additional public method: add_file
- """
- def __init__(self, type, name, attrs, index=None):
- ScalarControl.__init__(self, type, name, attrs, index)
- self._value = None
- self._upload_data = []
- def is_of_kind(self, kind): return kind == "file"
- def clear(self):
- if self.readonly:
- raise AttributeError("control '%s' is readonly" % self.name)
- self._upload_data = []
- def __setattr__(self, name, value):
- if name in ("value", "name", "type"):
- raise AttributeError("%s attribute is readonly" % name)
- else:
- self.__dict__[name] = value
- def add_file(self, file_object, content_type=None, filename=None):
- if not hasattr(file_object, "read"):
- raise TypeError("file-like object must have read method")
- if content_type is not None and not isstringlike(content_type):
- raise TypeError("content type must be None or string-like")
- if filename is not None and not isstringlike(filename):
- raise TypeError("filename must be None or string-like")
- if content_type is None:
- content_type …
Large files files are truncated, but you can click here to view the full file