PageRenderTime 199ms CodeModel.GetById 41ms app.highlight 125ms RepoModel.GetById 15ms app.codeStats 1ms

/Lib/urllib2.py

http://unladen-swallow.googlecode.com/
Python | 1384 lines | 1275 code | 17 blank | 92 comment | 9 complexity | 5240841504a50c106cc228fe2b8e4e59 MD5 | raw file
   1"""An extensible library for opening URLs using a variety of protocols
   2
   3The simplest way to use this module is to call the urlopen function,
   4which accepts a string containing a URL or a Request object (described
   5below).  It opens the URL and returns the results as file-like
   6object; the returned object has some extra methods described below.
   7
   8The OpenerDirector manages a collection of Handler objects that do
   9all the actual work.  Each Handler implements a particular protocol or
  10option.  The OpenerDirector is a composite object that invokes the
  11Handlers needed to open the requested URL.  For example, the
  12HTTPHandler performs HTTP GET and POST requests and deals with
  13non-error returns.  The HTTPRedirectHandler automatically deals with
  14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
  15deals with digest authentication.
  16
  17urlopen(url, data=None) -- Basic usage is the same as original
  18urllib.  pass the url and optionally data to post to an HTTP URL, and
  19get a file-like object back.  One difference is that you can also pass
  20a Request instance instead of URL.  Raises a URLError (subclass of
  21IOError); for HTTP errors, raises an HTTPError, which can also be
  22treated as a valid response.
  23
  24build_opener -- Function that creates a new OpenerDirector instance.
  25Will install the default handlers.  Accepts one or more Handlers as
  26arguments, either instances or Handler classes that it will
  27instantiate.  If one of the argument is a subclass of the default
  28handler, the argument will be installed instead of the default.
  29
  30install_opener -- Installs a new opener as the default opener.
  31
  32objects of interest:
  33OpenerDirector --
  34
  35Request -- An object that encapsulates the state of a request.  The
  36state can be as simple as the URL.  It can also include extra HTTP
  37headers, e.g. a User-Agent.
  38
  39BaseHandler --
  40
  41exceptions:
  42URLError -- A subclass of IOError, individual protocols have their own
  43specific subclass.
  44
  45HTTPError -- Also a valid HTTP response, so you can treat an HTTP error
  46as an exceptional event or valid response.
  47
  48internals:
  49BaseHandler and parent
  50_call_chain conventions
  51
  52Example usage:
  53
  54import urllib2
  55
  56# set up authentication info
  57authinfo = urllib2.HTTPBasicAuthHandler()
  58authinfo.add_password(realm='PDQ Application',
  59                      uri='https://mahler:8092/site-updates.py',
  60                      user='klem',
  61                      passwd='geheim$parole')
  62
  63proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
  64
  65# build a new opener that adds authentication and caching FTP handlers
  66opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
  67
  68# install it
  69urllib2.install_opener(opener)
  70
  71f = urllib2.urlopen('http://www.python.org/')
  72
  73
  74"""
  75
  76# XXX issues:
  77# If an authentication error handler that tries to perform
  78# authentication for some reason but fails, how should the error be
  79# signalled?  The client needs to know the HTTP error code.  But if
  80# the handler knows that the problem was, e.g., that it didn't know
  81# that hash algo that requested in the challenge, it would be good to
  82# pass that information along to the client, too.
  83# ftp errors aren't handled cleanly
  84# check digest against correct (i.e. non-apache) implementation
  85
  86# Possible extensions:
  87# complex proxies  XXX not sure what exactly was meant by this
  88# abstract factory for opener
  89
  90import base64
  91import hashlib
  92import httplib
  93import mimetools
  94import os
  95import posixpath
  96import random
  97import re
  98import socket
  99import sys
 100import time
 101import urlparse
 102import bisect
 103
 104try:
 105    from cStringIO import StringIO
 106except ImportError:
 107    from StringIO import StringIO
 108
 109from urllib import (unwrap, unquote, splittype, splithost, quote,
 110     addinfourl, splitport,
 111     splitattr, ftpwrapper, splituser, splitpasswd, splitvalue)
 112
 113# support for FileHandler, proxies via environment variables
 114from urllib import localhost, url2pathname, getproxies, proxy_bypass
 115
 116# used in User-Agent header sent
 117__version__ = sys.version[:3]
 118
 119_opener = None
 120def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
 121    global _opener
 122    if _opener is None:
 123        _opener = build_opener()
 124    return _opener.open(url, data, timeout)
 125
 126def install_opener(opener):
 127    global _opener
 128    _opener = opener
 129
 130# do these error classes make sense?
 131# make sure all of the IOError stuff is overridden.  we just want to be
 132# subtypes.
 133
 134class URLError(IOError):
 135    # URLError is a sub-type of IOError, but it doesn't share any of
 136    # the implementation.  need to override __init__ and __str__.
 137    # It sets self.args for compatibility with other EnvironmentError
 138    # subclasses, but args doesn't have the typical format with errno in
 139    # slot 0 and strerror in slot 1.  This may be better than nothing.
 140    def __init__(self, reason):
 141        self.args = reason,
 142        self.reason = reason
 143
 144    def __str__(self):
 145        return '<urlopen error %s>' % self.reason
 146
 147class HTTPError(URLError, addinfourl):
 148    """Raised when HTTP error occurs, but also acts like non-error return"""
 149    __super_init = addinfourl.__init__
 150
 151    def __init__(self, url, code, msg, hdrs, fp):
 152        self.code = code
 153        self.msg = msg
 154        self.hdrs = hdrs
 155        self.fp = fp
 156        self.filename = url
 157        # The addinfourl classes depend on fp being a valid file
 158        # object.  In some cases, the HTTPError may not have a valid
 159        # file object.  If this happens, the simplest workaround is to
 160        # not initialize the base classes.
 161        if fp is not None:
 162            self.__super_init(fp, hdrs, url, code)
 163
 164    def __str__(self):
 165        return 'HTTP Error %s: %s' % (self.code, self.msg)
 166
 167# copied from cookielib.py
 168_cut_port_re = re.compile(r":\d+$")
 169def request_host(request):
 170    """Return request-host, as defined by RFC 2965.
 171
 172    Variation from RFC: returned value is lowercased, for convenient
 173    comparison.
 174
 175    """
 176    url = request.get_full_url()
 177    host = urlparse.urlparse(url)[1]
 178    if host == "":
 179        host = request.get_header("Host", "")
 180
 181    # remove port, if present
 182    host = _cut_port_re.sub("", host, 1)
 183    return host.lower()
 184
 185class Request:
 186
 187    def __init__(self, url, data=None, headers={},
 188                 origin_req_host=None, unverifiable=False):
 189        # unwrap('<URL:type://host/path>') --> 'type://host/path'
 190        self.__original = unwrap(url)
 191        self.type = None
 192        # self.__r_type is what's left after doing the splittype
 193        self.host = None
 194        self.port = None
 195        self._tunnel_host = None
 196        self.data = data
 197        self.headers = {}
 198        for key, value in headers.items():
 199            self.add_header(key, value)
 200        self.unredirected_hdrs = {}
 201        if origin_req_host is None:
 202            origin_req_host = request_host(self)
 203        self.origin_req_host = origin_req_host
 204        self.unverifiable = unverifiable
 205
 206    def __getattr__(self, attr):
 207        # XXX this is a fallback mechanism to guard against these
 208        # methods getting called in a non-standard order.  this may be
 209        # too complicated and/or unnecessary.
 210        # XXX should the __r_XXX attributes be public?
 211        if attr[:12] == '_Request__r_':
 212            name = attr[12:]
 213            if hasattr(Request, 'get_' + name):
 214                getattr(self, 'get_' + name)()
 215                return getattr(self, attr)
 216        raise AttributeError, attr
 217
 218    def get_method(self):
 219        if self.has_data():
 220            return "POST"
 221        else:
 222            return "GET"
 223
 224    # XXX these helper methods are lame
 225
 226    def add_data(self, data):
 227        self.data = data
 228
 229    def has_data(self):
 230        return self.data is not None
 231
 232    def get_data(self):
 233        return self.data
 234
 235    def get_full_url(self):
 236        return self.__original
 237
 238    def get_type(self):
 239        if self.type is None:
 240            self.type, self.__r_type = splittype(self.__original)
 241            if self.type is None:
 242                raise ValueError, "unknown url type: %s" % self.__original
 243        return self.type
 244
 245    def get_host(self):
 246        if self.host is None:
 247            self.host, self.__r_host = splithost(self.__r_type)
 248            if self.host:
 249                self.host = unquote(self.host)
 250        return self.host
 251
 252    def get_selector(self):
 253        return self.__r_host
 254
 255    def set_proxy(self, host, type):
 256        if self.type == 'https' and not self._tunnel_host:
 257            self._tunnel_host = self.host
 258        else:
 259            self.type = type
 260            self.__r_host = self.__original
 261
 262        self.host = host
 263
 264    def has_proxy(self):
 265        return self.__r_host == self.__original
 266
 267    def get_origin_req_host(self):
 268        return self.origin_req_host
 269
 270    def is_unverifiable(self):
 271        return self.unverifiable
 272
 273    def add_header(self, key, val):
 274        # useful for something like authentication
 275        self.headers[key.capitalize()] = val
 276
 277    def add_unredirected_header(self, key, val):
 278        # will not be added to a redirected request
 279        self.unredirected_hdrs[key.capitalize()] = val
 280
 281    def has_header(self, header_name):
 282        return (header_name in self.headers or
 283                header_name in self.unredirected_hdrs)
 284
 285    def get_header(self, header_name, default=None):
 286        return self.headers.get(
 287            header_name,
 288            self.unredirected_hdrs.get(header_name, default))
 289
 290    def header_items(self):
 291        hdrs = self.unredirected_hdrs.copy()
 292        hdrs.update(self.headers)
 293        return hdrs.items()
 294
 295class OpenerDirector:
 296    def __init__(self):
 297        client_version = "Python-urllib/%s" % __version__
 298        self.addheaders = [('User-agent', client_version)]
 299        # manage the individual handlers
 300        self.handlers = []
 301        self.handle_open = {}
 302        self.handle_error = {}
 303        self.process_response = {}
 304        self.process_request = {}
 305
 306    def add_handler(self, handler):
 307        if not hasattr(handler, "add_parent"):
 308            raise TypeError("expected BaseHandler instance, got %r" %
 309                            type(handler))
 310
 311        added = False
 312        for meth in dir(handler):
 313            if meth in ["redirect_request", "do_open", "proxy_open"]:
 314                # oops, coincidental match
 315                continue
 316
 317            i = meth.find("_")
 318            protocol = meth[:i]
 319            condition = meth[i+1:]
 320
 321            if condition.startswith("error"):
 322                j = condition.find("_") + i + 1
 323                kind = meth[j+1:]
 324                try:
 325                    kind = int(kind)
 326                except ValueError:
 327                    pass
 328                lookup = self.handle_error.get(protocol, {})
 329                self.handle_error[protocol] = lookup
 330            elif condition == "open":
 331                kind = protocol
 332                lookup = self.handle_open
 333            elif condition == "response":
 334                kind = protocol
 335                lookup = self.process_response
 336            elif condition == "request":
 337                kind = protocol
 338                lookup = self.process_request
 339            else:
 340                continue
 341
 342            handlers = lookup.setdefault(kind, [])
 343            if handlers:
 344                bisect.insort(handlers, handler)
 345            else:
 346                handlers.append(handler)
 347            added = True
 348
 349        if added:
 350            # the handlers must work in an specific order, the order
 351            # is specified in a Handler attribute
 352            bisect.insort(self.handlers, handler)
 353            handler.add_parent(self)
 354
 355    def close(self):
 356        # Only exists for backwards compatibility.
 357        pass
 358
 359    def _call_chain(self, chain, kind, meth_name, *args):
 360        # Handlers raise an exception if no one else should try to handle
 361        # the request, or return None if they can't but another handler
 362        # could.  Otherwise, they return the response.
 363        handlers = chain.get(kind, ())
 364        for handler in handlers:
 365            func = getattr(handler, meth_name)
 366
 367            result = func(*args)
 368            if result is not None:
 369                return result
 370
 371    def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
 372        # accept a URL or a Request object
 373        if isinstance(fullurl, basestring):
 374            req = Request(fullurl, data)
 375        else:
 376            req = fullurl
 377            if data is not None:
 378                req.add_data(data)
 379
 380        req.timeout = timeout
 381        protocol = req.get_type()
 382
 383        # pre-process request
 384        meth_name = protocol+"_request"
 385        for processor in self.process_request.get(protocol, []):
 386            meth = getattr(processor, meth_name)
 387            req = meth(req)
 388
 389        response = self._open(req, data)
 390
 391        # post-process response
 392        meth_name = protocol+"_response"
 393        for processor in self.process_response.get(protocol, []):
 394            meth = getattr(processor, meth_name)
 395            response = meth(req, response)
 396
 397        return response
 398
 399    def _open(self, req, data=None):
 400        result = self._call_chain(self.handle_open, 'default',
 401                                  'default_open', req)
 402        if result:
 403            return result
 404
 405        protocol = req.get_type()
 406        result = self._call_chain(self.handle_open, protocol, protocol +
 407                                  '_open', req)
 408        if result:
 409            return result
 410
 411        return self._call_chain(self.handle_open, 'unknown',
 412                                'unknown_open', req)
 413
 414    def error(self, proto, *args):
 415        if proto in ('http', 'https'):
 416            # XXX http[s] protocols are special-cased
 417            dict = self.handle_error['http'] # https is not different than http
 418            proto = args[2]  # YUCK!
 419            meth_name = 'http_error_%s' % proto
 420            http_err = 1
 421            orig_args = args
 422        else:
 423            dict = self.handle_error
 424            meth_name = proto + '_error'
 425            http_err = 0
 426        args = (dict, proto, meth_name) + args
 427        result = self._call_chain(*args)
 428        if result:
 429            return result
 430
 431        if http_err:
 432            args = (dict, 'default', 'http_error_default') + orig_args
 433            return self._call_chain(*args)
 434
 435# XXX probably also want an abstract factory that knows when it makes
 436# sense to skip a superclass in favor of a subclass and when it might
 437# make sense to include both
 438
 439def build_opener(*handlers):
 440    """Create an opener object from a list of handlers.
 441
 442    The opener will use several default handlers, including support
 443    for HTTP and FTP.
 444
 445    If any of the handlers passed as arguments are subclasses of the
 446    default handlers, the default handlers will not be used.
 447    """
 448    import types
 449    def isclass(obj):
 450        return isinstance(obj, types.ClassType) or hasattr(obj, "__bases__")
 451
 452    opener = OpenerDirector()
 453    default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
 454                       HTTPDefaultErrorHandler, HTTPRedirectHandler,
 455                       FTPHandler, FileHandler, HTTPErrorProcessor]
 456    if hasattr(httplib, 'HTTPS'):
 457        default_classes.append(HTTPSHandler)
 458    skip = set()
 459    for klass in default_classes:
 460        for check in handlers:
 461            if isclass(check):
 462                if issubclass(check, klass):
 463                    skip.add(klass)
 464            elif isinstance(check, klass):
 465                skip.add(klass)
 466    for klass in skip:
 467        default_classes.remove(klass)
 468
 469    for klass in default_classes:
 470        opener.add_handler(klass())
 471
 472    for h in handlers:
 473        if isclass(h):
 474            h = h()
 475        opener.add_handler(h)
 476    return opener
 477
 478class BaseHandler:
 479    handler_order = 500
 480
 481    def add_parent(self, parent):
 482        self.parent = parent
 483
 484    def close(self):
 485        # Only exists for backwards compatibility
 486        pass
 487
 488    def __lt__(self, other):
 489        if not hasattr(other, "handler_order"):
 490            # Try to preserve the old behavior of having custom classes
 491            # inserted after default ones (works only for custom user
 492            # classes which are not aware of handler_order).
 493            return True
 494        return self.handler_order < other.handler_order
 495
 496
 497class HTTPErrorProcessor(BaseHandler):
 498    """Process HTTP error responses."""
 499    handler_order = 1000  # after all other processing
 500
 501    def http_response(self, request, response):
 502        code, msg, hdrs = response.code, response.msg, response.info()
 503
 504        # According to RFC 2616, "2xx" code indicates that the client's
 505        # request was successfully received, understood, and accepted.
 506        if not (200 <= code < 300):
 507            response = self.parent.error(
 508                'http', request, response, code, msg, hdrs)
 509
 510        return response
 511
 512    https_response = http_response
 513
 514class HTTPDefaultErrorHandler(BaseHandler):
 515    def http_error_default(self, req, fp, code, msg, hdrs):
 516        raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
 517
 518class HTTPRedirectHandler(BaseHandler):
 519    # maximum number of redirections to any single URL
 520    # this is needed because of the state that cookies introduce
 521    max_repeats = 4
 522    # maximum total number of redirections (regardless of URL) before
 523    # assuming we're in a loop
 524    max_redirections = 10
 525
 526    def redirect_request(self, req, fp, code, msg, headers, newurl):
 527        """Return a Request or None in response to a redirect.
 528
 529        This is called by the http_error_30x methods when a
 530        redirection response is received.  If a redirection should
 531        take place, return a new Request to allow http_error_30x to
 532        perform the redirect.  Otherwise, raise HTTPError if no-one
 533        else should try to handle this url.  Return None if you can't
 534        but another Handler might.
 535        """
 536        m = req.get_method()
 537        if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
 538            or code in (301, 302, 303) and m == "POST"):
 539            # Strictly (according to RFC 2616), 301 or 302 in response
 540            # to a POST MUST NOT cause a redirection without confirmation
 541            # from the user (of urllib2, in this case).  In practice,
 542            # essentially all clients do redirect in this case, so we
 543            # do the same.
 544            # be conciliant with URIs containing a space
 545            newurl = newurl.replace(' ', '%20')
 546            newheaders = dict((k,v) for k,v in req.headers.items()
 547                              if k.lower() not in ("content-length", "content-type")
 548                             )
 549            return Request(newurl,
 550                           headers=newheaders,
 551                           origin_req_host=req.get_origin_req_host(),
 552                           unverifiable=True)
 553        else:
 554            raise HTTPError(req.get_full_url(), code, msg, headers, fp)
 555
 556    # Implementation note: To avoid the server sending us into an
 557    # infinite loop, the request object needs to track what URLs we
 558    # have already seen.  Do this by adding a handler-specific
 559    # attribute to the Request object.
 560    def http_error_302(self, req, fp, code, msg, headers):
 561        # Some servers (incorrectly) return multiple Location headers
 562        # (so probably same goes for URI).  Use first header.
 563        if 'location' in headers:
 564            newurl = headers.getheaders('location')[0]
 565        elif 'uri' in headers:
 566            newurl = headers.getheaders('uri')[0]
 567        else:
 568            return
 569
 570        # fix a possible malformed URL
 571        urlparts = urlparse.urlparse(newurl)
 572        if not urlparts.path:
 573            urlparts = list(urlparts)
 574            urlparts[2] = "/"
 575        newurl = urlparse.urlunparse(urlparts)
 576
 577        newurl = urlparse.urljoin(req.get_full_url(), newurl)
 578
 579        # XXX Probably want to forget about the state of the current
 580        # request, although that might interact poorly with other
 581        # handlers that also use handler-specific request attributes
 582        new = self.redirect_request(req, fp, code, msg, headers, newurl)
 583        if new is None:
 584            return
 585
 586        # loop detection
 587        # .redirect_dict has a key url if url was previously visited.
 588        if hasattr(req, 'redirect_dict'):
 589            visited = new.redirect_dict = req.redirect_dict
 590            if (visited.get(newurl, 0) >= self.max_repeats or
 591                len(visited) >= self.max_redirections):
 592                raise HTTPError(req.get_full_url(), code,
 593                                self.inf_msg + msg, headers, fp)
 594        else:
 595            visited = new.redirect_dict = req.redirect_dict = {}
 596        visited[newurl] = visited.get(newurl, 0) + 1
 597
 598        # Don't close the fp until we are sure that we won't use it
 599        # with HTTPError.
 600        fp.read()
 601        fp.close()
 602
 603        return self.parent.open(new, timeout=req.timeout)
 604
 605    http_error_301 = http_error_303 = http_error_307 = http_error_302
 606
 607    inf_msg = "The HTTP server returned a redirect error that would " \
 608              "lead to an infinite loop.\n" \
 609              "The last 30x error message was:\n"
 610
 611
 612def _parse_proxy(proxy):
 613    """Return (scheme, user, password, host/port) given a URL or an authority.
 614
 615    If a URL is supplied, it must have an authority (host:port) component.
 616    According to RFC 3986, having an authority component means the URL must
 617    have two slashes after the scheme:
 618
 619    >>> _parse_proxy('file:/ftp.example.com/')
 620    Traceback (most recent call last):
 621    ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
 622
 623    The first three items of the returned tuple may be None.
 624
 625    Examples of authority parsing:
 626
 627    >>> _parse_proxy('proxy.example.com')
 628    (None, None, None, 'proxy.example.com')
 629    >>> _parse_proxy('proxy.example.com:3128')
 630    (None, None, None, 'proxy.example.com:3128')
 631
 632    The authority component may optionally include userinfo (assumed to be
 633    username:password):
 634
 635    >>> _parse_proxy('joe:password@proxy.example.com')
 636    (None, 'joe', 'password', 'proxy.example.com')
 637    >>> _parse_proxy('joe:password@proxy.example.com:3128')
 638    (None, 'joe', 'password', 'proxy.example.com:3128')
 639
 640    Same examples, but with URLs instead:
 641
 642    >>> _parse_proxy('http://proxy.example.com/')
 643    ('http', None, None, 'proxy.example.com')
 644    >>> _parse_proxy('http://proxy.example.com:3128/')
 645    ('http', None, None, 'proxy.example.com:3128')
 646    >>> _parse_proxy('http://joe:password@proxy.example.com/')
 647    ('http', 'joe', 'password', 'proxy.example.com')
 648    >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
 649    ('http', 'joe', 'password', 'proxy.example.com:3128')
 650
 651    Everything after the authority is ignored:
 652
 653    >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
 654    ('ftp', 'joe', 'password', 'proxy.example.com')
 655
 656    Test for no trailing '/' case:
 657
 658    >>> _parse_proxy('http://joe:password@proxy.example.com')
 659    ('http', 'joe', 'password', 'proxy.example.com')
 660
 661    """
 662    scheme, r_scheme = splittype(proxy)
 663    if not r_scheme.startswith("/"):
 664        # authority
 665        scheme = None
 666        authority = proxy
 667    else:
 668        # URL
 669        if not r_scheme.startswith("//"):
 670            raise ValueError("proxy URL with no authority: %r" % proxy)
 671        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
 672        # and 3.3.), path is empty or starts with '/'
 673        end = r_scheme.find("/", 2)
 674        if end == -1:
 675            end = None
 676        authority = r_scheme[2:end]
 677    userinfo, hostport = splituser(authority)
 678    if userinfo is not None:
 679        user, password = splitpasswd(userinfo)
 680    else:
 681        user = password = None
 682    return scheme, user, password, hostport
 683
 684class ProxyHandler(BaseHandler):
 685    # Proxies must be in front
 686    handler_order = 100
 687
 688    def __init__(self, proxies=None):
 689        if proxies is None:
 690            proxies = getproxies()
 691        assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 692        self.proxies = proxies
 693        for type, url in proxies.items():
 694            setattr(self, '%s_open' % type,
 695                    lambda r, proxy=url, type=type, meth=self.proxy_open: \
 696                    meth(r, proxy, type))
 697
 698    def proxy_open(self, req, proxy, type):
 699        orig_type = req.get_type()
 700        proxy_type, user, password, hostport = _parse_proxy(proxy)
 701
 702        if proxy_type is None:
 703            proxy_type = orig_type
 704
 705        if req.host and proxy_bypass(req.host):
 706            return None
 707
 708        if user and password:
 709            user_pass = '%s:%s' % (unquote(user), unquote(password))
 710            creds = base64.b64encode(user_pass).strip()
 711            req.add_header('Proxy-authorization', 'Basic ' + creds)
 712        hostport = unquote(hostport)
 713        req.set_proxy(hostport, proxy_type)
 714
 715        if orig_type == proxy_type or orig_type == 'https':
 716            # let other handlers take care of it
 717            return None
 718        else:
 719            # need to start over, because the other handlers don't
 720            # grok the proxy's URL type
 721            # e.g. if we have a constructor arg proxies like so:
 722            # {'http': 'ftp://proxy.example.com'}, we may end up turning
 723            # a request for http://acme.example.com/a into one for
 724            # ftp://proxy.example.com/a
 725            return self.parent.open(req, timeout=req.timeout)
 726
 727class HTTPPasswordMgr:
 728
 729    def __init__(self):
 730        self.passwd = {}
 731
 732    def add_password(self, realm, uri, user, passwd):
 733        # uri could be a single URI or a sequence
 734        if isinstance(uri, basestring):
 735            uri = [uri]
 736        if not realm in self.passwd:
 737            self.passwd[realm] = {}
 738        for default_port in True, False:
 739            reduced_uri = tuple(
 740                [self.reduce_uri(u, default_port) for u in uri])
 741            self.passwd[realm][reduced_uri] = (user, passwd)
 742
 743    def find_user_password(self, realm, authuri):
 744        domains = self.passwd.get(realm, {})
 745        for default_port in True, False:
 746            reduced_authuri = self.reduce_uri(authuri, default_port)
 747            for uris, authinfo in domains.iteritems():
 748                for uri in uris:
 749                    if self.is_suburi(uri, reduced_authuri):
 750                        return authinfo
 751        return None, None
 752
 753    def reduce_uri(self, uri, default_port=True):
 754        """Accept authority or URI and extract only the authority and path."""
 755        # note HTTP URLs do not have a userinfo component
 756        parts = urlparse.urlsplit(uri)
 757        if parts[1]:
 758            # URI
 759            scheme = parts[0]
 760            authority = parts[1]
 761            path = parts[2] or '/'
 762        else:
 763            # host or host:port
 764            scheme = None
 765            authority = uri
 766            path = '/'
 767        host, port = splitport(authority)
 768        if default_port and port is None and scheme is not None:
 769            dport = {"http": 80,
 770                     "https": 443,
 771                     }.get(scheme)
 772            if dport is not None:
 773                authority = "%s:%d" % (host, dport)
 774        return authority, path
 775
 776    def is_suburi(self, base, test):
 777        """Check if test is below base in a URI tree
 778
 779        Both args must be URIs in reduced form.
 780        """
 781        if base == test:
 782            return True
 783        if base[0] != test[0]:
 784            return False
 785        common = posixpath.commonprefix((base[1], test[1]))
 786        if len(common) == len(base[1]):
 787            return True
 788        return False
 789
 790
 791class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
 792
 793    def find_user_password(self, realm, authuri):
 794        user, password = HTTPPasswordMgr.find_user_password(self, realm,
 795                                                            authuri)
 796        if user is not None:
 797            return user, password
 798        return HTTPPasswordMgr.find_user_password(self, None, authuri)
 799
 800
 801class AbstractBasicAuthHandler:
 802
 803    # XXX this allows for multiple auth-schemes, but will stupidly pick
 804    # the last one with a realm specified.
 805
 806    # allow for double- and single-quoted realm values
 807    # (single quotes are a violation of the RFC, but appear in the wild)
 808    rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
 809                    'realm=(["\'])(.*?)\\2', re.I)
 810
 811    # XXX could pre-emptively send auth info already accepted (RFC 2617,
 812    # end of section 2, and section 1.2 immediately after "credentials"
 813    # production).
 814
 815    def __init__(self, password_mgr=None):
 816        if password_mgr is None:
 817            password_mgr = HTTPPasswordMgr()
 818        self.passwd = password_mgr
 819        self.add_password = self.passwd.add_password
 820
 821    def http_error_auth_reqed(self, authreq, host, req, headers):
 822        # host may be an authority (without userinfo) or a URL with an
 823        # authority
 824        # XXX could be multiple headers
 825        authreq = headers.get(authreq, None)
 826        if authreq:
 827            mo = AbstractBasicAuthHandler.rx.search(authreq)
 828            if mo:
 829                scheme, quote, realm = mo.groups()
 830                if scheme.lower() == 'basic':
 831                    return self.retry_http_basic_auth(host, req, realm)
 832
 833    def retry_http_basic_auth(self, host, req, realm):
 834        user, pw = self.passwd.find_user_password(realm, host)
 835        if pw is not None:
 836            raw = "%s:%s" % (user, pw)
 837            auth = 'Basic %s' % base64.b64encode(raw).strip()
 838            if req.headers.get(self.auth_header, None) == auth:
 839                return None
 840            req.add_header(self.auth_header, auth)
 841            return self.parent.open(req, timeout=req.timeout)
 842        else:
 843            return None
 844
 845
 846class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 847
 848    auth_header = 'Authorization'
 849
 850    def http_error_401(self, req, fp, code, msg, headers):
 851        url = req.get_full_url()
 852        return self.http_error_auth_reqed('www-authenticate',
 853                                          url, req, headers)
 854
 855
 856class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 857
 858    auth_header = 'Proxy-authorization'
 859
 860    def http_error_407(self, req, fp, code, msg, headers):
 861        # http_error_auth_reqed requires that there is no userinfo component in
 862        # authority.  Assume there isn't one, since urllib2 does not (and
 863        # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
 864        # userinfo.
 865        authority = req.get_host()
 866        return self.http_error_auth_reqed('proxy-authenticate',
 867                                          authority, req, headers)
 868
 869
 870def randombytes(n):
 871    """Return n random bytes."""
 872    # Use /dev/urandom if it is available.  Fall back to random module
 873    # if not.  It might be worthwhile to extend this function to use
 874    # other platform-specific mechanisms for getting random bytes.
 875    if os.path.exists("/dev/urandom"):
 876        f = open("/dev/urandom")
 877        s = f.read(n)
 878        f.close()
 879        return s
 880    else:
 881        L = [chr(random.randrange(0, 256)) for i in range(n)]
 882        return "".join(L)
 883
 884class AbstractDigestAuthHandler:
 885    # Digest authentication is specified in RFC 2617.
 886
 887    # XXX The client does not inspect the Authentication-Info header
 888    # in a successful response.
 889
 890    # XXX It should be possible to test this implementation against
 891    # a mock server that just generates a static set of challenges.
 892
 893    # XXX qop="auth-int" supports is shaky
 894
 895    def __init__(self, passwd=None):
 896        if passwd is None:
 897            passwd = HTTPPasswordMgr()
 898        self.passwd = passwd
 899        self.add_password = self.passwd.add_password
 900        self.retried = 0
 901        self.nonce_count = 0
 902
 903    def reset_retry_count(self):
 904        self.retried = 0
 905
 906    def http_error_auth_reqed(self, auth_header, host, req, headers):
 907        authreq = headers.get(auth_header, None)
 908        if self.retried > 5:
 909            # Don't fail endlessly - if we failed once, we'll probably
 910            # fail a second time. Hm. Unless the Password Manager is
 911            # prompting for the information. Crap. This isn't great
 912            # but it's better than the current 'repeat until recursion
 913            # depth exceeded' approach <wink>
 914            raise HTTPError(req.get_full_url(), 401, "digest auth failed",
 915                            headers, None)
 916        else:
 917            self.retried += 1
 918        if authreq:
 919            scheme = authreq.split()[0]
 920            if scheme.lower() == 'digest':
 921                return self.retry_http_digest_auth(req, authreq)
 922
 923    def retry_http_digest_auth(self, req, auth):
 924        token, challenge = auth.split(' ', 1)
 925        chal = parse_keqv_list(parse_http_list(challenge))
 926        auth = self.get_authorization(req, chal)
 927        if auth:
 928            auth_val = 'Digest %s' % auth
 929            if req.headers.get(self.auth_header, None) == auth_val:
 930                return None
 931            req.add_unredirected_header(self.auth_header, auth_val)
 932            resp = self.parent.open(req, timeout=req.timeout)
 933            return resp
 934
 935    def get_cnonce(self, nonce):
 936        # The cnonce-value is an opaque
 937        # quoted string value provided by the client and used by both client
 938        # and server to avoid chosen plaintext attacks, to provide mutual
 939        # authentication, and to provide some message integrity protection.
 940        # This isn't a fabulous effort, but it's probably Good Enough.
 941        dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
 942                                            randombytes(8))).hexdigest()
 943        return dig[:16]
 944
 945    def get_authorization(self, req, chal):
 946        try:
 947            realm = chal['realm']
 948            nonce = chal['nonce']
 949            qop = chal.get('qop')
 950            algorithm = chal.get('algorithm', 'MD5')
 951            # mod_digest doesn't send an opaque, even though it isn't
 952            # supposed to be optional
 953            opaque = chal.get('opaque', None)
 954        except KeyError:
 955            return None
 956
 957        H, KD = self.get_algorithm_impls(algorithm)
 958        if H is None:
 959            return None
 960
 961        user, pw = self.passwd.find_user_password(realm, req.get_full_url())
 962        if user is None:
 963            return None
 964
 965        # XXX not implemented yet
 966        if req.has_data():
 967            entdig = self.get_entity_digest(req.get_data(), chal)
 968        else:
 969            entdig = None
 970
 971        A1 = "%s:%s:%s" % (user, realm, pw)
 972        A2 = "%s:%s" % (req.get_method(),
 973                        # XXX selector: what about proxies and full urls
 974                        req.get_selector())
 975        if qop == 'auth':
 976            self.nonce_count += 1
 977            ncvalue = '%08x' % self.nonce_count
 978            cnonce = self.get_cnonce(nonce)
 979            noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
 980            respdig = KD(H(A1), noncebit)
 981        elif qop is None:
 982            respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
 983        else:
 984            # XXX handle auth-int.
 985            raise URLError("qop '%s' is not supported." % qop)
 986
 987        # XXX should the partial digests be encoded too?
 988
 989        base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
 990               'response="%s"' % (user, realm, nonce, req.get_selector(),
 991                                  respdig)
 992        if opaque:
 993            base += ', opaque="%s"' % opaque
 994        if entdig:
 995            base += ', digest="%s"' % entdig
 996        base += ', algorithm="%s"' % algorithm
 997        if qop:
 998            base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
 999        return base
1000
1001    def get_algorithm_impls(self, algorithm):
1002        # algorithm should be case-insensitive according to RFC2617
1003        algorithm = algorithm.upper()
1004        # lambdas assume digest modules are imported at the top level
1005        if algorithm == 'MD5':
1006            H = lambda x: hashlib.md5(x).hexdigest()
1007        elif algorithm == 'SHA':
1008            H = lambda x: hashlib.sha1(x).hexdigest()
1009        # XXX MD5-sess
1010        KD = lambda s, d: H("%s:%s" % (s, d))
1011        return H, KD
1012
1013    def get_entity_digest(self, data, chal):
1014        # XXX not implemented yet
1015        return None
1016
1017
1018class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1019    """An authentication protocol defined by RFC 2069
1020
1021    Digest authentication improves on basic authentication because it
1022    does not transmit passwords in the clear.
1023    """
1024
1025    auth_header = 'Authorization'
1026    handler_order = 490  # before Basic auth
1027
1028    def http_error_401(self, req, fp, code, msg, headers):
1029        host = urlparse.urlparse(req.get_full_url())[1]
1030        retry = self.http_error_auth_reqed('www-authenticate',
1031                                           host, req, headers)
1032        self.reset_retry_count()
1033        return retry
1034
1035
1036class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1037
1038    auth_header = 'Proxy-Authorization'
1039    handler_order = 490  # before Basic auth
1040
1041    def http_error_407(self, req, fp, code, msg, headers):
1042        host = req.get_host()
1043        retry = self.http_error_auth_reqed('proxy-authenticate',
1044                                           host, req, headers)
1045        self.reset_retry_count()
1046        return retry
1047
1048class AbstractHTTPHandler(BaseHandler):
1049
1050    def __init__(self, debuglevel=0):
1051        self._debuglevel = debuglevel
1052
1053    def set_http_debuglevel(self, level):
1054        self._debuglevel = level
1055
1056    def do_request_(self, request):
1057        host = request.get_host()
1058        if not host:
1059            raise URLError('no host given')
1060
1061        if request.has_data():  # POST
1062            data = request.get_data()
1063            if not request.has_header('Content-type'):
1064                request.add_unredirected_header(
1065                    'Content-type',
1066                    'application/x-www-form-urlencoded')
1067            if not request.has_header('Content-length'):
1068                request.add_unredirected_header(
1069                    'Content-length', '%d' % len(data))
1070
1071        sel_host = host
1072        if request.has_proxy():
1073            scheme, sel = splittype(request.get_selector())
1074            sel_host, sel_path = splithost(sel)
1075
1076        if not request.has_header('Host'):
1077            request.add_unredirected_header('Host', sel_host)
1078        for name, value in self.parent.addheaders:
1079            name = name.capitalize()
1080            if not request.has_header(name):
1081                request.add_unredirected_header(name, value)
1082
1083        return request
1084
1085    def do_open(self, http_class, req):
1086        """Return an addinfourl object for the request, using http_class.
1087
1088        http_class must implement the HTTPConnection API from httplib.
1089        The addinfourl return value is a file-like object.  It also
1090        has methods and attributes including:
1091            - info(): return a mimetools.Message object for the headers
1092            - geturl(): return the original request URL
1093            - code: HTTP status code
1094        """
1095        host = req.get_host()
1096        if not host:
1097            raise URLError('no host given')
1098
1099        h = http_class(host, timeout=req.timeout) # will parse host:port
1100        h.set_debuglevel(self._debuglevel)
1101
1102        headers = dict(req.headers)
1103        headers.update(req.unredirected_hdrs)
1104        # We want to make an HTTP/1.1 request, but the addinfourl
1105        # class isn't prepared to deal with a persistent connection.
1106        # It will try to read all remaining data from the socket,
1107        # which will block while the server waits for the next request.
1108        # So make sure the connection gets closed after the (only)
1109        # request.
1110        headers["Connection"] = "close"
1111        headers = dict(
1112            (name.title(), val) for name, val in headers.items())
1113
1114        if req._tunnel_host:
1115            h._set_tunnel(req._tunnel_host)
1116
1117        try:
1118            h.request(req.get_method(), req.get_selector(), req.data, headers)
1119            r = h.getresponse()
1120        except socket.error, err: # XXX what error?
1121            raise URLError(err)
1122
1123        # Pick apart the HTTPResponse object to get the addinfourl
1124        # object initialized properly.
1125
1126        # Wrap the HTTPResponse object in socket's file object adapter
1127        # for Windows.  That adapter calls recv(), so delegate recv()
1128        # to read().  This weird wrapping allows the returned object to
1129        # have readline() and readlines() methods.
1130
1131        # XXX It might be better to extract the read buffering code
1132        # out of socket._fileobject() and into a base class.
1133
1134        r.recv = r.read
1135        fp = socket._fileobject(r, close=True)
1136
1137        resp = addinfourl(fp, r.msg, req.get_full_url())
1138        resp.code = r.status
1139        resp.msg = r.reason
1140        return resp
1141
1142
1143class HTTPHandler(AbstractHTTPHandler):
1144
1145    def http_open(self, req):
1146        return self.do_open(httplib.HTTPConnection, req)
1147
1148    http_request = AbstractHTTPHandler.do_request_
1149
1150if hasattr(httplib, 'HTTPS'):
1151    class HTTPSHandler(AbstractHTTPHandler):
1152
1153        def https_open(self, req):
1154            return self.do_open(httplib.HTTPSConnection, req)
1155
1156        https_request = AbstractHTTPHandler.do_request_
1157
1158class HTTPCookieProcessor(BaseHandler):
1159    def __init__(self, cookiejar=None):
1160        import cookielib
1161        if cookiejar is None:
1162            cookiejar = cookielib.CookieJar()
1163        self.cookiejar = cookiejar
1164
1165    def http_request(self, request):
1166        self.cookiejar.add_cookie_header(request)
1167        return request
1168
1169    def http_response(self, request, response):
1170        self.cookiejar.extract_cookies(response, request)
1171        return response
1172
1173    https_request = http_request
1174    https_response = http_response
1175
1176class UnknownHandler(BaseHandler):
1177    def unknown_open(self, req):
1178        type = req.get_type()
1179        raise URLError('unknown url type: %s' % type)
1180
1181def parse_keqv_list(l):
1182    """Parse list of key=value strings where keys are not duplicated."""
1183    parsed = {}
1184    for elt in l:
1185        k, v = elt.split('=', 1)
1186        if v[0] == '"' and v[-1] == '"':
1187            v = v[1:-1]
1188        parsed[k] = v
1189    return parsed
1190
1191def parse_http_list(s):
1192    """Parse lists as described by RFC 2068 Section 2.
1193
1194    In particular, parse comma-separated lists where the elements of
1195    the list may include quoted-strings.  A quoted-string could
1196    contain a comma.  A non-quoted string could have quotes in the
1197    middle.  Neither commas nor quotes count if they are escaped.
1198    Only double-quotes count, not single-quotes.
1199    """
1200    res = []
1201    part = ''
1202
1203    escape = quote = False
1204    for cur in s:
1205        if escape:
1206            part += cur
1207            escape = False
1208            continue
1209        if quote:
1210            if cur == '\\':
1211                escape = True
1212                continue
1213            elif cur == '"':
1214                quote = False
1215            part += cur
1216            continue
1217
1218        if cur == ',':
1219            res.append(part)
1220            part = ''
1221            continue
1222
1223        if cur == '"':
1224            quote = True
1225
1226        part += cur
1227
1228    # append last part
1229    if part:
1230        res.append(part)
1231
1232    return [part.strip() for part in res]
1233
1234class FileHandler(BaseHandler):
1235    # Use local file or FTP depending on form of URL
1236    def file_open(self, req):
1237        url = req.get_selector()
1238        if url[:2] == '//' and url[2:3] != '/':
1239            req.type = 'ftp'
1240            return self.parent.open(req)
1241        else:
1242            return self.open_local_file(req)
1243
1244    # names for the localhost
1245    names = None
1246    def get_names(self):
1247        if FileHandler.names is None:
1248            try:
1249                FileHandler.names = (socket.gethostbyname('localhost'),
1250                                    socket.gethostbyname(socket.gethostname()))
1251            except socket.gaierror:
1252                FileHandler.names = (socket.gethostbyname('localhost'),)
1253        return FileHandler.names
1254
1255    # not entirely sure what the rules are here
1256    def open_local_file(self, req):
1257        import email.utils
1258        import mimetypes
1259        host = req.get_host()
1260        file = req.get_selector()
1261        localfile = url2pathname(file)
1262        try:
1263            stats = os.stat(localfile)
1264            size = stats.st_size
1265            modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1266            mtype = mimetypes.guess_type(file)[0]
1267            headers = mimetools.Message(StringIO(
1268                'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1269                (mtype or 'text/plain', size, modified)))
1270            if host:
1271                host, port = splitport(host)
1272            if not host or \
1273                (not port and socket.gethostbyname(host) in self.get_names()):
1274                return addinfourl(open(localfile, 'rb'),
1275                                  headers, 'file:'+file)
1276        except OSError, msg:
1277            # urllib2 users shouldn't expect OSErrors coming from urlopen()
1278            raise URLError(msg)
1279        raise URLError('file not on local host')
1280
1281class FTPHandler(BaseHandler):
1282    def ftp_open(self, req):
1283        import ftplib
1284        import mimetypes
1285        host = req.get_host()
1286        if not host:
1287            raise URLError('ftp error: no host given')
1288        host, port = splitport(host)
1289        if port is None:
1290            port = ftplib.FTP_PORT
1291        else:
1292            port = int(port)
1293
1294        # username/password handling
1295        user, host = splituser(host)
1296        if user:
1297            user, passwd = splitpasswd(user)
1298        else:
1299            passwd = None
1300        host = unquote(host)
1301        user = unquote(user or '')
1302        passwd = unquote(passwd or '')
1303
1304        try:
1305            host = socket.gethostbyname(host)
1306        except socket.error, msg:
1307            raise URLError(msg)
1308        path, attrs = splitattr(req.get_selector())
1309        dirs = path.split('/')
1310        dirs = map(unquote, dirs)
1311        dirs, file = dirs[:-1], dirs[-1]
1312        if dirs and not dirs[0]:
1313            dirs = dirs[1:]
1314        try:
1315            fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1316            type = file and 'I' or 'D'
1317            for attr in attrs:
1318                attr, value = splitvalue(attr)
1319                if attr.lower() == 'type' and \
1320                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
1321                    type = value.upper()
1322            fp, retrlen = fw.retrfile(file, type)
1323            headers = ""
1324            mtype = mimetypes.guess_type(req.get_full_url())[0]
1325            if mtype:
1326                headers += "Content-type: %s\n" % mtype
1327            if retrlen is not None and retrlen >= 0:
1328                headers += "Content-length: %d\n" % retrlen
1329            sf = StringIO(headers)
1330            headers = mimetools.Message(sf)
1331            return addinfourl(fp, headers, req.get_full_url())
1332        except ftplib.all_errors, msg:
1333            raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2]
1334
1335    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1336        fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1337##        fw.ftp.set_debuglevel(1)
1338        return fw
1339
1340class CacheFTPHandler(FTPHandler):
1341    # XXX would be nice to have pluggable cache strategies
1342    # XXX this stuff is definitely not thread safe
1343    def __init__(self):
1344        self.cache = {}
1345        self.timeout = {}
1346        self.soonest = 0
1347        self.delay = 60
1348        self.max_conns = 16
1349
1350    def setTimeout(self, t):
1351        self.delay = t
1352
1353    def setMaxConns(self, m):
1354        self.max_conns = m
1355
1356    def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1357        key = user, host, port, '/'.join(dirs), timeout
1358        if key in self.cache:
1359            self.timeout[key] = time.time() + self.delay
1360        else:
1361            self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout)
1362            self.timeout[key] = time.time() + self.delay
1363        self.check_cache()
1364        return self.cache[key]
1365
1366    def check_cache(self):
1367        # first check for old ones
1368        t = time.time()
1369        if self.soonest <= t:
1370            for k, v in self.timeout.items():
1371                if v < t:
1372                    self.cache[k].close()
1373                    del self.cache[k]
1374                    del self.timeout[k]
1375        self.soonest = min(self.timeout.values())
1376
1377        # then check the size
1378        if len(self.cache) == self.max_conns:
1379            for k, v in self.timeout.items():
1380                if v == self.soonest:
1381                    del self.cache[k]
1382                    del self.timeout[k]
1383                    break
1384            self.soonest = min(self.timeout.values())