PageRenderTime 79ms CodeModel.GetById 18ms app.highlight 48ms RepoModel.GetById 1ms app.codeStats 1ms

/lib-python/2.7/cookielib.py

https://bitbucket.org/evelyn559/pypy
Python | 1794 lines | 1718 code | 25 blank | 51 comment | 79 complexity | e44e95a69089ddc4d506f329f6e663c8 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1"""HTTP cookie handling for web clients.
   2
   3This module has (now fairly distant) origins in Gisle Aas' Perl module
   4HTTP::Cookies, from the libwww-perl library.
   5
   6Docstrings, comments and debug strings in this code refer to the
   7attributes of the HTTP cookie system as cookie-attributes, to distinguish
   8them clearly from Python attributes.
   9
  10Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
  11distributed with the Python standard library, but are available from
  12http://wwwsearch.sf.net/):
  13
  14                        CookieJar____
  15                        /     \      \
  16            FileCookieJar      \      \
  17             /    |   \         \      \
  18 MozillaCookieJar | LWPCookieJar \      \
  19                  |               |      \
  20                  |   ---MSIEBase |       \
  21                  |  /      |     |        \
  22                  | /   MSIEDBCookieJar BSDDBCookieJar
  23                  |/
  24               MSIECookieJar
  25
  26"""
  27
  28__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
  29           'FileCookieJar', 'LWPCookieJar', 'lwp_cookie_str', 'LoadError',
  30           'MozillaCookieJar']
  31
  32import re, urlparse, copy, time, urllib
  33try:
  34    import threading as _threading
  35except ImportError:
  36    import dummy_threading as _threading
  37import httplib  # only for the default HTTP port
  38from calendar import timegm
  39
  40debug = False   # set to True to enable debugging via the logging module
  41logger = None
  42
  43def _debug(*args):
  44    if not debug:
  45        return
  46    global logger
  47    if not logger:
  48        import logging
  49        logger = logging.getLogger("cookielib")
  50    return logger.debug(*args)
  51
  52
  53DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
  54MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
  55                         "instance initialised with one)")
  56
  57def _warn_unhandled_exception():
  58    # There are a few catch-all except: statements in this module, for
  59    # catching input that's bad in unexpected ways.  Warn if any
  60    # exceptions are caught there.
  61    import warnings, traceback, StringIO
  62    f = StringIO.StringIO()
  63    traceback.print_exc(None, f)
  64    msg = f.getvalue()
  65    warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
  66
  67
  68# Date/time conversion
  69# -----------------------------------------------------------------------------
  70
  71EPOCH_YEAR = 1970
  72def _timegm(tt):
  73    year, month, mday, hour, min, sec = tt[:6]
  74    if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
  75        (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
  76        return timegm(tt)
  77    else:
  78        return None
  79
  80DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
  81MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
  82          "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
  83MONTHS_LOWER = []
  84for month in MONTHS: MONTHS_LOWER.append(month.lower())
  85
  86def time2isoz(t=None):
  87    """Return a string representing time in seconds since epoch, t.
  88
  89    If the function is called without an argument, it will use the current
  90    time.
  91
  92    The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
  93    representing Universal Time (UTC, aka GMT).  An example of this format is:
  94
  95    1994-11-24 08:49:37Z
  96
  97    """
  98    if t is None: t = time.time()
  99    year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
 100    return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
 101        year, mon, mday, hour, min, sec)
 102
 103def time2netscape(t=None):
 104    """Return a string representing time in seconds since epoch, t.
 105
 106    If the function is called without an argument, it will use the current
 107    time.
 108
 109    The format of the returned string is like this:
 110
 111    Wed, DD-Mon-YYYY HH:MM:SS GMT
 112
 113    """
 114    if t is None: t = time.time()
 115    year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
 116    return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
 117        DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
 118
 119
 120UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
 121
 122TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
 123def offset_from_tz_string(tz):
 124    offset = None
 125    if tz in UTC_ZONES:
 126        offset = 0
 127    else:
 128        m = TIMEZONE_RE.search(tz)
 129        if m:
 130            offset = 3600 * int(m.group(2))
 131            if m.group(3):
 132                offset = offset + 60 * int(m.group(3))
 133            if m.group(1) == '-':
 134                offset = -offset
 135    return offset
 136
 137def _str2time(day, mon, yr, hr, min, sec, tz):
 138    # translate month name to number
 139    # month numbers start with 1 (January)
 140    try:
 141        mon = MONTHS_LOWER.index(mon.lower())+1
 142    except ValueError:
 143        # maybe it's already a number
 144        try:
 145            imon = int(mon)
 146        except ValueError:
 147            return None
 148        if 1 <= imon <= 12:
 149            mon = imon
 150        else:
 151            return None
 152
 153    # make sure clock elements are defined
 154    if hr is None: hr = 0
 155    if min is None: min = 0
 156    if sec is None: sec = 0
 157
 158    yr = int(yr)
 159    day = int(day)
 160    hr = int(hr)
 161    min = int(min)
 162    sec = int(sec)
 163
 164    if yr < 1000:
 165        # find "obvious" year
 166        cur_yr = time.localtime(time.time())[0]
 167        m = cur_yr % 100
 168        tmp = yr
 169        yr = yr + cur_yr - m
 170        m = m - tmp
 171        if abs(m) > 50:
 172            if m > 0: yr = yr + 100
 173            else: yr = yr - 100
 174
 175    # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
 176    t = _timegm((yr, mon, day, hr, min, sec, tz))
 177
 178    if t is not None:
 179        # adjust time using timezone string, to get absolute time since epoch
 180        if tz is None:
 181            tz = "UTC"
 182        tz = tz.upper()
 183        offset = offset_from_tz_string(tz)
 184        if offset is None:
 185            return None
 186        t = t - offset
 187
 188    return t
 189
 190STRICT_DATE_RE = re.compile(
 191    r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
 192    "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
 193WEEKDAY_RE = re.compile(
 194    r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
 195LOOSE_HTTP_DATE_RE = re.compile(
 196    r"""^
 197    (\d\d?)            # day
 198       (?:\s+|[-\/])
 199    (\w+)              # month
 200        (?:\s+|[-\/])
 201    (\d+)              # year
 202    (?:
 203          (?:\s+|:)    # separator before clock
 204       (\d\d?):(\d\d)  # hour:min
 205       (?::(\d\d))?    # optional seconds
 206    )?                 # optional clock
 207       \s*
 208    ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
 209       \s*
 210    (?:\(\w+\))?       # ASCII representation of timezone in parens.
 211       \s*$""", re.X)
 212def http2time(text):
 213    """Returns time in seconds since epoch of time represented by a string.
 214
 215    Return value is an integer.
 216
 217    None is returned if the format of str is unrecognized, the time is outside
 218    the representable range, or the timezone string is not recognized.  If the
 219    string contains no timezone, UTC is assumed.
 220
 221    The timezone in the string may be numerical (like "-0800" or "+0100") or a
 222    string timezone (like "UTC", "GMT", "BST" or "EST").  Currently, only the
 223    timezone strings equivalent to UTC (zero offset) are known to the function.
 224
 225    The function loosely parses the following formats:
 226
 227    Wed, 09 Feb 1994 22:23:32 GMT       -- HTTP format
 228    Tuesday, 08-Feb-94 14:15:29 GMT     -- old rfc850 HTTP format
 229    Tuesday, 08-Feb-1994 14:15:29 GMT   -- broken rfc850 HTTP format
 230    09 Feb 1994 22:23:32 GMT            -- HTTP format (no weekday)
 231    08-Feb-94 14:15:29 GMT              -- rfc850 format (no weekday)
 232    08-Feb-1994 14:15:29 GMT            -- broken rfc850 format (no weekday)
 233
 234    The parser ignores leading and trailing whitespace.  The time may be
 235    absent.
 236
 237    If the year is given with only 2 digits, the function will select the
 238    century that makes the year closest to the current date.
 239
 240    """
 241    # fast exit for strictly conforming string
 242    m = STRICT_DATE_RE.search(text)
 243    if m:
 244        g = m.groups()
 245        mon = MONTHS_LOWER.index(g[1].lower()) + 1
 246        tt = (int(g[2]), mon, int(g[0]),
 247              int(g[3]), int(g[4]), float(g[5]))
 248        return _timegm(tt)
 249
 250    # No, we need some messy parsing...
 251
 252    # clean up
 253    text = text.lstrip()
 254    text = WEEKDAY_RE.sub("", text, 1)  # Useless weekday
 255
 256    # tz is time zone specifier string
 257    day, mon, yr, hr, min, sec, tz = [None]*7
 258
 259    # loose regexp parse
 260    m = LOOSE_HTTP_DATE_RE.search(text)
 261    if m is not None:
 262        day, mon, yr, hr, min, sec, tz = m.groups()
 263    else:
 264        return None  # bad format
 265
 266    return _str2time(day, mon, yr, hr, min, sec, tz)
 267
 268ISO_DATE_RE = re.compile(
 269    """^
 270    (\d{4})              # year
 271       [-\/]?
 272    (\d\d?)              # numerical month
 273       [-\/]?
 274    (\d\d?)              # day
 275   (?:
 276         (?:\s+|[-:Tt])  # separator before clock
 277      (\d\d?):?(\d\d)    # hour:min
 278      (?::?(\d\d(?:\.\d*)?))?  # optional seconds (and fractional)
 279   )?                    # optional clock
 280      \s*
 281   ([-+]?\d\d?:?(:?\d\d)?
 282    |Z|z)?               # timezone  (Z is "zero meridian", i.e. GMT)
 283      \s*$""", re.X)
 284def iso2time(text):
 285    """
 286    As for http2time, but parses the ISO 8601 formats:
 287
 288    1994-02-03 14:15:29 -0100    -- ISO 8601 format
 289    1994-02-03 14:15:29          -- zone is optional
 290    1994-02-03                   -- only date
 291    1994-02-03T14:15:29          -- Use T as separator
 292    19940203T141529Z             -- ISO 8601 compact format
 293    19940203                     -- only date
 294
 295    """
 296    # clean up
 297    text = text.lstrip()
 298
 299    # tz is time zone specifier string
 300    day, mon, yr, hr, min, sec, tz = [None]*7
 301
 302    # loose regexp parse
 303    m = ISO_DATE_RE.search(text)
 304    if m is not None:
 305        # XXX there's an extra bit of the timezone I'm ignoring here: is
 306        #   this the right thing to do?
 307        yr, mon, day, hr, min, sec, tz, _ = m.groups()
 308    else:
 309        return None  # bad format
 310
 311    return _str2time(day, mon, yr, hr, min, sec, tz)
 312
 313
 314# Header parsing
 315# -----------------------------------------------------------------------------
 316
 317def unmatched(match):
 318    """Return unmatched part of re.Match object."""
 319    start, end = match.span(0)
 320    return match.string[:start]+match.string[end:]
 321
 322HEADER_TOKEN_RE =        re.compile(r"^\s*([^=\s;,]+)")
 323HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
 324HEADER_VALUE_RE =        re.compile(r"^\s*=\s*([^\s;,]*)")
 325HEADER_ESCAPE_RE = re.compile(r"\\(.)")
 326def split_header_words(header_values):
 327    r"""Parse header values into a list of lists containing key,value pairs.
 328
 329    The function knows how to deal with ",", ";" and "=" as well as quoted
 330    values after "=".  A list of space separated tokens are parsed as if they
 331    were separated by ";".
 332
 333    If the header_values passed as argument contains multiple values, then they
 334    are treated as if they were a single value separated by comma ",".
 335
 336    This means that this function is useful for parsing header fields that
 337    follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
 338    the requirement for tokens).
 339
 340      headers           = #header
 341      header            = (token | parameter) *( [";"] (token | parameter))
 342
 343      token             = 1*<any CHAR except CTLs or separators>
 344      separators        = "(" | ")" | "<" | ">" | "@"
 345                        | "," | ";" | ":" | "\" | <">
 346                        | "/" | "[" | "]" | "?" | "="
 347                        | "{" | "}" | SP | HT
 348
 349      quoted-string     = ( <"> *(qdtext | quoted-pair ) <"> )
 350      qdtext            = <any TEXT except <">>
 351      quoted-pair       = "\" CHAR
 352
 353      parameter         = attribute "=" value
 354      attribute         = token
 355      value             = token | quoted-string
 356
 357    Each header is represented by a list of key/value pairs.  The value for a
 358    simple token (not part of a parameter) is None.  Syntactically incorrect
 359    headers will not necessarily be parsed as you would want.
 360
 361    This is easier to describe with some examples:
 362
 363    >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
 364    [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
 365    >>> split_header_words(['text/html; charset="iso-8859-1"'])
 366    [[('text/html', None), ('charset', 'iso-8859-1')]]
 367    >>> split_header_words([r'Basic realm="\"foo\bar\""'])
 368    [[('Basic', None), ('realm', '"foobar"')]]
 369
 370    """
 371    assert not isinstance(header_values, basestring)
 372    result = []
 373    for text in header_values:
 374        orig_text = text
 375        pairs = []
 376        while text:
 377            m = HEADER_TOKEN_RE.search(text)
 378            if m:
 379                text = unmatched(m)
 380                name = m.group(1)
 381                m = HEADER_QUOTED_VALUE_RE.search(text)
 382                if m:  # quoted value
 383                    text = unmatched(m)
 384                    value = m.group(1)
 385                    value = HEADER_ESCAPE_RE.sub(r"\1", value)
 386                else:
 387                    m = HEADER_VALUE_RE.search(text)
 388                    if m:  # unquoted value
 389                        text = unmatched(m)
 390                        value = m.group(1)
 391                        value = value.rstrip()
 392                    else:
 393                        # no value, a lone token
 394                        value = None
 395                pairs.append((name, value))
 396            elif text.lstrip().startswith(","):
 397                # concatenated headers, as per RFC 2616 section 4.2
 398                text = text.lstrip()[1:]
 399                if pairs: result.append(pairs)
 400                pairs = []
 401            else:
 402                # skip junk
 403                non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
 404                assert nr_junk_chars > 0, (
 405                    "split_header_words bug: '%s', '%s', %s" %
 406                    (orig_text, text, pairs))
 407                text = non_junk
 408        if pairs: result.append(pairs)
 409    return result
 410
 411HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
 412def join_header_words(lists):
 413    """Do the inverse (almost) of the conversion done by split_header_words.
 414
 415    Takes a list of lists of (key, value) pairs and produces a single header
 416    value.  Attribute values are quoted if needed.
 417
 418    >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
 419    'text/plain; charset="iso-8859/1"'
 420    >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
 421    'text/plain, charset="iso-8859/1"'
 422
 423    """
 424    headers = []
 425    for pairs in lists:
 426        attr = []
 427        for k, v in pairs:
 428            if v is not None:
 429                if not re.search(r"^\w+$", v):
 430                    v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v)  # escape " and \
 431                    v = '"%s"' % v
 432                k = "%s=%s" % (k, v)
 433            attr.append(k)
 434        if attr: headers.append("; ".join(attr))
 435    return ", ".join(headers)
 436
 437def _strip_quotes(text):
 438    if text.startswith('"'):
 439        text = text[1:]
 440    if text.endswith('"'):
 441        text = text[:-1]
 442    return text
 443
 444def parse_ns_headers(ns_headers):
 445    """Ad-hoc parser for Netscape protocol cookie-attributes.
 446
 447    The old Netscape cookie format for Set-Cookie can for instance contain
 448    an unquoted "," in the expires field, so we have to use this ad-hoc
 449    parser instead of split_header_words.
 450
 451    XXX This may not make the best possible effort to parse all the crap
 452    that Netscape Cookie headers contain.  Ronald Tschalar's HTTPClient
 453    parser is probably better, so could do worse than following that if
 454    this ever gives any trouble.
 455
 456    Currently, this is also used for parsing RFC 2109 cookies.
 457
 458    """
 459    known_attrs = ("expires", "domain", "path", "secure",
 460                   # RFC 2109 attrs (may turn up in Netscape cookies, too)
 461                   "version", "port", "max-age")
 462
 463    result = []
 464    for ns_header in ns_headers:
 465        pairs = []
 466        version_set = False
 467        for ii, param in enumerate(re.split(r";\s*", ns_header)):
 468            param = param.rstrip()
 469            if param == "": continue
 470            if "=" not in param:
 471                k, v = param, None
 472            else:
 473                k, v = re.split(r"\s*=\s*", param, 1)
 474                k = k.lstrip()
 475            if ii != 0:
 476                lc = k.lower()
 477                if lc in known_attrs:
 478                    k = lc
 479                if k == "version":
 480                    # This is an RFC 2109 cookie.
 481                    v = _strip_quotes(v)
 482                    version_set = True
 483                if k == "expires":
 484                    # convert expires date to seconds since epoch
 485                    v = http2time(_strip_quotes(v))  # None if invalid
 486            pairs.append((k, v))
 487
 488        if pairs:
 489            if not version_set:
 490                pairs.append(("version", "0"))
 491            result.append(pairs)
 492
 493    return result
 494
 495
 496IPV4_RE = re.compile(r"\.\d+$")
 497def is_HDN(text):
 498    """Return True if text is a host domain name."""
 499    # XXX
 500    # This may well be wrong.  Which RFC is HDN defined in, if any (for
 501    #  the purposes of RFC 2965)?
 502    # For the current implementation, what about IPv6?  Remember to look
 503    #  at other uses of IPV4_RE also, if change this.
 504    if IPV4_RE.search(text):
 505        return False
 506    if text == "":
 507        return False
 508    if text[0] == "." or text[-1] == ".":
 509        return False
 510    return True
 511
 512def domain_match(A, B):
 513    """Return True if domain A domain-matches domain B, according to RFC 2965.
 514
 515    A and B may be host domain names or IP addresses.
 516
 517    RFC 2965, section 1:
 518
 519    Host names can be specified either as an IP address or a HDN string.
 520    Sometimes we compare one host name with another.  (Such comparisons SHALL
 521    be case-insensitive.)  Host A's name domain-matches host B's if
 522
 523         *  their host name strings string-compare equal; or
 524
 525         * A is a HDN string and has the form NB, where N is a non-empty
 526            name string, B has the form .B', and B' is a HDN string.  (So,
 527            x.y.com domain-matches .Y.com but not Y.com.)
 528
 529    Note that domain-match is not a commutative operation: a.b.c.com
 530    domain-matches .c.com, but not the reverse.
 531
 532    """
 533    # Note that, if A or B are IP addresses, the only relevant part of the
 534    # definition of the domain-match algorithm is the direct string-compare.
 535    A = A.lower()
 536    B = B.lower()
 537    if A == B:
 538        return True
 539    if not is_HDN(A):
 540        return False
 541    i = A.rfind(B)
 542    if i == -1 or i == 0:
 543        # A does not have form NB, or N is the empty string
 544        return False
 545    if not B.startswith("."):
 546        return False
 547    if not is_HDN(B[1:]):
 548        return False
 549    return True
 550
 551def liberal_is_HDN(text):
 552    """Return True if text is a sort-of-like a host domain name.
 553
 554    For accepting/blocking domains.
 555
 556    """
 557    if IPV4_RE.search(text):
 558        return False
 559    return True
 560
 561def user_domain_match(A, B):
 562    """For blocking/accepting domains.
 563
 564    A and B may be host domain names or IP addresses.
 565
 566    """
 567    A = A.lower()
 568    B = B.lower()
 569    if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
 570        if A == B:
 571            # equal IP addresses
 572            return True
 573        return False
 574    initial_dot = B.startswith(".")
 575    if initial_dot and A.endswith(B):
 576        return True
 577    if not initial_dot and A == B:
 578        return True
 579    return False
 580
 581cut_port_re = re.compile(r":\d+$")
 582def request_host(request):
 583    """Return request-host, as defined by RFC 2965.
 584
 585    Variation from RFC: returned value is lowercased, for convenient
 586    comparison.
 587
 588    """
 589    url = request.get_full_url()
 590    host = urlparse.urlparse(url)[1]
 591    if host == "":
 592        host = request.get_header("Host", "")
 593
 594    # remove port, if present
 595    host = cut_port_re.sub("", host, 1)
 596    return host.lower()
 597
 598def eff_request_host(request):
 599    """Return a tuple (request-host, effective request-host name).
 600
 601    As defined by RFC 2965, except both are lowercased.
 602
 603    """
 604    erhn = req_host = request_host(request)
 605    if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
 606        erhn = req_host + ".local"
 607    return req_host, erhn
 608
 609def request_path(request):
 610    """Path component of request-URI, as defined by RFC 2965."""
 611    url = request.get_full_url()
 612    parts = urlparse.urlsplit(url)
 613    path = escape_path(parts.path)
 614    if not path.startswith("/"):
 615        # fix bad RFC 2396 absoluteURI
 616        path = "/" + path
 617    return path
 618
 619def request_port(request):
 620    host = request.get_host()
 621    i = host.find(':')
 622    if i >= 0:
 623        port = host[i+1:]
 624        try:
 625            int(port)
 626        except ValueError:
 627            _debug("nonnumeric port: '%s'", port)
 628            return None
 629    else:
 630        port = DEFAULT_HTTP_PORT
 631    return port
 632
 633# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
 634# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
 635HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
 636ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
 637def uppercase_escaped_char(match):
 638    return "%%%s" % match.group(1).upper()
 639def escape_path(path):
 640    """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
 641    # There's no knowing what character encoding was used to create URLs
 642    # containing %-escapes, but since we have to pick one to escape invalid
 643    # path characters, we pick UTF-8, as recommended in the HTML 4.0
 644    # specification:
 645    # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
 646    # And here, kind of: draft-fielding-uri-rfc2396bis-03
 647    # (And in draft IRI specification: draft-duerst-iri-05)
 648    # (And here, for new URI schemes: RFC 2718)
 649    if isinstance(path, unicode):
 650        path = path.encode("utf-8")
 651    path = urllib.quote(path, HTTP_PATH_SAFE)
 652    path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
 653    return path
 654
 655def reach(h):
 656    """Return reach of host h, as defined by RFC 2965, section 1.
 657
 658    The reach R of a host name H is defined as follows:
 659
 660       *  If
 661
 662          -  H is the host domain name of a host; and,
 663
 664          -  H has the form A.B; and
 665
 666          -  A has no embedded (that is, interior) dots; and
 667
 668          -  B has at least one embedded dot, or B is the string "local".
 669             then the reach of H is .B.
 670
 671       *  Otherwise, the reach of H is H.
 672
 673    >>> reach("www.acme.com")
 674    '.acme.com'
 675    >>> reach("acme.com")
 676    'acme.com'
 677    >>> reach("acme.local")
 678    '.local'
 679
 680    """
 681    i = h.find(".")
 682    if i >= 0:
 683        #a = h[:i]  # this line is only here to show what a is
 684        b = h[i+1:]
 685        i = b.find(".")
 686        if is_HDN(h) and (i >= 0 or b == "local"):
 687            return "."+b
 688    return h
 689
 690def is_third_party(request):
 691    """
 692
 693    RFC 2965, section 3.3.6:
 694
 695        An unverifiable transaction is to a third-party host if its request-
 696        host U does not domain-match the reach R of the request-host O in the
 697        origin transaction.
 698
 699    """
 700    req_host = request_host(request)
 701    if not domain_match(req_host, reach(request.get_origin_req_host())):
 702        return True
 703    else:
 704        return False
 705
 706
 707class Cookie:
 708    """HTTP Cookie.
 709
 710    This class represents both Netscape and RFC 2965 cookies.
 711
 712    This is deliberately a very simple class.  It just holds attributes.  It's
 713    possible to construct Cookie instances that don't comply with the cookie
 714    standards.  CookieJar.make_cookies is the factory function for Cookie
 715    objects -- it deals with cookie parsing, supplying defaults, and
 716    normalising to the representation used in this class.  CookiePolicy is
 717    responsible for checking them to see whether they should be accepted from
 718    and returned to the server.
 719
 720    Note that the port may be present in the headers, but unspecified ("Port"
 721    rather than"Port=80", for example); if this is the case, port is None.
 722
 723    """
 724
 725    def __init__(self, version, name, value,
 726                 port, port_specified,
 727                 domain, domain_specified, domain_initial_dot,
 728                 path, path_specified,
 729                 secure,
 730                 expires,
 731                 discard,
 732                 comment,
 733                 comment_url,
 734                 rest,
 735                 rfc2109=False,
 736                 ):
 737
 738        if version is not None: version = int(version)
 739        if expires is not None: expires = int(expires)
 740        if port is None and port_specified is True:
 741            raise ValueError("if port is None, port_specified must be false")
 742
 743        self.version = version
 744        self.name = name
 745        self.value = value
 746        self.port = port
 747        self.port_specified = port_specified
 748        # normalise case, as per RFC 2965 section 3.3.3
 749        self.domain = domain.lower()
 750        self.domain_specified = domain_specified
 751        # Sigh.  We need to know whether the domain given in the
 752        # cookie-attribute had an initial dot, in order to follow RFC 2965
 753        # (as clarified in draft errata).  Needed for the returned $Domain
 754        # value.
 755        self.domain_initial_dot = domain_initial_dot
 756        self.path = path
 757        self.path_specified = path_specified
 758        self.secure = secure
 759        self.expires = expires
 760        self.discard = discard
 761        self.comment = comment
 762        self.comment_url = comment_url
 763        self.rfc2109 = rfc2109
 764
 765        self._rest = copy.copy(rest)
 766
 767    def has_nonstandard_attr(self, name):
 768        return name in self._rest
 769    def get_nonstandard_attr(self, name, default=None):
 770        return self._rest.get(name, default)
 771    def set_nonstandard_attr(self, name, value):
 772        self._rest[name] = value
 773
 774    def is_expired(self, now=None):
 775        if now is None: now = time.time()
 776        if (self.expires is not None) and (self.expires <= now):
 777            return True
 778        return False
 779
 780    def __str__(self):
 781        if self.port is None: p = ""
 782        else: p = ":"+self.port
 783        limit = self.domain + p + self.path
 784        if self.value is not None:
 785            namevalue = "%s=%s" % (self.name, self.value)
 786        else:
 787            namevalue = self.name
 788        return "<Cookie %s for %s>" % (namevalue, limit)
 789
 790    def __repr__(self):
 791        args = []
 792        for name in ("version", "name", "value",
 793                     "port", "port_specified",
 794                     "domain", "domain_specified", "domain_initial_dot",
 795                     "path", "path_specified",
 796                     "secure", "expires", "discard", "comment", "comment_url",
 797                     ):
 798            attr = getattr(self, name)
 799            args.append("%s=%s" % (name, repr(attr)))
 800        args.append("rest=%s" % repr(self._rest))
 801        args.append("rfc2109=%s" % repr(self.rfc2109))
 802        return "Cookie(%s)" % ", ".join(args)
 803
 804
 805class CookiePolicy:
 806    """Defines which cookies get accepted from and returned to server.
 807
 808    May also modify cookies, though this is probably a bad idea.
 809
 810    The subclass DefaultCookiePolicy defines the standard rules for Netscape
 811    and RFC 2965 cookies -- override that if you want a customised policy.
 812
 813    """
 814    def set_ok(self, cookie, request):
 815        """Return true if (and only if) cookie should be accepted from server.
 816
 817        Currently, pre-expired cookies never get this far -- the CookieJar
 818        class deletes such cookies itself.
 819
 820        """
 821        raise NotImplementedError()
 822
 823    def return_ok(self, cookie, request):
 824        """Return true if (and only if) cookie should be returned to server."""
 825        raise NotImplementedError()
 826
 827    def domain_return_ok(self, domain, request):
 828        """Return false if cookies should not be returned, given cookie domain.
 829        """
 830        return True
 831
 832    def path_return_ok(self, path, request):
 833        """Return false if cookies should not be returned, given cookie path.
 834        """
 835        return True
 836
 837
 838class DefaultCookiePolicy(CookiePolicy):
 839    """Implements the standard rules for accepting and returning cookies."""
 840
 841    DomainStrictNoDots = 1
 842    DomainStrictNonDomain = 2
 843    DomainRFC2965Match = 4
 844
 845    DomainLiberal = 0
 846    DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
 847
 848    def __init__(self,
 849                 blocked_domains=None, allowed_domains=None,
 850                 netscape=True, rfc2965=False,
 851                 rfc2109_as_netscape=None,
 852                 hide_cookie2=False,
 853                 strict_domain=False,
 854                 strict_rfc2965_unverifiable=True,
 855                 strict_ns_unverifiable=False,
 856                 strict_ns_domain=DomainLiberal,
 857                 strict_ns_set_initial_dollar=False,
 858                 strict_ns_set_path=False,
 859                 ):
 860        """Constructor arguments should be passed as keyword arguments only."""
 861        self.netscape = netscape
 862        self.rfc2965 = rfc2965
 863        self.rfc2109_as_netscape = rfc2109_as_netscape
 864        self.hide_cookie2 = hide_cookie2
 865        self.strict_domain = strict_domain
 866        self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
 867        self.strict_ns_unverifiable = strict_ns_unverifiable
 868        self.strict_ns_domain = strict_ns_domain
 869        self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
 870        self.strict_ns_set_path = strict_ns_set_path
 871
 872        if blocked_domains is not None:
 873            self._blocked_domains = tuple(blocked_domains)
 874        else:
 875            self._blocked_domains = ()
 876
 877        if allowed_domains is not None:
 878            allowed_domains = tuple(allowed_domains)
 879        self._allowed_domains = allowed_domains
 880
 881    def blocked_domains(self):
 882        """Return the sequence of blocked domains (as a tuple)."""
 883        return self._blocked_domains
 884    def set_blocked_domains(self, blocked_domains):
 885        """Set the sequence of blocked domains."""
 886        self._blocked_domains = tuple(blocked_domains)
 887
 888    def is_blocked(self, domain):
 889        for blocked_domain in self._blocked_domains:
 890            if user_domain_match(domain, blocked_domain):
 891                return True
 892        return False
 893
 894    def allowed_domains(self):
 895        """Return None, or the sequence of allowed domains (as a tuple)."""
 896        return self._allowed_domains
 897    def set_allowed_domains(self, allowed_domains):
 898        """Set the sequence of allowed domains, or None."""
 899        if allowed_domains is not None:
 900            allowed_domains = tuple(allowed_domains)
 901        self._allowed_domains = allowed_domains
 902
 903    def is_not_allowed(self, domain):
 904        if self._allowed_domains is None:
 905            return False
 906        for allowed_domain in self._allowed_domains:
 907            if user_domain_match(domain, allowed_domain):
 908                return False
 909        return True
 910
 911    def set_ok(self, cookie, request):
 912        """
 913        If you override .set_ok(), be sure to call this method.  If it returns
 914        false, so should your subclass (assuming your subclass wants to be more
 915        strict about which cookies to accept).
 916
 917        """
 918        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
 919
 920        assert cookie.name is not None
 921
 922        for n in "version", "verifiability", "name", "path", "domain", "port":
 923            fn_name = "set_ok_"+n
 924            fn = getattr(self, fn_name)
 925            if not fn(cookie, request):
 926                return False
 927
 928        return True
 929
 930    def set_ok_version(self, cookie, request):
 931        if cookie.version is None:
 932            # Version is always set to 0 by parse_ns_headers if it's a Netscape
 933            # cookie, so this must be an invalid RFC 2965 cookie.
 934            _debug("   Set-Cookie2 without version attribute (%s=%s)",
 935                   cookie.name, cookie.value)
 936            return False
 937        if cookie.version > 0 and not self.rfc2965:
 938            _debug("   RFC 2965 cookies are switched off")
 939            return False
 940        elif cookie.version == 0 and not self.netscape:
 941            _debug("   Netscape cookies are switched off")
 942            return False
 943        return True
 944
 945    def set_ok_verifiability(self, cookie, request):
 946        if request.is_unverifiable() and is_third_party(request):
 947            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
 948                _debug("   third-party RFC 2965 cookie during "
 949                             "unverifiable transaction")
 950                return False
 951            elif cookie.version == 0 and self.strict_ns_unverifiable:
 952                _debug("   third-party Netscape cookie during "
 953                             "unverifiable transaction")
 954                return False
 955        return True
 956
 957    def set_ok_name(self, cookie, request):
 958        # Try and stop servers setting V0 cookies designed to hack other
 959        # servers that know both V0 and V1 protocols.
 960        if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
 961            cookie.name.startswith("$")):
 962            _debug("   illegal name (starts with '$'): '%s'", cookie.name)
 963            return False
 964        return True
 965
 966    def set_ok_path(self, cookie, request):
 967        if cookie.path_specified:
 968            req_path = request_path(request)
 969            if ((cookie.version > 0 or
 970                 (cookie.version == 0 and self.strict_ns_set_path)) and
 971                not req_path.startswith(cookie.path)):
 972                _debug("   path attribute %s is not a prefix of request "
 973                       "path %s", cookie.path, req_path)
 974                return False
 975        return True
 976
 977    def set_ok_domain(self, cookie, request):
 978        if self.is_blocked(cookie.domain):
 979            _debug("   domain %s is in user block-list", cookie.domain)
 980            return False
 981        if self.is_not_allowed(cookie.domain):
 982            _debug("   domain %s is not in user allow-list", cookie.domain)
 983            return False
 984        if cookie.domain_specified:
 985            req_host, erhn = eff_request_host(request)
 986            domain = cookie.domain
 987            if self.strict_domain and (domain.count(".") >= 2):
 988                # XXX This should probably be compared with the Konqueror
 989                # (kcookiejar.cpp) and Mozilla implementations, but it's a
 990                # losing battle.
 991                i = domain.rfind(".")
 992                j = domain.rfind(".", 0, i)
 993                if j == 0:  # domain like .foo.bar
 994                    tld = domain[i+1:]
 995                    sld = domain[j+1:i]
 996                    if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
 997                       "gov", "mil", "int", "aero", "biz", "cat", "coop",
 998                       "info", "jobs", "mobi", "museum", "name", "pro",
 999                       "travel", "eu") and len(tld) == 2:
1000                        # domain like .co.uk
1001                        _debug("   country-code second level domain %s", domain)
1002                        return False
1003            if domain.startswith("."):
1004                undotted_domain = domain[1:]
1005            else:
1006                undotted_domain = domain
1007            embedded_dots = (undotted_domain.find(".") >= 0)
1008            if not embedded_dots and domain != ".local":
1009                _debug("   non-local domain %s contains no embedded dot",
1010                       domain)
1011                return False
1012            if cookie.version == 0:
1013                if (not erhn.endswith(domain) and
1014                    (not erhn.startswith(".") and
1015                     not ("."+erhn).endswith(domain))):
1016                    _debug("   effective request-host %s (even with added "
1017                           "initial dot) does not end end with %s",
1018                           erhn, domain)
1019                    return False
1020            if (cookie.version > 0 or
1021                (self.strict_ns_domain & self.DomainRFC2965Match)):
1022                if not domain_match(erhn, domain):
1023                    _debug("   effective request-host %s does not domain-match "
1024                           "%s", erhn, domain)
1025                    return False
1026            if (cookie.version > 0 or
1027                (self.strict_ns_domain & self.DomainStrictNoDots)):
1028                host_prefix = req_host[:-len(domain)]
1029                if (host_prefix.find(".") >= 0 and
1030                    not IPV4_RE.search(req_host)):
1031                    _debug("   host prefix %s for domain %s contains a dot",
1032                           host_prefix, domain)
1033                    return False
1034        return True
1035
1036    def set_ok_port(self, cookie, request):
1037        if cookie.port_specified:
1038            req_port = request_port(request)
1039            if req_port is None:
1040                req_port = "80"
1041            else:
1042                req_port = str(req_port)
1043            for p in cookie.port.split(","):
1044                try:
1045                    int(p)
1046                except ValueError:
1047                    _debug("   bad port %s (not numeric)", p)
1048                    return False
1049                if p == req_port:
1050                    break
1051            else:
1052                _debug("   request port (%s) not found in %s",
1053                       req_port, cookie.port)
1054                return False
1055        return True
1056
1057    def return_ok(self, cookie, request):
1058        """
1059        If you override .return_ok(), be sure to call this method.  If it
1060        returns false, so should your subclass (assuming your subclass wants to
1061        be more strict about which cookies to return).
1062
1063        """
1064        # Path has already been checked by .path_return_ok(), and domain
1065        # blocking done by .domain_return_ok().
1066        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1067
1068        for n in "version", "verifiability", "secure", "expires", "port", "domain":
1069            fn_name = "return_ok_"+n
1070            fn = getattr(self, fn_name)
1071            if not fn(cookie, request):
1072                return False
1073        return True
1074
1075    def return_ok_version(self, cookie, request):
1076        if cookie.version > 0 and not self.rfc2965:
1077            _debug("   RFC 2965 cookies are switched off")
1078            return False
1079        elif cookie.version == 0 and not self.netscape:
1080            _debug("   Netscape cookies are switched off")
1081            return False
1082        return True
1083
1084    def return_ok_verifiability(self, cookie, request):
1085        if request.is_unverifiable() and is_third_party(request):
1086            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1087                _debug("   third-party RFC 2965 cookie during unverifiable "
1088                       "transaction")
1089                return False
1090            elif cookie.version == 0 and self.strict_ns_unverifiable:
1091                _debug("   third-party Netscape cookie during unverifiable "
1092                       "transaction")
1093                return False
1094        return True
1095
1096    def return_ok_secure(self, cookie, request):
1097        if cookie.secure and request.get_type() != "https":
1098            _debug("   secure cookie with non-secure request")
1099            return False
1100        return True
1101
1102    def return_ok_expires(self, cookie, request):
1103        if cookie.is_expired(self._now):
1104            _debug("   cookie expired")
1105            return False
1106        return True
1107
1108    def return_ok_port(self, cookie, request):
1109        if cookie.port:
1110            req_port = request_port(request)
1111            if req_port is None:
1112                req_port = "80"
1113            for p in cookie.port.split(","):
1114                if p == req_port:
1115                    break
1116            else:
1117                _debug("   request port %s does not match cookie port %s",
1118                       req_port, cookie.port)
1119                return False
1120        return True
1121
1122    def return_ok_domain(self, cookie, request):
1123        req_host, erhn = eff_request_host(request)
1124        domain = cookie.domain
1125
1126        # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1127        if (cookie.version == 0 and
1128            (self.strict_ns_domain & self.DomainStrictNonDomain) and
1129            not cookie.domain_specified and domain != erhn):
1130            _debug("   cookie with unspecified domain does not string-compare "
1131                   "equal to request domain")
1132            return False
1133
1134        if cookie.version > 0 and not domain_match(erhn, domain):
1135            _debug("   effective request-host name %s does not domain-match "
1136                   "RFC 2965 cookie domain %s", erhn, domain)
1137            return False
1138        if cookie.version == 0 and not ("."+erhn).endswith(domain):
1139            _debug("   request-host %s does not match Netscape cookie domain "
1140                   "%s", req_host, domain)
1141            return False
1142        return True
1143
1144    def domain_return_ok(self, domain, request):
1145        # Liberal check of.  This is here as an optimization to avoid
1146        # having to load lots of MSIE cookie files unless necessary.
1147        req_host, erhn = eff_request_host(request)
1148        if not req_host.startswith("."):
1149            req_host = "."+req_host
1150        if not erhn.startswith("."):
1151            erhn = "."+erhn
1152        if not (req_host.endswith(domain) or erhn.endswith(domain)):
1153            #_debug("   request domain %s does not match cookie domain %s",
1154            #       req_host, domain)
1155            return False
1156
1157        if self.is_blocked(domain):
1158            _debug("   domain %s is in user block-list", domain)
1159            return False
1160        if self.is_not_allowed(domain):
1161            _debug("   domain %s is not in user allow-list", domain)
1162            return False
1163
1164        return True
1165
1166    def path_return_ok(self, path, request):
1167        _debug("- checking cookie path=%s", path)
1168        req_path = request_path(request)
1169        if not req_path.startswith(path):
1170            _debug("  %s does not path-match %s", req_path, path)
1171            return False
1172        return True
1173
1174
1175def vals_sorted_by_key(adict):
1176    keys = adict.keys()
1177    keys.sort()
1178    return map(adict.get, keys)
1179
1180def deepvalues(mapping):
1181    """Iterates over nested mapping, depth-first, in sorted order by key."""
1182    values = vals_sorted_by_key(mapping)
1183    for obj in values:
1184        mapping = False
1185        try:
1186            obj.items
1187        except AttributeError:
1188            pass
1189        else:
1190            mapping = True
1191            for subobj in deepvalues(obj):
1192                yield subobj
1193        if not mapping:
1194            yield obj
1195
1196
1197# Used as second parameter to dict.get() method, to distinguish absent
1198# dict key from one with a None value.
1199class Absent: pass
1200
1201class CookieJar:
1202    """Collection of HTTP cookies.
1203
1204    You may not need to know about this class: try
1205    urllib2.build_opener(HTTPCookieProcessor).open(url).
1206
1207    """
1208
1209    non_word_re = re.compile(r"\W")
1210    quote_re = re.compile(r"([\"\\])")
1211    strict_domain_re = re.compile(r"\.?[^.]*")
1212    domain_re = re.compile(r"[^.]*")
1213    dots_re = re.compile(r"^\.+")
1214
1215    magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
1216
1217    def __init__(self, policy=None):
1218        if policy is None:
1219            policy = DefaultCookiePolicy()
1220        self._policy = policy
1221
1222        self._cookies_lock = _threading.RLock()
1223        self._cookies = {}
1224
1225    def set_policy(self, policy):
1226        self._policy = policy
1227
1228    def _cookies_for_domain(self, domain, request):
1229        cookies = []
1230        if not self._policy.domain_return_ok(domain, request):
1231            return []
1232        _debug("Checking %s for cookies to return", domain)
1233        cookies_by_path = self._cookies[domain]
1234        for path in cookies_by_path.keys():
1235            if not self._policy.path_return_ok(path, request):
1236                continue
1237            cookies_by_name = cookies_by_path[path]
1238            for cookie in cookies_by_name.values():
1239                if not self._policy.return_ok(cookie, request):
1240                    _debug("   not returning cookie")
1241                    continue
1242                _debug("   it's a match")
1243                cookies.append(cookie)
1244        return cookies
1245
1246    def _cookies_for_request(self, request):
1247        """Return a list of cookies to be returned to server."""
1248        cookies = []
1249        for domain in self._cookies.keys():
1250            cookies.extend(self._cookies_for_domain(domain, request))
1251        return cookies
1252
1253    def _cookie_attrs(self, cookies):
1254        """Return a list of cookie-attributes to be returned to server.
1255
1256        like ['foo="bar"; $Path="/"', ...]
1257
1258        The $Version attribute is also added when appropriate (currently only
1259        once per request).
1260
1261        """
1262        # add cookies in order of most specific (ie. longest) path first
1263        cookies.sort(key=lambda arg: len(arg.path), reverse=True)
1264
1265        version_set = False
1266
1267        attrs = []
1268        for cookie in cookies:
1269            # set version of Cookie header
1270            # XXX
1271            # What should it be if multiple matching Set-Cookie headers have
1272            #  different versions themselves?
1273            # Answer: there is no answer; was supposed to be settled by
1274            #  RFC 2965 errata, but that may never appear...
1275            version = cookie.version
1276            if not version_set:
1277                version_set = True
1278                if version > 0:
1279                    attrs.append("$Version=%s" % version)
1280
1281            # quote cookie value if necessary
1282            # (not for Netscape protocol, which already has any quotes
1283            #  intact, due to the poorly-specified Netscape Cookie: syntax)
1284            if ((cookie.value is not None) and
1285                self.non_word_re.search(cookie.value) and version > 0):
1286                value = self.quote_re.sub(r"\\\1", cookie.value)
1287            else:
1288                value = cookie.value
1289
1290            # add cookie-attributes to be returned in Cookie header
1291            if cookie.value is None:
1292                attrs.append(cookie.name)
1293            else:
1294                attrs.append("%s=%s" % (cookie.name, value))
1295            if version > 0:
1296                if cookie.path_specified:
1297                    attrs.append('$Path="%s"' % cookie.path)
1298                if cookie.domain.startswith("."):
1299                    domain = cookie.domain
1300                    if (not cookie.domain_initial_dot and
1301                        domain.startswith(".")):
1302                        domain = domain[1:]
1303                    attrs.append('$Domain="%s"' % domain)
1304                if cookie.port is not None:
1305                    p = "$Port"
1306                    if cookie.port_specified:
1307                        p = p + ('="%s"' % cookie.port)
1308                    attrs.append(p)
1309
1310        return attrs
1311
1312    def add_cookie_header(self, request):
1313        """Add correct Cookie: header to request (urllib2.Request object).
1314
1315        The Cookie2 header is also added unless policy.hide_cookie2 is true.
1316
1317        """
1318        _debug("add_cookie_header")
1319        self._cookies_lock.acquire()
1320        try:
1321
1322            self._policy._now = self._now = int(time.time())
1323
1324            cookies = self._cookies_for_request(request)
1325
1326            attrs = self._cookie_attrs(cookies)
1327            if attrs:
1328                if not request.has_header("Cookie"):
1329                    request.add_unredirected_header(
1330                        "Cookie", "; ".join(attrs))
1331
1332            # if necessary, advertise that we know RFC 2965
1333            if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1334                not request.has_header("Cookie2")):
1335                for cookie in cookies:
1336                    if cookie.version != 1:
1337                        request.add_unredirected_header("Cookie2", '$Version="1"')
1338                        break
1339
1340        finally:
1341            self._cookies_lock.release()
1342
1343        self.clear_expired_cookies()
1344
1345    def _normalized_cookie_tuples(self, attrs_set):
1346        """Return list of tuples containing normalised cookie information.
1347
1348        attrs_set is the list of lists of key,value pairs extracted from
1349        the Set-Cookie or Set-Cookie2 headers.
1350
1351        Tuples are name, value, standard, rest, where name and value are the
1352        cookie name and value, standard is a dictionary containing the standard
1353        cookie-attributes (discard, secure, version, expires or max-age,
1354        domain, path and port) and rest is a dictionary containing the rest of
1355        the cookie-attributes.
1356
1357        """
1358        cookie_tuples = []
1359
1360        boolean_attrs = "discard", "secure"
1361        value_attrs = ("version",
1362                       "expires", "max-age",
1363                       "domain", "path", "port",
1364                       "comment", "commenturl")
1365
1366        for cookie_attrs in attrs_set:
1367            name, value = cookie_attrs[0]
1368
1369            # Build dictionary of standard cookie-attributes (standard) and
1370            # dictionary of other cookie-attributes (rest).
1371
1372            # Note: expiry time is normalised to seconds since epoch.  V0
1373            # cookies should have the Expires cookie-attribute, and V1 cookies
1374            # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1375            # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1376            # accept either (but prefer Max-Age).
1377            max_age_set = False
1378
1379            bad_cookie = False
1380
1381            standard = {}
1382            rest = {}
1383            for k, v in cookie_attrs[1:]:
1384                lc = k.lower()
1385                # don't lose case distinction for unknown fields
1386                if lc in value_attrs or lc in boolean_attrs:
1387                    k = lc
1388                if k in boolean_attrs and v is None:
1389                    # boolean cookie-attribute is present, but has no value
1390                    # (like "discard", rather than "port=80")
1391                    v = True
1392                if k in standard:
1393                    # only first value is significant
1394                    continue
1395                if k == "domain":
1396                    if v is None:
1397                        _debug("   missing value for domain attribute")
1398                        bad_cookie = True
1399                        break
1400                    # RFC 2965 section 3.3.3
1401                    v = v.lower()
1402                if k == "expires":
1403                    if max_age_set:
1404                        # Prefer max-age to expires (like Mozilla)…

Large files files are truncated, but you can click here to view the full file