/python/lib/Lib/xml/Uri.py
Python | 380 lines | 291 code | 15 blank | 74 comment | 42 complexity | 4579b0837a49e0dbd57dd89412706c39 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0, MPL-2.0-no-copyleft-exception, MIT, EPL-1.0, AGPL-1.0
- # pylint: disable-msg=C0103
- #
- # backported code from 4Suite with slight modifications, started from r1.89 of
- # Ft/Lib/Uri.py, by syt@logilab.fr on 2005-02-09
- #
- # part if not all of this code should probably move to urlparse (or be used
- # to fix some existant functions in this module)
- #
- #
- # Copyright 2004 Fourthought, Inc. (USA).
- # Detailed license and copyright information: http://4suite.org/COPYRIGHT
- # Project home, documentation, distributions: http://4suite.org/
- import os.path
- import sys
- import re
- import urlparse, urllib, urllib2
- def UnsplitUriRef(uriRefSeq):
- """should replace urlparse.urlunsplit
- Given a sequence as would be produced by SplitUriRef(), assembles and
- returns a URI reference as a string.
- """
- if not isinstance(uriRefSeq, (tuple, list)):
- raise TypeError("sequence expected, got %s" % type(uriRefSeq))
- (scheme, authority, path, query, fragment) = uriRefSeq
- uri = ''
- if scheme is not None:
- uri += scheme + ':'
- if authority is not None:
- uri += '//' + authority
- uri += path
- if query is not None:
- uri += '?' + query
- if fragment is not None:
- uri += '#' + fragment
- return uri
- SPLIT_URI_REF_PATTERN = re.compile(r"^(?:(?P<scheme>[^:/?#]+):)?(?://(?P<authority>[^/?#]*))?(?P<path>[^?#]*)(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?$")
- def SplitUriRef(uriref):
- """should replace urlparse.urlsplit
- Given a valid URI reference as a string, returns a tuple representing the
- generic URI components, as per RFC 2396 appendix B. The tuple's structure
- is (scheme, authority, path, query, fragment).
- All values will be strings (possibly empty) or None if undefined.
- Note that per rfc3986, there is no distinction between a path and
- an "opaque part", as there was in RFC 2396.
- """
- # the pattern will match every possible string, so it's safe to
- # assume there's a groupdict method to call.
- g = SPLIT_URI_REF_PATTERN.match(uriref).groupdict()
- scheme = g['scheme']
- authority = g['authority']
- path = g['path']
- query = g['query']
- fragment = g['fragment']
- return (scheme, authority, path, query, fragment)
- def Absolutize(uriRef, baseUri):
- """
- Resolves a URI reference to absolute form, effecting the result of RFC
- 3986 section 5. The URI reference is considered to be relative to the
- given base URI.
- It is the caller's responsibility to ensure that the base URI matches
- the absolute-URI syntax rule of RFC 3986, and that its path component
- does not contain '.' or '..' segments if the scheme is hierarchical.
- Unexpected results may occur otherwise.
- This function only conducts a minimal sanity check in order to determine
- if relative resolution is possible: it raises a UriException if the base
- URI does not have a scheme component. While it is true that the base URI
- is irrelevant if the URI reference has a scheme, an exception is raised
- in order to signal that the given string does not even come close to
- meeting the criteria to be usable as a base URI.
- It is the caller's responsibility to make a determination of whether the
- URI reference constitutes a "same-document reference", as defined in RFC
- 2396 or RFC 3986. As per the spec, dereferencing a same-document
- reference "should not" involve retrieval of a new representation of the
- referenced resource. Note that the two specs have different definitions
- of same-document reference: RFC 2396 says it is *only* the cases where the
- reference is the empty string, or "#" followed by a fragment; RFC 3986
- requires making a comparison of the base URI to the absolute form of the
- reference (as is returned by the spec), minus its fragment component,
- if any.
- This function is similar to urlparse.urljoin() and urllib.basejoin().
- Those functions, however, are (as of Python 2.3) outdated, buggy, and/or
- designed to produce results acceptable for use with other core Python
- libraries, rather than being earnest implementations of the relevant
- specs. Their problems are most noticeable in their handling of
- same-document references and 'file:' URIs, both being situations that
- come up far too often to consider the functions reliable enough for
- general use.
- """
- # Reasons to avoid using urllib.basejoin() and urlparse.urljoin():
- # - Both are partial implementations of long-obsolete specs.
- # - Both accept relative URLs as the base, which no spec allows.
- # - urllib.basejoin() mishandles the '' and '..' references.
- # - If the base URL uses a non-hierarchical or relative path,
- # or if the URL scheme is unrecognized, the result is not
- # always as expected (partly due to issues in RFC 1808).
- # - If the authority component of a 'file' URI is empty,
- # the authority component is removed altogether. If it was
- # not present, an empty authority component is in the result.
- # - '.' and '..' segments are not always collapsed as well as they
- # should be (partly due to issues in RFC 1808).
- # - Effective Python 2.4, urllib.basejoin() *is* urlparse.urljoin(),
- # but urlparse.urljoin() is still based on RFC 1808.
- # This procedure is based on the pseudocode in RFC 3986 sec. 5.2.
- #
- # ensure base URI is absolute
- if not baseUri:
- raise ValueError('baseUri is required and must be a non empty string')
- if not IsAbsolute(baseUri):
- raise ValueError('%r is not an absolute URI' % baseUri)
- # shortcut for the simplest same-document reference cases
- if uriRef == '' or uriRef[0] == '#':
- return baseUri.split('#')[0] + uriRef
- # ensure a clean slate
- tScheme = tAuth = tPath = tQuery = None
- # parse the reference into its components
- (rScheme, rAuth, rPath, rQuery, rFrag) = SplitUriRef(uriRef)
- # if the reference is absolute, eliminate '.' and '..' path segments
- # and skip to the end
- if rScheme is not None:
- tScheme = rScheme
- tAuth = rAuth
- tPath = RemoveDotSegments(rPath)
- tQuery = rQuery
- else:
- # the base URI's scheme, and possibly more, will be inherited
- (bScheme, bAuth, bPath, bQuery, bFrag) = SplitUriRef(baseUri)
- # if the reference is a net-path, just eliminate '.' and '..' path
- # segments; no other changes needed.
- if rAuth is not None:
- tAuth = rAuth
- tPath = RemoveDotSegments(rPath)
- tQuery = rQuery
- # if it's not a net-path, we need to inherit pieces of the base URI
- else:
- # use base URI's path if the reference's path is empty
- if not rPath:
- tPath = bPath
- # use the reference's query, if any, or else the base URI's,
- tQuery = rQuery is not None and rQuery or bQuery
- # the reference's path is not empty
- else:
- # just use the reference's path if it's absolute
- if rPath[0] == '/':
- tPath = RemoveDotSegments(rPath)
- # merge the reference's relative path with the base URI's path
- else:
- if bAuth is not None and not bPath:
- tPath = '/' + rPath
- else:
- tPath = bPath[:bPath.rfind('/')+1] + rPath
- tPath = RemoveDotSegments(tPath)
- # use the reference's query
- tQuery = rQuery
- # since the reference isn't a net-path,
- # use the authority from the base URI
- tAuth = bAuth
- # inherit the scheme from the base URI
- tScheme = bScheme
- # always use the reference's fragment (but no need to define another var)
- #tFrag = rFrag
- # now compose the target URI (RFC 3986 sec. 5.3)
- return UnsplitUriRef((tScheme, tAuth, tPath, tQuery, rFrag))
- REG_NAME_HOST_PATTERN = re.compile(r"^(?:(?:[0-9A-Za-z\-_\.!~*'();&=+$,]|(?:%[0-9A-Fa-f]{2}))*)$")
- def MakeUrllibSafe(uriRef):
- """
- Makes the given RFC 3986-conformant URI reference safe for passing
- to legacy urllib functions. The result may not be a valid URI.
- As of Python 2.3.3, urllib.urlopen() does not fully support
- internationalized domain names, it does not strip fragment components,
- and on Windows, it expects file URIs to use '|' instead of ':' in the
- path component corresponding to the drivespec. It also relies on
- urllib.unquote(), which mishandles unicode arguments. This function
- produces a URI reference that will work around these issues, although
- the IDN workaround is limited to Python 2.3 only. May raise a
- UnicodeEncodeError if the URI reference is Unicode and erroneously
- contains non-ASCII characters.
- """
- # IDN support requires decoding any percent-encoded octets in the
- # host part (if it's a reg-name) of the authority component, and when
- # doing DNS lookups, applying IDNA encoding to that string first.
- # As of Python 2.3, there is an IDNA codec, and the socket and httplib
- # modules accept Unicode strings and apply IDNA encoding automatically
- # where necessary. However, urllib.urlopen() has not yet been updated
- # to do the same; it raises an exception if you give it a Unicode
- # string, and does no conversion on non-Unicode strings, meaning you
- # have to give it an IDNA string yourself. We will only support it on
- # Python 2.3 and up.
- #
- # see if host is a reg-name, as opposed to IPv4 or IPv6 addr.
- if isinstance(uriRef, unicode):
- try:
- uriRef = uriRef.encode('us-ascii') # parts of urllib are not unicode safe
- except UnicodeError:
- raise ValueError("uri %r must consist of ASCII characters." % uriRef)
- (scheme, auth, path, query, frag) = urlparse.urlsplit(uriRef)
- if auth and auth.find('@') > -1:
- userinfo, hostport = auth.split('@')
- else:
- userinfo = None
- hostport = auth
- if hostport and hostport.find(':') > -1:
- host, port = hostport.split(':')
- else:
- host = hostport
- port = None
- if host and REG_NAME_HOST_PATTERN.match(host):
- # percent-encoded hostnames will always fail DNS lookups
- host = urllib.unquote(host) #PercentDecode(host)
- # IDNA-encode if possible.
- # We shouldn't do this for schemes that don't need DNS lookup,
- # but are there any (that you'd be calling urlopen for)?
- if sys.version_info[0:2] >= (2, 3):
- if isinstance(host, str):
- host = host.decode('utf-8')
- host = host.encode('idna')
- # reassemble the authority with the new hostname
- # (percent-decoded, and possibly IDNA-encoded)
- auth = ''
- if userinfo:
- auth += userinfo + '@'
- auth += host
- if port:
- auth += ':' + port
- # On Windows, ensure that '|', not ':', is used in a drivespec.
- if os.name == 'nt' and scheme == 'file':
- path = path.replace(':', '|', 1)
- # Note that we drop fragment, if any. See RFC 3986 sec. 3.5.
- uri = urlparse.urlunsplit((scheme, auth, path, query, None))
- return uri
- def BaseJoin(base, uriRef):
- """
- Merges a base URI reference with another URI reference, returning a
- new URI reference.
- It behaves exactly the same as Absolutize(), except the arguments
- are reversed, and it accepts any URI reference (even a relative URI)
- as the base URI. If the base has no scheme component, it is
- evaluated as if it did, and then the scheme component of the result
- is removed from the result, unless the uriRef had a scheme. Thus, if
- neither argument has a scheme component, the result won't have one.
- This function is named BaseJoin because it is very much like
- urllib.basejoin(), but it follows the current rfc3986 algorithms
- for path merging, dot segment elimination, and inheritance of query
- and fragment components.
- WARNING: This function exists for 2 reasons: (1) because of a need
- within the 4Suite repository to perform URI reference absolutization
- using base URIs that are stored (inappropriately) as absolute paths
- in the subjects of statements in the RDF model, and (2) because of
- a similar need to interpret relative repo paths in a 4Suite product
- setup.xml file as being relative to a path that can be set outside
- the document. When these needs go away, this function probably will,
- too, so it is not advisable to use it.
- """
- if IsAbsolute(base):
- return Absolutize(uriRef, base)
- else:
- dummyscheme = 'basejoin'
- res = Absolutize(uriRef, '%s:%s' % (dummyscheme, base))
- if IsAbsolute(uriRef):
- # scheme will be inherited from uriRef
- return res
- else:
- # no scheme in, no scheme out
- return res[len(dummyscheme)+1:]
- def RemoveDotSegments(path):
- """
- Supports Absolutize() by implementing the remove_dot_segments function
- described in RFC 3986 sec. 5.2. It collapses most of the '.' and '..'
- segments out of a path without eliminating empty segments. It is intended
- to be used during the path merging process and may not give expected
- results when used independently. Use NormalizePathSegments() or
- NormalizePathSegmentsInUri() if more general normalization is desired.
- semi-private because it is not for general use. I've implemented it
- using two segment stacks, as alluded to in the spec, rather than the
- explicit string-walking algorithm that would be too inefficient. (mbrown)
- """
- # return empty string if entire path is just "." or ".."
- if path == '.' or path == '..':
- return path[0:0] # preserves string type
- # remove all "./" or "../" segments at the beginning
- while path:
- if path[:2] == './':
- path = path[2:]
- elif path[:3] == '../':
- path = path[3:]
- else:
- break
- # We need to keep track of whether there was a leading slash,
- # because we're going to drop it in order to prevent our list of
- # segments from having an ambiguous empty first item when we call
- # split().
- leading_slash = 0
- if path[:1] == '/':
- path = path[1:]
- leading_slash = 1
- # replace a trailing "/." with just "/"
- if path[-2:] == '/.':
- path = path[:-1]
- # convert the segments into a list and process each segment in
- # order from left to right.
- segments = path.split('/')
- keepers = []
- segments.reverse()
- while segments:
- seg = segments.pop()
- # '..' means drop the previous kept segment, if any.
- # If none, and if the path is relative, then keep the '..'.
- # If the '..' was the last segment, ensure
- # that the result ends with '/'.
- if seg == '..':
- if keepers:
- keepers.pop()
- elif not leading_slash:
- keepers.append(seg)
- if not segments:
- keepers.append('')
- # ignore '.' segments and keep all others, even empty ones
- elif seg != '.':
- keepers.append(seg)
- # reassemble the kept segments
- return leading_slash * '/' + '/'.join(keepers)
- SCHEME_PATTERN = re.compile(r'([a-zA-Z][a-zA-Z0-9+\-.]*):')
- def GetScheme(uriRef):
- """
- Obtains, with optimum efficiency, just the scheme from a URI reference.
- Returns a string, or if no scheme could be found, returns None.
- """
- # Using a regex seems to be the best option. Called 50,000 times on
- # different URIs, on a 1.0-GHz PIII with FreeBSD 4.7 and Python
- # 2.2.1, this method completed in 0.95s, and 0.05s if there was no
- # scheme to find. By comparison,
- # urllib.splittype()[0] took 1.5s always;
- # Ft.Lib.Uri.SplitUriRef()[0] took 2.5s always;
- # urlparse.urlparse()[0] took 3.5s always.
- m = SCHEME_PATTERN.match(uriRef)
- if m is None:
- return None
- else:
- return m.group(1)
- def IsAbsolute(identifier):
- """
- Given a string believed to be a URI or URI reference, tests that it is
- absolute (as per RFC 2396), not relative -- i.e., that it has a scheme.
- """
- # We do it this way to avoid compiling another massive regex.
- return GetScheme(identifier) is not None