PageRenderTime 51ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/python/lib/Lib/xml/Uri.py

http://github.com/JetBrains/intellij-community
Python | 380 lines | 291 code | 15 blank | 74 comment | 42 complexity | 4579b0837a49e0dbd57dd89412706c39 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0, MPL-2.0-no-copyleft-exception, MIT, EPL-1.0, AGPL-1.0
  1. # pylint: disable-msg=C0103
  2. #
  3. # backported code from 4Suite with slight modifications, started from r1.89 of
  4. # Ft/Lib/Uri.py, by syt@logilab.fr on 2005-02-09
  5. #
  6. # part if not all of this code should probably move to urlparse (or be used
  7. # to fix some existant functions in this module)
  8. #
  9. #
  10. # Copyright 2004 Fourthought, Inc. (USA).
  11. # Detailed license and copyright information: http://4suite.org/COPYRIGHT
  12. # Project home, documentation, distributions: http://4suite.org/
  13. import os.path
  14. import sys
  15. import re
  16. import urlparse, urllib, urllib2
  17. def UnsplitUriRef(uriRefSeq):
  18. """should replace urlparse.urlunsplit
  19. Given a sequence as would be produced by SplitUriRef(), assembles and
  20. returns a URI reference as a string.
  21. """
  22. if not isinstance(uriRefSeq, (tuple, list)):
  23. raise TypeError("sequence expected, got %s" % type(uriRefSeq))
  24. (scheme, authority, path, query, fragment) = uriRefSeq
  25. uri = ''
  26. if scheme is not None:
  27. uri += scheme + ':'
  28. if authority is not None:
  29. uri += '//' + authority
  30. uri += path
  31. if query is not None:
  32. uri += '?' + query
  33. if fragment is not None:
  34. uri += '#' + fragment
  35. return uri
  36. SPLIT_URI_REF_PATTERN = re.compile(r"^(?:(?P<scheme>[^:/?#]+):)?(?://(?P<authority>[^/?#]*))?(?P<path>[^?#]*)(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?$")
  37. def SplitUriRef(uriref):
  38. """should replace urlparse.urlsplit
  39. Given a valid URI reference as a string, returns a tuple representing the
  40. generic URI components, as per RFC 2396 appendix B. The tuple's structure
  41. is (scheme, authority, path, query, fragment).
  42. All values will be strings (possibly empty) or None if undefined.
  43. Note that per rfc3986, there is no distinction between a path and
  44. an "opaque part", as there was in RFC 2396.
  45. """
  46. # the pattern will match every possible string, so it's safe to
  47. # assume there's a groupdict method to call.
  48. g = SPLIT_URI_REF_PATTERN.match(uriref).groupdict()
  49. scheme = g['scheme']
  50. authority = g['authority']
  51. path = g['path']
  52. query = g['query']
  53. fragment = g['fragment']
  54. return (scheme, authority, path, query, fragment)
  55. def Absolutize(uriRef, baseUri):
  56. """
  57. Resolves a URI reference to absolute form, effecting the result of RFC
  58. 3986 section 5. The URI reference is considered to be relative to the
  59. given base URI.
  60. It is the caller's responsibility to ensure that the base URI matches
  61. the absolute-URI syntax rule of RFC 3986, and that its path component
  62. does not contain '.' or '..' segments if the scheme is hierarchical.
  63. Unexpected results may occur otherwise.
  64. This function only conducts a minimal sanity check in order to determine
  65. if relative resolution is possible: it raises a UriException if the base
  66. URI does not have a scheme component. While it is true that the base URI
  67. is irrelevant if the URI reference has a scheme, an exception is raised
  68. in order to signal that the given string does not even come close to
  69. meeting the criteria to be usable as a base URI.
  70. It is the caller's responsibility to make a determination of whether the
  71. URI reference constitutes a "same-document reference", as defined in RFC
  72. 2396 or RFC 3986. As per the spec, dereferencing a same-document
  73. reference "should not" involve retrieval of a new representation of the
  74. referenced resource. Note that the two specs have different definitions
  75. of same-document reference: RFC 2396 says it is *only* the cases where the
  76. reference is the empty string, or "#" followed by a fragment; RFC 3986
  77. requires making a comparison of the base URI to the absolute form of the
  78. reference (as is returned by the spec), minus its fragment component,
  79. if any.
  80. This function is similar to urlparse.urljoin() and urllib.basejoin().
  81. Those functions, however, are (as of Python 2.3) outdated, buggy, and/or
  82. designed to produce results acceptable for use with other core Python
  83. libraries, rather than being earnest implementations of the relevant
  84. specs. Their problems are most noticeable in their handling of
  85. same-document references and 'file:' URIs, both being situations that
  86. come up far too often to consider the functions reliable enough for
  87. general use.
  88. """
  89. # Reasons to avoid using urllib.basejoin() and urlparse.urljoin():
  90. # - Both are partial implementations of long-obsolete specs.
  91. # - Both accept relative URLs as the base, which no spec allows.
  92. # - urllib.basejoin() mishandles the '' and '..' references.
  93. # - If the base URL uses a non-hierarchical or relative path,
  94. # or if the URL scheme is unrecognized, the result is not
  95. # always as expected (partly due to issues in RFC 1808).
  96. # - If the authority component of a 'file' URI is empty,
  97. # the authority component is removed altogether. If it was
  98. # not present, an empty authority component is in the result.
  99. # - '.' and '..' segments are not always collapsed as well as they
  100. # should be (partly due to issues in RFC 1808).
  101. # - Effective Python 2.4, urllib.basejoin() *is* urlparse.urljoin(),
  102. # but urlparse.urljoin() is still based on RFC 1808.
  103. # This procedure is based on the pseudocode in RFC 3986 sec. 5.2.
  104. #
  105. # ensure base URI is absolute
  106. if not baseUri:
  107. raise ValueError('baseUri is required and must be a non empty string')
  108. if not IsAbsolute(baseUri):
  109. raise ValueError('%r is not an absolute URI' % baseUri)
  110. # shortcut for the simplest same-document reference cases
  111. if uriRef == '' or uriRef[0] == '#':
  112. return baseUri.split('#')[0] + uriRef
  113. # ensure a clean slate
  114. tScheme = tAuth = tPath = tQuery = None
  115. # parse the reference into its components
  116. (rScheme, rAuth, rPath, rQuery, rFrag) = SplitUriRef(uriRef)
  117. # if the reference is absolute, eliminate '.' and '..' path segments
  118. # and skip to the end
  119. if rScheme is not None:
  120. tScheme = rScheme
  121. tAuth = rAuth
  122. tPath = RemoveDotSegments(rPath)
  123. tQuery = rQuery
  124. else:
  125. # the base URI's scheme, and possibly more, will be inherited
  126. (bScheme, bAuth, bPath, bQuery, bFrag) = SplitUriRef(baseUri)
  127. # if the reference is a net-path, just eliminate '.' and '..' path
  128. # segments; no other changes needed.
  129. if rAuth is not None:
  130. tAuth = rAuth
  131. tPath = RemoveDotSegments(rPath)
  132. tQuery = rQuery
  133. # if it's not a net-path, we need to inherit pieces of the base URI
  134. else:
  135. # use base URI's path if the reference's path is empty
  136. if not rPath:
  137. tPath = bPath
  138. # use the reference's query, if any, or else the base URI's,
  139. tQuery = rQuery is not None and rQuery or bQuery
  140. # the reference's path is not empty
  141. else:
  142. # just use the reference's path if it's absolute
  143. if rPath[0] == '/':
  144. tPath = RemoveDotSegments(rPath)
  145. # merge the reference's relative path with the base URI's path
  146. else:
  147. if bAuth is not None and not bPath:
  148. tPath = '/' + rPath
  149. else:
  150. tPath = bPath[:bPath.rfind('/')+1] + rPath
  151. tPath = RemoveDotSegments(tPath)
  152. # use the reference's query
  153. tQuery = rQuery
  154. # since the reference isn't a net-path,
  155. # use the authority from the base URI
  156. tAuth = bAuth
  157. # inherit the scheme from the base URI
  158. tScheme = bScheme
  159. # always use the reference's fragment (but no need to define another var)
  160. #tFrag = rFrag
  161. # now compose the target URI (RFC 3986 sec. 5.3)
  162. return UnsplitUriRef((tScheme, tAuth, tPath, tQuery, rFrag))
  163. REG_NAME_HOST_PATTERN = re.compile(r"^(?:(?:[0-9A-Za-z\-_\.!~*'();&=+$,]|(?:%[0-9A-Fa-f]{2}))*)$")
  164. def MakeUrllibSafe(uriRef):
  165. """
  166. Makes the given RFC 3986-conformant URI reference safe for passing
  167. to legacy urllib functions. The result may not be a valid URI.
  168. As of Python 2.3.3, urllib.urlopen() does not fully support
  169. internationalized domain names, it does not strip fragment components,
  170. and on Windows, it expects file URIs to use '|' instead of ':' in the
  171. path component corresponding to the drivespec. It also relies on
  172. urllib.unquote(), which mishandles unicode arguments. This function
  173. produces a URI reference that will work around these issues, although
  174. the IDN workaround is limited to Python 2.3 only. May raise a
  175. UnicodeEncodeError if the URI reference is Unicode and erroneously
  176. contains non-ASCII characters.
  177. """
  178. # IDN support requires decoding any percent-encoded octets in the
  179. # host part (if it's a reg-name) of the authority component, and when
  180. # doing DNS lookups, applying IDNA encoding to that string first.
  181. # As of Python 2.3, there is an IDNA codec, and the socket and httplib
  182. # modules accept Unicode strings and apply IDNA encoding automatically
  183. # where necessary. However, urllib.urlopen() has not yet been updated
  184. # to do the same; it raises an exception if you give it a Unicode
  185. # string, and does no conversion on non-Unicode strings, meaning you
  186. # have to give it an IDNA string yourself. We will only support it on
  187. # Python 2.3 and up.
  188. #
  189. # see if host is a reg-name, as opposed to IPv4 or IPv6 addr.
  190. if isinstance(uriRef, unicode):
  191. try:
  192. uriRef = uriRef.encode('us-ascii') # parts of urllib are not unicode safe
  193. except UnicodeError:
  194. raise ValueError("uri %r must consist of ASCII characters." % uriRef)
  195. (scheme, auth, path, query, frag) = urlparse.urlsplit(uriRef)
  196. if auth and auth.find('@') > -1:
  197. userinfo, hostport = auth.split('@')
  198. else:
  199. userinfo = None
  200. hostport = auth
  201. if hostport and hostport.find(':') > -1:
  202. host, port = hostport.split(':')
  203. else:
  204. host = hostport
  205. port = None
  206. if host and REG_NAME_HOST_PATTERN.match(host):
  207. # percent-encoded hostnames will always fail DNS lookups
  208. host = urllib.unquote(host) #PercentDecode(host)
  209. # IDNA-encode if possible.
  210. # We shouldn't do this for schemes that don't need DNS lookup,
  211. # but are there any (that you'd be calling urlopen for)?
  212. if sys.version_info[0:2] >= (2, 3):
  213. if isinstance(host, str):
  214. host = host.decode('utf-8')
  215. host = host.encode('idna')
  216. # reassemble the authority with the new hostname
  217. # (percent-decoded, and possibly IDNA-encoded)
  218. auth = ''
  219. if userinfo:
  220. auth += userinfo + '@'
  221. auth += host
  222. if port:
  223. auth += ':' + port
  224. # On Windows, ensure that '|', not ':', is used in a drivespec.
  225. if os.name == 'nt' and scheme == 'file':
  226. path = path.replace(':', '|', 1)
  227. # Note that we drop fragment, if any. See RFC 3986 sec. 3.5.
  228. uri = urlparse.urlunsplit((scheme, auth, path, query, None))
  229. return uri
  230. def BaseJoin(base, uriRef):
  231. """
  232. Merges a base URI reference with another URI reference, returning a
  233. new URI reference.
  234. It behaves exactly the same as Absolutize(), except the arguments
  235. are reversed, and it accepts any URI reference (even a relative URI)
  236. as the base URI. If the base has no scheme component, it is
  237. evaluated as if it did, and then the scheme component of the result
  238. is removed from the result, unless the uriRef had a scheme. Thus, if
  239. neither argument has a scheme component, the result won't have one.
  240. This function is named BaseJoin because it is very much like
  241. urllib.basejoin(), but it follows the current rfc3986 algorithms
  242. for path merging, dot segment elimination, and inheritance of query
  243. and fragment components.
  244. WARNING: This function exists for 2 reasons: (1) because of a need
  245. within the 4Suite repository to perform URI reference absolutization
  246. using base URIs that are stored (inappropriately) as absolute paths
  247. in the subjects of statements in the RDF model, and (2) because of
  248. a similar need to interpret relative repo paths in a 4Suite product
  249. setup.xml file as being relative to a path that can be set outside
  250. the document. When these needs go away, this function probably will,
  251. too, so it is not advisable to use it.
  252. """
  253. if IsAbsolute(base):
  254. return Absolutize(uriRef, base)
  255. else:
  256. dummyscheme = 'basejoin'
  257. res = Absolutize(uriRef, '%s:%s' % (dummyscheme, base))
  258. if IsAbsolute(uriRef):
  259. # scheme will be inherited from uriRef
  260. return res
  261. else:
  262. # no scheme in, no scheme out
  263. return res[len(dummyscheme)+1:]
  264. def RemoveDotSegments(path):
  265. """
  266. Supports Absolutize() by implementing the remove_dot_segments function
  267. described in RFC 3986 sec. 5.2. It collapses most of the '.' and '..'
  268. segments out of a path without eliminating empty segments. It is intended
  269. to be used during the path merging process and may not give expected
  270. results when used independently. Use NormalizePathSegments() or
  271. NormalizePathSegmentsInUri() if more general normalization is desired.
  272. semi-private because it is not for general use. I've implemented it
  273. using two segment stacks, as alluded to in the spec, rather than the
  274. explicit string-walking algorithm that would be too inefficient. (mbrown)
  275. """
  276. # return empty string if entire path is just "." or ".."
  277. if path == '.' or path == '..':
  278. return path[0:0] # preserves string type
  279. # remove all "./" or "../" segments at the beginning
  280. while path:
  281. if path[:2] == './':
  282. path = path[2:]
  283. elif path[:3] == '../':
  284. path = path[3:]
  285. else:
  286. break
  287. # We need to keep track of whether there was a leading slash,
  288. # because we're going to drop it in order to prevent our list of
  289. # segments from having an ambiguous empty first item when we call
  290. # split().
  291. leading_slash = 0
  292. if path[:1] == '/':
  293. path = path[1:]
  294. leading_slash = 1
  295. # replace a trailing "/." with just "/"
  296. if path[-2:] == '/.':
  297. path = path[:-1]
  298. # convert the segments into a list and process each segment in
  299. # order from left to right.
  300. segments = path.split('/')
  301. keepers = []
  302. segments.reverse()
  303. while segments:
  304. seg = segments.pop()
  305. # '..' means drop the previous kept segment, if any.
  306. # If none, and if the path is relative, then keep the '..'.
  307. # If the '..' was the last segment, ensure
  308. # that the result ends with '/'.
  309. if seg == '..':
  310. if keepers:
  311. keepers.pop()
  312. elif not leading_slash:
  313. keepers.append(seg)
  314. if not segments:
  315. keepers.append('')
  316. # ignore '.' segments and keep all others, even empty ones
  317. elif seg != '.':
  318. keepers.append(seg)
  319. # reassemble the kept segments
  320. return leading_slash * '/' + '/'.join(keepers)
  321. SCHEME_PATTERN = re.compile(r'([a-zA-Z][a-zA-Z0-9+\-.]*):')
  322. def GetScheme(uriRef):
  323. """
  324. Obtains, with optimum efficiency, just the scheme from a URI reference.
  325. Returns a string, or if no scheme could be found, returns None.
  326. """
  327. # Using a regex seems to be the best option. Called 50,000 times on
  328. # different URIs, on a 1.0-GHz PIII with FreeBSD 4.7 and Python
  329. # 2.2.1, this method completed in 0.95s, and 0.05s if there was no
  330. # scheme to find. By comparison,
  331. # urllib.splittype()[0] took 1.5s always;
  332. # Ft.Lib.Uri.SplitUriRef()[0] took 2.5s always;
  333. # urlparse.urlparse()[0] took 3.5s always.
  334. m = SCHEME_PATTERN.match(uriRef)
  335. if m is None:
  336. return None
  337. else:
  338. return m.group(1)
  339. def IsAbsolute(identifier):
  340. """
  341. Given a string believed to be a URI or URI reference, tests that it is
  342. absolute (as per RFC 2396), not relative -- i.e., that it has a scheme.
  343. """
  344. # We do it this way to avoid compiling another massive regex.
  345. return GetScheme(identifier) is not None