PageRenderTime 67ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 1ms

/External.LCA_RESTRICTED/Languages/IronPython/27/Lib/urlparse.py

http://github.com/IronLanguages/main
Python | 428 lines | 405 code | 5 blank | 18 comment | 28 complexity | 6d124ed1be42540fcfbc6c60cd6a62db MD5 | raw file
Possible License(s): CPL-1.0, BSD-3-Clause, ISC, GPL-2.0, MPL-2.0-no-copyleft-exception
  1. """Parse (absolute and relative) URLs.
  2. urlparse module is based upon the following RFC specifications.
  3. RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
  4. and L. Masinter, January 2005.
  5. RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
  6. and L.Masinter, December 1999.
  7. RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
  8. Berners-Lee, R. Fielding, and L. Masinter, August 1998.
  9. RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
  10. RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
  11. 1995.
  12. RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
  13. McCahill, December 1994
  14. RFC 3986 is considered the current standard and any future changes to
  15. urlparse module should conform with it. The urlparse module is
  16. currently not entirely compliant with this RFC due to defacto
  17. scenarios for parsing, and for backward compatibility purposes, some
  18. parsing quirks from older RFCs are retained. The testcases in
  19. test_urlparse.py provides a good indicator of parsing behavior.
  20. """
  21. import re
  22. __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
  23. "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
  24. # A classification of schemes ('' means apply by default)
  25. uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
  26. 'wais', 'file', 'https', 'shttp', 'mms',
  27. 'prospero', 'rtsp', 'rtspu', '', 'sftp',
  28. 'svn', 'svn+ssh']
  29. uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
  30. 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
  31. 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
  32. 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
  33. uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
  34. 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
  35. 'mms', '', 'sftp', 'tel']
  36. # These are not actually used anymore, but should stay for backwards
  37. # compatibility. (They are undocumented, but have a public-looking name.)
  38. non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
  39. 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
  40. uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
  41. 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
  42. uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
  43. 'nntp', 'wais', 'https', 'shttp', 'snews',
  44. 'file', 'prospero', '']
  45. # Characters valid in scheme names
  46. scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
  47. 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
  48. '0123456789'
  49. '+-.')
  50. MAX_CACHE_SIZE = 20
  51. _parse_cache = {}
  52. def clear_cache():
  53. """Clear the parse cache."""
  54. _parse_cache.clear()
  55. class ResultMixin(object):
  56. """Shared methods for the parsed result objects."""
  57. @property
  58. def username(self):
  59. netloc = self.netloc
  60. if "@" in netloc:
  61. userinfo = netloc.rsplit("@", 1)[0]
  62. if ":" in userinfo:
  63. userinfo = userinfo.split(":", 1)[0]
  64. return userinfo
  65. return None
  66. @property
  67. def password(self):
  68. netloc = self.netloc
  69. if "@" in netloc:
  70. userinfo = netloc.rsplit("@", 1)[0]
  71. if ":" in userinfo:
  72. return userinfo.split(":", 1)[1]
  73. return None
  74. @property
  75. def hostname(self):
  76. netloc = self.netloc.split('@')[-1]
  77. if '[' in netloc and ']' in netloc:
  78. return netloc.split(']')[0][1:].lower()
  79. elif ':' in netloc:
  80. return netloc.split(':')[0].lower()
  81. elif netloc == '':
  82. return None
  83. else:
  84. return netloc.lower()
  85. @property
  86. def port(self):
  87. netloc = self.netloc.split('@')[-1].split(']')[-1]
  88. if ':' in netloc:
  89. port = netloc.split(':')[1]
  90. if port:
  91. port = int(port, 10)
  92. # verify legal port
  93. if (0 <= port <= 65535):
  94. return port
  95. return None
  96. from collections import namedtuple
  97. class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
  98. __slots__ = ()
  99. def geturl(self):
  100. return urlunsplit(self)
  101. class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
  102. __slots__ = ()
  103. def geturl(self):
  104. return urlunparse(self)
  105. def urlparse(url, scheme='', allow_fragments=True):
  106. """Parse a URL into 6 components:
  107. <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
  108. Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
  109. Note that we don't break the components up in smaller bits
  110. (e.g. netloc is a single string) and we don't expand % escapes."""
  111. tuple = urlsplit(url, scheme, allow_fragments)
  112. scheme, netloc, url, query, fragment = tuple
  113. if scheme in uses_params and ';' in url:
  114. url, params = _splitparams(url)
  115. else:
  116. params = ''
  117. return ParseResult(scheme, netloc, url, params, query, fragment)
  118. def _splitparams(url):
  119. if '/' in url:
  120. i = url.find(';', url.rfind('/'))
  121. if i < 0:
  122. return url, ''
  123. else:
  124. i = url.find(';')
  125. return url[:i], url[i+1:]
  126. def _splitnetloc(url, start=0):
  127. delim = len(url) # position of end of domain part of url, default is end
  128. for c in '/?#': # look for delimiters; the order is NOT important
  129. wdelim = url.find(c, start) # find first of this delim
  130. if wdelim >= 0: # if found
  131. delim = min(delim, wdelim) # use earliest delim position
  132. return url[start:delim], url[delim:] # return (domain, rest)
  133. def urlsplit(url, scheme='', allow_fragments=True):
  134. """Parse a URL into 5 components:
  135. <scheme>://<netloc>/<path>?<query>#<fragment>
  136. Return a 5-tuple: (scheme, netloc, path, query, fragment).
  137. Note that we don't break the components up in smaller bits
  138. (e.g. netloc is a single string) and we don't expand % escapes."""
  139. allow_fragments = bool(allow_fragments)
  140. key = url, scheme, allow_fragments, type(url), type(scheme)
  141. cached = _parse_cache.get(key, None)
  142. if cached:
  143. return cached
  144. if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
  145. clear_cache()
  146. netloc = query = fragment = ''
  147. i = url.find(':')
  148. if i > 0:
  149. if url[:i] == 'http': # optimize the common case
  150. scheme = url[:i].lower()
  151. url = url[i+1:]
  152. if url[:2] == '//':
  153. netloc, url = _splitnetloc(url, 2)
  154. if (('[' in netloc and ']' not in netloc) or
  155. (']' in netloc and '[' not in netloc)):
  156. raise ValueError("Invalid IPv6 URL")
  157. if allow_fragments and '#' in url:
  158. url, fragment = url.split('#', 1)
  159. if '?' in url:
  160. url, query = url.split('?', 1)
  161. v = SplitResult(scheme, netloc, url, query, fragment)
  162. _parse_cache[key] = v
  163. return v
  164. for c in url[:i]:
  165. if c not in scheme_chars:
  166. break
  167. else:
  168. # make sure "url" is not actually a port number (in which case
  169. # "scheme" is really part of the path)
  170. rest = url[i+1:]
  171. if not rest or any(c not in '0123456789' for c in rest):
  172. # not a port number
  173. scheme, url = url[:i].lower(), rest
  174. if url[:2] == '//':
  175. netloc, url = _splitnetloc(url, 2)
  176. if (('[' in netloc and ']' not in netloc) or
  177. (']' in netloc and '[' not in netloc)):
  178. raise ValueError("Invalid IPv6 URL")
  179. if allow_fragments and '#' in url:
  180. url, fragment = url.split('#', 1)
  181. if '?' in url:
  182. url, query = url.split('?', 1)
  183. v = SplitResult(scheme, netloc, url, query, fragment)
  184. _parse_cache[key] = v
  185. return v
  186. def urlunparse(data):
  187. """Put a parsed URL back together again. This may result in a
  188. slightly different, but equivalent URL, if the URL that was parsed
  189. originally had redundant delimiters, e.g. a ? with an empty query
  190. (the draft states that these are equivalent)."""
  191. scheme, netloc, url, params, query, fragment = data
  192. if params:
  193. url = "%s;%s" % (url, params)
  194. return urlunsplit((scheme, netloc, url, query, fragment))
  195. def urlunsplit(data):
  196. """Combine the elements of a tuple as returned by urlsplit() into a
  197. complete URL as a string. The data argument can be any five-item iterable.
  198. This may result in a slightly different, but equivalent URL, if the URL that
  199. was parsed originally had unnecessary delimiters (for example, a ? with an
  200. empty query; the RFC states that these are equivalent)."""
  201. scheme, netloc, url, query, fragment = data
  202. if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
  203. if url and url[:1] != '/': url = '/' + url
  204. url = '//' + (netloc or '') + url
  205. if scheme:
  206. url = scheme + ':' + url
  207. if query:
  208. url = url + '?' + query
  209. if fragment:
  210. url = url + '#' + fragment
  211. return url
  212. def urljoin(base, url, allow_fragments=True):
  213. """Join a base URL and a possibly relative URL to form an absolute
  214. interpretation of the latter."""
  215. if not base:
  216. return url
  217. if not url:
  218. return base
  219. bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
  220. urlparse(base, '', allow_fragments)
  221. scheme, netloc, path, params, query, fragment = \
  222. urlparse(url, bscheme, allow_fragments)
  223. if scheme != bscheme or scheme not in uses_relative:
  224. return url
  225. if scheme in uses_netloc:
  226. if netloc:
  227. return urlunparse((scheme, netloc, path,
  228. params, query, fragment))
  229. netloc = bnetloc
  230. if path[:1] == '/':
  231. return urlunparse((scheme, netloc, path,
  232. params, query, fragment))
  233. if not path and not params:
  234. path = bpath
  235. params = bparams
  236. if not query:
  237. query = bquery
  238. return urlunparse((scheme, netloc, path,
  239. params, query, fragment))
  240. segments = bpath.split('/')[:-1] + path.split('/')
  241. # XXX The stuff below is bogus in various ways...
  242. if segments[-1] == '.':
  243. segments[-1] = ''
  244. while '.' in segments:
  245. segments.remove('.')
  246. while 1:
  247. i = 1
  248. n = len(segments) - 1
  249. while i < n:
  250. if (segments[i] == '..'
  251. and segments[i-1] not in ('', '..')):
  252. del segments[i-1:i+1]
  253. break
  254. i = i+1
  255. else:
  256. break
  257. if segments == ['', '..']:
  258. segments[-1] = ''
  259. elif len(segments) >= 2 and segments[-1] == '..':
  260. segments[-2:] = ['']
  261. return urlunparse((scheme, netloc, '/'.join(segments),
  262. params, query, fragment))
  263. def urldefrag(url):
  264. """Removes any existing fragment from URL.
  265. Returns a tuple of the defragmented URL and the fragment. If
  266. the URL contained no fragments, the second element is the
  267. empty string.
  268. """
  269. if '#' in url:
  270. s, n, p, a, q, frag = urlparse(url)
  271. defrag = urlunparse((s, n, p, a, q, ''))
  272. return defrag, frag
  273. else:
  274. return url, ''
  275. try:
  276. unicode
  277. except NameError:
  278. def _is_unicode(x):
  279. return 0
  280. else:
  281. def _is_unicode(x):
  282. return isinstance(x, unicode)
  283. # unquote method for parse_qs and parse_qsl
  284. # Cannot use directly from urllib as it would create a circular reference
  285. # because urllib uses urlparse methods (urljoin). If you update this function,
  286. # update it also in urllib. This code duplication does not existin in Python3.
  287. _hexdig = '0123456789ABCDEFabcdef'
  288. _hextochr = dict((a+b, chr(int(a+b,16)))
  289. for a in _hexdig for b in _hexdig)
  290. _asciire = re.compile('([\x00-\x7f]+)')
  291. def unquote(s, recurse=False):
  292. """unquote('abc%20def') -> 'abc def'."""
  293. if _is_unicode(s) and not recurse:
  294. if '%' not in s:
  295. return s
  296. bits = _asciire.split(s)
  297. res = [bits[0]]
  298. append = res.append
  299. for i in range(1, len(bits), 2):
  300. append(unquote(str(bits[i]), True).decode('latin1'))
  301. append(bits[i + 1])
  302. return ''.join(res)
  303. bits = s.split('%')
  304. # fastpath
  305. if len(bits) == 1:
  306. return s
  307. res = [bits[0]]
  308. append = res.append
  309. for item in bits[1:]:
  310. try:
  311. append(_hextochr[item[:2]])
  312. append(item[2:])
  313. except KeyError:
  314. append('%')
  315. append(item)
  316. return ''.join(res)
  317. def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
  318. """Parse a query given as a string argument.
  319. Arguments:
  320. qs: percent-encoded query string to be parsed
  321. keep_blank_values: flag indicating whether blank values in
  322. percent-encoded queries should be treated as blank strings.
  323. A true value indicates that blanks should be retained as
  324. blank strings. The default false value indicates that
  325. blank values are to be ignored and treated as if they were
  326. not included.
  327. strict_parsing: flag indicating what to do with parsing errors.
  328. If false (the default), errors are silently ignored.
  329. If true, errors raise a ValueError exception.
  330. """
  331. dict = {}
  332. for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
  333. if name in dict:
  334. dict[name].append(value)
  335. else:
  336. dict[name] = [value]
  337. return dict
  338. def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
  339. """Parse a query given as a string argument.
  340. Arguments:
  341. qs: percent-encoded query string to be parsed
  342. keep_blank_values: flag indicating whether blank values in
  343. percent-encoded queries should be treated as blank strings. A
  344. true value indicates that blanks should be retained as blank
  345. strings. The default false value indicates that blank values
  346. are to be ignored and treated as if they were not included.
  347. strict_parsing: flag indicating what to do with parsing errors. If
  348. false (the default), errors are silently ignored. If true,
  349. errors raise a ValueError exception.
  350. Returns a list, as G-d intended.
  351. """
  352. pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
  353. r = []
  354. for name_value in pairs:
  355. if not name_value and not strict_parsing:
  356. continue
  357. nv = name_value.split('=', 1)
  358. if len(nv) != 2:
  359. if strict_parsing:
  360. raise ValueError, "bad query field: %r" % (name_value,)
  361. # Handle case of a control-name with no equal sign
  362. if keep_blank_values:
  363. nv.append('')
  364. else:
  365. continue
  366. if len(nv[1]) or keep_blank_values:
  367. name = unquote(nv[0].replace('+', ' '))
  368. value = unquote(nv[1].replace('+', ' '))
  369. r.append((name, value))
  370. return r