PageRenderTime 40ms CodeModel.GetById 13ms RepoModel.GetById 1ms app.codeStats 0ms

/feedvalidator/uri.py

https://github.com/dh-benamor/restful-openerp
Python | 238 lines | 198 code | 29 blank | 11 comment | 23 complexity | e8fe005314960803609050d918e6c005 MD5 | raw file
  1. """$Id$"""
  2. """
  3. Code to test URI references for validity, and give their normalized
  4. form, according to RFC 3986.
  5. """
  6. __author__ = "Joseph Walton <http://www.kafsemo.org/>"
  7. __version__ = "$Revision$"
  8. __copyright__ = "Copyright (c) 2004, 2007 Joseph Walton"
  9. from urlparse import urljoin
  10. from urllib import quote, quote_plus, unquote, unquote_plus
  11. from unicodedata import normalize
  12. from codecs import lookup
  13. import re
  14. (enc, dec) = lookup('UTF-8')[:2]
  15. SUBDELIMS='!$&\'()*+,;='
  16. PCHAR='-._~' + SUBDELIMS + ':@'
  17. GENDELIMS=':/?#[]@'
  18. RESERVED=GENDELIMS + SUBDELIMS
  19. default_port = {
  20. 'ftp': 21,
  21. 'telnet': 23,
  22. 'http': 80,
  23. 'gopher': 70,
  24. 'news': 119,
  25. 'nntp': 119,
  26. 'prospero': 191,
  27. 'https': 443,
  28. 'snews': 563,
  29. 'snntp': 563,
  30. }
  31. class BadUri(Exception):
  32. pass
  33. def _n(s):
  34. return enc(normalize('NFC', dec(s)[0]))[0]
  35. octetRe = re.compile('([^%]|%[a-fA-F0-9]{2})')
  36. def asOctets(s):
  37. while (s):
  38. m = octetRe.match(s)
  39. if not(m):
  40. raise BadUri()
  41. c = m.group(1)
  42. if (c[0] == '%'):
  43. yield(c.upper(), chr(int(c[1:], 0x10)))
  44. else:
  45. yield(c, c)
  46. s = s[m.end(1):]
  47. def _qnu(s,safe=''):
  48. if s == None:
  49. return None
  50. # unquote{,_plus} leave high-bit octets unconverted in Unicode strings
  51. # This conversion will, correctly, cause UnicodeEncodeError if there are
  52. # non-ASCII characters present in the string
  53. s = str(s)
  54. res = ''
  55. b = ''
  56. for (c,x) in asOctets(s):
  57. if x in RESERVED and x in safe:
  58. res += quote(_n(unquote(b)), safe)
  59. b = ''
  60. res += c
  61. else:
  62. b += x
  63. res += quote(_n(unquote(b)), safe)
  64. return res
  65. # Match an optional port specification
  66. portRe = re.compile(':(\d*)$')
  67. def _normPort(netloc,defPort):
  68. nl = netloc.lower()
  69. p = defPort
  70. m = portRe.search(nl)
  71. if m:
  72. if m.group(1) != '':
  73. p = int(m.group(1))
  74. nl = nl[:m.start(1) - 1]
  75. if nl and nl[-1] == '.' and nl.rfind('.', 0, -2) >= 0:
  76. nl = nl[:-1]
  77. # Square brackets are allowed, and only allowed, delimiting IPv6 addresses
  78. if nl.startswith('[') != nl.endswith(']'):
  79. raise BadUri()
  80. if p != defPort:
  81. nl = nl + ':' + str(p)
  82. return nl
  83. def _normAuth(auth,port):
  84. i = auth.rfind('@')
  85. if i >= 0:
  86. c = auth[:i]
  87. if c == ':':
  88. c = ''
  89. h = auth[i + 1:]
  90. else:
  91. c = None
  92. h = auth
  93. if c:
  94. return c + '@' + _normPort(h,port)
  95. else:
  96. return _normPort(h,port)
  97. def _normPath(p):
  98. l = p.split(u'/')
  99. i = 0
  100. if l and l[0]:
  101. i = len(l)
  102. while i < len(l):
  103. c = l[i]
  104. if (c == '.'):
  105. if i < len(l) - 1:
  106. del l[i]
  107. else:
  108. l[i] = ''
  109. elif (c == '..'):
  110. if i < len(l) - 1:
  111. del l[i]
  112. else:
  113. l[i] = ''
  114. if i > 1 or (i > 0 and l[0]):
  115. i -= 1
  116. del l[i]
  117. else:
  118. i += 1
  119. if l == ['']:
  120. l = ['', '']
  121. return u'/'.join([_qnu(c, PCHAR) for c in l])
  122. # From RFC 2396bis, with added end-of-string marker
  123. uriRe = re.compile('^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?$')
  124. def _canonical(s):
  125. m = uriRe.match(s)
  126. if not(m):
  127. raise BadUri()
  128. # Check for a relative URI
  129. if m.group(2) is None:
  130. scheme = None
  131. else:
  132. scheme = m.group(2).lower()
  133. if m.group(4) is None:
  134. authority = None
  135. p = m.group(5)
  136. # Don't try to normalise URI references with relative paths
  137. if scheme is None and not p.startswith('/'):
  138. return None
  139. if scheme == 'mailto':
  140. # XXX From RFC 2368, mailto equivalence needs to be subtler than this
  141. i = p.find('@')
  142. if i > 0:
  143. j = p.find('?')
  144. if j < 0:
  145. j = len(p)
  146. p = _qnu(p[:i]) + '@' + _qnu(p[i + 1:].lower()) + _qnu(p[j:])
  147. path = p
  148. else:
  149. if scheme is None or p.startswith('/'):
  150. path = _normPath(p)
  151. else:
  152. path = _qnu(p, PCHAR + '/')
  153. else:
  154. a = m.group(4)
  155. p = m.group(5)
  156. if scheme in default_port:
  157. a = _normAuth(a, default_port[scheme])
  158. else:
  159. a = _normAuth(a, None)
  160. authority = a
  161. path = _normPath(p)
  162. query = _qnu(m.group(7), PCHAR + "/?")
  163. fragment = _qnu(m.group(9), PCHAR + "/?")
  164. s = u''
  165. if scheme != None:
  166. s += scheme + ':'
  167. if authority != None:
  168. s += '//' + authority
  169. s += path
  170. if query != None:
  171. s += '?' + query
  172. if fragment != None:
  173. s += '#' + fragment
  174. return s
  175. class Uri:
  176. """A Uri wraps a string and performs equality testing according to the
  177. rules for URI equivalence. """
  178. def __init__(self,s):
  179. self.s = s
  180. self.n = _canonical(s)
  181. def __str__(self):
  182. return self.s
  183. def __repr__(self):
  184. return repr(self.s)
  185. def __eq__(self, a):
  186. return self.n == a.n
  187. def canonicalForm(u):
  188. """Give the canonical form for a URI, so char-by-char comparisons become valid tests for equivalence."""
  189. try:
  190. return _canonical(u)
  191. except BadUri:
  192. return None
  193. except UnicodeError:
  194. return None