PageRenderTime 40ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 0ms

/feeds/urlnorm.py

https://github.com/e1ven/Lonava
Python | 256 lines | 228 code | 12 blank | 16 comment | 6 complexity | 778ceac246bf63cd0af5dc0d9d876569 MD5 | raw file
  1. """
  2. URI Normalization function:
  3. * Always provide the URI scheme in lowercase characters.
  4. * Always provide the host, if any, in lowercase characters.
  5. * Only perform percent-encoding where it is essential.
  6. * Always use uppercase A-through-F characters when percent-encoding.
  7. * Prevent dot-segments appearing in non-relative URI paths.
  8. * For schemes that define a default authority, use an empty authority if the
  9. default is desired.
  10. * For schemes that define an empty path to be equivalent to a path of "/",
  11. use "/".
  12. * For schemes that define a port, use an empty port if the default is desired
  13. * All portions of the URI must be utf-8 encoded NFC from Unicode strings
  14. implements:
  15. http://gbiv.com/protocols/uri/rev-2002/rfc2396bis.html#canonical-form
  16. http://www.intertwingly.net/wiki/pie/PaceCanonicalIds
  17. inspired by:
  18. Tony J. Ibbs, http://starship.python.net/crew/tibs/python/tji_url.py
  19. Mark Nottingham, http://www.mnot.net/python/urlnorm.py
  20. """
  21. __license__ = "Python"
  22. import re, unicodedata, urlparse
  23. from urllib import quote, unquote
  24. import urllib2
  25. from BeautifulSoup import BeautifulSoup
  26. import socket
  27. default_port = {
  28. 'ftp': 21,
  29. 'telnet': 23,
  30. 'http': 80,
  31. 'gopher': 70,
  32. 'news': 119,
  33. 'nntp': 119,
  34. 'prospero': 191,
  35. 'https': 443,
  36. 'snews': 563,
  37. 'snntp': 563,
  38. }
  39. def normalize(url):
  40. # timeout in seconds
  41. timeout = 10
  42. socket.setdefaulttimeout(timeout)
  43. """Normalize a URL."""
  44. hasprotocol = False
  45. for i in default_port.keys():
  46. if url.startswith(i):
  47. hasprotocol = True
  48. if hasprotocol == False:
  49. url = "http://" + url
  50. # Add TinyURL lookup, plus fix FeedburnerURLs
  51. try:
  52. headers = { 'User-Agent' : 'LonBot/1.0 +http://Lonava.com/' }
  53. o = urllib2.build_opener( urllib2.HTTPCookieProcessor() )
  54. urllib2.install_opener( o )
  55. req = urllib2.Request(url,None, headers)
  56. fp = urllib2.urlopen(req)
  57. url = fp.geturl()
  58. html = fp.read()
  59. try:
  60. soup = BeautifulSoup(html)
  61. links = soup.findAll('link')
  62. for a in links:
  63. for attr, value in a.attrs:
  64. if attr == "rel" and value == "canonical":
  65. print "Found Canonical URL: " + a['href']
  66. # Youtube is returning stupid canonical links
  67. for i in default_port.keys():
  68. if a['href'].startswith(i):
  69. hasprotocol = True
  70. return a['href']
  71. parsed = urlparse(url)
  72. if hasprotocol == False:
  73. print "Bad URL. Youtube likely. Appending hostname."
  74. return "http://" + parsed.hostname + a['href']
  75. except:
  76. url = fp.geturl()
  77. except:
  78. print "Possibly bad url; Doing my best."
  79. scheme,auth,path,query,fragment = urlparse.urlsplit(url.strip())
  80. (userinfo,host,port)=re.search('([^@]*@)?([^:]*):?(.*)',auth).groups()
  81. # Always provide the URI scheme in lowercase characters.
  82. scheme = scheme.lower()
  83. # Always provide the host, if any, in lowercase characters.
  84. host = host.lower()
  85. if host and host[-1] == '.': host = host[:-1]
  86. # Only perform percent-encoding where it is essential.
  87. # Always use uppercase A-through-F characters when percent-encoding.
  88. # All portions of the URI must be utf-8 encoded NFC from Unicode strings
  89. def clean(string):
  90. string=unicode(unquote(string),'utf-8','replace')
  91. return unicodedata.normalize('NFC',string).encode('utf-8')
  92. path=quote(clean(path),"~:/?#[]@!$&'()*+,;=")
  93. fragment=quote(clean(fragment),"~")
  94. # note care must be taken to only encode & and = characters as values
  95. query="&".join(["=".join([quote(clean(t) ,"~:/?#[]@!$'()*+,;=")
  96. for t in q.split("=",1)]) for q in query.split("&")])
  97. # Prevent dot-segments appearing in non-relative URI paths.
  98. if scheme in ["","http","https","ftp","file"]:
  99. output=[]
  100. for input in path.split('/'):
  101. if input=="":
  102. if not output: output.append(input)
  103. elif input==".":
  104. pass
  105. elif input=="..":
  106. if len(output)>1: output.pop()
  107. else:
  108. output.append(input)
  109. if input in ["",".",".."]: output.append("")
  110. path='/'.join(output)
  111. # For schemes that define a default authority, use an empty authority if
  112. # the default is desired.
  113. if userinfo in ["@",":@"]: userinfo=""
  114. # For schemes that define an empty path to be equivalent to a path of "/",
  115. # use "/".
  116. if path=="" and scheme in ["http","https","ftp","file"]:
  117. path="/"
  118. # For schemes that define a port, use an empty port if the default is
  119. # desired
  120. if port and scheme in default_port.keys():
  121. if port.isdigit():
  122. port=str(int(port))
  123. if int(port)==default_port[scheme]:
  124. port = ''
  125. # Put it all back together again
  126. auth=(userinfo or "") + host
  127. if port: auth+=":"+port
  128. if url.endswith("#") and query=="" and fragment=="": path+="#"
  129. return urlparse.urlunsplit((scheme,auth,path,query,fragment))
  130. if __name__ == "__main__":
  131. import unittest
  132. suite = unittest.TestSuite()
  133. """ from http://www.intertwingly.net/wiki/pie/PaceCanonicalIds """
  134. tests= [
  135. (False, "http://:@example.com/"),
  136. (False, "http://@example.com/"),
  137. (False, "http://example.com"),
  138. (False, "HTTP://example.com/"),
  139. (False, "http://EXAMPLE.COM/"),
  140. (False, "http://example.com/%7Ejane"),
  141. (False, "http://example.com/?q=%C7"),
  142. (False, "http://example.com/?q=%5c"),
  143. (False, "http://example.com/?q=C%CC%A7"),
  144. (False, "http://example.com/a/../a/b"),
  145. (False, "http://example.com/a/./b"),
  146. (False, "http://example.com:80/"),
  147. (True, "http://example.com/"),
  148. (True, "http://example.com/?q=%C3%87"),
  149. (True, "http://example.com/?q=%E2%85%A0"),
  150. (True, "http://example.com/?q=%5C"),
  151. (True, "http://example.com/~jane"),
  152. (True, "http://example.com/a/b"),
  153. (True, "http://example.com:8080/"),
  154. (True, "http://user:password@example.com/"),
  155. # from rfc2396bis
  156. (True, "ftp://ftp.is.co.za/rfc/rfc1808.txt"),
  157. (True, "http://www.ietf.org/rfc/rfc2396.txt"),
  158. (True, "ldap://[2001:db8::7]/c=GB?objectClass?one"),
  159. (True, "mailto:John.Doe@example.com"),
  160. (True, "news:comp.infosystems.www.servers.unix"),
  161. (True, "tel:+1-816-555-1212"),
  162. (True, "telnet://192.0.2.16:80/"),
  163. (True, "urn:oasis:names:specification:docbook:dtd:xml:4.1.2"),
  164. # other
  165. (True, "http://127.0.0.1/"),
  166. (False, "http://127.0.0.1:80/"),
  167. (True, "http://www.w3.org/2000/01/rdf-schema#"),
  168. (False, "http://example.com:081/"),
  169. ]
  170. def testcase(expected,value):
  171. class test(unittest.TestCase):
  172. def runTest(self):
  173. assert (normalize(value)==value)==expected, \
  174. (expected, value, normalize(value))
  175. return test()
  176. for (expected,value) in tests:
  177. suite.addTest(testcase(expected,value))
  178. """ mnot test suite; three tests updated for rfc2396bis. """
  179. tests = {
  180. '/foo/bar/.': '/foo/bar/',
  181. '/foo/bar/./': '/foo/bar/',
  182. '/foo/bar/..': '/foo/',
  183. '/foo/bar/../': '/foo/',
  184. '/foo/bar/../baz': '/foo/baz',
  185. '/foo/bar/../..': '/',
  186. '/foo/bar/../../': '/',
  187. '/foo/bar/../../baz': '/baz',
  188. '/foo/bar/../../../baz': '/baz', #was: '/../baz',
  189. '/foo/bar/../../../../baz': '/baz',
  190. '/./foo': '/foo',
  191. '/../foo': '/foo', #was: '/../foo',
  192. '/foo.': '/foo.',
  193. '/.foo': '/.foo',
  194. '/foo..': '/foo..',
  195. '/..foo': '/..foo',
  196. '/./../foo': '/foo', #was: '/../foo',
  197. '/./foo/.': '/foo/',
  198. '/foo/./bar': '/foo/bar',
  199. '/foo/../bar': '/bar',
  200. '/foo//': '/foo/',
  201. '/foo///bar//': '/foo/bar/',
  202. 'http://www.foo.com:80/foo': 'http://www.foo.com/foo',
  203. 'http://www.foo.com:8000/foo': 'http://www.foo.com:8000/foo',
  204. 'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html',
  205. 'http://www.foo.com.:81/foo': 'http://www.foo.com:81/foo',
  206. 'http://www.foo.com/%7ebar': 'http://www.foo.com/~bar',
  207. 'http://www.foo.com/%7Ebar': 'http://www.foo.com/~bar',
  208. 'ftp://user:pass@ftp.foo.net/foo/bar':
  209. 'ftp://user:pass@ftp.foo.net/foo/bar',
  210. 'http://USER:pass@www.Example.COM/foo/bar':
  211. 'http://USER:pass@www.example.com/foo/bar',
  212. 'http://www.example.com./': 'http://www.example.com/',
  213. '-': '-',
  214. }
  215. def testcase(original,normalized):
  216. class test(unittest.TestCase):
  217. def runTest(self):
  218. assert normalize(original)==normalized, \
  219. (original, normalized, normalize(original))
  220. return test()
  221. for (original,normalized) in tests.items():
  222. suite.addTest(testcase(original,normalized))
  223. """ execute tests """
  224. unittest.TextTestRunner().run(suite)