/feeds/urlnorm.py
Python | 256 lines | 228 code | 12 blank | 16 comment | 6 complexity | 778ceac246bf63cd0af5dc0d9d876569 MD5 | raw file
- """
- URI Normalization function:
- * Always provide the URI scheme in lowercase characters.
- * Always provide the host, if any, in lowercase characters.
- * Only perform percent-encoding where it is essential.
- * Always use uppercase A-through-F characters when percent-encoding.
- * Prevent dot-segments appearing in non-relative URI paths.
- * For schemes that define a default authority, use an empty authority if the
- default is desired.
- * For schemes that define an empty path to be equivalent to a path of "/",
- use "/".
- * For schemes that define a port, use an empty port if the default is desired
- * All portions of the URI must be utf-8 encoded NFC from Unicode strings
- implements:
- http://gbiv.com/protocols/uri/rev-2002/rfc2396bis.html#canonical-form
- http://www.intertwingly.net/wiki/pie/PaceCanonicalIds
- inspired by:
- Tony J. Ibbs, http://starship.python.net/crew/tibs/python/tji_url.py
- Mark Nottingham, http://www.mnot.net/python/urlnorm.py
- """
- __license__ = "Python"
- import re, unicodedata, urlparse
- from urllib import quote, unquote
- import urllib2
- from BeautifulSoup import BeautifulSoup
- import socket
- default_port = {
- 'ftp': 21,
- 'telnet': 23,
- 'http': 80,
- 'gopher': 70,
- 'news': 119,
- 'nntp': 119,
- 'prospero': 191,
- 'https': 443,
- 'snews': 563,
- 'snntp': 563,
- }
- def normalize(url):
- # timeout in seconds
- timeout = 10
- socket.setdefaulttimeout(timeout)
- """Normalize a URL."""
- hasprotocol = False
- for i in default_port.keys():
- if url.startswith(i):
- hasprotocol = True
- if hasprotocol == False:
- url = "http://" + url
- # Add TinyURL lookup, plus fix FeedburnerURLs
- try:
- headers = { 'User-Agent' : 'LonBot/1.0 +http://Lonava.com/' }
- o = urllib2.build_opener( urllib2.HTTPCookieProcessor() )
- urllib2.install_opener( o )
- req = urllib2.Request(url,None, headers)
- fp = urllib2.urlopen(req)
- url = fp.geturl()
- html = fp.read()
- try:
- soup = BeautifulSoup(html)
- links = soup.findAll('link')
- for a in links:
- for attr, value in a.attrs:
- if attr == "rel" and value == "canonical":
- print "Found Canonical URL: " + a['href']
- # Youtube is returning stupid canonical links
- for i in default_port.keys():
- if a['href'].startswith(i):
- hasprotocol = True
- return a['href']
- parsed = urlparse(url)
- if hasprotocol == False:
- print "Bad URL. Youtube likely. Appending hostname."
- return "http://" + parsed.hostname + a['href']
- except:
- url = fp.geturl()
- except:
- print "Possibly bad url; Doing my best."
-
- scheme,auth,path,query,fragment = urlparse.urlsplit(url.strip())
- (userinfo,host,port)=re.search('([^@]*@)?([^:]*):?(.*)',auth).groups()
- # Always provide the URI scheme in lowercase characters.
- scheme = scheme.lower()
- # Always provide the host, if any, in lowercase characters.
- host = host.lower()
- if host and host[-1] == '.': host = host[:-1]
- # Only perform percent-encoding where it is essential.
- # Always use uppercase A-through-F characters when percent-encoding.
- # All portions of the URI must be utf-8 encoded NFC from Unicode strings
- def clean(string):
- string=unicode(unquote(string),'utf-8','replace')
- return unicodedata.normalize('NFC',string).encode('utf-8')
- path=quote(clean(path),"~:/?#[]@!$&'()*+,;=")
- fragment=quote(clean(fragment),"~")
- # note care must be taken to only encode & and = characters as values
- query="&".join(["=".join([quote(clean(t) ,"~:/?#[]@!$'()*+,;=")
- for t in q.split("=",1)]) for q in query.split("&")])
- # Prevent dot-segments appearing in non-relative URI paths.
- if scheme in ["","http","https","ftp","file"]:
- output=[]
- for input in path.split('/'):
- if input=="":
- if not output: output.append(input)
- elif input==".":
- pass
- elif input=="..":
- if len(output)>1: output.pop()
- else:
- output.append(input)
- if input in ["",".",".."]: output.append("")
- path='/'.join(output)
- # For schemes that define a default authority, use an empty authority if
- # the default is desired.
- if userinfo in ["@",":@"]: userinfo=""
- # For schemes that define an empty path to be equivalent to a path of "/",
- # use "/".
- if path=="" and scheme in ["http","https","ftp","file"]:
- path="/"
- # For schemes that define a port, use an empty port if the default is
- # desired
- if port and scheme in default_port.keys():
- if port.isdigit():
- port=str(int(port))
- if int(port)==default_port[scheme]:
- port = ''
- # Put it all back together again
- auth=(userinfo or "") + host
- if port: auth+=":"+port
- if url.endswith("#") and query=="" and fragment=="": path+="#"
- return urlparse.urlunsplit((scheme,auth,path,query,fragment))
- if __name__ == "__main__":
- import unittest
- suite = unittest.TestSuite()
- """ from http://www.intertwingly.net/wiki/pie/PaceCanonicalIds """
- tests= [
- (False, "http://:@example.com/"),
- (False, "http://@example.com/"),
- (False, "http://example.com"),
- (False, "HTTP://example.com/"),
- (False, "http://EXAMPLE.COM/"),
- (False, "http://example.com/%7Ejane"),
- (False, "http://example.com/?q=%C7"),
- (False, "http://example.com/?q=%5c"),
- (False, "http://example.com/?q=C%CC%A7"),
- (False, "http://example.com/a/../a/b"),
- (False, "http://example.com/a/./b"),
- (False, "http://example.com:80/"),
- (True, "http://example.com/"),
- (True, "http://example.com/?q=%C3%87"),
- (True, "http://example.com/?q=%E2%85%A0"),
- (True, "http://example.com/?q=%5C"),
- (True, "http://example.com/~jane"),
- (True, "http://example.com/a/b"),
- (True, "http://example.com:8080/"),
- (True, "http://user:password@example.com/"),
- # from rfc2396bis
- (True, "ftp://ftp.is.co.za/rfc/rfc1808.txt"),
- (True, "http://www.ietf.org/rfc/rfc2396.txt"),
- (True, "ldap://[2001:db8::7]/c=GB?objectClass?one"),
- (True, "mailto:John.Doe@example.com"),
- (True, "news:comp.infosystems.www.servers.unix"),
- (True, "tel:+1-816-555-1212"),
- (True, "telnet://192.0.2.16:80/"),
- (True, "urn:oasis:names:specification:docbook:dtd:xml:4.1.2"),
- # other
- (True, "http://127.0.0.1/"),
- (False, "http://127.0.0.1:80/"),
- (True, "http://www.w3.org/2000/01/rdf-schema#"),
- (False, "http://example.com:081/"),
- ]
- def testcase(expected,value):
- class test(unittest.TestCase):
- def runTest(self):
- assert (normalize(value)==value)==expected, \
- (expected, value, normalize(value))
- return test()
- for (expected,value) in tests:
- suite.addTest(testcase(expected,value))
- """ mnot test suite; three tests updated for rfc2396bis. """
- tests = {
- '/foo/bar/.': '/foo/bar/',
- '/foo/bar/./': '/foo/bar/',
- '/foo/bar/..': '/foo/',
- '/foo/bar/../': '/foo/',
- '/foo/bar/../baz': '/foo/baz',
- '/foo/bar/../..': '/',
- '/foo/bar/../../': '/',
- '/foo/bar/../../baz': '/baz',
- '/foo/bar/../../../baz': '/baz', #was: '/../baz',
- '/foo/bar/../../../../baz': '/baz',
- '/./foo': '/foo',
- '/../foo': '/foo', #was: '/../foo',
- '/foo.': '/foo.',
- '/.foo': '/.foo',
- '/foo..': '/foo..',
- '/..foo': '/..foo',
- '/./../foo': '/foo', #was: '/../foo',
- '/./foo/.': '/foo/',
- '/foo/./bar': '/foo/bar',
- '/foo/../bar': '/bar',
- '/foo//': '/foo/',
- '/foo///bar//': '/foo/bar/',
- 'http://www.foo.com:80/foo': 'http://www.foo.com/foo',
- 'http://www.foo.com:8000/foo': 'http://www.foo.com:8000/foo',
- 'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html',
- 'http://www.foo.com.:81/foo': 'http://www.foo.com:81/foo',
- 'http://www.foo.com/%7ebar': 'http://www.foo.com/~bar',
- 'http://www.foo.com/%7Ebar': 'http://www.foo.com/~bar',
- 'ftp://user:pass@ftp.foo.net/foo/bar':
- 'ftp://user:pass@ftp.foo.net/foo/bar',
- 'http://USER:pass@www.Example.COM/foo/bar':
- 'http://USER:pass@www.example.com/foo/bar',
- 'http://www.example.com./': 'http://www.example.com/',
- '-': '-',
- }
- def testcase(original,normalized):
- class test(unittest.TestCase):
- def runTest(self):
- assert normalize(original)==normalized, \
- (original, normalized, normalize(original))
- return test()
- for (original,normalized) in tests.items():
- suite.addTest(testcase(original,normalized))
- """ execute tests """
- unittest.TextTestRunner().run(suite)