PageRenderTime 71ms CodeModel.GetById 35ms RepoModel.GetById 0ms app.codeStats 0ms

/extract_tags/lib/tldextract.py

https://bitbucket.org/pombredanne/unstructured-data-extractor
Python | 232 lines | 207 code | 2 blank | 23 comment | 4 complexity | 92c7886b5326743987ea26cb671ca717 MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. """`tldextract` accurately separates the gTLD or ccTLD (generic or country code
  3. top-level domain) from the registered domain and subdomains of a URL.
  4. >>> import tldextract
  5. >>> tldextract.extract('http://forums.news.cnn.com/')
  6. ExtractResult(subdomain='forums.news', domain='cnn', tld='com')
  7. >>> tldextract.extract('http://forums.bbc.co.uk/') # United Kingdom
  8. ExtractResult(subdomain='forums', domain='bbc', tld='co.uk')
  9. >>> tldextract.extract('http://www.worldbank.org.kg/') # Kyrgyzstan
  10. ExtractResult(subdomain='www', domain='worldbank', tld='org.kg')
  11. `ExtractResult` is a namedtuple, so it's simple to access the parts you want.
  12. >>> ext = tldextract.extract('http://forums.bbc.co.uk')
  13. >>> ext.domain
  14. 'bbc'
  15. >>> '.'.join(ext[:2]) # rejoin subdomain and domain
  16. 'forums.bbc'
  17. """
  18. from __future__ import with_statement
  19. try:
  20. import cPickle as pickle
  21. except ImportError:
  22. import pickle
  23. import errno
  24. from functools import wraps
  25. import logging
  26. from operator import itemgetter
  27. import os
  28. import sys
  29. try:
  30. import pkg_resources
  31. except ImportError:
  32. class pkg_resources(object):
  33. """Fake pkg_resources interface which falls back to getting resources
  34. inside `tldextract`'s directory.
  35. """
  36. @classmethod
  37. def resource_stream(cls, package, resource_name):
  38. moddir = os.path.dirname(__file__)
  39. f = os.path.join(moddir, resource_name)
  40. return open(f)
  41. import re
  42. import socket
  43. import urllib2
  44. import urlparse
  45. LOG = logging.getLogger("tldextract")
  46. SCHEME_RE = re.compile(r'^([' + urlparse.scheme_chars + ']+:)?//')
  47. IP_RE = re.compile(r'^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$')
  48. class ExtractResult(tuple):
  49. 'ExtractResult(subdomain, domain, tld)'
  50. __slots__ = ()
  51. _fields = ('subdomain', 'domain', 'tld')
  52. def __new__(_cls, subdomain, domain, tld):
  53. 'Create new instance of ExtractResult(subdomain, domain, tld)'
  54. return tuple.__new__(_cls, (subdomain, domain, tld))
  55. @classmethod
  56. def _make(cls, iterable, new=tuple.__new__, len=len):
  57. 'Make a new ExtractResult object from a sequence or iterable'
  58. result = new(cls, iterable)
  59. if len(result) != 3:
  60. raise TypeError('Expected 3 arguments, got %d' % len(result))
  61. return result
  62. def __repr__(self):
  63. 'Return a nicely formatted representation string'
  64. return 'ExtractResult(subdomain=%r, domain=%r, tld=%r)' % self
  65. def _asdict(self):
  66. 'Return a new dict which maps field names to their values'
  67. return dict(zip(self._fields, self))
  68. def _replace(_self, **kwds):
  69. 'Return a new ExtractResult object replacing specified fields with new values'
  70. result = _self._make(map(kwds.pop, ('subdomain', 'domain', 'tld'), _self))
  71. if kwds:
  72. raise ValueError('Got unexpected field names: %r' % kwds.keys())
  73. return result
  74. def __getnewargs__(self):
  75. 'Return self as a plain tuple. Used by copy and pickle.'
  76. return tuple(self)
  77. subdomain = property(itemgetter(0), doc='Alias for field number 0')
  78. domain = property(itemgetter(1), doc='Alias for field number 1')
  79. tld = property(itemgetter(2), doc='Alias for field number 2')
  80. class TLDExtract(object):
  81. def __init__(self, fetch=True, cache_file=''):
  82. """
  83. Constructs a callable for extracting subdomain, domain, and TLD
  84. components from a URL.
  85. If fetch is True (the default) and no cached TLD set is found, this
  86. extractor will fetch TLD sources live over HTTP on first use. Set to
  87. False to not make HTTP requests. Either way, if the TLD set can't be
  88. read, the module will fall back to the included TLD set snapshot.
  89. Specifying cache_file will override the location of the TLD set.
  90. Defaults to /path/to/tldextract/.tld_set.
  91. """
  92. self.fetch = fetch
  93. self.cache_file = cache_file or os.path.join(os.path.dirname(__file__), '.tld_set')
  94. self._extractor = None
  95. def __call__(self, url):
  96. """
  97. Takes a string URL and splits it into its subdomain, domain, and
  98. gTLD/ccTLD component.
  99. >>> extract = TLDExtract()
  100. >>> extract('http://forums.news.cnn.com/')
  101. ExtractResult(subdomain='forums.news', domain='cnn', tld='com')
  102. >>> extract('http://forums.bbc.co.uk/')
  103. ExtractResult(subdomain='forums', domain='bbc', tld='co.uk')
  104. """
  105. netloc = SCHEME_RE.sub("", url) \
  106. .partition("/")[0] \
  107. .partition("?")[0] \
  108. .partition("#")[0] \
  109. .split("@")[-1] \
  110. .partition(":")[0]
  111. registered_domain, tld = self._get_tld_extractor().extract(netloc)
  112. if not tld and netloc and netloc[0].isdigit():
  113. try:
  114. is_ip = socket.inet_aton(netloc)
  115. return ExtractResult('', netloc, '')
  116. except AttributeError:
  117. if IP_RE.match(netloc):
  118. return ExtractResult('', netloc, '')
  119. except socket.error:
  120. pass
  121. subdomain, _, domain = registered_domain.rpartition('.')
  122. return ExtractResult(subdomain, domain, tld)
  123. def _get_tld_extractor(self):
  124. if self._extractor:
  125. return self._extractor
  126. cached_file = self.cache_file
  127. try:
  128. with open(cached_file) as f:
  129. self._extractor = _PublicSuffixListTLDExtractor(pickle.load(f))
  130. return self._extractor
  131. except IOError, ioe:
  132. file_not_found = ioe.errno == errno.ENOENT
  133. if not file_not_found:
  134. LOG.error("error reading TLD cache file %s: %s", cached_file, ioe)
  135. except Exception, ex:
  136. LOG.error("error reading TLD cache file %s: %s", cached_file, ex)
  137. tlds = frozenset()
  138. if self.fetch:
  139. tld_sources = (_PublicSuffixListSource,)
  140. tlds = frozenset(tld for tld_source in tld_sources for tld in tld_source())
  141. if not tlds:
  142. with pkg_resources.resource_stream(__name__, '.tld_set_snapshot') as snapshot_file:
  143. self._extractor = _PublicSuffixListTLDExtractor(pickle.load(snapshot_file))
  144. return self._extractor
  145. LOG.info("computed TLDs: [%s, ...]", ', '.join(list(tlds)[:10]))
  146. if LOG.isEnabledFor(logging.DEBUG):
  147. import difflib
  148. with pkg_resources.resource_stream(__name__, '.tld_set_snapshot') as snapshot_file:
  149. snapshot = sorted(pickle.load(snapshot_file))
  150. new = sorted(tlds)
  151. for line in difflib.unified_diff(snapshot, new, fromfile=".tld_set_snapshot", tofile=cached_file):
  152. print >> sys.stderr, line.encode('utf-8')
  153. try:
  154. with open(cached_file, 'wb') as f:
  155. pickle.dump(tlds, f)
  156. except IOError, e:
  157. LOG.warn("unable to cache TLDs in file %s: %s", cached_file, e)
  158. self._extractor = _PublicSuffixListTLDExtractor(tlds)
  159. return self._extractor
  160. TLD_EXTRACTOR = TLDExtract()
  161. @wraps(TLD_EXTRACTOR.__call__)
  162. def extract(url):
  163. return TLD_EXTRACTOR(url)
  164. def _fetch_page(url):
  165. try:
  166. return unicode(urllib2.urlopen(url).read(), 'utf-8')
  167. except urllib2.URLError, e:
  168. LOG.error(e)
  169. return u''
  170. def _PublicSuffixListSource():
  171. page = _fetch_page('http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1')
  172. tld_finder = re.compile(r'^(?P<tld>[.*!]*\w[\S]*)', re.UNICODE | re.MULTILINE)
  173. tlds = [m.group('tld') for m in tld_finder.finditer(page)]
  174. return tlds
  175. class _PublicSuffixListTLDExtractor(object):
  176. def __init__(self, tlds):
  177. self.tlds = tlds
  178. def extract(self, netloc):
  179. spl = netloc.split('.')
  180. for i in range(len(spl)):
  181. maybe_tld = '.'.join(spl[i:])
  182. exception_tld = '!' + maybe_tld
  183. if exception_tld in self.tlds:
  184. return '.'.join(spl[:i+1]), '.'.join(spl[i+1:])
  185. wildcard_tld = '*.' + '.'.join(spl[i+1:])
  186. if wildcard_tld in self.tlds or maybe_tld in self.tlds:
  187. return '.'.join(spl[:i]), maybe_tld
  188. return netloc, ''
  189. if __name__ == "__main__":
  190. url = sys.argv[1]
  191. print ' '.join(extract(url))