PageRenderTime 112ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/parsers/baseparser.py

https://gitlab.com/andyblaesus/newsdiffs
Python | 156 lines | 139 code | 11 blank | 6 comment | 6 complexity | af3d7afef5cc58a15614cf542081665f MD5 | raw file
  1. import cookielib
  2. import logging
  3. import re
  4. import socket
  5. import sys
  6. import time
  7. import urllib2
  8. # Define a logger
  9. # This formatter is like the default but uses a period rather than a comma
  10. # to separate the milliseconds
  11. class MyFormatter(logging.Formatter):
  12. def formatTime(self, record, datefmt=None):
  13. return logging.Formatter.formatTime(self, record,
  14. datefmt).replace(',', '.')
  15. logger = logging.getLogger(__name__)
  16. logger.setLevel(logging.DEBUG)
  17. formatter = MyFormatter('%(asctime)s:%(levelname)s:%(message)s')
  18. ch = logging.StreamHandler()
  19. ch.setLevel(logging.WARNING)
  20. ch.setFormatter(formatter)
  21. logger.addHandler(ch)
  22. # Utility functions
  23. def grab_url(url, max_depth=5, opener=None):
  24. if opener is None:
  25. cj = cookielib.CookieJar()
  26. opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
  27. retry = False
  28. try:
  29. text = opener.open(url, timeout=5).read()
  30. if '<title>NY Times Advertisement</title>' in text:
  31. retry = True
  32. except socket.timeout:
  33. retry = True
  34. if retry:
  35. if max_depth == 0:
  36. raise Exception('Too many attempts to download %s' % url)
  37. time.sleep(0.5)
  38. return grab_url(url, max_depth-1, opener)
  39. return text
  40. # Begin hot patch for https://bugs.launchpad.net/bugs/788986
  41. # Ick.
  42. from BeautifulSoup import BeautifulSoup
  43. def bs_fixed_getText(self, separator=u""):
  44. bsmod = sys.modules[BeautifulSoup.__module__]
  45. if not len(self.contents):
  46. return u""
  47. stopNode = self._lastRecursiveChild().next
  48. strings = []
  49. current = self.contents[0]
  50. while current is not stopNode:
  51. if isinstance(current, bsmod.NavigableString):
  52. strings.append(current)
  53. current = current.next
  54. return separator.join(strings)
  55. sys.modules[BeautifulSoup.__module__].Tag.getText = bs_fixed_getText
  56. # End fix
  57. def strip_whitespace(text):
  58. lines = text.split('\n')
  59. return '\n'.join(x.strip().rstrip(u'\xa0') for x in lines).strip() + '\n'
  60. # from http://stackoverflow.com/questions/5842115/converting-a-string-which-contains-both-utf-8-encoded-bytestrings-and-codepoints
  61. # Translate a unicode string containing utf8
  62. def parse_double_utf8(txt):
  63. def parse(m):
  64. try:
  65. return m.group(0).encode('latin1').decode('utf8')
  66. except UnicodeDecodeError:
  67. return m.group(0)
  68. return re.sub(ur'[\xc2-\xf4][\x80-\xbf]+', parse, txt)
  69. def canonicalize(text):
  70. return strip_whitespace(parse_double_utf8(text))
  71. def concat(domain, url):
  72. return domain + url if url.startswith('/') else domain + '/' + url
  73. # End utility functions
  74. # Base Parser
  75. # To create a new parser, subclass and define _parse(html).
  76. class BaseParser(object):
  77. url = None
  78. domains = [] # List of domains this should parse
  79. # These should be filled in by self._parse(html)
  80. date = None
  81. title = None
  82. byline = None
  83. body = None
  84. real_article = True # If set to False, ignore this article
  85. SUFFIX = '' # append suffix, like '?fullpage=yes', to urls
  86. meta = [] # Currently unused.
  87. # Used when finding articles to parse
  88. feeder_pat = None # Look for links matching this regular expression
  89. feeder_pages = [] # on these pages
  90. feeder_bs = BeautifulSoup #use this version of beautifulsoup for feed
  91. def __init__(self, url):
  92. self.url = url
  93. try:
  94. self.html = grab_url(self._printableurl())
  95. except urllib2.HTTPError as e:
  96. if e.code == 404:
  97. self.real_article = False
  98. return
  99. raise
  100. logger.debug('got html')
  101. self._parse(self.html)
  102. def _printableurl(self):
  103. return self.url + self.SUFFIX
  104. def _parse(self, html):
  105. """Should take html and populate self.(date, title, byline, body)
  106. If the article isn't valid, set self.real_article to False and return.
  107. """
  108. raise NotImplementedError()
  109. def __unicode__(self):
  110. return canonicalize(u'\n'.join((self.date, self.title, self.byline,
  111. self.body,)))
  112. @classmethod
  113. def feed_urls(cls):
  114. all_urls = []
  115. for feeder_url in cls.feeder_pages:
  116. html = grab_url(feeder_url)
  117. soup = cls.feeder_bs(html)
  118. # "or ''" to make None into str
  119. urls = [a.get('href') or '' for a in soup.findAll('a')]
  120. # If no http://, prepend domain name
  121. domain = '/'.join(feeder_url.split('/')[:3])
  122. urls = [url if '://' in url else concat(domain, url) for url in urls]
  123. all_urls = all_urls + [url for url in urls if
  124. re.search(cls.feeder_pat, url)]
  125. return all_urls