PageRenderTime 55ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/apps/nebula/utilities.py

https://bitbucket.org/resplin/byteflow
Python | 179 lines | 165 code | 7 blank | 7 comment | 6 complexity | 5b7b069043b1fd1558df80870ae86e3e MD5 | raw file
Possible License(s): BSD-3-Clause
  1. import re
  2. import feedparser
  3. import urllib2
  4. import datetime
  5. from BeautifulSoup import BeautifulSoup
  6. from nebula.debugging import logging
  7. from nebula.time_utilities import time_to_datetime
  8. from pytils.translit import slugify
  9. VERSION = '0.1'
  10. URL = 'http://pyobject.ru/about/nebula/'
  11. USER_AGENT = 'nebula %s - %s' % (VERSION, URL)
  12. def shorten_url(url):
  13. quoted = urllib2.quote(url)
  14. req = 'http://is.gd/api.php?longurl=%s' % quoted
  15. res = urllib2.urlopen(req)
  16. retcode = res.code
  17. if retcode != 200:
  18. raise RuntimeError("Cannot shorten url because is.gd returns non-ok status code: %d" % retcode)
  19. return res.read()
  20. def clean_body(body):
  21. headings_start = re.compile(r'(<[h|H]\d{1}>)')
  22. headings_end = re.compile(r'(</?[h|H]\d{1}>)')
  23. divs = re.compile(r'(<[/]?div.*?>)')
  24. comments = re.compile(r'(<!--.*?-->)')
  25. body = divs.sub('', body)
  26. body = headings_start.sub('<p class="heading">', body)
  27. body = headings_end.sub('</p>', body)
  28. body = comments.sub('', body)
  29. # Remove junky feedburner links:
  30. # Note, we don't remove all links that reference feedburner,
  31. # only those which contain image elements that reference
  32. # feedburner.
  33. # You cannot simply remove all links that point to feedburner
  34. # because some publishers use a feature that rewrites all links
  35. # in the content to proxy through FB for tracking purposes.
  36. if 'feedburner' in body:
  37. soup = BeautifulSoup(body)
  38. images = soup.findAll('img', src=re.compile('feedburner'))
  39. for i in images:
  40. # Remove the parent link (and by association, the image)
  41. i.parent.extract()
  42. body = unicode(soup) # Using unicode to be nice, I guess. str()
  43. # might work just as well.
  44. return body.strip()
  45. def clean_title(title):
  46. if title:
  47. bracketed_text = re.compile(r'\[(.*?)\]')
  48. title = bracketed_text.sub('', title)
  49. return title.strip()
  50. else:
  51. return ''
  52. def fetch_single_feed(blog, callback_filter=None):
  53. num_with_tags = 0
  54. from nebula.models import AggregatedBlog, AggregatedPost
  55. logging.debug('Fetching feed %s from blog %r' % (blog.feed, blog.name))
  56. assert isinstance(blog, AggregatedBlog)
  57. FeedPostClass = getattr(blog, '_post_class', AggregatedPost)
  58. assert issubclass(FeedPostClass, AggregatedPost), "blog._post_class is %s class instead of subclass of AggregatedBlog" % FeedPostClass.__name__
  59. if not blog.feed:
  60. logging.info('Blog %r have no feed' % blog.name)
  61. return
  62. try:
  63. d = feedparser.parse(blog.feed, agent=USER_AGENT, etag=blog.etag)
  64. except Exception, e:
  65. logging.error("Fail to fetch feed %s from blog %r: %s" % (blog.feed, blog.name, e))
  66. status = d.get('status')
  67. if status:
  68. if status == 304:
  69. logging.debug('Feed %s has not changed since our last attempt' % blog.feed)
  70. elif status >= 400:
  71. logging.error('HTTP error while trying to grab the feed %s: %s' % (blog.feed, status))
  72. return
  73. blog.etag = d.get('etag') or ''
  74. for entry in d.entries:
  75. created = False
  76. active = True
  77. guid = entry.get('guid', entry.get('link'))
  78. if not guid:
  79. logging.warning('Entry %r from feed have %s no guid' % (entry.title, blog.feed))
  80. continue
  81. try:
  82. existing_post = FeedPostClass.objects.get(guid__iexact=guid)
  83. continue
  84. except FeedPostClass.DoesNotExist:
  85. logging.debug('Post %r from feed %s does not already exist in DB' % (guid, blog.feed))
  86. pass
  87. date_posted = entry.get('modified_parsed', None)
  88. if date_posted:
  89. date_posted = time_to_datetime(date_posted)
  90. else:
  91. logging.warning('Blog %r has bad dates' % (blog.name,))
  92. blog.bad_dates = True
  93. date_posted = datetime.datetime.now()
  94. title = entry.get('title', None)
  95. body = entry.get('summary', None)
  96. if not body:
  97. body = getattr(entry, 'content', [{}])[0].get('value', '')
  98. if body != '':
  99. body = clean_body(body)
  100. if title == body:
  101. body = ''
  102. if title != '':
  103. title = clean_title(title)
  104. link = entry.get('feedburner_origlink', entry.get('link', None))
  105. #title = title.encode('ascii', 'xmlcharrefreplace')
  106. #if body:
  107. # body = body.encode('ascii', 'xmlcharrefreplace')
  108. #author = None
  109. author = entry.get('author_detail')
  110. if not author:
  111. author = entry.get('author', '')
  112. else:
  113. author = author.get('name', '')
  114. #if author:
  115. # author = author.encode('ascii', 'xmlcharrefreplace')
  116. #else:
  117. # author = ''
  118. # Process tags if they exist
  119. tags = entry.get('tags', '')
  120. if tags != '':
  121. num_with_tags += 1
  122. tags = ' '.join([tag.term.lower() for tag in tags])
  123. logging.debug('Found tags for entry %r from feed %s: %s' % (guid, blog.feed, tags,))
  124. # shorten url if length bigger than 255
  125. if len(link) >= 255:
  126. link = shorten_url(link)
  127. # calls callback filter for entry
  128. defaults = {
  129. 'blog' : blog,
  130. 'title' : title,
  131. 'slug' : slugify(title)[:50],
  132. 'body' : body,
  133. 'link' : link,
  134. 'guid' : guid,
  135. 'author': author,
  136. 'posted': date_posted.replace(tzinfo=None),
  137. 'tags' : tags,
  138. 'active': not blog.bad_dates,
  139. }
  140. post_extra_defaults = getattr(blog, '_post_extra_defaults', {})
  141. defaults.update(post_extra_defaults)
  142. if callable(callback_filter):
  143. # callback filter may return None if this post must be skipped
  144. appropriate_defaults = callback_filter(defaults)
  145. else:
  146. appropriate_defaults = defaults
  147. if appropriate_defaults:
  148. post, created = FeedPostClass.objects.get_or_create(
  149. guid__iexact=guid,
  150. defaults=appropriate_defaults,
  151. )
  152. if num_with_tags == 0:
  153. logging.debug('Blog %r has no tags' % (blog,))
  154. blog.bad_tags = True
  155. else:
  156. blog.bad_tags = False
  157. blog.save()
  158. def fetch_feeds(blogs, callback_filter=None):
  159. for blog in blogs:
  160. fetch_single_feed(blog, callback_filter)