/apps/nebula/utilities.py
Python | 179 lines | 165 code | 7 blank | 7 comment | 6 complexity | 5b7b069043b1fd1558df80870ae86e3e MD5 | raw file
Possible License(s): BSD-3-Clause
- import re
- import feedparser
- import urllib2
- import datetime
- from BeautifulSoup import BeautifulSoup
- from nebula.debugging import logging
- from nebula.time_utilities import time_to_datetime
- from pytils.translit import slugify
- VERSION = '0.1'
- URL = 'http://pyobject.ru/about/nebula/'
- USER_AGENT = 'nebula %s - %s' % (VERSION, URL)
- def shorten_url(url):
- quoted = urllib2.quote(url)
- req = 'http://is.gd/api.php?longurl=%s' % quoted
- res = urllib2.urlopen(req)
- retcode = res.code
- if retcode != 200:
- raise RuntimeError("Cannot shorten url because is.gd returns non-ok status code: %d" % retcode)
- return res.read()
- def clean_body(body):
- headings_start = re.compile(r'(<[h|H]\d{1}>)')
- headings_end = re.compile(r'(</?[h|H]\d{1}>)')
- divs = re.compile(r'(<[/]?div.*?>)')
- comments = re.compile(r'(<!--.*?-->)')
- body = divs.sub('', body)
- body = headings_start.sub('<p class="heading">', body)
- body = headings_end.sub('</p>', body)
- body = comments.sub('', body)
- # Remove junky feedburner links:
- # Note, we don't remove all links that reference feedburner,
- # only those which contain image elements that reference
- # feedburner.
- # You cannot simply remove all links that point to feedburner
- # because some publishers use a feature that rewrites all links
- # in the content to proxy through FB for tracking purposes.
- if 'feedburner' in body:
- soup = BeautifulSoup(body)
- images = soup.findAll('img', src=re.compile('feedburner'))
- for i in images:
- # Remove the parent link (and by association, the image)
- i.parent.extract()
- body = unicode(soup) # Using unicode to be nice, I guess. str()
- # might work just as well.
- return body.strip()
- def clean_title(title):
- if title:
- bracketed_text = re.compile(r'\[(.*?)\]')
- title = bracketed_text.sub('', title)
- return title.strip()
- else:
- return ''
- def fetch_single_feed(blog, callback_filter=None):
- num_with_tags = 0
- from nebula.models import AggregatedBlog, AggregatedPost
- logging.debug('Fetching feed %s from blog %r' % (blog.feed, blog.name))
- assert isinstance(blog, AggregatedBlog)
- FeedPostClass = getattr(blog, '_post_class', AggregatedPost)
- assert issubclass(FeedPostClass, AggregatedPost), "blog._post_class is %s class instead of subclass of AggregatedBlog" % FeedPostClass.__name__
- if not blog.feed:
- logging.info('Blog %r have no feed' % blog.name)
- return
- try:
- d = feedparser.parse(blog.feed, agent=USER_AGENT, etag=blog.etag)
- except Exception, e:
- logging.error("Fail to fetch feed %s from blog %r: %s" % (blog.feed, blog.name, e))
- status = d.get('status')
- if status:
- if status == 304:
- logging.debug('Feed %s has not changed since our last attempt' % blog.feed)
- elif status >= 400:
- logging.error('HTTP error while trying to grab the feed %s: %s' % (blog.feed, status))
- return
- blog.etag = d.get('etag') or ''
- for entry in d.entries:
- created = False
- active = True
- guid = entry.get('guid', entry.get('link'))
- if not guid:
- logging.warning('Entry %r from feed have %s no guid' % (entry.title, blog.feed))
- continue
- try:
- existing_post = FeedPostClass.objects.get(guid__iexact=guid)
- continue
- except FeedPostClass.DoesNotExist:
- logging.debug('Post %r from feed %s does not already exist in DB' % (guid, blog.feed))
- pass
- date_posted = entry.get('modified_parsed', None)
- if date_posted:
- date_posted = time_to_datetime(date_posted)
- else:
- logging.warning('Blog %r has bad dates' % (blog.name,))
- blog.bad_dates = True
- date_posted = datetime.datetime.now()
- title = entry.get('title', None)
- body = entry.get('summary', None)
- if not body:
- body = getattr(entry, 'content', [{}])[0].get('value', '')
- if body != '':
- body = clean_body(body)
- if title == body:
- body = ''
- if title != '':
- title = clean_title(title)
- link = entry.get('feedburner_origlink', entry.get('link', None))
- #title = title.encode('ascii', 'xmlcharrefreplace')
- #if body:
- # body = body.encode('ascii', 'xmlcharrefreplace')
- #author = None
- author = entry.get('author_detail')
- if not author:
- author = entry.get('author', '')
- else:
- author = author.get('name', '')
- #if author:
- # author = author.encode('ascii', 'xmlcharrefreplace')
- #else:
- # author = ''
- # Process tags if they exist
- tags = entry.get('tags', '')
- if tags != '':
- num_with_tags += 1
- tags = ' '.join([tag.term.lower() for tag in tags])
- logging.debug('Found tags for entry %r from feed %s: %s' % (guid, blog.feed, tags,))
- # shorten url if length bigger than 255
- if len(link) >= 255:
- link = shorten_url(link)
- # calls callback filter for entry
- defaults = {
- 'blog' : blog,
- 'title' : title,
- 'slug' : slugify(title)[:50],
- 'body' : body,
- 'link' : link,
- 'guid' : guid,
- 'author': author,
- 'posted': date_posted.replace(tzinfo=None),
- 'tags' : tags,
- 'active': not blog.bad_dates,
- }
- post_extra_defaults = getattr(blog, '_post_extra_defaults', {})
- defaults.update(post_extra_defaults)
- if callable(callback_filter):
- # callback filter may return None if this post must be skipped
- appropriate_defaults = callback_filter(defaults)
- else:
- appropriate_defaults = defaults
- if appropriate_defaults:
- post, created = FeedPostClass.objects.get_or_create(
- guid__iexact=guid,
- defaults=appropriate_defaults,
- )
- if num_with_tags == 0:
- logging.debug('Blog %r has no tags' % (blog,))
- blog.bad_tags = True
- else:
- blog.bad_tags = False
- blog.save()
- def fetch_feeds(blogs, callback_filter=None):
- for blog in blogs:
- fetch_single_feed(blog, callback_filter)