PageRenderTime 28ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/feedzilla/management/commands/feedzilla_analyze.py

https://bitbucket.org/lorien/feedzilla/
Python | 60 lines | 36 code | 13 blank | 11 comment | 8 complexity | f51ffa00cd065afa34197df496a03a82 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. """
  2. Script for searching invalid or obsolete feeds.
  3. """
  4. # Copyright: 2011, Grigoriy Petukhov
  5. # Author: Grigoriy Petukhov (http://lorien.name)
  6. # License: BSD
  7. from grab import Grab
  8. import re
  9. from urlparse import urlsplit
  10. from django.core.management.base import BaseCommand
  11. from feedzilla.models import Feed
  12. REX_FEED_URL = re.compile(r'feed|rss|atom', re.I)
  13. class Command(BaseCommand):
  14. help = u'Search for invalid and obsolete feeds'
  15. def handle(self, *args, **kwargs):
  16. g = Grab()
  17. for feed in Feed.objects.all():
  18. host = urlsplit(feed.site_url).hostname
  19. ok = True
  20. resp = g.go(feed.site_url)
  21. # Check that request to Feed.site_url is not redirect elsewhere
  22. if resp.url != feed.site_url:
  23. print 'Site %s redirects to %s' % (feed.site_url, resp.url)
  24. ok = False
  25. tree = g.tree
  26. tree.make_links_absolute(resp.url)
  27. # Search for `Feed.feed_url` in HTML source of the page
  28. # fetched from `Feed.site_url` URL.
  29. found = False
  30. candidates = set()
  31. for elem, attr, url, pos in tree.iterlinks():
  32. if REX_FEED_URL.search(url):
  33. candidates.add(url)
  34. if url == feed.feed_url:
  35. found = True
  36. break
  37. # If `Feed.feed_url` was not found then
  38. # display found link which probably is correct variant of the `feed_url`.
  39. if not found:
  40. print 'Feed url %s not found on Site %s' % (feed.feed_url, host)
  41. print 'Candidates:'
  42. print '\n'.join(candidates)
  43. ok = False
  44. if not ok:
  45. print
  46. if __name__ == '__main__':
  47. main()