PageRenderTime 47ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/compat/feedfinder.py

https://bitbucket.org/resplin/byteflow
Python | 366 lines | 329 code | 19 blank | 18 comment | 58 complexity | 6bd57732fe1f7256cbd2f27b7006d576 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. """feedfinder: Find the Web feed for a Web page
  2. http://www.aaronsw.com/2002/feedfinder/
  3. Usage:
  4. feed(uri) - returns feed found for a URI
  5. feeds(uri) - returns all feeds found for a URI
  6. >>> import feedfinder
  7. >>> feedfinder.feed('scripting.com')
  8. 'http://scripting.com/rss.xml'
  9. >>>
  10. >>> feedfinder.feeds('scripting.com')
  11. ['http://delong.typepad.com/sdj/atom.xml',
  12. 'http://delong.typepad.com/sdj/index.rdf',
  13. 'http://delong.typepad.com/sdj/rss.xml']
  14. >>>
  15. Can also use from the command line. Feeds are returned one per line:
  16. $ python feedfinder.py diveintomark.org
  17. http://diveintomark.org/xml/atom.xml
  18. How it works:
  19. 0. At every step, feeds are minimally verified to make sure they are really feeds.
  20. 1. If the URI points to a feed, it is simply returned; otherwise
  21. the page is downloaded and the real fun begins.
  22. 2. Feeds pointed to by LINK tags in the header of the page (autodiscovery)
  23. 3. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml", or
  24. ".atom"
  25. 4. <A> links to feeds on the same server containing "rss", "rdf", "xml", or "atom"
  26. 5. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml", or
  27. ".atom"
  28. 6. <A> links to feeds on external servers containing "rss", "rdf", "xml", or "atom"
  29. 7. Try some guesses about common places for feeds (index.xml, atom.xml, etc.).
  30. 8. As a last ditch effort, we search Syndic8 for feeds matching the URI
  31. """
  32. __version__ = "1.371"
  33. __date__ = "2006-04-24"
  34. __maintainer__ = "Aaron Swartz (me@aaronsw.com)"
  35. __author__ = "Mark Pilgrim (http://diveintomark.org)"
  36. __copyright__ = "Copyright 2002-4, Mark Pilgrim; 2006 Aaron Swartz"
  37. __license__ = "Python"
  38. __credits__ = """Abe Fettig for a patch to sort Syndic8 feeds by popularity
  39. Also Jason Diamond, Brian Lalor for bug reporting and patches"""
  40. _debug = 0
  41. import sgmllib, urllib, urlparse, re, sys, robotparser
  42. import threading
  43. class TimeoutError(Exception): pass
  44. def timelimit(timeout):
  45. """borrowed from web.py"""
  46. def _1(function):
  47. def _2(*args, **kw):
  48. class Dispatch(threading.Thread):
  49. def __init__(self):
  50. threading.Thread.__init__(self)
  51. self.result = None
  52. self.error = None
  53. self.setDaemon(True)
  54. self.start()
  55. def run(self):
  56. try:
  57. self.result = function(*args, **kw)
  58. except:
  59. self.error = sys.exc_info()
  60. c = Dispatch()
  61. c.join(timeout)
  62. if c.isAlive():
  63. raise TimeoutError, 'took too long'
  64. if c.error:
  65. raise c.error[0], c.error[1]
  66. return c.result
  67. return _2
  68. return _1
  69. # XML-RPC support allows feedfinder to query Syndic8 for possible matches.
  70. # Python 2.3 now comes with this module by default, otherwise you can download it
  71. try:
  72. import xmlrpclib # http://www.pythonware.com/products/xmlrpc/
  73. except ImportError:
  74. xmlrpclib = None
  75. if not dict:
  76. def dict(aList):
  77. rc = {}
  78. for k, v in aList:
  79. rc[k] = v
  80. return rc
  81. def _debuglog(message):
  82. if _debug: print message
  83. class URLGatekeeper:
  84. """a class to track robots.txt rules across multiple servers"""
  85. def __init__(self):
  86. self.rpcache = {} # a dictionary of RobotFileParser objects, by domain
  87. self.urlopener = urllib.FancyURLopener()
  88. self.urlopener.version = "feedfinder/" + __version__ + " " + self.urlopener.version + " +http://www.aaronsw.com/2002/feedfinder/"
  89. _debuglog(self.urlopener.version)
  90. self.urlopener.addheaders = [('User-agent', self.urlopener.version)]
  91. robotparser.URLopener.version = self.urlopener.version
  92. robotparser.URLopener.addheaders = self.urlopener.addheaders
  93. def _getrp(self, url):
  94. protocol, domain = urlparse.urlparse(url)[:2]
  95. if self.rpcache.has_key(domain):
  96. return self.rpcache[domain]
  97. baseurl = '%s://%s' % (protocol, domain)
  98. robotsurl = urlparse.urljoin(baseurl, 'robots.txt')
  99. _debuglog('fetching %s' % robotsurl)
  100. rp = robotparser.RobotFileParser(robotsurl)
  101. try:
  102. rp.read()
  103. except:
  104. pass
  105. self.rpcache[domain] = rp
  106. return rp
  107. def can_fetch(self, url):
  108. rp = self._getrp(url)
  109. allow = rp.can_fetch(self.urlopener.version, url)
  110. _debuglog("gatekeeper of %s says %s" % (url, allow))
  111. return allow
  112. @timelimit(30)
  113. def get(self, url, check=True):
  114. if check and not self.can_fetch(url): return ''
  115. try:
  116. return self.urlopener.open(url).read()
  117. except:
  118. return ''
  119. _gatekeeper = URLGatekeeper()
  120. class BaseParser(sgmllib.SGMLParser):
  121. def __init__(self, baseuri):
  122. sgmllib.SGMLParser.__init__(self)
  123. self.links = []
  124. self.baseuri = baseuri
  125. def normalize_attrs(self, attrs):
  126. def cleanattr(v):
  127. v = sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v)
  128. v = v.strip()
  129. v = v.replace('&lt;', '<').replace('&gt;', '>').replace('&apos;', "'").replace('&quot;', '"').replace('&amp;', '&')
  130. return v
  131. attrs = [(k.lower(), cleanattr(v)) for k, v in attrs]
  132. attrs = [(k, k in ('rel','type') and v.lower() or v) for k, v in attrs]
  133. return attrs
  134. def do_base(self, attrs):
  135. attrsD = dict(self.normalize_attrs(attrs))
  136. if not attrsD.has_key('href'): return
  137. self.baseuri = attrsD['href']
  138. def error(self, *a, **kw): pass # we're not picky
  139. class LinkParser(BaseParser):
  140. FEED_TYPES = ('application/rss+xml',
  141. 'text/xml',
  142. 'application/atom+xml',
  143. 'application/x.atom+xml',
  144. 'application/x-atom+xml')
  145. def do_link(self, attrs):
  146. attrsD = dict(self.normalize_attrs(attrs))
  147. if not attrsD.has_key('rel'): return
  148. rels = attrsD['rel'].split()
  149. if 'alternate' not in rels: return
  150. if attrsD.get('type') not in self.FEED_TYPES: return
  151. if not attrsD.has_key('href'): return
  152. self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
  153. class ALinkParser(BaseParser):
  154. def start_a(self, attrs):
  155. attrsD = dict(self.normalize_attrs(attrs))
  156. if not attrsD.has_key('href'): return
  157. self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
  158. def makeFullURI(uri):
  159. uri = uri.strip()
  160. if uri.startswith('feed://'):
  161. uri = 'http://' + uri.split('feed://', 1).pop()
  162. for x in ['http', 'https']:
  163. if uri.startswith('%s://' % x):
  164. return uri
  165. return 'http://%s' % uri
  166. def getLinks(data, baseuri):
  167. p = LinkParser(baseuri)
  168. p.feed(data)
  169. return p.links
  170. def getALinks(data, baseuri):
  171. p = ALinkParser(baseuri)
  172. p.feed(data)
  173. return p.links
  174. def getLocalLinks(links, baseuri):
  175. baseuri = baseuri.lower()
  176. urilen = len(baseuri)
  177. return [l for l in links if l.lower().startswith(baseuri)]
  178. def isFeedLink(link):
  179. return link[-4:].lower() in ('.rss', '.rdf', '.xml', '.atom')
  180. def isXMLRelatedLink(link):
  181. link = link.lower()
  182. return link.count('rss') + link.count('rdf') + link.count('xml') + link.count('atom')
  183. r_brokenRedirect = re.compile('<newLocation[^>]*>(.*?)</newLocation>', re.S)
  184. def tryBrokenRedirect(data):
  185. if '<newLocation' in data:
  186. newuris = r_brokenRedirect.findall(data)
  187. if newuris: return newuris[0].strip()
  188. def couldBeFeedData(data):
  189. data = data.lower()
  190. if data.count('<html'): return 0
  191. return data.count('<rss') + data.count('<rdf') + data.count('<feed')
  192. def isFeed(uri):
  193. _debuglog('seeing if %s is a feed' % uri)
  194. protocol = urlparse.urlparse(uri)
  195. if protocol[0] not in ('http', 'https'): return 0
  196. data = _gatekeeper.get(uri)
  197. return couldBeFeedData(data)
  198. def sortFeeds(feed1Info, feed2Info):
  199. return cmp(feed2Info['headlines_rank'], feed1Info['headlines_rank'])
  200. def getFeedsFromSyndic8(uri):
  201. feeds = []
  202. try:
  203. server = xmlrpclib.Server('http://www.syndic8.com/xmlrpc.php')
  204. feedids = server.syndic8.FindFeeds(uri)
  205. infolist = server.syndic8.GetFeedInfo(feedids, ['headlines_rank','status','dataurl'])
  206. infolist.sort(sortFeeds)
  207. feeds = [f['dataurl'] for f in infolist if f['status']=='Syndicated']
  208. _debuglog('found %s feeds through Syndic8' % len(feeds))
  209. except:
  210. pass
  211. return feeds
  212. def feeds(uri, all=False, querySyndic8=False, _recurs=None):
  213. if _recurs is None: _recurs = [uri]
  214. fulluri = makeFullURI(uri)
  215. try:
  216. data = _gatekeeper.get(fulluri, check=False)
  217. except:
  218. return []
  219. # is this already a feed?
  220. if couldBeFeedData(data):
  221. return [fulluri]
  222. newuri = tryBrokenRedirect(data)
  223. if newuri and newuri not in _recurs:
  224. _recurs.append(newuri)
  225. return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs)
  226. # nope, it's a page, try LINK tags first
  227. _debuglog('looking for LINK tags')
  228. try:
  229. outfeeds = getLinks(data, fulluri)
  230. except:
  231. outfeeds = []
  232. _debuglog('found %s feeds through LINK tags' % len(outfeeds))
  233. outfeeds = filter(isFeed, outfeeds)
  234. if all or not outfeeds:
  235. # no LINK tags, look for regular <A> links that point to feeds
  236. _debuglog('no LINK tags, looking at A tags')
  237. try:
  238. links = getALinks(data, fulluri)
  239. except:
  240. links = []
  241. locallinks = getLocalLinks(links, fulluri)
  242. # look for obvious feed links on the same server
  243. outfeeds.extend(filter(isFeed, filter(isFeedLink, locallinks)))
  244. if all or not outfeeds:
  245. # look harder for feed links on the same server
  246. outfeeds.extend(filter(isFeed, filter(isXMLRelatedLink, locallinks)))
  247. if all or not outfeeds:
  248. # look for obvious feed links on another server
  249. outfeeds.extend(filter(isFeed, filter(isFeedLink, links)))
  250. if all or not outfeeds:
  251. # look harder for feed links on another server
  252. outfeeds.extend(filter(isFeed, filter(isXMLRelatedLink, links)))
  253. if all or not outfeeds:
  254. _debuglog('no A tags, guessing')
  255. suffixes = [ # filenames used by popular software:
  256. 'atom.xml', # blogger, TypePad
  257. 'index.atom', # MT, apparently
  258. 'index.rdf', # MT
  259. 'rss.xml', # Dave Winer/Manila
  260. 'index.xml', # MT
  261. 'index.rss' # Slash
  262. ]
  263. outfeeds.extend(filter(isFeed, [urlparse.urljoin(fulluri, x) for x in suffixes]))
  264. if (all or not outfeeds) and querySyndic8:
  265. # still no luck, search Syndic8 for feeds (requires xmlrpclib)
  266. _debuglog('still no luck, searching Syndic8')
  267. outfeeds.extend(getFeedsFromSyndic8(uri))
  268. if hasattr(__builtins__, 'set') or __builtins__.has_key('set'):
  269. outfeeds = list(set(outfeeds))
  270. return outfeeds
  271. getFeeds = feeds # backwards-compatibility
  272. def feed(uri):
  273. #todo: give preference to certain feed formats
  274. feedlist = feeds(uri)
  275. if feedlist:
  276. return feedlist[0]
  277. else:
  278. return None
  279. ##### test harness ######
  280. def test():
  281. uri = 'http://diveintomark.org/tests/client/autodiscovery/html4-001.html'
  282. failed = []
  283. count = 0
  284. while 1:
  285. data = _gatekeeper.get(uri)
  286. if data.find('Atom autodiscovery test') == -1: break
  287. sys.stdout.write('.')
  288. sys.stdout.flush()
  289. count += 1
  290. links = getLinks(data, uri)
  291. if not links:
  292. print '\n*** FAILED ***', uri, 'could not find link'
  293. failed.append(uri)
  294. elif len(links) > 1:
  295. print '\n*** FAILED ***', uri, 'found too many links'
  296. failed.append(uri)
  297. else:
  298. atomdata = urllib.urlopen(links[0]).read()
  299. if atomdata.find('<link rel="alternate"') == -1:
  300. print '\n*** FAILED ***', uri, 'retrieved something that is not a feed'
  301. failed.append(uri)
  302. else:
  303. backlink = atomdata.split('href="').pop().split('"')[0]
  304. if backlink != uri:
  305. print '\n*** FAILED ***', uri, 'retrieved wrong feed'
  306. failed.append(uri)
  307. if data.find('<link rel="next" href="') == -1: break
  308. uri = urlparse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0])
  309. print
  310. print count, 'tests executed,', len(failed), 'failed'
  311. if __name__ == '__main__':
  312. args = sys.argv[1:]
  313. if args and args[0] == '--debug':
  314. _debug = 1
  315. args.pop(0)
  316. if args:
  317. uri = args[0]
  318. else:
  319. uri = 'http://diveintomark.org/'
  320. if uri == 'test':
  321. test()
  322. else:
  323. print "\n".join(getFeeds(uri))