PageRenderTime 44ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/blogmaker/util/fetch.py

http://blogmaker.googlecode.com/
Python | 111 lines | 94 code | 8 blank | 9 comment | 3 complexity | d5903645d4a00922a1192a4e5ffe3fa5 MD5 | raw file
  1. ''' Utilities for fetching data from URLs.
  2. Copyright (c) 2006-2007, PreFab Software Inc.
  3. '''
  4. import datetime, gzip, re, urllib, urllib2, urlparse
  5. import cStringIO as StringIO
  6. from blogmaker.util.feedparser import _parse_date, _FeedURLHandler
  7. debuglevel=0
  8. # This finds meta-refresh with time < 10 seconds
  9. refreshRe = re.compile(r'''http-equiv=['"]refresh['"]\s+content=['"]\d;\s*url=\s*([^'"\s]+)['"]''', re.IGNORECASE)
  10. def read_html(url, etag=None, modified=None, agent=''):
  11. ''' Open and read data from a URL.
  12. Returns both the connection object and the data.
  13. Will follow some <meta> tag redirects.
  14. '''
  15. tried = set([url])
  16. f, data = read_resource(url, etag, modified, agent)
  17. while True:
  18. m = refreshRe.search(data)
  19. if not m:
  20. break
  21. redirectUrl = m.group(1)
  22. if redirectUrl == '/nojs.htm': # yuck; special case for jacklewis.net
  23. break
  24. url = urlparse.urljoin(f.url, redirectUrl)
  25. if url in tried:
  26. break
  27. tried.add(url)
  28. f, data = read_resource(url, etag, modified, agent)
  29. if not hasattr(f, 'status'):
  30. f.status = 301 # Treat a <meta> redirect as permanent
  31. return f, data
  32. def read_resource(url, etag=None, modified=None, agent='', isFeed=False):
  33. ''' Open and read data from a URL.
  34. Returns both the connection object and the data.
  35. Arguments are the same as for open_resource().
  36. '''
  37. f = open_resource(url, etag, modified, agent, isFeed)
  38. try:
  39. data = f.read()
  40. f.close()
  41. except ValueError, e:
  42. # This is a workaround for a very specific problem
  43. # Some web sites do not correctly chunk HTTP 1.1 data
  44. # urllib2 can't deal with them; it raises a ValueError
  45. # Catch the error and try urllib instead
  46. # The returned file object is not as rich as the one returned
  47. # by open_resource() but it does have info()
  48. if e.message != "invalid literal for int() with base 16: ''":
  49. raise
  50. f = urllib.urlopen(url)
  51. data = f.read()
  52. f.close()
  53. # Even though we ask for no encoding, some sites still return gzip
  54. # http://hughhewitt.townhall.com/blog is one
  55. if f.info().get('content-encoding', None) == 'gzip':
  56. data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
  57. return f, data
  58. def open_resource(url, etag=None, modified=None, agent='', isFeed=False):
  59. """URL --> stream
  60. If the etag argument is supplied, it will be used as the value of an
  61. If-None-Match request header.
  62. If the modified argument is supplied, it will be used
  63. as the value of an If-Modified-Since request header.
  64. If the agent argument is supplied, it will be used as the value of a
  65. User-Agent request header.
  66. Loosely based on feedparser._open_resource()
  67. """
  68. # try to open with urllib2 (to use optional headers)
  69. request = urllib2.Request(url)
  70. request.add_header('User-Agent', agent)
  71. if etag:
  72. request.add_header('If-None-Match', etag)
  73. if modified:
  74. request.add_header('If-Modified-Since', modified)
  75. request.add_header('Accept-encoding', '')
  76. if isFeed:
  77. request.add_header('A-IM', 'feed') # RFC 3229 support
  78. # Use _FeedURLHandler to capture redirect codes in f.status
  79. opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=debuglevel), _FeedURLHandler())
  80. opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
  81. try:
  82. f = opener.open(request)
  83. return f
  84. finally:
  85. opener.close() # JohnD