PageRenderTime 36ms CodeModel.GetById 11ms app.highlight 19ms RepoModel.GetById 1ms app.codeStats 0ms

/blogmaker/util/fetch.py

http://blogmaker.googlecode.com/
Python | 111 lines | 94 code | 8 blank | 9 comment | 2 complexity | d5903645d4a00922a1192a4e5ffe3fa5 MD5 | raw file
  1''' Utilities for fetching data from URLs.
  2    Copyright (c) 2006-2007, PreFab Software Inc.
  3'''
  4
  5import datetime, gzip, re, urllib, urllib2, urlparse
  6import cStringIO as StringIO
  7
  8from blogmaker.util.feedparser import _parse_date, _FeedURLHandler
  9
 10debuglevel=0
 11
 12# This finds meta-refresh with time < 10 seconds
 13refreshRe = re.compile(r'''http-equiv=['"]refresh['"]\s+content=['"]\d;\s*url=\s*([^'"\s]+)['"]''', re.IGNORECASE)
 14
 15def read_html(url, etag=None, modified=None, agent=''):
 16    ''' Open and read data from a URL.
 17        Returns both the connection object and the data.
 18        Will follow some <meta> tag redirects.
 19    '''
 20    tried = set([url])
 21    f, data = read_resource(url, etag, modified, agent)
 22    while True:
 23        m = refreshRe.search(data)
 24        if not m:
 25            break
 26            
 27        redirectUrl = m.group(1)
 28        if redirectUrl == '/nojs.htm':  # yuck; special case for jacklewis.net
 29            break
 30            
 31        url = urlparse.urljoin(f.url, redirectUrl)
 32        if url in tried:
 33            break
 34        tried.add(url)
 35        
 36        f, data = read_resource(url, etag, modified, agent)
 37        if not hasattr(f, 'status'):
 38            f.status = 301  # Treat a <meta> redirect as permanent
 39
 40    return f, data
 41    
 42
 43def read_resource(url, etag=None, modified=None, agent='', isFeed=False):
 44    ''' Open and read data from a URL.
 45        Returns both the connection object and the data.
 46        Arguments are the same as for open_resource().
 47    '''
 48    f = open_resource(url, etag, modified, agent, isFeed)
 49    try:
 50        data = f.read()
 51        f.close()
 52    except ValueError, e:
 53        # This is a workaround for a very specific problem
 54        # Some web sites do not correctly chunk HTTP 1.1 data
 55        # urllib2 can't deal with them; it raises a ValueError
 56        # Catch the error and try urllib instead
 57        # The returned file object is not as rich as the one returned
 58        # by open_resource() but it does have info()
 59        if e.message != "invalid literal for int() with base 16: ''":
 60            raise
 61        f = urllib.urlopen(url)
 62        data = f.read()
 63        f.close()
 64    
 65    # Even though we ask for no encoding, some sites still return gzip
 66    # http://hughhewitt.townhall.com/blog is one
 67    if f.info().get('content-encoding', None) == 'gzip':
 68        data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
 69    
 70    return f, data
 71        
 72
 73def open_resource(url, etag=None, modified=None, agent='', isFeed=False):
 74    """URL --> stream
 75
 76    If the etag argument is supplied, it will be used as the value of an
 77    If-None-Match request header.
 78
 79    If the modified argument is supplied, it will be used
 80    as the value of an If-Modified-Since request header.
 81
 82    If the agent argument is supplied, it will be used as the value of a
 83    User-Agent request header.
 84
 85    Loosely based on feedparser._open_resource()
 86    """
 87
 88    # try to open with urllib2 (to use optional headers)
 89    request = urllib2.Request(url)
 90    request.add_header('User-Agent', agent)
 91    if etag:
 92        request.add_header('If-None-Match', etag)
 93
 94    if modified:
 95        request.add_header('If-Modified-Since', modified)
 96    
 97    request.add_header('Accept-encoding', '')
 98
 99    if isFeed:
100        request.add_header('A-IM', 'feed') # RFC 3229 support
101
102    # Use _FeedURLHandler to capture redirect codes in f.status
103    opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=debuglevel), _FeedURLHandler())
104
105    opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
106    try:
107        f = opener.open(request)
108        return f
109    finally:
110        opener.close() # JohnD
111