/blogmaker/util/fetch.py
Python | 111 lines | 94 code | 8 blank | 9 comment | 3 complexity | d5903645d4a00922a1192a4e5ffe3fa5 MD5 | raw file
- ''' Utilities for fetching data from URLs.
- Copyright (c) 2006-2007, PreFab Software Inc.
- '''
- import datetime, gzip, re, urllib, urllib2, urlparse
- import cStringIO as StringIO
- from blogmaker.util.feedparser import _parse_date, _FeedURLHandler
- debuglevel=0
- # This finds meta-refresh with time < 10 seconds
- refreshRe = re.compile(r'''http-equiv=['"]refresh['"]\s+content=['"]\d;\s*url=\s*([^'"\s]+)['"]''', re.IGNORECASE)
- def read_html(url, etag=None, modified=None, agent=''):
- ''' Open and read data from a URL.
- Returns both the connection object and the data.
- Will follow some <meta> tag redirects.
- '''
- tried = set([url])
- f, data = read_resource(url, etag, modified, agent)
- while True:
- m = refreshRe.search(data)
- if not m:
- break
-
- redirectUrl = m.group(1)
- if redirectUrl == '/nojs.htm': # yuck; special case for jacklewis.net
- break
-
- url = urlparse.urljoin(f.url, redirectUrl)
- if url in tried:
- break
- tried.add(url)
-
- f, data = read_resource(url, etag, modified, agent)
- if not hasattr(f, 'status'):
- f.status = 301 # Treat a <meta> redirect as permanent
- return f, data
-
- def read_resource(url, etag=None, modified=None, agent='', isFeed=False):
- ''' Open and read data from a URL.
- Returns both the connection object and the data.
- Arguments are the same as for open_resource().
- '''
- f = open_resource(url, etag, modified, agent, isFeed)
- try:
- data = f.read()
- f.close()
- except ValueError, e:
- # This is a workaround for a very specific problem
- # Some web sites do not correctly chunk HTTP 1.1 data
- # urllib2 can't deal with them; it raises a ValueError
- # Catch the error and try urllib instead
- # The returned file object is not as rich as the one returned
- # by open_resource() but it does have info()
- if e.message != "invalid literal for int() with base 16: ''":
- raise
- f = urllib.urlopen(url)
- data = f.read()
- f.close()
-
- # Even though we ask for no encoding, some sites still return gzip
- # http://hughhewitt.townhall.com/blog is one
- if f.info().get('content-encoding', None) == 'gzip':
- data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
-
- return f, data
-
- def open_resource(url, etag=None, modified=None, agent='', isFeed=False):
- """URL --> stream
- If the etag argument is supplied, it will be used as the value of an
- If-None-Match request header.
- If the modified argument is supplied, it will be used
- as the value of an If-Modified-Since request header.
- If the agent argument is supplied, it will be used as the value of a
- User-Agent request header.
- Loosely based on feedparser._open_resource()
- """
- # try to open with urllib2 (to use optional headers)
- request = urllib2.Request(url)
- request.add_header('User-Agent', agent)
- if etag:
- request.add_header('If-None-Match', etag)
- if modified:
- request.add_header('If-Modified-Since', modified)
-
- request.add_header('Accept-encoding', '')
- if isFeed:
- request.add_header('A-IM', 'feed') # RFC 3229 support
- # Use _FeedURLHandler to capture redirect codes in f.status
- opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=debuglevel), _FeedURLHandler())
- opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
- try:
- f = opener.open(request)
- return f
- finally:
- opener.close() # JohnD