client.py | searchcode

/Sources/wwwclient/client.py

https://github.com/netconstructor/wwwclient
Python | 374 lines | 353 code | 3 blank | 18 comment | 2 complexity | e2325cf036969773561641efb5904dc9 MD5 | raw file
Possible License(s): LGPL-3.0

#!/usr/bin/env python
# Encoding: iso-8859-1
# -----------------------------------------------------------------------------
# Project   : WWWClient
# -----------------------------------------------------------------------------
# Author    : Sebastien Pierre                               <sebastien@ivy.fr>
# -----------------------------------------------------------------------------
# License   : GNU Lesser General Public License
# Credits   : Xprima.com
# -----------------------------------------------------------------------------
# Creation  : 04-Jun-2006
# Last mod  : 27-Sep-2006
# -----------------------------------------------------------------------------

import re, mimetypes, urllib, zlib

__doc__ = """\
This modules defines an abstract class for HTTP clients, that creates a simple,
easy to understand, low-level wrapper for existing HTTP implementation. It
expects to have simple datatypes as input for building the request, and expects
to have the response a string.

The HTTPClient class has a fast response parser that is able to update
important information withing the client.

HTTPClient subclasses are instanciated and bound to every session. As HTTPClient
are stateful (they aggregate session state), they are not meant to be shared
among different sessions.
"""

# TODO: Find more use cases for chunked mode
# TODO: Add cookie encode/decode functions

FILE_ATTACHMENT    = 0
CONTENT_ATTACHMENT = 1

RE_CONTENT_LENGTH  = re.compile("^\s*Content-Length\s*:\s*([0-9]+)", re.I|re.MULTILINE)
RE_CONTENT_ENCODING= re.compile("^\s*Content-Encoding\s*:(.*)\r\n", re.I|re.MULTILINE)
RE_CONTENT_TYPE    = re.compile("^\s*Content-Type\s*:(.*)\r\n",   re.I|re.MULTILINE)
RE_CHARSET         = re.compile("\s*charset=([\w\d_-]+)",           re.I|re.MULTILINE)
RE_LOCATION        = re.compile("^\s*Location\s*:(.*)\r\n",          re.I|re.MULTILINE)
RE_SET_COOKIE      = re.compile("^\s*Set-Cookie\s*:(.*)\r\n",        re.I|re.MULTILINE)
RE_CHUNKED         = re.compile("^\s*Transfer-Encoding\s*:\s*chunked\s*\r\n", re.I|re.MULTILINE)
CRLF               = "\r\n"
BOUNDARY           = '----------fbb6cc131b52e5a980ac702bedde498032a88158$'
DEFAULT_MIMETYPE   = 'text/plain'
DEFAULT_ATTACH_MIMETYPE = 'application/octet-stream'

# NOTE: A useful reference for understanding HTTP is the following website
# <http://www.jmarshall.com/easy/http>
class HTTPClient:
	"""Abstract class for an 'HTTPClient'. As explained in the module
	documentation, the 'HTTPClient' is a an object-oriented interface to
	low-level HTTP communication infrastructure. The 'HTTPClient' is stateful,
	in the sense that it aggregates the status resulting from requests and
	responses."""

	def __init__( self, encoding="latin-1" ):
		"""Creates a new HTTPClient with the given 'encoding' as default
		encofing ('latin-1' is the default)."""
		self._method     = "GET"
		self._url        = None
		self._host       = None
		self._protocol   = None
		self._status     = None
		self._redirect   = None
		self._newCookies = None
		self._responses  = None
		self._onLog      = None
		self._cache      = None
		self.verbose     = 0
		self.encoding    = encoding
		self.retryDelay  = 0.100
		self.retryCount  = 5

	def _log( self, *args ):
		"""Logs data to stdout or forwards it to self._onLog"""
		if self._onLog:
			self._onLog(*args)
		else:
			print " ".join(map(str,args))

	def setCache( self, cache ):
		"""Set a cache"""
		self._cache = cache
	
	def method( self ):
		"""Returns the method of the last request by this HTTP client."""
		return self._method

	def url( self ):
		"""Returns the last URL processed by this HTTP client."""
		return self._url
	
	def host( self ):
		"""Returns the current host"""
		return self._host
	
	def protocol( self ):
		"""Returns the current protocol."""
		return self._protocol

	def status( self ):
		"""Returns the last response status."""
		return self._status
	
	def redirect( self ):
		"""Returns the redirection URL (if any)."""
		if self._redirect == None or self._redirect.find("://") != -1:
			return self._redirect
		if self._redirect[0] == "/":
			return "%s://%s%s" % (self.protocol(), self.host(), self._redirect)
		else:
			return "%s://%s/%s" % (self.protocol(), self.host(), self._redirect)
	
	def newCookies( self ):
		"""Returns the cookies added by the last response."""
		return self._newCookies
	
	def responses( self ):
		"""Returns the list of responses to the last request. The list is
		composed of triples (firstline, headers, body)."""
		return self._responses

	def data( self ):
		"""Returns the last response data."""
		if not self._responses:
			return ""
		elif len(self._responses) == 1:
			return self._responses[0][-1]
		else:
			return "".join(r[-1] for r in self._responses)

	def dataSize( self ):
		"""Returns the total size of the responses."""
		total = 0
		for r in self._responses:
			total += len(r)
		return total

	def info( self, level=1 ):
		return "%s %s (%s)" % (self.method(), self.url(), self.status())
		# return "\n".join((
		# 	"URL           : %s" % (self.url()),
		# 	"- status      : %s" % (self.status()),
		# 	"- redirect    : %s" % (self.redirect()),
		# 	"- cookies(new): %s" % (self.newCookies()),
		# 	"- responses   : #%s (%sbytes)" % (len(self.responses()),self.dataSize()),
		# ))

	def encode( self, fields=(), attach=() ):
		"""Encodes the given fields and attachments (as given to POST) and
		returns the request body and content type for sending the encoded
		data.  This method can be used to bypass Curl own form encoding
		techniques."""
		content = []
		if not fields and not attach: return "", DEFAULT_MIMETYPE
		if fields:
			for name, value in fields:
				content.append("--" + BOUNDARY)
				content.append('Content-Disposition: form-data; name="%s"' % name)
				content.append('')
				content.append(self._valueToString(value))
		if attach:
			attach = self._ensureAttachment(attach)
			for name, filename, atype in attach:
				content.append("--" + BOUNDARY)
				if atype == FILE_ATTACHMENT:
					f     = file(filename, 'r')
					value = f.read()
					f.close()
					mime_type = mimetypes.guess_type(filename)[0] or DEFAULT_ATTACH_MIMETYPE
				elif atype == CONTENT_ATTACHMENT:
					filename, mime_type, value = filename
				content.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (name, filename))
				content.append('Content-Type: %s' % (mime_type))
				content.append('Content-Transfer-Encoding: binary')
				content.append('')
				content.append(self._valueToString(value))
		content.append('--' + BOUNDARY + '--')
		content.append('')
		body         = CRLF.join(content)
		content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
		return body, content_type

	def GET( self, url, headers=None ):
		"""Gets the given URL, setting the given headers (as a list of
		strings)."""
		raise Exception("GET method must be implemented by HTTPClient subclasses.")

	def POST( self, url, data=None, mimetype=None, fields=None, attach=None, headers=None ):
		"""Posts the given data (as urlencoded string), or fields as list of
		(name, value) pairs and/or attachments as list of (name, value, type)
		triples. Headers attributes are the same as for the @GET
		method.
		
		The @attach parameter is quite special, as the value will depend on the
		type: if type is @FILE_ATTACHMENT, then value is simply the path to the
		file, but if the type is @CONTENT_ATTACHMENT, the value is expected to
		be a triple (filename, mimetype, value).
		"""
		raise Exception("GET method must be implemented by HTTPClient subclasses.")
	
	def _ensureAttachment( self, attach ):
		"""Ensures that the given attachment is a list of attachments. For
		instance if attach is a single attachment, it will be returned as
		`[attach]`."""
		if attach is None: return attach
		if len(attach) == 3:
			for a in attach:
				if type(a) in (tuple,list) and len(a) == 3:
					continue
				return [attach]
		return attach

	def _valueToString( self, value ):
		"""Ensures that the given value will be an encoded string, encoded in
		this HTTPClient default encoding (set it with the @encoding
		attribute)."""
		if   type(value) == unicode: value = value.encode(self.encoding)
		elif value == None: value = ""
		else: value = str(value)
		return value

	def _valueToPostData( self, value ):
		"""Encodes the given value as an url-encoded string suitable for
		post-data. If the value is a string, it will be left as-s (only
		converted to the default encoding)"""
		if   type(value) == str:
			return value
		elif type(value) == unicode:
			return value
		elif type(value) in (list,tuple):
			return urllib.urlencode(value)
		elif type(value) == dict:
			return urllib.urlencode(value)
		else:
			# It should be a Pair... but we cannot check it because of circular
			# imports
			return value.asURL()

	def _absoluteURL( self, url ):
		"""Returns the absolute URL for the given url"""
		if self.host() == None or url == None or url.find("://") != -1:
			res = url
		elif url[0] == "/":
			res = "%s://%s%s" % (self.protocol(), self.host(), url)
		else:
			res = "%s://%s/%s" % (self.protocol(), self.host(), url)
		return str(res)

	def _parseResponse( self, message):
		"""Parse the message, and return a list of responses and headers. This
		might occur when there is a provisional response in between, or when
		location are followed. The result is a list of (firstline, headers,
		body), all as unparsed stings."""
		res     = []
		off     = 0
		self._newCookies = []
		# FIXME: I don't get why we need to iterate here
		# (it's probably when you have multiple responses)
		while off < len(message):
			body = ""
			eol  = message.find(CRLF, off)
			eoh  = message.find(CRLF + CRLF, off)
			if eol == -1: break
			if eoh == -1: eoh = len(message)
			first_line       = message[off:eol]
			headers          = message[eol+2:eoh]
			# FIXME: This is not very efficient, we should parse all headers
			# into a structure, rahter than searching
			charset          = RE_CHARSET.search(headers)
			is_chunked       = RE_CHUNKED.search(headers)
			content_length   = RE_CONTENT_LENGTH.search(headers)
			content_encoding = RE_CONTENT_ENCODING.search(headers)
			content_type     = RE_CONTENT_TYPE.search(headers)
			if content_encoding:
				content_encoding = content_encoding.group(1)
			if content_type:
				content_type     = content_type.group(1)
			if charset:
				encoding   = charset.group(1)
			else:
				encoding   = self.encoding
			# If there is a content-length specified, we use it
			if content_length:
				content_length = int(content_length.group(1))
				off        = eoh + 4 + content_length
				body       = self._decodeBody(message[eoh+4:off], content_encoding, encoding)
			# Otherwise, the transfer type may be chunks
			elif is_chunked:
				# FIXME: For the moment, chunks are supposed to be separated by
				# CRLF + CRLF only (this is what google.com returns)
				off        = message.find(CRLF + CRLF, eoh + 4)
				if off == -1: off = len(message) 
				body       = self._decodeBody(message[eoh+4:off], content_encoding, encoding)
			# Otherwise the body is simply what's left after the headers
			else:
				if len(message) > eoh+4:
					body = self._decodeBody(message[eoh+4:], content_encoding, encoding)
				off = len(message)
			location, cookies = self._parseStatefulHeaders(headers)
			# WTF: 
			self._redirect    = location
			self._newCookies.extend(self._parseCookies(cookies))
			# FIXME: I don't know if it works properly, but at least it handles
			# responses from <http://www.contactor.se/~dast/postit.cgi> properly.
			if first_line and first_line.startswith("HTTP"):
				res.append([first_line, headers, body])
			# If the first line does not start with HTTP, then this may be
			# the rest of the body from a previous response
			else:
				assert res, "There must be a first line"
				res[-1][-1] = res[-1][-1] + CRLF + CRLF + first_line
				if headers: res[-1][-1] = res[-1][-1] + headers
				if body: res[-1][-1] = res[-1][-1] + body 
		# TODO: It would be good to communicate headers and first_line back
		self._responses = res
		return res

	def _decodeBody( self, body, contentEncoding=None, encoding=None ):
		if contentEncoding:
			if contentEncoding.lower().strip() == "gzip":
				body = zlib.decompress(body)
				#if encoding: return body.decode(encoding)
				#else: return body
				return body
			else:
				raise Exception("Unsupported content encoding: " + contentEncoding)
		else:
			# FIXME: Should not force encoding, only if it's a string
			#if encoding: return body.decode(encoding)
			return body

	def _parseStatefulHeaders( self, headers ):
		"""Return the Location and Set-Cookie headers from the given header
		string."""
		# We add an extra carriage, because some regexes will expect a carriage
		# return at the end
		headers += "\r\n"
		location    = RE_LOCATION.search(headers)
		if location: location = location.group(1).strip()
		cookies    = RE_SET_COOKIE.findall(headers)
		set_cookie = ";".join(cookies)
		return location, set_cookie
	
	def _parseCookies( self, cookies ):
		"""Returns a pair (name, value) for the given cookies, given as text."""
		_cookies   = {}
		res        = []
		if not cookies: return res
		for cookie in cookies.split(";"):
			equal = cookie.find("=")
			if equal > 0:
				key           = cookie[:equal].strip()
				value         = cookie[equal+1:].strip()
				_cookies[key] = value
		for key, value in _cookies.items():
			res.append((key, value))
		return res

	def _parseHeaders( self, headers ):
		"""Parses all headers and returns a list of (key, value) representing
		them."""
		res = []
		for header in headers.split("\n"):
			colon = header.find(":")
			name  = header[:colon].strip()
			value = header[colon+1:-1]
			if not name: continue
			res.append((name,value))
		return res

# EOF - vim: tw=80 ts=4 sw=4 noet