PageRenderTime 48ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/Sources/wwwclient/client.py

https://github.com/netconstructor/wwwclient
Python | 374 lines | 353 code | 3 blank | 18 comment | 2 complexity | e2325cf036969773561641efb5904dc9 MD5 | raw file
Possible License(s): LGPL-3.0
  1. #!/usr/bin/env python
  2. # Encoding: iso-8859-1
  3. # -----------------------------------------------------------------------------
  4. # Project : WWWClient
  5. # -----------------------------------------------------------------------------
  6. # Author : Sebastien Pierre <sebastien@ivy.fr>
  7. # -----------------------------------------------------------------------------
  8. # License : GNU Lesser General Public License
  9. # Credits : Xprima.com
  10. # -----------------------------------------------------------------------------
  11. # Creation : 04-Jun-2006
  12. # Last mod : 27-Sep-2006
  13. # -----------------------------------------------------------------------------
  14. import re, mimetypes, urllib, zlib
  15. __doc__ = """\
  16. This modules defines an abstract class for HTTP clients, that creates a simple,
  17. easy to understand, low-level wrapper for existing HTTP implementation. It
  18. expects to have simple datatypes as input for building the request, and expects
  19. to have the response a string.
  20. The HTTPClient class has a fast response parser that is able to update
  21. important information withing the client.
  22. HTTPClient subclasses are instanciated and bound to every session. As HTTPClient
  23. are stateful (they aggregate session state), they are not meant to be shared
  24. among different sessions.
  25. """
  26. # TODO: Find more use cases for chunked mode
  27. # TODO: Add cookie encode/decode functions
  28. FILE_ATTACHMENT = 0
  29. CONTENT_ATTACHMENT = 1
  30. RE_CONTENT_LENGTH = re.compile("^\s*Content-Length\s*:\s*([0-9]+)", re.I|re.MULTILINE)
  31. RE_CONTENT_ENCODING= re.compile("^\s*Content-Encoding\s*:(.*)\r\n", re.I|re.MULTILINE)
  32. RE_CONTENT_TYPE = re.compile("^\s*Content-Type\s*:(.*)\r\n", re.I|re.MULTILINE)
  33. RE_CHARSET = re.compile("\s*charset=([\w\d_-]+)", re.I|re.MULTILINE)
  34. RE_LOCATION = re.compile("^\s*Location\s*:(.*)\r\n", re.I|re.MULTILINE)
  35. RE_SET_COOKIE = re.compile("^\s*Set-Cookie\s*:(.*)\r\n", re.I|re.MULTILINE)
  36. RE_CHUNKED = re.compile("^\s*Transfer-Encoding\s*:\s*chunked\s*\r\n", re.I|re.MULTILINE)
  37. CRLF = "\r\n"
  38. BOUNDARY = '----------fbb6cc131b52e5a980ac702bedde498032a88158$'
  39. DEFAULT_MIMETYPE = 'text/plain'
  40. DEFAULT_ATTACH_MIMETYPE = 'application/octet-stream'
  41. # NOTE: A useful reference for understanding HTTP is the following website
  42. # <http://www.jmarshall.com/easy/http>
  43. class HTTPClient:
  44. """Abstract class for an 'HTTPClient'. As explained in the module
  45. documentation, the 'HTTPClient' is a an object-oriented interface to
  46. low-level HTTP communication infrastructure. The 'HTTPClient' is stateful,
  47. in the sense that it aggregates the status resulting from requests and
  48. responses."""
  49. def __init__( self, encoding="latin-1" ):
  50. """Creates a new HTTPClient with the given 'encoding' as default
  51. encofing ('latin-1' is the default)."""
  52. self._method = "GET"
  53. self._url = None
  54. self._host = None
  55. self._protocol = None
  56. self._status = None
  57. self._redirect = None
  58. self._newCookies = None
  59. self._responses = None
  60. self._onLog = None
  61. self._cache = None
  62. self.verbose = 0
  63. self.encoding = encoding
  64. self.retryDelay = 0.100
  65. self.retryCount = 5
  66. def _log( self, *args ):
  67. """Logs data to stdout or forwards it to self._onLog"""
  68. if self._onLog:
  69. self._onLog(*args)
  70. else:
  71. print " ".join(map(str,args))
  72. def setCache( self, cache ):
  73. """Set a cache"""
  74. self._cache = cache
  75. def method( self ):
  76. """Returns the method of the last request by this HTTP client."""
  77. return self._method
  78. def url( self ):
  79. """Returns the last URL processed by this HTTP client."""
  80. return self._url
  81. def host( self ):
  82. """Returns the current host"""
  83. return self._host
  84. def protocol( self ):
  85. """Returns the current protocol."""
  86. return self._protocol
  87. def status( self ):
  88. """Returns the last response status."""
  89. return self._status
  90. def redirect( self ):
  91. """Returns the redirection URL (if any)."""
  92. if self._redirect == None or self._redirect.find("://") != -1:
  93. return self._redirect
  94. if self._redirect[0] == "/":
  95. return "%s://%s%s" % (self.protocol(), self.host(), self._redirect)
  96. else:
  97. return "%s://%s/%s" % (self.protocol(), self.host(), self._redirect)
  98. def newCookies( self ):
  99. """Returns the cookies added by the last response."""
  100. return self._newCookies
  101. def responses( self ):
  102. """Returns the list of responses to the last request. The list is
  103. composed of triples (firstline, headers, body)."""
  104. return self._responses
  105. def data( self ):
  106. """Returns the last response data."""
  107. if not self._responses:
  108. return ""
  109. elif len(self._responses) == 1:
  110. return self._responses[0][-1]
  111. else:
  112. return "".join(r[-1] for r in self._responses)
  113. def dataSize( self ):
  114. """Returns the total size of the responses."""
  115. total = 0
  116. for r in self._responses:
  117. total += len(r)
  118. return total
  119. def info( self, level=1 ):
  120. return "%s %s (%s)" % (self.method(), self.url(), self.status())
  121. # return "\n".join((
  122. # "URL : %s" % (self.url()),
  123. # "- status : %s" % (self.status()),
  124. # "- redirect : %s" % (self.redirect()),
  125. # "- cookies(new): %s" % (self.newCookies()),
  126. # "- responses : #%s (%sbytes)" % (len(self.responses()),self.dataSize()),
  127. # ))
  128. def encode( self, fields=(), attach=() ):
  129. """Encodes the given fields and attachments (as given to POST) and
  130. returns the request body and content type for sending the encoded
  131. data. This method can be used to bypass Curl own form encoding
  132. techniques."""
  133. content = []
  134. if not fields and not attach: return "", DEFAULT_MIMETYPE
  135. if fields:
  136. for name, value in fields:
  137. content.append("--" + BOUNDARY)
  138. content.append('Content-Disposition: form-data; name="%s"' % name)
  139. content.append('')
  140. content.append(self._valueToString(value))
  141. if attach:
  142. attach = self._ensureAttachment(attach)
  143. for name, filename, atype in attach:
  144. content.append("--" + BOUNDARY)
  145. if atype == FILE_ATTACHMENT:
  146. f = file(filename, 'r')
  147. value = f.read()
  148. f.close()
  149. mime_type = mimetypes.guess_type(filename)[0] or DEFAULT_ATTACH_MIMETYPE
  150. elif atype == CONTENT_ATTACHMENT:
  151. filename, mime_type, value = filename
  152. content.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (name, filename))
  153. content.append('Content-Type: %s' % (mime_type))
  154. content.append('Content-Transfer-Encoding: binary')
  155. content.append('')
  156. content.append(self._valueToString(value))
  157. content.append('--' + BOUNDARY + '--')
  158. content.append('')
  159. body = CRLF.join(content)
  160. content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
  161. return body, content_type
  162. def GET( self, url, headers=None ):
  163. """Gets the given URL, setting the given headers (as a list of
  164. strings)."""
  165. raise Exception("GET method must be implemented by HTTPClient subclasses.")
  166. def POST( self, url, data=None, mimetype=None, fields=None, attach=None, headers=None ):
  167. """Posts the given data (as urlencoded string), or fields as list of
  168. (name, value) pairs and/or attachments as list of (name, value, type)
  169. triples. Headers attributes are the same as for the @GET
  170. method.
  171. The @attach parameter is quite special, as the value will depend on the
  172. type: if type is @FILE_ATTACHMENT, then value is simply the path to the
  173. file, but if the type is @CONTENT_ATTACHMENT, the value is expected to
  174. be a triple (filename, mimetype, value).
  175. """
  176. raise Exception("GET method must be implemented by HTTPClient subclasses.")
  177. def _ensureAttachment( self, attach ):
  178. """Ensures that the given attachment is a list of attachments. For
  179. instance if attach is a single attachment, it will be returned as
  180. `[attach]`."""
  181. if attach is None: return attach
  182. if len(attach) == 3:
  183. for a in attach:
  184. if type(a) in (tuple,list) and len(a) == 3:
  185. continue
  186. return [attach]
  187. return attach
  188. def _valueToString( self, value ):
  189. """Ensures that the given value will be an encoded string, encoded in
  190. this HTTPClient default encoding (set it with the @encoding
  191. attribute)."""
  192. if type(value) == unicode: value = value.encode(self.encoding)
  193. elif value == None: value = ""
  194. else: value = str(value)
  195. return value
  196. def _valueToPostData( self, value ):
  197. """Encodes the given value as an url-encoded string suitable for
  198. post-data. If the value is a string, it will be left as-s (only
  199. converted to the default encoding)"""
  200. if type(value) == str:
  201. return value
  202. elif type(value) == unicode:
  203. return value
  204. elif type(value) in (list,tuple):
  205. return urllib.urlencode(value)
  206. elif type(value) == dict:
  207. return urllib.urlencode(value)
  208. else:
  209. # It should be a Pair... but we cannot check it because of circular
  210. # imports
  211. return value.asURL()
  212. def _absoluteURL( self, url ):
  213. """Returns the absolute URL for the given url"""
  214. if self.host() == None or url == None or url.find("://") != -1:
  215. res = url
  216. elif url[0] == "/":
  217. res = "%s://%s%s" % (self.protocol(), self.host(), url)
  218. else:
  219. res = "%s://%s/%s" % (self.protocol(), self.host(), url)
  220. return str(res)
  221. def _parseResponse( self, message):
  222. """Parse the message, and return a list of responses and headers. This
  223. might occur when there is a provisional response in between, or when
  224. location are followed. The result is a list of (firstline, headers,
  225. body), all as unparsed stings."""
  226. res = []
  227. off = 0
  228. self._newCookies = []
  229. # FIXME: I don't get why we need to iterate here
  230. # (it's probably when you have multiple responses)
  231. while off < len(message):
  232. body = ""
  233. eol = message.find(CRLF, off)
  234. eoh = message.find(CRLF + CRLF, off)
  235. if eol == -1: break
  236. if eoh == -1: eoh = len(message)
  237. first_line = message[off:eol]
  238. headers = message[eol+2:eoh]
  239. # FIXME: This is not very efficient, we should parse all headers
  240. # into a structure, rahter than searching
  241. charset = RE_CHARSET.search(headers)
  242. is_chunked = RE_CHUNKED.search(headers)
  243. content_length = RE_CONTENT_LENGTH.search(headers)
  244. content_encoding = RE_CONTENT_ENCODING.search(headers)
  245. content_type = RE_CONTENT_TYPE.search(headers)
  246. if content_encoding:
  247. content_encoding = content_encoding.group(1)
  248. if content_type:
  249. content_type = content_type.group(1)
  250. if charset:
  251. encoding = charset.group(1)
  252. else:
  253. encoding = self.encoding
  254. # If there is a content-length specified, we use it
  255. if content_length:
  256. content_length = int(content_length.group(1))
  257. off = eoh + 4 + content_length
  258. body = self._decodeBody(message[eoh+4:off], content_encoding, encoding)
  259. # Otherwise, the transfer type may be chunks
  260. elif is_chunked:
  261. # FIXME: For the moment, chunks are supposed to be separated by
  262. # CRLF + CRLF only (this is what google.com returns)
  263. off = message.find(CRLF + CRLF, eoh + 4)
  264. if off == -1: off = len(message)
  265. body = self._decodeBody(message[eoh+4:off], content_encoding, encoding)
  266. # Otherwise the body is simply what's left after the headers
  267. else:
  268. if len(message) > eoh+4:
  269. body = self._decodeBody(message[eoh+4:], content_encoding, encoding)
  270. off = len(message)
  271. location, cookies = self._parseStatefulHeaders(headers)
  272. # WTF:
  273. self._redirect = location
  274. self._newCookies.extend(self._parseCookies(cookies))
  275. # FIXME: I don't know if it works properly, but at least it handles
  276. # responses from <http://www.contactor.se/~dast/postit.cgi> properly.
  277. if first_line and first_line.startswith("HTTP"):
  278. res.append([first_line, headers, body])
  279. # If the first line does not start with HTTP, then this may be
  280. # the rest of the body from a previous response
  281. else:
  282. assert res, "There must be a first line"
  283. res[-1][-1] = res[-1][-1] + CRLF + CRLF + first_line
  284. if headers: res[-1][-1] = res[-1][-1] + headers
  285. if body: res[-1][-1] = res[-1][-1] + body
  286. # TODO: It would be good to communicate headers and first_line back
  287. self._responses = res
  288. return res
  289. def _decodeBody( self, body, contentEncoding=None, encoding=None ):
  290. if contentEncoding:
  291. if contentEncoding.lower().strip() == "gzip":
  292. body = zlib.decompress(body)
  293. #if encoding: return body.decode(encoding)
  294. #else: return body
  295. return body
  296. else:
  297. raise Exception("Unsupported content encoding: " + contentEncoding)
  298. else:
  299. # FIXME: Should not force encoding, only if it's a string
  300. #if encoding: return body.decode(encoding)
  301. return body
  302. def _parseStatefulHeaders( self, headers ):
  303. """Return the Location and Set-Cookie headers from the given header
  304. string."""
  305. # We add an extra carriage, because some regexes will expect a carriage
  306. # return at the end
  307. headers += "\r\n"
  308. location = RE_LOCATION.search(headers)
  309. if location: location = location.group(1).strip()
  310. cookies = RE_SET_COOKIE.findall(headers)
  311. set_cookie = ";".join(cookies)
  312. return location, set_cookie
  313. def _parseCookies( self, cookies ):
  314. """Returns a pair (name, value) for the given cookies, given as text."""
  315. _cookies = {}
  316. res = []
  317. if not cookies: return res
  318. for cookie in cookies.split(";"):
  319. equal = cookie.find("=")
  320. if equal > 0:
  321. key = cookie[:equal].strip()
  322. value = cookie[equal+1:].strip()
  323. _cookies[key] = value
  324. for key, value in _cookies.items():
  325. res.append((key, value))
  326. return res
  327. def _parseHeaders( self, headers ):
  328. """Parses all headers and returns a list of (key, value) representing
  329. them."""
  330. res = []
  331. for header in headers.split("\n"):
  332. colon = header.find(":")
  333. name = header[:colon].strip()
  334. value = header[colon+1:-1]
  335. if not name: continue
  336. res.append((name,value))
  337. return res
  338. # EOF - vim: tw=80 ts=4 sw=4 noet