PageRenderTime 55ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/samples-and-tests/i-am-a-developer/mechanize/_opener.py

http://github.com/playframework/play
Python | 436 lines | 363 code | 31 blank | 42 comment | 41 complexity | b40a142d9ec0696b92fd3545b30cee57 MD5 | raw file
Possible License(s): Apache-2.0
  1. """Integration with Python standard library module urllib2: OpenerDirector
  2. class.
  3. Copyright 2004-2006 John J Lee <jjl@pobox.com>
  4. This code is free software; you can redistribute it and/or modify it
  5. under the terms of the BSD or ZPL 2.1 licenses (see the file
  6. COPYING.txt included with the distribution).
  7. """
  8. import os, urllib2, bisect, httplib, types, tempfile
  9. try:
  10. import threading as _threading
  11. except ImportError:
  12. import dummy_threading as _threading
  13. try:
  14. set
  15. except NameError:
  16. import sets
  17. set = sets.Set
  18. import _file
  19. import _http
  20. from _request import Request
  21. import _response
  22. import _rfc3986
  23. import _sockettimeout
  24. import _upgrade
  25. from _util import isstringlike
  26. class ContentTooShortError(urllib2.URLError):
  27. def __init__(self, reason, result):
  28. urllib2.URLError.__init__(self, reason)
  29. self.result = result
  30. def set_request_attr(req, name, value, default):
  31. try:
  32. getattr(req, name)
  33. except AttributeError:
  34. setattr(req, name, default)
  35. if value is not default:
  36. setattr(req, name, value)
  37. class OpenerDirector(urllib2.OpenerDirector):
  38. def __init__(self):
  39. urllib2.OpenerDirector.__init__(self)
  40. # really none of these are (sanely) public -- the lack of initial
  41. # underscore on some is just due to following urllib2
  42. self.process_response = {}
  43. self.process_request = {}
  44. self._any_request = {}
  45. self._any_response = {}
  46. self._handler_index_valid = True
  47. self._tempfiles = []
  48. def add_handler(self, handler):
  49. if handler in self.handlers:
  50. return
  51. # XXX why does self.handlers need to be sorted?
  52. bisect.insort(self.handlers, handler)
  53. handler.add_parent(self)
  54. self._handler_index_valid = False
  55. def _maybe_reindex_handlers(self):
  56. if self._handler_index_valid:
  57. return
  58. handle_error = {}
  59. handle_open = {}
  60. process_request = {}
  61. process_response = {}
  62. any_request = set()
  63. any_response = set()
  64. unwanted = []
  65. for handler in self.handlers:
  66. added = False
  67. for meth in dir(handler):
  68. if meth in ["redirect_request", "do_open", "proxy_open"]:
  69. # oops, coincidental match
  70. continue
  71. if meth == "any_request":
  72. any_request.add(handler)
  73. added = True
  74. continue
  75. elif meth == "any_response":
  76. any_response.add(handler)
  77. added = True
  78. continue
  79. ii = meth.find("_")
  80. scheme = meth[:ii]
  81. condition = meth[ii+1:]
  82. if condition.startswith("error"):
  83. jj = meth[ii+1:].find("_") + ii + 1
  84. kind = meth[jj+1:]
  85. try:
  86. kind = int(kind)
  87. except ValueError:
  88. pass
  89. lookup = handle_error.setdefault(scheme, {})
  90. elif condition == "open":
  91. kind = scheme
  92. lookup = handle_open
  93. elif condition == "request":
  94. kind = scheme
  95. lookup = process_request
  96. elif condition == "response":
  97. kind = scheme
  98. lookup = process_response
  99. else:
  100. continue
  101. lookup.setdefault(kind, set()).add(handler)
  102. added = True
  103. if not added:
  104. unwanted.append(handler)
  105. for handler in unwanted:
  106. self.handlers.remove(handler)
  107. # sort indexed methods
  108. # XXX could be cleaned up
  109. for lookup in [process_request, process_response]:
  110. for scheme, handlers in lookup.iteritems():
  111. lookup[scheme] = handlers
  112. for scheme, lookup in handle_error.iteritems():
  113. for code, handlers in lookup.iteritems():
  114. handlers = list(handlers)
  115. handlers.sort()
  116. lookup[code] = handlers
  117. for scheme, handlers in handle_open.iteritems():
  118. handlers = list(handlers)
  119. handlers.sort()
  120. handle_open[scheme] = handlers
  121. # cache the indexes
  122. self.handle_error = handle_error
  123. self.handle_open = handle_open
  124. self.process_request = process_request
  125. self.process_response = process_response
  126. self._any_request = any_request
  127. self._any_response = any_response
  128. def _request(self, url_or_req, data, visit,
  129. timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
  130. if isstringlike(url_or_req):
  131. req = Request(url_or_req, data, visit=visit, timeout=timeout)
  132. else:
  133. # already a urllib2.Request or mechanize.Request instance
  134. req = url_or_req
  135. if data is not None:
  136. req.add_data(data)
  137. # XXX yuck
  138. set_request_attr(req, "visit", visit, None)
  139. set_request_attr(req, "timeout", timeout,
  140. _sockettimeout._GLOBAL_DEFAULT_TIMEOUT)
  141. return req
  142. def open(self, fullurl, data=None,
  143. timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
  144. req = self._request(fullurl, data, None, timeout)
  145. req_scheme = req.get_type()
  146. self._maybe_reindex_handlers()
  147. # pre-process request
  148. # XXX should we allow a Processor to change the URL scheme
  149. # of the request?
  150. request_processors = set(self.process_request.get(req_scheme, []))
  151. request_processors.update(self._any_request)
  152. request_processors = list(request_processors)
  153. request_processors.sort()
  154. for processor in request_processors:
  155. for meth_name in ["any_request", req_scheme+"_request"]:
  156. meth = getattr(processor, meth_name, None)
  157. if meth:
  158. req = meth(req)
  159. # In Python >= 2.4, .open() supports processors already, so we must
  160. # call ._open() instead.
  161. urlopen = getattr(urllib2.OpenerDirector, "_open",
  162. urllib2.OpenerDirector.open)
  163. response = urlopen(self, req, data)
  164. # post-process response
  165. response_processors = set(self.process_response.get(req_scheme, []))
  166. response_processors.update(self._any_response)
  167. response_processors = list(response_processors)
  168. response_processors.sort()
  169. for processor in response_processors:
  170. for meth_name in ["any_response", req_scheme+"_response"]:
  171. meth = getattr(processor, meth_name, None)
  172. if meth:
  173. response = meth(req, response)
  174. return response
  175. def error(self, proto, *args):
  176. if proto in ['http', 'https']:
  177. # XXX http[s] protocols are special-cased
  178. dict = self.handle_error['http'] # https is not different than http
  179. proto = args[2] # YUCK!
  180. meth_name = 'http_error_%s' % proto
  181. http_err = 1
  182. orig_args = args
  183. else:
  184. dict = self.handle_error
  185. meth_name = proto + '_error'
  186. http_err = 0
  187. args = (dict, proto, meth_name) + args
  188. result = apply(self._call_chain, args)
  189. if result:
  190. return result
  191. if http_err:
  192. args = (dict, 'default', 'http_error_default') + orig_args
  193. return apply(self._call_chain, args)
  194. BLOCK_SIZE = 1024*8
  195. def retrieve(self, fullurl, filename=None, reporthook=None, data=None,
  196. timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
  197. """Returns (filename, headers).
  198. For remote objects, the default filename will refer to a temporary
  199. file. Temporary files are removed when the OpenerDirector.close()
  200. method is called.
  201. For file: URLs, at present the returned filename is None. This may
  202. change in future.
  203. If the actual number of bytes read is less than indicated by the
  204. Content-Length header, raises ContentTooShortError (a URLError
  205. subclass). The exception's .result attribute contains the (filename,
  206. headers) that would have been returned.
  207. """
  208. req = self._request(fullurl, data, False, timeout)
  209. scheme = req.get_type()
  210. fp = self.open(req)
  211. headers = fp.info()
  212. if filename is None and scheme == 'file':
  213. # XXX req.get_selector() seems broken here, return None,
  214. # pending sanity :-/
  215. return None, headers
  216. #return urllib.url2pathname(req.get_selector()), headers
  217. if filename:
  218. tfp = open(filename, 'wb')
  219. else:
  220. path = _rfc3986.urlsplit(req.get_full_url())[2]
  221. suffix = os.path.splitext(path)[1]
  222. fd, filename = tempfile.mkstemp(suffix)
  223. self._tempfiles.append(filename)
  224. tfp = os.fdopen(fd, 'wb')
  225. result = filename, headers
  226. bs = self.BLOCK_SIZE
  227. size = -1
  228. read = 0
  229. blocknum = 0
  230. if reporthook:
  231. if "content-length" in headers:
  232. size = int(headers["Content-Length"])
  233. reporthook(blocknum, bs, size)
  234. while 1:
  235. block = fp.read(bs)
  236. if block == "":
  237. break
  238. read += len(block)
  239. tfp.write(block)
  240. blocknum += 1
  241. if reporthook:
  242. reporthook(blocknum, bs, size)
  243. fp.close()
  244. tfp.close()
  245. del fp
  246. del tfp
  247. # raise exception if actual size does not match content-length header
  248. if size >= 0 and read < size:
  249. raise ContentTooShortError(
  250. "retrieval incomplete: "
  251. "got only %i out of %i bytes" % (read, size),
  252. result
  253. )
  254. return result
  255. def close(self):
  256. urllib2.OpenerDirector.close(self)
  257. # make it very obvious this object is no longer supposed to be used
  258. self.open = self.error = self.retrieve = self.add_handler = None
  259. if self._tempfiles:
  260. for filename in self._tempfiles:
  261. try:
  262. os.unlink(filename)
  263. except OSError:
  264. pass
  265. del self._tempfiles[:]
  266. def wrapped_open(urlopen, process_response_object, fullurl, data=None,
  267. timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
  268. success = True
  269. try:
  270. response = urlopen(fullurl, data, timeout)
  271. except urllib2.HTTPError, error:
  272. success = False
  273. if error.fp is None: # not a response
  274. raise
  275. response = error
  276. if response is not None:
  277. response = process_response_object(response)
  278. if not success:
  279. raise response
  280. return response
  281. class ResponseProcessingOpener(OpenerDirector):
  282. def open(self, fullurl, data=None,
  283. timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
  284. def bound_open(fullurl, data=None,
  285. timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
  286. return OpenerDirector.open(self, fullurl, data, timeout)
  287. return wrapped_open(
  288. bound_open, self.process_response_object, fullurl, data, timeout)
  289. def process_response_object(self, response):
  290. return response
  291. class SeekableResponseOpener(ResponseProcessingOpener):
  292. def process_response_object(self, response):
  293. return _response.seek_wrapped_response(response)
  294. class OpenerFactory:
  295. """This class's interface is quite likely to change."""
  296. default_classes = [
  297. # handlers
  298. urllib2.ProxyHandler,
  299. urllib2.UnknownHandler,
  300. _http.HTTPHandler, # derived from new AbstractHTTPHandler
  301. _http.HTTPDefaultErrorHandler,
  302. _http.HTTPRedirectHandler, # bugfixed
  303. urllib2.FTPHandler,
  304. _file.FileHandler,
  305. # processors
  306. _upgrade.HTTPRequestUpgradeProcessor,
  307. _http.HTTPCookieProcessor,
  308. _http.HTTPErrorProcessor,
  309. ]
  310. if hasattr(httplib, 'HTTPS'):
  311. default_classes.append(_http.HTTPSHandler)
  312. handlers = []
  313. replacement_handlers = []
  314. def __init__(self, klass=OpenerDirector):
  315. self.klass = klass
  316. def build_opener(self, *handlers):
  317. """Create an opener object from a list of handlers and processors.
  318. The opener will use several default handlers and processors, including
  319. support for HTTP and FTP.
  320. If any of the handlers passed as arguments are subclasses of the
  321. default handlers, the default handlers will not be used.
  322. """
  323. opener = self.klass()
  324. default_classes = list(self.default_classes)
  325. skip = []
  326. for klass in default_classes:
  327. for check in handlers:
  328. if type(check) == types.ClassType:
  329. if issubclass(check, klass):
  330. skip.append(klass)
  331. elif type(check) == types.InstanceType:
  332. if isinstance(check, klass):
  333. skip.append(klass)
  334. for klass in skip:
  335. default_classes.remove(klass)
  336. for klass in default_classes:
  337. opener.add_handler(klass())
  338. for h in handlers:
  339. if type(h) == types.ClassType:
  340. h = h()
  341. opener.add_handler(h)
  342. return opener
  343. build_opener = OpenerFactory().build_opener
  344. _opener = None
  345. urlopen_lock = _threading.Lock()
  346. def urlopen(url, data=None, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
  347. global _opener
  348. if _opener is None:
  349. urlopen_lock.acquire()
  350. try:
  351. if _opener is None:
  352. _opener = build_opener()
  353. finally:
  354. urlopen_lock.release()
  355. return _opener.open(url, data, timeout)
  356. def urlretrieve(url, filename=None, reporthook=None, data=None,
  357. timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
  358. global _opener
  359. if _opener is None:
  360. urlopen_lock.acquire()
  361. try:
  362. if _opener is None:
  363. _opener = build_opener()
  364. finally:
  365. urlopen_lock.release()
  366. return _opener.retrieve(url, filename, reporthook, data, timeout)
  367. def install_opener(opener):
  368. global _opener
  369. _opener = opener