/scrapy/utils/response.py

https://github.com/yoyo2k/scrapy · Python · 84 lines · 68 code · 7 blank · 9 comment · 11 complexity · 154b40e63da15d57e09db43dccc58da7 MD5 · raw file

  1. """
  2. This module provides some useful functions for working with
  3. scrapy.http.Response objects
  4. """
  5. import os
  6. import weakref
  7. import webbrowser
  8. import tempfile
  9. from twisted.web import http
  10. from twisted.web.http import RESPONSES
  11. from w3lib import html
  12. from scrapy.xlib.BeautifulSoup import BeautifulSoup
  13. from scrapy.http import Response, HtmlResponse
  14. def body_or_str(obj, unicode=True):
  15. assert isinstance(obj, (Response, basestring)), \
  16. "obj must be Response or basestring, not %s" % type(obj).__name__
  17. if isinstance(obj, Response):
  18. return obj.body_as_unicode() if unicode else obj.body
  19. elif isinstance(obj, str):
  20. return obj.decode('utf-8') if unicode else obj
  21. else:
  22. return obj if unicode else obj.encode('utf-8')
  23. _baseurl_cache = weakref.WeakKeyDictionary()
  24. def get_base_url(response):
  25. """Return the base url of the given response, joined with the response url"""
  26. if response not in _baseurl_cache:
  27. text = response.body_as_unicode()[0:4096]
  28. _baseurl_cache[response] = html.get_base_url(text, response.url, \
  29. response.encoding)
  30. return _baseurl_cache[response]
  31. _metaref_cache = weakref.WeakKeyDictionary()
  32. def get_meta_refresh(response):
  33. """Parse the http-equiv refrsh parameter from the given response"""
  34. if response not in _metaref_cache:
  35. text = response.body_as_unicode()[0:4096]
  36. _metaref_cache[response] = html.get_meta_refresh(text, response.url, \
  37. response.encoding)
  38. return _metaref_cache[response]
  39. def response_status_message(status):
  40. """Return status code plus status text descriptive message
  41. >>> response_status_message(200)
  42. '200 OK'
  43. >>> response_status_message(404)
  44. '404 Not Found'
  45. """
  46. return '%s %s' % (status, http.responses.get(int(status)))
  47. def response_httprepr(response):
  48. """Return raw HTTP representation (as string) of the given response. This
  49. is provided only for reference, since it's not the exact stream of bytes
  50. that was received (that's not exposed by Twisted).
  51. """
  52. s = "HTTP/1.1 %d %s\r\n" % (response.status, RESPONSES.get(response.status, ''))
  53. if response.headers:
  54. s += response.headers.to_string() + "\r\n"
  55. s += "\r\n"
  56. s += response.body
  57. return s
  58. def open_in_browser(response, _openfunc=webbrowser.open):
  59. """Open the given response in a local web browser, populating the <base>
  60. tag for external links to work
  61. """
  62. # XXX: this implementation is a bit dirty and could be improved
  63. if not isinstance(response, HtmlResponse):
  64. raise TypeError("Unsupported response type: %s" % \
  65. response.__class__.__name__)
  66. body = response.body
  67. if '<base' not in body:
  68. body = body.replace('<head>', '<head><base href="%s">' % response.url)
  69. fd, fname = tempfile.mkstemp('.html')
  70. os.write(fd, body)
  71. os.close(fd)
  72. return _openfunc("file://%s" % fname)