PageRenderTime 26ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/scrapy/utils/gz.py

http://github.com/scrapy/scrapy
Python | 59 lines | 41 code | 5 blank | 13 comment | 8 complexity | d4afc0e2b53bbb96d40c0c58cfd2717e MD5 | raw file
Possible License(s): BSD-3-Clause
  1. from gzip import GzipFile
  2. from io import BytesIO
  3. import re
  4. import struct
  5. from scrapy.utils.decorators import deprecated
  6. # - Python>=3.5 GzipFile's read() has issues returning leftover
  7. # uncompressed data when input is corrupted
  8. # (regression or bug-fix compared to Python 3.4)
  9. # - read1(), which fetches data before raising EOFError on next call
  10. # works here but is only available from Python>=3.3
  11. @deprecated('GzipFile.read1')
  12. def read1(gzf, size=-1):
  13. return gzf.read1(size)
  14. def gunzip(data):
  15. """Gunzip the given data and return as much data as possible.
  16. This is resilient to CRC checksum errors.
  17. """
  18. f = GzipFile(fileobj=BytesIO(data))
  19. output_list = []
  20. chunk = b'.'
  21. while chunk:
  22. try:
  23. chunk = f.read1(8196)
  24. output_list.append(chunk)
  25. except (IOError, EOFError, struct.error):
  26. # complete only if there is some data, otherwise re-raise
  27. # see issue 87 about catching struct.error
  28. # some pages are quite small so output_list is empty and f.extrabuf
  29. # contains the whole page content
  30. if output_list or getattr(f, 'extrabuf', None):
  31. try:
  32. output_list.append(f.extrabuf[-f.extrasize:])
  33. finally:
  34. break
  35. else:
  36. raise
  37. return b''.join(output_list)
  38. _is_gzipped = re.compile(br'^application/(x-)?gzip\b', re.I).search
  39. _is_octetstream = re.compile(br'^(application|binary)/octet-stream\b', re.I).search
  40. @deprecated
  41. def is_gzipped(response):
  42. """Return True if the response is gzipped, or False otherwise"""
  43. ctype = response.headers.get('Content-Type', b'')
  44. cenc = response.headers.get('Content-Encoding', b'').lower()
  45. return _is_gzipped(ctype) or _is_octetstream(ctype) and cenc in (b'gzip', b'x-gzip')
  46. def gzip_magic_number(response):
  47. return response.body[:3] == b'\x1f\x8b\x08'