PageRenderTime 57ms CodeModel.GetById 15ms app.highlight 36ms RepoModel.GetById 1ms app.codeStats 0ms

/Lib/gzip.py

http://unladen-swallow.googlecode.com/
Python | 484 lines | 479 code | 1 blank | 4 comment | 11 complexity | 42aa5eaa9e4daba0608bb55041924d91 MD5 | raw file
  1"""Functions that read and write gzipped files.
  2
  3The user of the file doesn't have to worry about the compression,
  4but random access is not allowed."""
  5
  6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
  7
  8import struct, sys, time
  9import zlib
 10import __builtin__
 11
 12__all__ = ["GzipFile","open"]
 13
 14FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
 15
 16READ, WRITE = 1, 2
 17
 18def write32u(output, value):
 19    # The L format writes the bit pattern correctly whether signed
 20    # or unsigned.
 21    output.write(struct.pack("<L", value))
 22
 23def read32(input):
 24    return struct.unpack("<I", input.read(4))[0]
 25
 26def open(filename, mode="rb", compresslevel=9):
 27    """Shorthand for GzipFile(filename, mode, compresslevel).
 28
 29    The filename argument is required; mode defaults to 'rb'
 30    and compresslevel defaults to 9.
 31
 32    """
 33    return GzipFile(filename, mode, compresslevel)
 34
 35class GzipFile:
 36    """The GzipFile class simulates most of the methods of a file object with
 37    the exception of the readinto() and truncate() methods.
 38
 39    """
 40
 41    myfileobj = None
 42    max_read_chunk = 10 * 1024 * 1024   # 10Mb
 43
 44    def __init__(self, filename=None, mode=None,
 45                 compresslevel=9, fileobj=None):
 46        """Constructor for the GzipFile class.
 47
 48        At least one of fileobj and filename must be given a
 49        non-trivial value.
 50
 51        The new class instance is based on fileobj, which can be a regular
 52        file, a StringIO object, or any other object which simulates a file.
 53        It defaults to None, in which case filename is opened to provide
 54        a file object.
 55
 56        When fileobj is not None, the filename argument is only used to be
 57        included in the gzip file header, which may includes the original
 58        filename of the uncompressed file.  It defaults to the filename of
 59        fileobj, if discernible; otherwise, it defaults to the empty string,
 60        and in this case the original filename is not included in the header.
 61
 62        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
 63        depending on whether the file will be read or written.  The default
 64        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
 65        Be aware that only the 'rb', 'ab', and 'wb' values should be used
 66        for cross-platform portability.
 67
 68        The compresslevel argument is an integer from 1 to 9 controlling the
 69        level of compression; 1 is fastest and produces the least compression,
 70        and 9 is slowest and produces the most compression.  The default is 9.
 71
 72        """
 73
 74        # guarantee the file is opened in binary mode on platforms
 75        # that care about that sort of thing
 76        if mode and 'b' not in mode:
 77            mode += 'b'
 78        if fileobj is None:
 79            fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
 80        if filename is None:
 81            if hasattr(fileobj, 'name'): filename = fileobj.name
 82            else: filename = ''
 83        if mode is None:
 84            if hasattr(fileobj, 'mode'): mode = fileobj.mode
 85            else: mode = 'rb'
 86
 87        if mode[0:1] == 'r':
 88            self.mode = READ
 89            # Set flag indicating start of a new member
 90            self._new_member = True
 91            self.extrabuf = ""
 92            self.extrasize = 0
 93            self.name = filename
 94            # Starts small, scales exponentially
 95            self.min_readsize = 100
 96
 97        elif mode[0:1] == 'w' or mode[0:1] == 'a':
 98            self.mode = WRITE
 99            self._init_write(filename)
100            self.compress = zlib.compressobj(compresslevel,
101                                             zlib.DEFLATED,
102                                             -zlib.MAX_WBITS,
103                                             zlib.DEF_MEM_LEVEL,
104                                             0)
105        else:
106            raise IOError, "Mode " + mode + " not supported"
107
108        self.fileobj = fileobj
109        self.offset = 0
110
111        if self.mode == WRITE:
112            self._write_gzip_header()
113
114    @property
115    def filename(self):
116        import warnings
117        warnings.warn("use the name attribute", DeprecationWarning, 2)
118        if self.mode == WRITE and self.name[-3:] != ".gz":
119            return self.name + ".gz"
120        return self.name
121
122    def __repr__(self):
123        s = repr(self.fileobj)
124        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
125
126    def _init_write(self, filename):
127        self.name = filename
128        self.crc = zlib.crc32("") & 0xffffffffL
129        self.size = 0
130        self.writebuf = []
131        self.bufsize = 0
132
133    def _write_gzip_header(self):
134        self.fileobj.write('\037\213')             # magic header
135        self.fileobj.write('\010')                 # compression method
136        fname = self.name
137        if fname.endswith(".gz"):
138            fname = fname[:-3]
139        flags = 0
140        if fname:
141            flags = FNAME
142        self.fileobj.write(chr(flags))
143        write32u(self.fileobj, long(time.time()))
144        self.fileobj.write('\002')
145        self.fileobj.write('\377')
146        if fname:
147            self.fileobj.write(fname + '\000')
148
149    def _init_read(self):
150        self.crc = zlib.crc32("") & 0xffffffffL
151        self.size = 0
152
153    def _read_gzip_header(self):
154        magic = self.fileobj.read(2)
155        if magic != '\037\213':
156            raise IOError, 'Not a gzipped file'
157        method = ord( self.fileobj.read(1) )
158        if method != 8:
159            raise IOError, 'Unknown compression method'
160        flag = ord( self.fileobj.read(1) )
161        # modtime = self.fileobj.read(4)
162        # extraflag = self.fileobj.read(1)
163        # os = self.fileobj.read(1)
164        self.fileobj.read(6)
165
166        if flag & FEXTRA:
167            # Read & discard the extra field, if present
168            xlen = ord(self.fileobj.read(1))
169            xlen = xlen + 256*ord(self.fileobj.read(1))
170            self.fileobj.read(xlen)
171        if flag & FNAME:
172            # Read and discard a null-terminated string containing the filename
173            while True:
174                s = self.fileobj.read(1)
175                if not s or s=='\000':
176                    break
177        if flag & FCOMMENT:
178            # Read and discard a null-terminated string containing a comment
179            while True:
180                s = self.fileobj.read(1)
181                if not s or s=='\000':
182                    break
183        if flag & FHCRC:
184            self.fileobj.read(2)     # Read & discard the 16-bit header CRC
185
186
187    def write(self,data):
188        if self.mode != WRITE:
189            import errno
190            raise IOError(errno.EBADF, "write() on read-only GzipFile object")
191
192        if self.fileobj is None:
193            raise ValueError, "write() on closed GzipFile object"
194        if len(data) > 0:
195            self.size = self.size + len(data)
196            self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
197            self.fileobj.write( self.compress.compress(data) )
198            self.offset += len(data)
199
200    def read(self, size=-1):
201        if self.mode != READ:
202            import errno
203            raise IOError(errno.EBADF, "read() on write-only GzipFile object")
204
205        if self.extrasize <= 0 and self.fileobj is None:
206            return ''
207
208        readsize = 1024
209        if size < 0:        # get the whole thing
210            try:
211                while True:
212                    self._read(readsize)
213                    readsize = min(self.max_read_chunk, readsize * 2)
214            except EOFError:
215                size = self.extrasize
216        else:               # just get some more of it
217            try:
218                while size > self.extrasize:
219                    self._read(readsize)
220                    readsize = min(self.max_read_chunk, readsize * 2)
221            except EOFError:
222                if size > self.extrasize:
223                    size = self.extrasize
224
225        chunk = self.extrabuf[:size]
226        self.extrabuf = self.extrabuf[size:]
227        self.extrasize = self.extrasize - size
228
229        self.offset += size
230        return chunk
231
232    def _unread(self, buf):
233        self.extrabuf = buf + self.extrabuf
234        self.extrasize = len(buf) + self.extrasize
235        self.offset -= len(buf)
236
237    def _read(self, size=1024):
238        if self.fileobj is None:
239            raise EOFError, "Reached EOF"
240
241        if self._new_member:
242            # If the _new_member flag is set, we have to
243            # jump to the next member, if there is one.
244            #
245            # First, check if we're at the end of the file;
246            # if so, it's time to stop; no more members to read.
247            pos = self.fileobj.tell()   # Save current position
248            self.fileobj.seek(0, 2)     # Seek to end of file
249            if pos == self.fileobj.tell():
250                raise EOFError, "Reached EOF"
251            else:
252                self.fileobj.seek( pos ) # Return to original position
253
254            self._init_read()
255            self._read_gzip_header()
256            self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
257            self._new_member = False
258
259        # Read a chunk of data from the file
260        buf = self.fileobj.read(size)
261
262        # If the EOF has been reached, flush the decompression object
263        # and mark this object as finished.
264
265        if buf == "":
266            uncompress = self.decompress.flush()
267            self._read_eof()
268            self._add_read_data( uncompress )
269            raise EOFError, 'Reached EOF'
270
271        uncompress = self.decompress.decompress(buf)
272        self._add_read_data( uncompress )
273
274        if self.decompress.unused_data != "":
275            # Ending case: we've come to the end of a member in the file,
276            # so seek back to the start of the unused data, finish up
277            # this member, and read a new gzip header.
278            # (The number of bytes to seek back is the length of the unused
279            # data, minus 8 because _read_eof() will rewind a further 8 bytes)
280            self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
281
282            # Check the CRC and file size, and set the flag so we read
283            # a new member on the next call
284            self._read_eof()
285            self._new_member = True
286
287    def _add_read_data(self, data):
288        self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
289        self.extrabuf = self.extrabuf + data
290        self.extrasize = self.extrasize + len(data)
291        self.size = self.size + len(data)
292
293    def _read_eof(self):
294        # We've read to the end of the file, so we have to rewind in order
295        # to reread the 8 bytes containing the CRC and the file size.
296        # We check the that the computed CRC and size of the
297        # uncompressed data matches the stored values.  Note that the size
298        # stored is the true file size mod 2**32.
299        self.fileobj.seek(-8, 1)
300        crc32 = read32(self.fileobj)
301        isize = read32(self.fileobj)  # may exceed 2GB
302        if crc32 != self.crc:
303            raise IOError("CRC check failed %s != %s" % (hex(crc32),
304                                                         hex(self.crc)))
305        elif isize != (self.size & 0xffffffffL):
306            raise IOError, "Incorrect length of data produced"
307
308    def close(self):
309        if self.fileobj is None:
310            return
311        if self.mode == WRITE:
312            self.fileobj.write(self.compress.flush())
313            write32u(self.fileobj, self.crc)
314            # self.size may exceed 2GB, or even 4GB
315            write32u(self.fileobj, self.size & 0xffffffffL)
316            self.fileobj = None
317        elif self.mode == READ:
318            self.fileobj = None
319        if self.myfileobj:
320            self.myfileobj.close()
321            self.myfileobj = None
322
323    def __del__(self):
324        try:
325            if (self.myfileobj is None and
326                self.fileobj is None):
327                return
328        except AttributeError:
329            return
330        self.close()
331
332    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
333        if self.mode == WRITE:
334            # Ensure the compressor's buffer is flushed
335            self.fileobj.write(self.compress.flush(zlib_mode))
336        self.fileobj.flush()
337
338    def fileno(self):
339        """Invoke the underlying file object's fileno() method.
340
341        This will raise AttributeError if the underlying file object
342        doesn't support fileno().
343        """
344        return self.fileobj.fileno()
345
346    def isatty(self):
347        return False
348
349    def tell(self):
350        return self.offset
351
352    def rewind(self):
353        '''Return the uncompressed stream file position indicator to the
354        beginning of the file'''
355        if self.mode != READ:
356            raise IOError("Can't rewind in write mode")
357        self.fileobj.seek(0)
358        self._new_member = True
359        self.extrabuf = ""
360        self.extrasize = 0
361        self.offset = 0
362
363    def seek(self, offset, whence=0):
364        if whence:
365            if whence == 1:
366                offset = self.offset + offset
367            else:
368                raise ValueError('Seek from end not supported')
369        if self.mode == WRITE:
370            if offset < self.offset:
371                raise IOError('Negative seek in write mode')
372            count = offset - self.offset
373            for i in range(count // 1024):
374                self.write(1024 * '\0')
375            self.write((count % 1024) * '\0')
376        elif self.mode == READ:
377            if offset < self.offset:
378                # for negative seek, rewind and do positive seek
379                self.rewind()
380            count = offset - self.offset
381            for i in range(count // 1024):
382                self.read(1024)
383            self.read(count % 1024)
384
385    def readline(self, size=-1):
386        if size < 0:
387            size = sys.maxint
388            readsize = self.min_readsize
389        else:
390            readsize = size
391        bufs = []
392        while size != 0:
393            c = self.read(readsize)
394            i = c.find('\n')
395
396            # We set i=size to break out of the loop under two
397            # conditions: 1) there's no newline, and the chunk is
398            # larger than size, or 2) there is a newline, but the
399            # resulting line would be longer than 'size'.
400            if (size <= i) or (i == -1 and len(c) > size):
401                i = size - 1
402
403            if i >= 0 or c == '':
404                bufs.append(c[:i + 1])    # Add portion of last chunk
405                self._unread(c[i + 1:])   # Push back rest of chunk
406                break
407
408            # Append chunk to list, decrease 'size',
409            bufs.append(c)
410            size = size - len(c)
411            readsize = min(size, readsize * 2)
412        if readsize > self.min_readsize:
413            self.min_readsize = min(readsize, self.min_readsize * 2, 512)
414        return ''.join(bufs) # Return resulting line
415
416    def readlines(self, sizehint=0):
417        # Negative numbers result in reading all the lines
418        if sizehint <= 0:
419            sizehint = sys.maxint
420        L = []
421        while sizehint > 0:
422            line = self.readline()
423            if line == "":
424                break
425            L.append(line)
426            sizehint = sizehint - len(line)
427
428        return L
429
430    def writelines(self, L):
431        for line in L:
432            self.write(line)
433
434    def __iter__(self):
435        return self
436
437    def next(self):
438        line = self.readline()
439        if line:
440            return line
441        else:
442            raise StopIteration
443
444
445def _test():
446    # Act like gzip; with -d, act like gunzip.
447    # The input file is not deleted, however, nor are any other gzip
448    # options or features supported.
449    args = sys.argv[1:]
450    decompress = args and args[0] == "-d"
451    if decompress:
452        args = args[1:]
453    if not args:
454        args = ["-"]
455    for arg in args:
456        if decompress:
457            if arg == "-":
458                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
459                g = sys.stdout
460            else:
461                if arg[-3:] != ".gz":
462                    print "filename doesn't end in .gz:", repr(arg)
463                    continue
464                f = open(arg, "rb")
465                g = __builtin__.open(arg[:-3], "wb")
466        else:
467            if arg == "-":
468                f = sys.stdin
469                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
470            else:
471                f = __builtin__.open(arg, "rb")
472                g = open(arg + ".gz", "wb")
473        while True:
474            chunk = f.read(1024)
475            if not chunk:
476                break
477            g.write(chunk)
478        if g is not sys.stdout:
479            g.close()
480        if f is not sys.stdin:
481            f.close()
482
483if __name__ == '__main__':
484    _test()