iter_worksheet.py | searchcode

/tablib/packages/openpyxl/reader/iter_worksheet.py

https://github.com/timClicks/tablib
Python | 348 lines | 282 code | 23 blank | 43 comment | 12 complexity | 2882310ffcb9b734a2b8aafa0eda2410 MD5 | raw file

# file openpyxl/reader/iter_worksheet.py



# Copyright (c) 2010 openpyxl

#

# Permission is hereby granted, free of charge, to any person obtaining a copy

# of this software and associated documentation files (the "Software"), to deal

# in the Software without restriction, including without limitation the rights

# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell

# copies of the Software, and to permit persons to whom the Software is

# furnished to do so, subject to the following conditions:

#

# The above copyright notice and this permission notice shall be included in

# all copies or substantial portions of the Software.

#

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN

# THE SOFTWARE.

#

# @license: http://www.opensource.org/licenses/mit-license.php

# @author: Eric Gazoni



""" Iterators-based worksheet reader

*Still very raw*

"""



from ....compat import BytesIO as StringIO

import warnings

import operator

from functools import partial

from itertools import groupby, ifilter

from ..worksheet import Worksheet

from ..cell import coordinate_from_string, get_column_letter, Cell

from ..reader.excel import get_sheet_ids

from ..reader.strings import read_string_table

from ..reader.style import read_style_table, NumberFormat

from ..shared.date_time import SharedDate

from ..reader.worksheet import read_dimension

from ..shared.ooxml import (MIN_COLUMN, MAX_COLUMN, PACKAGE_WORKSHEETS,

    MAX_ROW, MIN_ROW, ARC_SHARED_STRINGS, ARC_APP, ARC_STYLE)

try:

    from xml.etree.cElementTree import iterparse

except ImportError:

    from xml.etree.ElementTree import iterparse





from zipfile import ZipFile

from .. import cell

import re

import tempfile

import zlib

import zipfile

import struct



TYPE_NULL = Cell.TYPE_NULL

MISSING_VALUE = None



RE_COORDINATE = re.compile('^([A-Z]+)([0-9]+)$')



SHARED_DATE = SharedDate()



_COL_CONVERSION_CACHE = dict((get_column_letter(i), i) for i in xrange(1, 18279))

def column_index_from_string(str_col, _col_conversion_cache=_COL_CONVERSION_CACHE):

    # we use a function argument to get indexed name lookup

    return _col_conversion_cache[str_col]

del _COL_CONVERSION_CACHE



RAW_ATTRIBUTES = ['row', 'column', 'coordinate', 'internal_value', 'data_type', 'style_id', 'number_format']



try:

    from collections import namedtuple

    BaseRawCell = namedtuple('RawCell', RAW_ATTRIBUTES)

except ImportError:



    # warnings.warn("""Unable to import 'namedtuple' module, this may cause  memory issues when using optimized reader. Please upgrade your Python installation to 2.6+""")



    class BaseRawCell(object):



        def __init__(self, *args):

            assert len(args)==len(RAW_ATTRIBUTES)



            for attr, val in zip(RAW_ATTRIBUTES, args):

                setattr(self, attr, val)



        def _replace(self, **kwargs):



            self.__dict__.update(kwargs)



            return self





class RawCell(BaseRawCell):

    """Optimized version of the :class:`openpyxl.cell.Cell`, using named tuples.



    Useful attributes are:



    * row

    * column

    * coordinate

    * internal_value



    You can also access if needed:



    * data_type

    * number_format



    """



    @property

    def is_date(self):

        res = (self.data_type == Cell.TYPE_NUMERIC

               and self.number_format is not None

               and ('d' in self.number_format

                    or 'm' in self.number_format

                    or 'y' in self.number_format

                    or 'h' in self.number_format

                    or 's' in self.number_format

                   ))



        return res



def iter_rows(workbook_name, sheet_name, xml_source, range_string = '', row_offset = 0, column_offset = 0):



    archive = get_archive_file(workbook_name)



    source = xml_source



    if range_string:

        min_col, min_row, max_col, max_row = get_range_boundaries(range_string, row_offset, column_offset)

    else:

        min_col, min_row, max_col, max_row = read_dimension(xml_source = source)

        min_col = column_index_from_string(min_col)

        max_col = column_index_from_string(max_col) + 1

        max_row += 6



    try:

        string_table = read_string_table(archive.read(ARC_SHARED_STRINGS))

    except KeyError:

        string_table = {}



    style_table = read_style_table(archive.read(ARC_STYLE))



    source.seek(0)

    p = iterparse(source)



    return get_squared_range(p, min_col, min_row, max_col, max_row, string_table, style_table)





def get_rows(p, min_column = MIN_COLUMN, min_row = MIN_ROW, max_column = MAX_COLUMN, max_row = MAX_ROW):



    return groupby(get_cells(p, min_row, min_column, max_row, max_column), operator.attrgetter('row'))



def get_cells(p, min_row, min_col, max_row, max_col, _re_coordinate=RE_COORDINATE):



    for _event, element in p:



        if element.tag == '{http://schemas.openxmlformats.org/spreadsheetml/2006/main}c':

            coord = element.get('r')

            column_str, row = _re_coordinate.match(coord).groups()



            row = int(row)

            column = column_index_from_string(column_str)



            if min_col <= column <= max_col and min_row <= row <= max_row:

                data_type = element.get('t', 'n')

                style_id = element.get('s')

                value = element.findtext('{http://schemas.openxmlformats.org/spreadsheetml/2006/main}v')

                yield RawCell(row, column_str, coord, value, data_type, style_id, None)



        if element.tag == '{http://schemas.openxmlformats.org/spreadsheetml/2006/main}v':

            continue

        element.clear()







def get_range_boundaries(range_string, row = 0, column = 0):



    if ':' in range_string:

        min_range, max_range = range_string.split(':')

        min_col, min_row = coordinate_from_string(min_range)

        max_col, max_row = coordinate_from_string(max_range)



        min_col = column_index_from_string(min_col) + column

        max_col = column_index_from_string(max_col) + column

        min_row += row

        max_row += row



    else:

        min_col, min_row = coordinate_from_string(range_string)

        min_col = column_index_from_string(min_col)

        max_col = min_col + 1

        max_row = min_row



    return (min_col, min_row, max_col, max_row)



def get_archive_file(archive_name):



    return ZipFile(archive_name, 'r')



def get_xml_source(archive_file, sheet_name):



    return archive_file.read('%s/%s' % (PACKAGE_WORKSHEETS, sheet_name))



def get_missing_cells(row, columns):



    return dict([(column, RawCell(row, column, '%s%s' % (column, row), MISSING_VALUE, TYPE_NULL, None, None)) for column in columns])



def get_squared_range(p, min_col, min_row, max_col, max_row, string_table, style_table):



    expected_columns = [get_column_letter(ci) for ci in xrange(min_col, max_col)]



    current_row = min_row

    for row, cells in get_rows(p, min_row = min_row, max_row = max_row, min_column = min_col, max_column = max_col):

        full_row = []

        if current_row < row:



            for gap_row in xrange(current_row, row):



                dummy_cells = get_missing_cells(gap_row, expected_columns)



                yield tuple([dummy_cells[column] for column in expected_columns])



                current_row = row



        temp_cells = list(cells)



        retrieved_columns = dict([(c.column, c) for c in temp_cells])



        missing_columns = list(set(expected_columns) - set(retrieved_columns.keys()))



        replacement_columns = get_missing_cells(row, missing_columns)



        for column in expected_columns:



            if column in retrieved_columns:

                cell = retrieved_columns[column]



                if cell.style_id is not None:

                    style = style_table[int(cell.style_id)]

                    cell = cell._replace(number_format = style.number_format.format_code) #pylint: disable-msg=W0212

                if cell.internal_value is not None:

                    if cell.data_type == Cell.TYPE_STRING:

                        cell = cell._replace(internal_value = string_table[int(cell.internal_value)]) #pylint: disable-msg=W0212

                    elif cell.data_type == Cell.TYPE_BOOL:

                        cell = cell._replace(internal_value = cell.internal_value == 'True')

                    elif cell.is_date:

                        cell = cell._replace(internal_value = SHARED_DATE.from_julian(float(cell.internal_value)))

                    elif cell.data_type == Cell.TYPE_NUMERIC:

                        cell = cell._replace(internal_value = float(cell.internal_value))

                full_row.append(cell)



            else:

                full_row.append(replacement_columns[column])



        current_row = row + 1



        yield tuple(full_row)



#------------------------------------------------------------------------------



class IterableWorksheet(Worksheet):



    def __init__(self, parent_workbook, title, workbook_name,

            sheet_codename, xml_source):



        Worksheet.__init__(self, parent_workbook, title)

        self._workbook_name = workbook_name

        self._sheet_codename = sheet_codename

        self._xml_source = xml_source



    def iter_rows(self, range_string = '', row_offset = 0, column_offset = 0):

        """ Returns a squared range based on the `range_string` parameter,

        using generators.



        :param range_string: range of cells (e.g. 'A1:C4')

        :type range_string: string



        :param row: row index of the cell (e.g. 4)

        :type row: int



        :param column: column index of the cell (e.g. 3)

        :type column: int



        :rtype: generator



        """



        return iter_rows(workbook_name = self._workbook_name,

                         sheet_name = self._sheet_codename,

                         xml_source = self._xml_source,

                         range_string = range_string,

                         row_offset = row_offset,

                         column_offset = column_offset)



    def cell(self, *args, **kwargs):



        raise NotImplementedError("use 'iter_rows()' instead")



    def range(self, *args, **kwargs):



        raise NotImplementedError("use 'iter_rows()' instead")



def unpack_worksheet(archive, filename):



    temp_file = tempfile.TemporaryFile(mode='r+', prefix='openpyxl.', suffix='.unpack.temp')



    zinfo = archive.getinfo(filename)



    if zinfo.compress_type == zipfile.ZIP_STORED:

        decoder = None

    elif zinfo.compress_type == zipfile.ZIP_DEFLATED:

        decoder = zlib.decompressobj(-zlib.MAX_WBITS)

    else:

        raise zipfile.BadZipFile("Unrecognized compression method")



    archive.fp.seek(_get_file_offset(archive, zinfo))

    bytes_to_read = zinfo.compress_size



    while True:

        buff = archive.fp.read(min(bytes_to_read, 102400))

        if not buff:

            break

        bytes_to_read -= len(buff)

        if decoder:

            buff = decoder.decompress(buff)

        temp_file.write(buff)



    if decoder:

        temp_file.write(decoder.decompress('Z'))



    return temp_file



def _get_file_offset(archive, zinfo):



    try:

        return zinfo.file_offset

    except AttributeError:

        # From http://stackoverflow.com/questions/3781261/how-to-simulate-zipfile-open-in-python-2-5



        # Seek over the fixed size fields to the "file name length" field in

        # the file header (26 bytes). Unpack this and the "extra field length"

        # field ourselves as info.extra doesn't seem to be the correct length.

        archive.fp.seek(zinfo.header_offset + 26)

        file_name_len, extra_len = struct.unpack("<HH", archive.fp.read(4))

        return zinfo.header_offset + 30 + file_name_len + extra_len