iter_worksheet.py | searchcode

/tablib/packages/openpyxl3/reader/iter_worksheet.py

https://github.com/DanLipsitt/tablib
Python | 343 lines | 306 code | 10 blank | 27 comment | 3 complexity | e18e32a69c7cff3bcd37a6694ba26c74 MD5 | raw file

# file openpyxl/reader/iter_worksheet.py

# Copyright (c) 2010 openpyxl
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# @license: http://www.opensource.org/licenses/mit-license.php
# @author: Eric Gazoni

""" Iterators-based worksheet reader 
*Still very raw*
"""

from io import StringIO
import warnings
import operator
from functools import partial
from itertools import groupby
from ..worksheet import Worksheet
from ..cell import coordinate_from_string, get_column_letter, Cell
from .excel import get_sheet_ids
from .strings import read_string_table
from .style import read_style_table, NumberFormat
from ..shared.date_time import SharedDate
from .worksheet import read_dimension
from ..shared.ooxml import (MIN_COLUMN, MAX_COLUMN, PACKAGE_WORKSHEETS,
    MAX_ROW, MIN_ROW, ARC_SHARED_STRINGS, ARC_APP, ARC_STYLE)
from xml.etree.cElementTree import iterparse
from zipfile import ZipFile
from .. import cell
import re
import tempfile
import zlib
import zipfile
import struct

TYPE_NULL = Cell.TYPE_NULL
MISSING_VALUE = None 

RE_COORDINATE = re.compile('^([A-Z]+)([0-9]+)$')

SHARED_DATE = SharedDate()

_COL_CONVERSION_CACHE = dict((get_column_letter(i), i) for i in range(1, 18279))
def column_index_from_string(str_col, _col_conversion_cache=_COL_CONVERSION_CACHE):
    # we use a function argument to get indexed name lookup
    return _col_conversion_cache[str_col]
del _COL_CONVERSION_CACHE

RAW_ATTRIBUTES = ['row', 'column', 'coordinate', 'internal_value', 'data_type', 'style_id', 'number_format']

try:
    from collections import namedtuple
    BaseRawCell = namedtuple('RawCell', RAW_ATTRIBUTES)
except ImportError:

    warnings.warn("""Unable to import 'namedtuple' module, this may cause  memory issues when using optimized reader. Please upgrade your Python installation to 2.6+""")

    class BaseRawCell(object):

        def __init__(self, *args):
            assert len(args)==len(RAW_ATTRIBUTES)

            for attr, val in zip(RAW_ATTRIBUTES, args):
                setattr(self, attr, val)

        def _replace(self, **kwargs):

            self.__dict__.update(kwargs)

            return self


class RawCell(BaseRawCell):
    """Optimized version of the :class:`..cell.Cell`, using named tuples.

    Useful attributes are:

    * row
    * column
    * coordinate
    * internal_value

    You can also access if needed:

    * data_type
    * number_format

    """

    @property
    def is_date(self):
        res = (self.data_type == Cell.TYPE_NUMERIC
               and self.number_format is not None
               and ('d' in self.number_format 
                    or 'm' in self.number_format
                    or 'y' in self.number_format
                    or 'h' in self.number_format
                    or 's' in self.number_format 
                   ))

        return res

def iter_rows(workbook_name, sheet_name, xml_source, range_string = '', row_offset = 0, column_offset = 0):

    archive = get_archive_file(workbook_name)

    source = xml_source 

    if range_string:
        min_col, min_row, max_col, max_row = get_range_boundaries(range_string, row_offset, column_offset)
    else:
        min_col, min_row, max_col, max_row = read_dimension(xml_source = source)
        min_col = column_index_from_string(min_col)
        max_col = column_index_from_string(max_col) + 1
        max_row += 6

    try:
        string_table = read_string_table(archive.read(ARC_SHARED_STRINGS))
    except KeyError:
        string_table = {}

    style_table = read_style_table(archive.read(ARC_STYLE))

    source.seek(0)
    p = iterparse(source)

    return get_squared_range(p, min_col, min_row, max_col, max_row, string_table, style_table)


def get_rows(p, min_column = MIN_COLUMN, min_row = MIN_ROW, max_column = MAX_COLUMN, max_row = MAX_ROW):

    return groupby(get_cells(p, min_row, min_column, max_row, max_column), operator.attrgetter('row'))

def get_cells(p, min_row, min_col, max_row, max_col, _re_coordinate=RE_COORDINATE):

    for _event, element in p:

        if element.tag == '{http://schemas.openxmlformats.org/spreadsheetml/2006/main}c':
            coord = element.get('r')
            column_str, row = _re_coordinate.match(coord).groups()

            row = int(row)
            column = column_index_from_string(column_str)

            if min_col <= column <= max_col and min_row <= row <= max_row:
                data_type = element.get('t', 'n')
                style_id = element.get('s')
                value = element.findtext('{http://schemas.openxmlformats.org/spreadsheetml/2006/main}v')
                yield RawCell(row, column_str, coord, value, data_type, style_id, None)

        if element.tag == '{http://schemas.openxmlformats.org/spreadsheetml/2006/main}v':
            continue
        element.clear()



def get_range_boundaries(range_string, row = 0, column = 0):

    if ':' in range_string:
        min_range, max_range = range_string.split(':')
        min_col, min_row = coordinate_from_string(min_range)
        max_col, max_row = coordinate_from_string(max_range)

        min_col = column_index_from_string(min_col) + column
        max_col = column_index_from_string(max_col) + column
        min_row += row
        max_row += row

    else:
        min_col, min_row = coordinate_from_string(range_string)
        min_col = column_index_from_string(min_col)
        max_col = min_col + 1
        max_row = min_row 

    return (min_col, min_row, max_col, max_row)

def get_archive_file(archive_name):

    return ZipFile(archive_name, 'r')

def get_xml_source(archive_file, sheet_name):

    return archive_file.read('%s/%s' % (PACKAGE_WORKSHEETS, sheet_name))

def get_missing_cells(row, columns):

    return dict([(column, RawCell(row, column, '%s%s' % (column, row), MISSING_VALUE, TYPE_NULL, None, None)) for column in columns])

def get_squared_range(p, min_col, min_row, max_col, max_row, string_table, style_table):

    expected_columns = [get_column_letter(ci) for ci in range(min_col, max_col)]

    current_row = min_row
    for row, cells in get_rows(p, min_row = min_row, max_row = max_row, min_column = min_col, max_column = max_col):
        full_row = []
        if current_row < row:

            for gap_row in range(current_row, row):

                dummy_cells = get_missing_cells(gap_row, expected_columns)
                
                yield tuple([dummy_cells[column] for column in expected_columns])

                current_row = row

        temp_cells = list(cells)

        retrieved_columns = dict([(c.column, c) for c in temp_cells])

        missing_columns = list(set(expected_columns) - set(retrieved_columns.keys()))

        replacement_columns = get_missing_cells(row, missing_columns)

        for column in expected_columns:

            if column in retrieved_columns:
                cell = retrieved_columns[column]

                if cell.style_id is not None:
                    style = style_table[int(cell.style_id)]
                    cell = cell._replace(number_format = style.number_format.format_code) #pylint: disable-msg=W0212
                if cell.internal_value is not None:
                    if cell.data_type == Cell.TYPE_STRING:
                        cell = cell._replace(internal_value = string_table[int(cell.internal_value)]) #pylint: disable-msg=W0212
                    elif cell.data_type == Cell.TYPE_BOOL:
                        cell = cell._replace(internal_value = cell.internal_value == 'True')
                    elif cell.is_date:
                        cell = cell._replace(internal_value = SHARED_DATE.from_julian(float(cell.internal_value)))
                    elif cell.data_type == Cell.TYPE_NUMERIC:
                        cell = cell._replace(internal_value = float(cell.internal_value))
                full_row.append(cell)

            else:
                full_row.append(replacement_columns[column])

        current_row = row + 1

        yield tuple(full_row)

#------------------------------------------------------------------------------ 

class IterableWorksheet(Worksheet):

    def __init__(self, parent_workbook, title, workbook_name, 
            sheet_codename, xml_source):

        Worksheet.__init__(self, parent_workbook, title)
        self._workbook_name = workbook_name
        self._sheet_codename = sheet_codename
        self._xml_source = xml_source

    def iter_rows(self, range_string = '', row_offset = 0, column_offset = 0):
        """ Returns a squared range based on the `range_string` parameter, 
        using generators.
        
        :param range_string: range of cells (e.g. 'A1:C4')
        :type range_string: string
        
        :param row: row index of the cell (e.g. 4)
        :type row: int

        :param column: column index of the cell (e.g. 3)
        :type column: int
        
        :rtype: generator
        
        """

        return iter_rows(workbook_name = self._workbook_name,
                         sheet_name = self._sheet_codename,
                         xml_source = self._xml_source,
                         range_string = range_string,
                         row_offset = row_offset,
                         column_offset = column_offset)

    def cell(self, *args, **kwargs):

        raise NotImplementedError("use 'iter_rows()' instead")

    def range(self, *args, **kwargs):

        raise NotImplementedError("use 'iter_rows()' instead")

def unpack_worksheet(archive, filename):

    temp_file = tempfile.TemporaryFile(mode='r+', prefix='openpyxl.', suffix='.unpack.temp')

    zinfo = archive.getinfo(filename)

    if zinfo.compress_type == zipfile.ZIP_STORED:
        decoder = None
    elif zinfo.compress_type == zipfile.ZIP_DEFLATED:
        decoder = zlib.decompressobj(-zlib.MAX_WBITS)
    else:
        raise zipfile.BadZipFile("Unrecognized compression method")

    archive.fp.seek(_get_file_offset(archive, zinfo))
    bytes_to_read = zinfo.compress_size

    while True:
        buff = archive.fp.read(min(bytes_to_read, 102400))
        if not buff:
            break
        bytes_to_read -= len(buff)
        if decoder:
            buff = decoder.decompress(buff)
        temp_file.write(buff)

    if decoder:
        temp_file.write(decoder.decompress('Z'))

    return temp_file

def _get_file_offset(archive, zinfo):

    try:
        return zinfo.file_offset
    except AttributeError:
        # From http://stackoverflow.com/questions/3781261/how-to-simulate-zipfile-open-in-python-2-5

        # Seek over the fixed size fields to the "file name length" field in
        # the file header (26 bytes). Unpack this and the "extra field length"
        # field ourselves as info.extra doesn't seem to be the correct length.
        archive.fp.seek(zinfo.header_offset + 26)
        file_name_len, extra_len = struct.unpack("<HH", archive.fp.read(4))
        return zinfo.header_offset + 30 + file_name_len + extra_len