PageRenderTime 57ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/tablib/packages/openpyxl3/reader/iter_worksheet.py

https://github.com/DanLipsitt/tablib
Python | 343 lines | 306 code | 10 blank | 27 comment | 3 complexity | e18e32a69c7cff3bcd37a6694ba26c74 MD5 | raw file
  1. # file openpyxl/reader/iter_worksheet.py
  2. # Copyright (c) 2010 openpyxl
  3. #
  4. # Permission is hereby granted, free of charge, to any person obtaining a copy
  5. # of this software and associated documentation files (the "Software"), to deal
  6. # in the Software without restriction, including without limitation the rights
  7. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8. # copies of the Software, and to permit persons to whom the Software is
  9. # furnished to do so, subject to the following conditions:
  10. #
  11. # The above copyright notice and this permission notice shall be included in
  12. # all copies or substantial portions of the Software.
  13. #
  14. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  20. # THE SOFTWARE.
  21. #
  22. # @license: http://www.opensource.org/licenses/mit-license.php
  23. # @author: Eric Gazoni
  24. """ Iterators-based worksheet reader
  25. *Still very raw*
  26. """
  27. from io import StringIO
  28. import warnings
  29. import operator
  30. from functools import partial
  31. from itertools import groupby
  32. from ..worksheet import Worksheet
  33. from ..cell import coordinate_from_string, get_column_letter, Cell
  34. from .excel import get_sheet_ids
  35. from .strings import read_string_table
  36. from .style import read_style_table, NumberFormat
  37. from ..shared.date_time import SharedDate
  38. from .worksheet import read_dimension
  39. from ..shared.ooxml import (MIN_COLUMN, MAX_COLUMN, PACKAGE_WORKSHEETS,
  40. MAX_ROW, MIN_ROW, ARC_SHARED_STRINGS, ARC_APP, ARC_STYLE)
  41. from xml.etree.cElementTree import iterparse
  42. from zipfile import ZipFile
  43. from .. import cell
  44. import re
  45. import tempfile
  46. import zlib
  47. import zipfile
  48. import struct
  49. TYPE_NULL = Cell.TYPE_NULL
  50. MISSING_VALUE = None
  51. RE_COORDINATE = re.compile('^([A-Z]+)([0-9]+)$')
  52. SHARED_DATE = SharedDate()
  53. _COL_CONVERSION_CACHE = dict((get_column_letter(i), i) for i in range(1, 18279))
  54. def column_index_from_string(str_col, _col_conversion_cache=_COL_CONVERSION_CACHE):
  55. # we use a function argument to get indexed name lookup
  56. return _col_conversion_cache[str_col]
  57. del _COL_CONVERSION_CACHE
  58. RAW_ATTRIBUTES = ['row', 'column', 'coordinate', 'internal_value', 'data_type', 'style_id', 'number_format']
  59. try:
  60. from collections import namedtuple
  61. BaseRawCell = namedtuple('RawCell', RAW_ATTRIBUTES)
  62. except ImportError:
  63. warnings.warn("""Unable to import 'namedtuple' module, this may cause memory issues when using optimized reader. Please upgrade your Python installation to 2.6+""")
  64. class BaseRawCell(object):
  65. def __init__(self, *args):
  66. assert len(args)==len(RAW_ATTRIBUTES)
  67. for attr, val in zip(RAW_ATTRIBUTES, args):
  68. setattr(self, attr, val)
  69. def _replace(self, **kwargs):
  70. self.__dict__.update(kwargs)
  71. return self
  72. class RawCell(BaseRawCell):
  73. """Optimized version of the :class:`..cell.Cell`, using named tuples.
  74. Useful attributes are:
  75. * row
  76. * column
  77. * coordinate
  78. * internal_value
  79. You can also access if needed:
  80. * data_type
  81. * number_format
  82. """
  83. @property
  84. def is_date(self):
  85. res = (self.data_type == Cell.TYPE_NUMERIC
  86. and self.number_format is not None
  87. and ('d' in self.number_format
  88. or 'm' in self.number_format
  89. or 'y' in self.number_format
  90. or 'h' in self.number_format
  91. or 's' in self.number_format
  92. ))
  93. return res
  94. def iter_rows(workbook_name, sheet_name, xml_source, range_string = '', row_offset = 0, column_offset = 0):
  95. archive = get_archive_file(workbook_name)
  96. source = xml_source
  97. if range_string:
  98. min_col, min_row, max_col, max_row = get_range_boundaries(range_string, row_offset, column_offset)
  99. else:
  100. min_col, min_row, max_col, max_row = read_dimension(xml_source = source)
  101. min_col = column_index_from_string(min_col)
  102. max_col = column_index_from_string(max_col) + 1
  103. max_row += 6
  104. try:
  105. string_table = read_string_table(archive.read(ARC_SHARED_STRINGS))
  106. except KeyError:
  107. string_table = {}
  108. style_table = read_style_table(archive.read(ARC_STYLE))
  109. source.seek(0)
  110. p = iterparse(source)
  111. return get_squared_range(p, min_col, min_row, max_col, max_row, string_table, style_table)
  112. def get_rows(p, min_column = MIN_COLUMN, min_row = MIN_ROW, max_column = MAX_COLUMN, max_row = MAX_ROW):
  113. return groupby(get_cells(p, min_row, min_column, max_row, max_column), operator.attrgetter('row'))
  114. def get_cells(p, min_row, min_col, max_row, max_col, _re_coordinate=RE_COORDINATE):
  115. for _event, element in p:
  116. if element.tag == '{http://schemas.openxmlformats.org/spreadsheetml/2006/main}c':
  117. coord = element.get('r')
  118. column_str, row = _re_coordinate.match(coord).groups()
  119. row = int(row)
  120. column = column_index_from_string(column_str)
  121. if min_col <= column <= max_col and min_row <= row <= max_row:
  122. data_type = element.get('t', 'n')
  123. style_id = element.get('s')
  124. value = element.findtext('{http://schemas.openxmlformats.org/spreadsheetml/2006/main}v')
  125. yield RawCell(row, column_str, coord, value, data_type, style_id, None)
  126. if element.tag == '{http://schemas.openxmlformats.org/spreadsheetml/2006/main}v':
  127. continue
  128. element.clear()
  129. def get_range_boundaries(range_string, row = 0, column = 0):
  130. if ':' in range_string:
  131. min_range, max_range = range_string.split(':')
  132. min_col, min_row = coordinate_from_string(min_range)
  133. max_col, max_row = coordinate_from_string(max_range)
  134. min_col = column_index_from_string(min_col) + column
  135. max_col = column_index_from_string(max_col) + column
  136. min_row += row
  137. max_row += row
  138. else:
  139. min_col, min_row = coordinate_from_string(range_string)
  140. min_col = column_index_from_string(min_col)
  141. max_col = min_col + 1
  142. max_row = min_row
  143. return (min_col, min_row, max_col, max_row)
  144. def get_archive_file(archive_name):
  145. return ZipFile(archive_name, 'r')
  146. def get_xml_source(archive_file, sheet_name):
  147. return archive_file.read('%s/%s' % (PACKAGE_WORKSHEETS, sheet_name))
  148. def get_missing_cells(row, columns):
  149. return dict([(column, RawCell(row, column, '%s%s' % (column, row), MISSING_VALUE, TYPE_NULL, None, None)) for column in columns])
  150. def get_squared_range(p, min_col, min_row, max_col, max_row, string_table, style_table):
  151. expected_columns = [get_column_letter(ci) for ci in range(min_col, max_col)]
  152. current_row = min_row
  153. for row, cells in get_rows(p, min_row = min_row, max_row = max_row, min_column = min_col, max_column = max_col):
  154. full_row = []
  155. if current_row < row:
  156. for gap_row in range(current_row, row):
  157. dummy_cells = get_missing_cells(gap_row, expected_columns)
  158. yield tuple([dummy_cells[column] for column in expected_columns])
  159. current_row = row
  160. temp_cells = list(cells)
  161. retrieved_columns = dict([(c.column, c) for c in temp_cells])
  162. missing_columns = list(set(expected_columns) - set(retrieved_columns.keys()))
  163. replacement_columns = get_missing_cells(row, missing_columns)
  164. for column in expected_columns:
  165. if column in retrieved_columns:
  166. cell = retrieved_columns[column]
  167. if cell.style_id is not None:
  168. style = style_table[int(cell.style_id)]
  169. cell = cell._replace(number_format = style.number_format.format_code) #pylint: disable-msg=W0212
  170. if cell.internal_value is not None:
  171. if cell.data_type == Cell.TYPE_STRING:
  172. cell = cell._replace(internal_value = string_table[int(cell.internal_value)]) #pylint: disable-msg=W0212
  173. elif cell.data_type == Cell.TYPE_BOOL:
  174. cell = cell._replace(internal_value = cell.internal_value == 'True')
  175. elif cell.is_date:
  176. cell = cell._replace(internal_value = SHARED_DATE.from_julian(float(cell.internal_value)))
  177. elif cell.data_type == Cell.TYPE_NUMERIC:
  178. cell = cell._replace(internal_value = float(cell.internal_value))
  179. full_row.append(cell)
  180. else:
  181. full_row.append(replacement_columns[column])
  182. current_row = row + 1
  183. yield tuple(full_row)
  184. #------------------------------------------------------------------------------
  185. class IterableWorksheet(Worksheet):
  186. def __init__(self, parent_workbook, title, workbook_name,
  187. sheet_codename, xml_source):
  188. Worksheet.__init__(self, parent_workbook, title)
  189. self._workbook_name = workbook_name
  190. self._sheet_codename = sheet_codename
  191. self._xml_source = xml_source
  192. def iter_rows(self, range_string = '', row_offset = 0, column_offset = 0):
  193. """ Returns a squared range based on the `range_string` parameter,
  194. using generators.
  195. :param range_string: range of cells (e.g. 'A1:C4')
  196. :type range_string: string
  197. :param row: row index of the cell (e.g. 4)
  198. :type row: int
  199. :param column: column index of the cell (e.g. 3)
  200. :type column: int
  201. :rtype: generator
  202. """
  203. return iter_rows(workbook_name = self._workbook_name,
  204. sheet_name = self._sheet_codename,
  205. xml_source = self._xml_source,
  206. range_string = range_string,
  207. row_offset = row_offset,
  208. column_offset = column_offset)
  209. def cell(self, *args, **kwargs):
  210. raise NotImplementedError("use 'iter_rows()' instead")
  211. def range(self, *args, **kwargs):
  212. raise NotImplementedError("use 'iter_rows()' instead")
  213. def unpack_worksheet(archive, filename):
  214. temp_file = tempfile.TemporaryFile(mode='r+', prefix='openpyxl.', suffix='.unpack.temp')
  215. zinfo = archive.getinfo(filename)
  216. if zinfo.compress_type == zipfile.ZIP_STORED:
  217. decoder = None
  218. elif zinfo.compress_type == zipfile.ZIP_DEFLATED:
  219. decoder = zlib.decompressobj(-zlib.MAX_WBITS)
  220. else:
  221. raise zipfile.BadZipFile("Unrecognized compression method")
  222. archive.fp.seek(_get_file_offset(archive, zinfo))
  223. bytes_to_read = zinfo.compress_size
  224. while True:
  225. buff = archive.fp.read(min(bytes_to_read, 102400))
  226. if not buff:
  227. break
  228. bytes_to_read -= len(buff)
  229. if decoder:
  230. buff = decoder.decompress(buff)
  231. temp_file.write(buff)
  232. if decoder:
  233. temp_file.write(decoder.decompress('Z'))
  234. return temp_file
  235. def _get_file_offset(archive, zinfo):
  236. try:
  237. return zinfo.file_offset
  238. except AttributeError:
  239. # From http://stackoverflow.com/questions/3781261/how-to-simulate-zipfile-open-in-python-2-5
  240. # Seek over the fixed size fields to the "file name length" field in
  241. # the file header (26 bytes). Unpack this and the "extra field length"
  242. # field ourselves as info.extra doesn't seem to be the correct length.
  243. archive.fp.seek(zinfo.header_offset + 26)
  244. file_name_len, extra_len = struct.unpack("<HH", archive.fp.read(4))
  245. return zinfo.header_offset + 30 + file_name_len + extra_len