PageRenderTime 26ms CodeModel.GetById 30ms RepoModel.GetById 1ms app.codeStats 0ms

/tablib/packages/openpyxl/reader/iter_worksheet.py

https://github.com/timClicks/tablib
Python | 348 lines | 282 code | 23 blank | 43 comment | 12 complexity | 2882310ffcb9b734a2b8aafa0eda2410 MD5 | raw file
  1. # file openpyxl/reader/iter_worksheet.py
  2. # Copyright (c) 2010 openpyxl
  3. #
  4. # Permission is hereby granted, free of charge, to any person obtaining a copy
  5. # of this software and associated documentation files (the "Software"), to deal
  6. # in the Software without restriction, including without limitation the rights
  7. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8. # copies of the Software, and to permit persons to whom the Software is
  9. # furnished to do so, subject to the following conditions:
  10. #
  11. # The above copyright notice and this permission notice shall be included in
  12. # all copies or substantial portions of the Software.
  13. #
  14. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  20. # THE SOFTWARE.
  21. #
  22. # @license: http://www.opensource.org/licenses/mit-license.php
  23. # @author: Eric Gazoni
  24. """ Iterators-based worksheet reader
  25. *Still very raw*
  26. """
  27. from ....compat import BytesIO as StringIO
  28. import warnings
  29. import operator
  30. from functools import partial
  31. from itertools import groupby, ifilter
  32. from ..worksheet import Worksheet
  33. from ..cell import coordinate_from_string, get_column_letter, Cell
  34. from ..reader.excel import get_sheet_ids
  35. from ..reader.strings import read_string_table
  36. from ..reader.style import read_style_table, NumberFormat
  37. from ..shared.date_time import SharedDate
  38. from ..reader.worksheet import read_dimension
  39. from ..shared.ooxml import (MIN_COLUMN, MAX_COLUMN, PACKAGE_WORKSHEETS,
  40. MAX_ROW, MIN_ROW, ARC_SHARED_STRINGS, ARC_APP, ARC_STYLE)
  41. try:
  42. from xml.etree.cElementTree import iterparse
  43. except ImportError:
  44. from xml.etree.ElementTree import iterparse
  45. from zipfile import ZipFile
  46. from .. import cell
  47. import re
  48. import tempfile
  49. import zlib
  50. import zipfile
  51. import struct
  52. TYPE_NULL = Cell.TYPE_NULL
  53. MISSING_VALUE = None
  54. RE_COORDINATE = re.compile('^([A-Z]+)([0-9]+)$')
  55. SHARED_DATE = SharedDate()
  56. _COL_CONVERSION_CACHE = dict((get_column_letter(i), i) for i in xrange(1, 18279))
  57. def column_index_from_string(str_col, _col_conversion_cache=_COL_CONVERSION_CACHE):
  58. # we use a function argument to get indexed name lookup
  59. return _col_conversion_cache[str_col]
  60. del _COL_CONVERSION_CACHE
  61. RAW_ATTRIBUTES = ['row', 'column', 'coordinate', 'internal_value', 'data_type', 'style_id', 'number_format']
  62. try:
  63. from collections import namedtuple
  64. BaseRawCell = namedtuple('RawCell', RAW_ATTRIBUTES)
  65. except ImportError:
  66. # warnings.warn("""Unable to import 'namedtuple' module, this may cause memory issues when using optimized reader. Please upgrade your Python installation to 2.6+""")
  67. class BaseRawCell(object):
  68. def __init__(self, *args):
  69. assert len(args)==len(RAW_ATTRIBUTES)
  70. for attr, val in zip(RAW_ATTRIBUTES, args):
  71. setattr(self, attr, val)
  72. def _replace(self, **kwargs):
  73. self.__dict__.update(kwargs)
  74. return self
  75. class RawCell(BaseRawCell):
  76. """Optimized version of the :class:`openpyxl.cell.Cell`, using named tuples.
  77. Useful attributes are:
  78. * row
  79. * column
  80. * coordinate
  81. * internal_value
  82. You can also access if needed:
  83. * data_type
  84. * number_format
  85. """
  86. @property
  87. def is_date(self):
  88. res = (self.data_type == Cell.TYPE_NUMERIC
  89. and self.number_format is not None
  90. and ('d' in self.number_format
  91. or 'm' in self.number_format
  92. or 'y' in self.number_format
  93. or 'h' in self.number_format
  94. or 's' in self.number_format
  95. ))
  96. return res
  97. def iter_rows(workbook_name, sheet_name, xml_source, range_string = '', row_offset = 0, column_offset = 0):
  98. archive = get_archive_file(workbook_name)
  99. source = xml_source
  100. if range_string:
  101. min_col, min_row, max_col, max_row = get_range_boundaries(range_string, row_offset, column_offset)
  102. else:
  103. min_col, min_row, max_col, max_row = read_dimension(xml_source = source)
  104. min_col = column_index_from_string(min_col)
  105. max_col = column_index_from_string(max_col) + 1
  106. max_row += 6
  107. try:
  108. string_table = read_string_table(archive.read(ARC_SHARED_STRINGS))
  109. except KeyError:
  110. string_table = {}
  111. style_table = read_style_table(archive.read(ARC_STYLE))
  112. source.seek(0)
  113. p = iterparse(source)
  114. return get_squared_range(p, min_col, min_row, max_col, max_row, string_table, style_table)
  115. def get_rows(p, min_column = MIN_COLUMN, min_row = MIN_ROW, max_column = MAX_COLUMN, max_row = MAX_ROW):
  116. return groupby(get_cells(p, min_row, min_column, max_row, max_column), operator.attrgetter('row'))
  117. def get_cells(p, min_row, min_col, max_row, max_col, _re_coordinate=RE_COORDINATE):
  118. for _event, element in p:
  119. if element.tag == '{http://schemas.openxmlformats.org/spreadsheetml/2006/main}c':
  120. coord = element.get('r')
  121. column_str, row = _re_coordinate.match(coord).groups()
  122. row = int(row)
  123. column = column_index_from_string(column_str)
  124. if min_col <= column <= max_col and min_row <= row <= max_row:
  125. data_type = element.get('t', 'n')
  126. style_id = element.get('s')
  127. value = element.findtext('{http://schemas.openxmlformats.org/spreadsheetml/2006/main}v')
  128. yield RawCell(row, column_str, coord, value, data_type, style_id, None)
  129. if element.tag == '{http://schemas.openxmlformats.org/spreadsheetml/2006/main}v':
  130. continue
  131. element.clear()
  132. def get_range_boundaries(range_string, row = 0, column = 0):
  133. if ':' in range_string:
  134. min_range, max_range = range_string.split(':')
  135. min_col, min_row = coordinate_from_string(min_range)
  136. max_col, max_row = coordinate_from_string(max_range)
  137. min_col = column_index_from_string(min_col) + column
  138. max_col = column_index_from_string(max_col) + column
  139. min_row += row
  140. max_row += row
  141. else:
  142. min_col, min_row = coordinate_from_string(range_string)
  143. min_col = column_index_from_string(min_col)
  144. max_col = min_col + 1
  145. max_row = min_row
  146. return (min_col, min_row, max_col, max_row)
  147. def get_archive_file(archive_name):
  148. return ZipFile(archive_name, 'r')
  149. def get_xml_source(archive_file, sheet_name):
  150. return archive_file.read('%s/%s' % (PACKAGE_WORKSHEETS, sheet_name))
  151. def get_missing_cells(row, columns):
  152. return dict([(column, RawCell(row, column, '%s%s' % (column, row), MISSING_VALUE, TYPE_NULL, None, None)) for column in columns])
  153. def get_squared_range(p, min_col, min_row, max_col, max_row, string_table, style_table):
  154. expected_columns = [get_column_letter(ci) for ci in xrange(min_col, max_col)]
  155. current_row = min_row
  156. for row, cells in get_rows(p, min_row = min_row, max_row = max_row, min_column = min_col, max_column = max_col):
  157. full_row = []
  158. if current_row < row:
  159. for gap_row in xrange(current_row, row):
  160. dummy_cells = get_missing_cells(gap_row, expected_columns)
  161. yield tuple([dummy_cells[column] for column in expected_columns])
  162. current_row = row
  163. temp_cells = list(cells)
  164. retrieved_columns = dict([(c.column, c) for c in temp_cells])
  165. missing_columns = list(set(expected_columns) - set(retrieved_columns.keys()))
  166. replacement_columns = get_missing_cells(row, missing_columns)
  167. for column in expected_columns:
  168. if column in retrieved_columns:
  169. cell = retrieved_columns[column]
  170. if cell.style_id is not None:
  171. style = style_table[int(cell.style_id)]
  172. cell = cell._replace(number_format = style.number_format.format_code) #pylint: disable-msg=W0212
  173. if cell.internal_value is not None:
  174. if cell.data_type == Cell.TYPE_STRING:
  175. cell = cell._replace(internal_value = string_table[int(cell.internal_value)]) #pylint: disable-msg=W0212
  176. elif cell.data_type == Cell.TYPE_BOOL:
  177. cell = cell._replace(internal_value = cell.internal_value == 'True')
  178. elif cell.is_date:
  179. cell = cell._replace(internal_value = SHARED_DATE.from_julian(float(cell.internal_value)))
  180. elif cell.data_type == Cell.TYPE_NUMERIC:
  181. cell = cell._replace(internal_value = float(cell.internal_value))
  182. full_row.append(cell)
  183. else:
  184. full_row.append(replacement_columns[column])
  185. current_row = row + 1
  186. yield tuple(full_row)
  187. #------------------------------------------------------------------------------
  188. class IterableWorksheet(Worksheet):
  189. def __init__(self, parent_workbook, title, workbook_name,
  190. sheet_codename, xml_source):
  191. Worksheet.__init__(self, parent_workbook, title)
  192. self._workbook_name = workbook_name
  193. self._sheet_codename = sheet_codename
  194. self._xml_source = xml_source
  195. def iter_rows(self, range_string = '', row_offset = 0, column_offset = 0):
  196. """ Returns a squared range based on the `range_string` parameter,
  197. using generators.
  198. :param range_string: range of cells (e.g. 'A1:C4')
  199. :type range_string: string
  200. :param row: row index of the cell (e.g. 4)
  201. :type row: int
  202. :param column: column index of the cell (e.g. 3)
  203. :type column: int
  204. :rtype: generator
  205. """
  206. return iter_rows(workbook_name = self._workbook_name,
  207. sheet_name = self._sheet_codename,
  208. xml_source = self._xml_source,
  209. range_string = range_string,
  210. row_offset = row_offset,
  211. column_offset = column_offset)
  212. def cell(self, *args, **kwargs):
  213. raise NotImplementedError("use 'iter_rows()' instead")
  214. def range(self, *args, **kwargs):
  215. raise NotImplementedError("use 'iter_rows()' instead")
  216. def unpack_worksheet(archive, filename):
  217. temp_file = tempfile.TemporaryFile(mode='r+', prefix='openpyxl.', suffix='.unpack.temp')
  218. zinfo = archive.getinfo(filename)
  219. if zinfo.compress_type == zipfile.ZIP_STORED:
  220. decoder = None
  221. elif zinfo.compress_type == zipfile.ZIP_DEFLATED:
  222. decoder = zlib.decompressobj(-zlib.MAX_WBITS)
  223. else:
  224. raise zipfile.BadZipFile("Unrecognized compression method")
  225. archive.fp.seek(_get_file_offset(archive, zinfo))
  226. bytes_to_read = zinfo.compress_size
  227. while True:
  228. buff = archive.fp.read(min(bytes_to_read, 102400))
  229. if not buff:
  230. break
  231. bytes_to_read -= len(buff)
  232. if decoder:
  233. buff = decoder.decompress(buff)
  234. temp_file.write(buff)
  235. if decoder:
  236. temp_file.write(decoder.decompress('Z'))
  237. return temp_file
  238. def _get_file_offset(archive, zinfo):
  239. try:
  240. return zinfo.file_offset
  241. except AttributeError:
  242. # From http://stackoverflow.com/questions/3781261/how-to-simulate-zipfile-open-in-python-2-5
  243. # Seek over the fixed size fields to the "file name length" field in
  244. # the file header (26 bytes). Unpack this and the "extra field length"
  245. # field ourselves as info.extra doesn't seem to be the correct length.
  246. archive.fp.seek(zinfo.header_offset + 26)
  247. file_name_len, extra_len = struct.unpack("<HH", archive.fp.read(4))
  248. return zinfo.header_offset + 30 + file_name_len + extra_len