PageRenderTime 76ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/io/pytables.py

http://github.com/wesm/pandas
Python | 4741 lines | 4690 code | 27 blank | 24 comment | 33 complexity | 26fb8d58c70144bbe83de193f4e12e88 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. # pylint: disable-msg=E1101,W0613,W0603
  2. """
  3. High level interface to PyTables for reading and writing pandas data structures
  4. to disk
  5. """
  6. import copy
  7. from datetime import date, datetime
  8. from distutils.version import LooseVersion
  9. import itertools
  10. import os
  11. import re
  12. import time
  13. import warnings
  14. import numpy as np
  15. from pandas._libs import lib, writers as libwriters
  16. from pandas._libs.tslibs import timezones
  17. from pandas.compat import PY3, filter, lrange, range, string_types
  18. from pandas.errors import PerformanceWarning
  19. from pandas.core.dtypes.common import (
  20. ensure_object, is_categorical_dtype, is_datetime64_dtype,
  21. is_datetime64tz_dtype, is_list_like, is_timedelta64_dtype)
  22. from pandas.core.dtypes.missing import array_equivalent
  23. from pandas import (
  24. DataFrame, DatetimeIndex, Index, Int64Index, MultiIndex, PeriodIndex,
  25. Series, SparseDataFrame, SparseSeries, TimedeltaIndex, compat, concat,
  26. isna, to_datetime)
  27. from pandas.core import config
  28. from pandas.core.arrays.categorical import Categorical
  29. from pandas.core.arrays.sparse import BlockIndex, IntIndex
  30. from pandas.core.base import StringMixin
  31. import pandas.core.common as com
  32. from pandas.core.computation.pytables import Expr, maybe_expression
  33. from pandas.core.config import get_option
  34. from pandas.core.index import ensure_index
  35. from pandas.core.internals import BlockManager, _block_shape, make_block
  36. from pandas.io.common import _stringify_path
  37. from pandas.io.formats.printing import adjoin, pprint_thing
  38. # versioning attribute
  39. _version = '0.15.2'
  40. # encoding
  41. # PY3 encoding if we don't specify
  42. _default_encoding = 'UTF-8'
  43. def _ensure_decoded(s):
  44. """ if we have bytes, decode them to unicode """
  45. if isinstance(s, np.bytes_):
  46. s = s.decode('UTF-8')
  47. return s
  48. def _ensure_encoding(encoding):
  49. # set the encoding if we need
  50. if encoding is None:
  51. if PY3:
  52. encoding = _default_encoding
  53. return encoding
  54. def _ensure_str(name):
  55. """Ensure that an index / column name is a str (python 3) or
  56. unicode (python 2); otherwise they may be np.string dtype.
  57. Non-string dtypes are passed through unchanged.
  58. https://github.com/pandas-dev/pandas/issues/13492
  59. """
  60. if isinstance(name, compat.string_types):
  61. name = compat.text_type(name)
  62. return name
  63. Term = Expr
  64. def _ensure_term(where, scope_level):
  65. """
  66. ensure that the where is a Term or a list of Term
  67. this makes sure that we are capturing the scope of variables
  68. that are passed
  69. create the terms here with a frame_level=2 (we are 2 levels down)
  70. """
  71. # only consider list/tuple here as an ndarray is automatically a coordinate
  72. # list
  73. level = scope_level + 1
  74. if isinstance(where, (list, tuple)):
  75. wlist = []
  76. for w in filter(lambda x: x is not None, where):
  77. if not maybe_expression(w):
  78. wlist.append(w)
  79. else:
  80. wlist.append(Term(w, scope_level=level))
  81. where = wlist
  82. elif maybe_expression(where):
  83. where = Term(where, scope_level=level)
  84. return where
  85. class PossibleDataLossError(Exception):
  86. pass
  87. class ClosedFileError(Exception):
  88. pass
  89. class IncompatibilityWarning(Warning):
  90. pass
  91. incompatibility_doc = """
  92. where criteria is being ignored as this version [%s] is too old (or
  93. not-defined), read the file in and write it out to a new file to upgrade (with
  94. the copy_to method)
  95. """
  96. class AttributeConflictWarning(Warning):
  97. pass
  98. attribute_conflict_doc = """
  99. the [%s] attribute of the existing index is [%s] which conflicts with the new
  100. [%s], resetting the attribute to None
  101. """
  102. class DuplicateWarning(Warning):
  103. pass
  104. duplicate_doc = """
  105. duplicate entries in table, taking most recently appended
  106. """
  107. performance_doc = """
  108. your performance may suffer as PyTables will pickle object types that it cannot
  109. map directly to c-types [inferred_type->%s,key->%s] [items->%s]
  110. """
  111. # formats
  112. _FORMAT_MAP = {
  113. u'f': 'fixed',
  114. u'fixed': 'fixed',
  115. u't': 'table',
  116. u'table': 'table',
  117. }
  118. format_deprecate_doc = """
  119. the table keyword has been deprecated
  120. use the format='fixed(f)|table(t)' keyword instead
  121. fixed(f) : specifies the Fixed format
  122. and is the default for put operations
  123. table(t) : specifies the Table format
  124. and is the default for append operations
  125. """
  126. # map object types
  127. _TYPE_MAP = {
  128. Series: u'series',
  129. SparseSeries: u'sparse_series',
  130. DataFrame: u'frame',
  131. SparseDataFrame: u'sparse_frame',
  132. }
  133. # storer class map
  134. _STORER_MAP = {
  135. u'Series': 'LegacySeriesFixed',
  136. u'DataFrame': 'LegacyFrameFixed',
  137. u'DataMatrix': 'LegacyFrameFixed',
  138. u'series': 'SeriesFixed',
  139. u'sparse_series': 'SparseSeriesFixed',
  140. u'frame': 'FrameFixed',
  141. u'sparse_frame': 'SparseFrameFixed',
  142. }
  143. # table class map
  144. _TABLE_MAP = {
  145. u'generic_table': 'GenericTable',
  146. u'appendable_series': 'AppendableSeriesTable',
  147. u'appendable_multiseries': 'AppendableMultiSeriesTable',
  148. u'appendable_frame': 'AppendableFrameTable',
  149. u'appendable_multiframe': 'AppendableMultiFrameTable',
  150. u'worm': 'WORMTable',
  151. }
  152. # axes map
  153. _AXES_MAP = {
  154. DataFrame: [0],
  155. }
  156. # register our configuration options
  157. dropna_doc = """
  158. : boolean
  159. drop ALL nan rows when appending to a table
  160. """
  161. format_doc = """
  162. : format
  163. default format writing format, if None, then
  164. put will default to 'fixed' and append will default to 'table'
  165. """
  166. with config.config_prefix('io.hdf'):
  167. config.register_option('dropna_table', False, dropna_doc,
  168. validator=config.is_bool)
  169. config.register_option(
  170. 'default_format', None, format_doc,
  171. validator=config.is_one_of_factory(['fixed', 'table', None])
  172. )
  173. # oh the troubles to reduce import time
  174. _table_mod = None
  175. _table_file_open_policy_is_strict = False
  176. def _tables():
  177. global _table_mod
  178. global _table_file_open_policy_is_strict
  179. if _table_mod is None:
  180. import tables
  181. _table_mod = tables
  182. # version requirements
  183. if LooseVersion(tables.__version__) < LooseVersion('3.0.0'):
  184. raise ImportError("PyTables version >= 3.0.0 is required")
  185. # set the file open policy
  186. # return the file open policy; this changes as of pytables 3.1
  187. # depending on the HDF5 version
  188. try:
  189. _table_file_open_policy_is_strict = (
  190. tables.file._FILE_OPEN_POLICY == 'strict')
  191. except AttributeError:
  192. pass
  193. return _table_mod
  194. # interface to/from ###
  195. def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None,
  196. append=None, **kwargs):
  197. """ store this object, close it if we opened it """
  198. if append:
  199. f = lambda store: store.append(key, value, **kwargs)
  200. else:
  201. f = lambda store: store.put(key, value, **kwargs)
  202. path_or_buf = _stringify_path(path_or_buf)
  203. if isinstance(path_or_buf, string_types):
  204. with HDFStore(path_or_buf, mode=mode, complevel=complevel,
  205. complib=complib) as store:
  206. f(store)
  207. else:
  208. f(path_or_buf)
  209. def read_hdf(path_or_buf, key=None, mode='r', **kwargs):
  210. """
  211. Read from the store, close it if we opened it.
  212. Retrieve pandas object stored in file, optionally based on where
  213. criteria
  214. Parameters
  215. ----------
  216. path_or_buf : string, buffer or path object
  217. Path to the file to open, or an open :class:`pandas.HDFStore` object.
  218. Supports any object implementing the ``__fspath__`` protocol.
  219. This includes :class:`pathlib.Path` and py._path.local.LocalPath
  220. objects.
  221. .. versionadded:: 0.19.0 support for pathlib, py.path.
  222. .. versionadded:: 0.21.0 support for __fspath__ protocol.
  223. key : object, optional
  224. The group identifier in the store. Can be omitted if the HDF file
  225. contains a single pandas object.
  226. mode : {'r', 'r+', 'a'}, optional
  227. Mode to use when opening the file. Ignored if path_or_buf is a
  228. :class:`pandas.HDFStore`. Default is 'r'.
  229. where : list, optional
  230. A list of Term (or convertible) objects.
  231. start : int, optional
  232. Row number to start selection.
  233. stop : int, optional
  234. Row number to stop selection.
  235. columns : list, optional
  236. A list of columns names to return.
  237. iterator : bool, optional
  238. Return an iterator object.
  239. chunksize : int, optional
  240. Number of rows to include in an iteration when using an iterator.
  241. errors : str, default 'strict'
  242. Specifies how encoding and decoding errors are to be handled.
  243. See the errors argument for :func:`open` for a full list
  244. of options.
  245. **kwargs
  246. Additional keyword arguments passed to HDFStore.
  247. Returns
  248. -------
  249. item : object
  250. The selected object. Return type depends on the object stored.
  251. See Also
  252. --------
  253. DataFrame.to_hdf : Write a HDF file from a DataFrame.
  254. HDFStore : Low-level access to HDF files.
  255. Examples
  256. --------
  257. >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z'])
  258. >>> df.to_hdf('./store.h5', 'data')
  259. >>> reread = pd.read_hdf('./store.h5')
  260. """
  261. if mode not in ['r', 'r+', 'a']:
  262. raise ValueError('mode {0} is not allowed while performing a read. '
  263. 'Allowed modes are r, r+ and a.'.format(mode))
  264. # grab the scope
  265. if 'where' in kwargs:
  266. kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1)
  267. if isinstance(path_or_buf, HDFStore):
  268. if not path_or_buf.is_open:
  269. raise IOError('The HDFStore must be open for reading.')
  270. store = path_or_buf
  271. auto_close = False
  272. else:
  273. path_or_buf = _stringify_path(path_or_buf)
  274. if not isinstance(path_or_buf, string_types):
  275. raise NotImplementedError('Support for generic buffers has not '
  276. 'been implemented.')
  277. try:
  278. exists = os.path.exists(path_or_buf)
  279. # if filepath is too long
  280. except (TypeError, ValueError):
  281. exists = False
  282. if not exists:
  283. raise compat.FileNotFoundError(
  284. 'File {path} does not exist'.format(path=path_or_buf))
  285. store = HDFStore(path_or_buf, mode=mode, **kwargs)
  286. # can't auto open/close if we are using an iterator
  287. # so delegate to the iterator
  288. auto_close = True
  289. try:
  290. if key is None:
  291. groups = store.groups()
  292. if len(groups) == 0:
  293. raise ValueError('No dataset in HDF5 file.')
  294. candidate_only_group = groups[0]
  295. # For the HDF file to have only one dataset, all other groups
  296. # should then be metadata groups for that candidate group. (This
  297. # assumes that the groups() method enumerates parent groups
  298. # before their children.)
  299. for group_to_check in groups[1:]:
  300. if not _is_metadata_of(group_to_check, candidate_only_group):
  301. raise ValueError('key must be provided when HDF5 file '
  302. 'contains multiple datasets.')
  303. key = candidate_only_group._v_pathname
  304. return store.select(key, auto_close=auto_close, **kwargs)
  305. except (ValueError, TypeError):
  306. # if there is an error, close the store
  307. try:
  308. store.close()
  309. except AttributeError:
  310. pass
  311. raise
  312. def _is_metadata_of(group, parent_group):
  313. """Check if a given group is a metadata group for a given parent_group."""
  314. if group._v_depth <= parent_group._v_depth:
  315. return False
  316. current = group
  317. while current._v_depth > 1:
  318. parent = current._v_parent
  319. if parent == parent_group and current._v_name == 'meta':
  320. return True
  321. current = current._v_parent
  322. return False
  323. class HDFStore(StringMixin):
  324. """
  325. Dict-like IO interface for storing pandas objects in PyTables
  326. either Fixed or Table format.
  327. Parameters
  328. ----------
  329. path : string
  330. File path to HDF5 file
  331. mode : {'a', 'w', 'r', 'r+'}, default 'a'
  332. ``'r'``
  333. Read-only; no data can be modified.
  334. ``'w'``
  335. Write; a new file is created (an existing file with the same
  336. name would be deleted).
  337. ``'a'``
  338. Append; an existing file is opened for reading and writing,
  339. and if the file does not exist it is created.
  340. ``'r+'``
  341. It is similar to ``'a'``, but the file must already exist.
  342. complevel : int, 0-9, default None
  343. Specifies a compression level for data.
  344. A value of 0 disables compression.
  345. complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
  346. Specifies the compression library to be used.
  347. As of v0.20.2 these additional compressors for Blosc are supported
  348. (default if no compressor specified: 'blosc:blosclz'):
  349. {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
  350. 'blosc:zlib', 'blosc:zstd'}.
  351. Specifying a compression library which is not available issues
  352. a ValueError.
  353. fletcher32 : bool, default False
  354. If applying compression use the fletcher32 checksum
  355. Examples
  356. --------
  357. >>> bar = pd.DataFrame(np.random.randn(10, 4))
  358. >>> store = pd.HDFStore('test.h5')
  359. >>> store['foo'] = bar # write to HDF5
  360. >>> bar = store['foo'] # retrieve
  361. >>> store.close()
  362. """
  363. def __init__(self, path, mode=None, complevel=None, complib=None,
  364. fletcher32=False, **kwargs):
  365. if 'format' in kwargs:
  366. raise ValueError('format is not a defined argument for HDFStore')
  367. try:
  368. import tables # noqa
  369. except ImportError as ex: # pragma: no cover
  370. raise ImportError('HDFStore requires PyTables, "{ex!s}" problem '
  371. 'importing'.format(ex=ex))
  372. if complib is not None and complib not in tables.filters.all_complibs:
  373. raise ValueError(
  374. "complib only supports {libs} compression.".format(
  375. libs=tables.filters.all_complibs))
  376. if complib is None and complevel is not None:
  377. complib = tables.filters.default_complib
  378. self._path = _stringify_path(path)
  379. if mode is None:
  380. mode = 'a'
  381. self._mode = mode
  382. self._handle = None
  383. self._complevel = complevel if complevel else 0
  384. self._complib = complib
  385. self._fletcher32 = fletcher32
  386. self._filters = None
  387. self.open(mode=mode, **kwargs)
  388. def __fspath__(self):
  389. return self._path
  390. @property
  391. def root(self):
  392. """ return the root node """
  393. self._check_if_open()
  394. return self._handle.root
  395. @property
  396. def filename(self):
  397. return self._path
  398. def __getitem__(self, key):
  399. return self.get(key)
  400. def __setitem__(self, key, value):
  401. self.put(key, value)
  402. def __delitem__(self, key):
  403. return self.remove(key)
  404. def __getattr__(self, name):
  405. """ allow attribute access to get stores """
  406. try:
  407. return self.get(name)
  408. except (KeyError, ClosedFileError):
  409. pass
  410. raise AttributeError(
  411. "'{object}' object has no attribute '{name}'".format(
  412. object=type(self).__name__, name=name))
  413. def __contains__(self, key):
  414. """ check for existence of this key
  415. can match the exact pathname or the pathnm w/o the leading '/'
  416. """
  417. node = self.get_node(key)
  418. if node is not None:
  419. name = node._v_pathname
  420. if name == key or name[1:] == key:
  421. return True
  422. return False
  423. def __len__(self):
  424. return len(self.groups())
  425. def __unicode__(self):
  426. return '{type}\nFile path: {path}\n'.format(
  427. type=type(self), path=pprint_thing(self._path))
  428. def __enter__(self):
  429. return self
  430. def __exit__(self, exc_type, exc_value, traceback):
  431. self.close()
  432. def keys(self):
  433. """
  434. Return a (potentially unordered) list of the keys corresponding to the
  435. objects stored in the HDFStore. These are ABSOLUTE path-names (e.g.
  436. have the leading '/'
  437. """
  438. return [n._v_pathname for n in self.groups()]
  439. def __iter__(self):
  440. return iter(self.keys())
  441. def items(self):
  442. """
  443. iterate on key->group
  444. """
  445. for g in self.groups():
  446. yield g._v_pathname, g
  447. iteritems = items
  448. def open(self, mode='a', **kwargs):
  449. """
  450. Open the file in the specified mode
  451. Parameters
  452. ----------
  453. mode : {'a', 'w', 'r', 'r+'}, default 'a'
  454. See HDFStore docstring or tables.open_file for info about modes
  455. """
  456. tables = _tables()
  457. if self._mode != mode:
  458. # if we are changing a write mode to read, ok
  459. if self._mode in ['a', 'w'] and mode in ['r', 'r+']:
  460. pass
  461. elif mode in ['w']:
  462. # this would truncate, raise here
  463. if self.is_open:
  464. raise PossibleDataLossError(
  465. "Re-opening the file [{0}] with mode [{1}] "
  466. "will delete the current file!"
  467. .format(self._path, self._mode)
  468. )
  469. self._mode = mode
  470. # close and reopen the handle
  471. if self.is_open:
  472. self.close()
  473. if self._complevel and self._complevel > 0:
  474. self._filters = _tables().Filters(self._complevel, self._complib,
  475. fletcher32=self._fletcher32)
  476. try:
  477. self._handle = tables.open_file(self._path, self._mode, **kwargs)
  478. except (IOError) as e: # pragma: no cover
  479. if 'can not be written' in str(e):
  480. print(
  481. 'Opening {path} in read-only mode'.format(path=self._path))
  482. self._handle = tables.open_file(self._path, 'r', **kwargs)
  483. else:
  484. raise
  485. except (ValueError) as e:
  486. # trap PyTables >= 3.1 FILE_OPEN_POLICY exception
  487. # to provide an updated message
  488. if 'FILE_OPEN_POLICY' in str(e):
  489. e = ValueError(
  490. "PyTables [{version}] no longer supports opening multiple "
  491. "files\n"
  492. "even in read-only mode on this HDF5 version "
  493. "[{hdf_version}]. You can accept this\n"
  494. "and not open the same file multiple times at once,\n"
  495. "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 "
  496. "which allows\n"
  497. "files to be opened multiple times at once\n"
  498. .format(version=tables.__version__,
  499. hdf_version=tables.get_hdf5_version()))
  500. raise e
  501. except (Exception) as e:
  502. # trying to read from a non-existent file causes an error which
  503. # is not part of IOError, make it one
  504. if self._mode == 'r' and 'Unable to open/create file' in str(e):
  505. raise IOError(str(e))
  506. raise
  507. def close(self):
  508. """
  509. Close the PyTables file handle
  510. """
  511. if self._handle is not None:
  512. self._handle.close()
  513. self._handle = None
  514. @property
  515. def is_open(self):
  516. """
  517. return a boolean indicating whether the file is open
  518. """
  519. if self._handle is None:
  520. return False
  521. return bool(self._handle.isopen)
  522. def flush(self, fsync=False):
  523. """
  524. Force all buffered modifications to be written to disk.
  525. Parameters
  526. ----------
  527. fsync : bool (default False)
  528. call ``os.fsync()`` on the file handle to force writing to disk.
  529. Notes
  530. -----
  531. Without ``fsync=True``, flushing may not guarantee that the OS writes
  532. to disk. With fsync, the operation will block until the OS claims the
  533. file has been written; however, other caching layers may still
  534. interfere.
  535. """
  536. if self._handle is not None:
  537. self._handle.flush()
  538. if fsync:
  539. try:
  540. os.fsync(self._handle.fileno())
  541. except OSError:
  542. pass
  543. def get(self, key):
  544. """
  545. Retrieve pandas object stored in file
  546. Parameters
  547. ----------
  548. key : object
  549. Returns
  550. -------
  551. obj : same type as object stored in file
  552. """
  553. group = self.get_node(key)
  554. if group is None:
  555. raise KeyError('No object named {key} in the file'.format(key=key))
  556. return self._read_group(group)
  557. def select(self, key, where=None, start=None, stop=None, columns=None,
  558. iterator=False, chunksize=None, auto_close=False, **kwargs):
  559. """
  560. Retrieve pandas object stored in file, optionally based on where
  561. criteria
  562. Parameters
  563. ----------
  564. key : object
  565. where : list of Term (or convertible) objects, optional
  566. start : integer (defaults to None), row number to start selection
  567. stop : integer (defaults to None), row number to stop selection
  568. columns : a list of columns that if not None, will limit the return
  569. columns
  570. iterator : boolean, return an iterator, default False
  571. chunksize : nrows to include in iteration, return an iterator
  572. auto_close : boolean, should automatically close the store when
  573. finished, default is False
  574. Returns
  575. -------
  576. The selected object
  577. """
  578. group = self.get_node(key)
  579. if group is None:
  580. raise KeyError('No object named {key} in the file'.format(key=key))
  581. # create the storer and axes
  582. where = _ensure_term(where, scope_level=1)
  583. s = self._create_storer(group)
  584. s.infer_axes()
  585. # function to call on iteration
  586. def func(_start, _stop, _where):
  587. return s.read(start=_start, stop=_stop,
  588. where=_where,
  589. columns=columns)
  590. # create the iterator
  591. it = TableIterator(self, s, func, where=where, nrows=s.nrows,
  592. start=start, stop=stop, iterator=iterator,
  593. chunksize=chunksize, auto_close=auto_close)
  594. return it.get_result()
  595. def select_as_coordinates(
  596. self, key, where=None, start=None, stop=None, **kwargs):
  597. """
  598. return the selection as an Index
  599. Parameters
  600. ----------
  601. key : object
  602. where : list of Term (or convertible) objects, optional
  603. start : integer (defaults to None), row number to start selection
  604. stop : integer (defaults to None), row number to stop selection
  605. """
  606. where = _ensure_term(where, scope_level=1)
  607. return self.get_storer(key).read_coordinates(where=where, start=start,
  608. stop=stop, **kwargs)
  609. def select_column(self, key, column, **kwargs):
  610. """
  611. return a single column from the table. This is generally only useful to
  612. select an indexable
  613. Parameters
  614. ----------
  615. key : object
  616. column: the column of interest
  617. Exceptions
  618. ----------
  619. raises KeyError if the column is not found (or key is not a valid
  620. store)
  621. raises ValueError if the column can not be extracted individually (it
  622. is part of a data block)
  623. """
  624. return self.get_storer(key).read_column(column=column, **kwargs)
  625. def select_as_multiple(self, keys, where=None, selector=None, columns=None,
  626. start=None, stop=None, iterator=False,
  627. chunksize=None, auto_close=False, **kwargs):
  628. """ Retrieve pandas objects from multiple tables
  629. Parameters
  630. ----------
  631. keys : a list of the tables
  632. selector : the table to apply the where criteria (defaults to keys[0]
  633. if not supplied)
  634. columns : the columns I want back
  635. start : integer (defaults to None), row number to start selection
  636. stop : integer (defaults to None), row number to stop selection
  637. iterator : boolean, return an iterator, default False
  638. chunksize : nrows to include in iteration, return an iterator
  639. Exceptions
  640. ----------
  641. raises KeyError if keys or selector is not found or keys is empty
  642. raises TypeError if keys is not a list or tuple
  643. raises ValueError if the tables are not ALL THE SAME DIMENSIONS
  644. """
  645. # default to single select
  646. where = _ensure_term(where, scope_level=1)
  647. if isinstance(keys, (list, tuple)) and len(keys) == 1:
  648. keys = keys[0]
  649. if isinstance(keys, string_types):
  650. return self.select(key=keys, where=where, columns=columns,
  651. start=start, stop=stop, iterator=iterator,
  652. chunksize=chunksize, **kwargs)
  653. if not isinstance(keys, (list, tuple)):
  654. raise TypeError("keys must be a list/tuple")
  655. if not len(keys):
  656. raise ValueError("keys must have a non-zero length")
  657. if selector is None:
  658. selector = keys[0]
  659. # collect the tables
  660. tbls = [self.get_storer(k) for k in keys]
  661. s = self.get_storer(selector)
  662. # validate rows
  663. nrows = None
  664. for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
  665. if t is None:
  666. raise KeyError("Invalid table [{key}]".format(key=k))
  667. if not t.is_table:
  668. raise TypeError(
  669. "object [{obj}] is not a table, and cannot be used in all "
  670. "select as multiple".format(obj=t.pathname)
  671. )
  672. if nrows is None:
  673. nrows = t.nrows
  674. elif t.nrows != nrows:
  675. raise ValueError(
  676. "all tables must have exactly the same nrows!")
  677. # axis is the concentation axes
  678. axis = list({t.non_index_axes[0][0] for t in tbls})[0]
  679. def func(_start, _stop, _where):
  680. # retrieve the objs, _where is always passed as a set of
  681. # coordinates here
  682. objs = [t.read(where=_where, columns=columns, start=_start,
  683. stop=_stop, **kwargs) for t in tbls]
  684. # concat and return
  685. return concat(objs, axis=axis,
  686. verify_integrity=False)._consolidate()
  687. # create the iterator
  688. it = TableIterator(self, s, func, where=where, nrows=nrows,
  689. start=start, stop=stop, iterator=iterator,
  690. chunksize=chunksize, auto_close=auto_close)
  691. return it.get_result(coordinates=True)
  692. def put(self, key, value, format=None, append=False, **kwargs):
  693. """
  694. Store object in HDFStore
  695. Parameters
  696. ----------
  697. key : object
  698. value : {Series, DataFrame}
  699. format : 'fixed(f)|table(t)', default is 'fixed'
  700. fixed(f) : Fixed format
  701. Fast writing/reading. Not-appendable, nor searchable
  702. table(t) : Table format
  703. Write as a PyTables Table structure which may perform
  704. worse but allow more flexible operations like searching
  705. / selecting subsets of the data
  706. append : boolean, default False
  707. This will force Table format, append the input data to the
  708. existing.
  709. data_columns : list of columns to create as data columns, or True to
  710. use all columns. See
  711. `here <http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns>`__ # noqa
  712. encoding : default None, provide an encoding for strings
  713. dropna : boolean, default False, do not write an ALL nan row to
  714. the store settable by the option 'io.hdf.dropna_table'
  715. """
  716. if format is None:
  717. format = get_option("io.hdf.default_format") or 'fixed'
  718. kwargs = self._validate_format(format, kwargs)
  719. self._write_to_group(key, value, append=append, **kwargs)
  720. def remove(self, key, where=None, start=None, stop=None):
  721. """
  722. Remove pandas object partially by specifying the where condition
  723. Parameters
  724. ----------
  725. key : string
  726. Node to remove or delete rows from
  727. where : list of Term (or convertible) objects, optional
  728. start : integer (defaults to None), row number to start selection
  729. stop : integer (defaults to None), row number to stop selection
  730. Returns
  731. -------
  732. number of rows removed (or None if not a Table)
  733. Exceptions
  734. ----------
  735. raises KeyError if key is not a valid store
  736. """
  737. where = _ensure_term(where, scope_level=1)
  738. try:
  739. s = self.get_storer(key)
  740. except KeyError:
  741. # the key is not a valid store, re-raising KeyError
  742. raise
  743. except Exception:
  744. if where is not None:
  745. raise ValueError(
  746. "trying to remove a node with a non-None where clause!")
  747. # we are actually trying to remove a node (with children)
  748. s = self.get_node(key)
  749. if s is not None:
  750. s._f_remove(recursive=True)
  751. return None
  752. # remove the node
  753. if com._all_none(where, start, stop):
  754. s.group._f_remove(recursive=True)
  755. # delete from the table
  756. else:
  757. if not s.is_table:
  758. raise ValueError(
  759. 'can only remove with where on objects written as tables')
  760. return s.delete(where=where, start=start, stop=stop)
  761. def append(self, key, value, format=None, append=True, columns=None,
  762. dropna=None, **kwargs):
  763. """
  764. Append to Table in file. Node must already exist and be Table
  765. format.
  766. Parameters
  767. ----------
  768. key : object
  769. value : {Series, DataFrame}
  770. format : 'table' is the default
  771. table(t) : table format
  772. Write as a PyTables Table structure which may perform
  773. worse but allow more flexible operations like searching
  774. / selecting subsets of the data
  775. append : boolean, default True, append the input data to the
  776. existing
  777. data_columns : list of columns, or True, default None
  778. List of columns to create as indexed data columns for on-disk
  779. queries, or True to use all columns. By default only the axes
  780. of the object are indexed. See `here
  781. <http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns>`__.
  782. min_itemsize : dict of columns that specify minimum string sizes
  783. nan_rep : string to use as string nan represenation
  784. chunksize : size to chunk the writing
  785. expectedrows : expected TOTAL row size of this table
  786. encoding : default None, provide an encoding for strings
  787. dropna : boolean, default False, do not write an ALL nan row to
  788. the store settable by the option 'io.hdf.dropna_table'
  789. Notes
  790. -----
  791. Does *not* check if data being appended overlaps with existing
  792. data in the table, so be careful
  793. """
  794. if columns is not None:
  795. raise TypeError("columns is not a supported keyword in append, "
  796. "try data_columns")
  797. if dropna is None:
  798. dropna = get_option("io.hdf.dropna_table")
  799. if format is None:
  800. format = get_option("io.hdf.default_format") or 'table'
  801. kwargs = self._validate_format(format, kwargs)
  802. self._write_to_group(key, value, append=append, dropna=dropna,
  803. **kwargs)
  804. def append_to_multiple(self, d, value, selector, data_columns=None,
  805. axes=None, dropna=False, **kwargs):
  806. """
  807. Append to multiple tables
  808. Parameters
  809. ----------
  810. d : a dict of table_name to table_columns, None is acceptable as the
  811. values of one node (this will get all the remaining columns)
  812. value : a pandas object
  813. selector : a string that designates the indexable table; all of its
  814. columns will be designed as data_columns, unless data_columns is
  815. passed, in which case these are used
  816. data_columns : list of columns to create as data columns, or True to
  817. use all columns
  818. dropna : if evaluates to True, drop rows from all tables if any single
  819. row in each table has all NaN. Default False.
  820. Notes
  821. -----
  822. axes parameter is currently not accepted
  823. """
  824. if axes is not None:
  825. raise TypeError("axes is currently not accepted as a parameter to"
  826. " append_to_multiple; you can create the "
  827. "tables independently instead")
  828. if not isinstance(d, dict):
  829. raise ValueError(
  830. "append_to_multiple must have a dictionary specified as the "
  831. "way to split the value"
  832. )
  833. if selector not in d:
  834. raise ValueError(
  835. "append_to_multiple requires a selector that is in passed dict"
  836. )
  837. # figure out the splitting axis (the non_index_axis)
  838. axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]
  839. # figure out how to split the value
  840. remain_key = None
  841. remain_values = []
  842. for k, v in d.items():
  843. if v is None:
  844. if remain_key is not None:
  845. raise ValueError(
  846. "append_to_multiple can only have one value in d that "
  847. "is None"
  848. )
  849. remain_key = k
  850. else:
  851. remain_values.extend(v)
  852. if remain_key is not None:
  853. ordered = value.axes[axis]
  854. ordd = ordered.difference(Index(remain_values))
  855. ordd = sorted(ordered.get_indexer(ordd))
  856. d[remain_key] = ordered.take(ordd)
  857. # data_columns
  858. if data_columns is None:
  859. data_columns = d[selector]
  860. # ensure rows are synchronized across the tables
  861. if dropna:
  862. idxs = (value[cols].dropna(how='all').index for cols in d.values())
  863. valid_index = next(idxs)
  864. for index in idxs:
  865. valid_index = valid_index.intersection(index)
  866. value = value.loc[valid_index]
  867. # append
  868. for k, v in d.items():
  869. dc = data_columns if k == selector else None
  870. # compute the val
  871. val = value.reindex(v, axis=axis)
  872. self.append(k, val, data_columns=dc, **kwargs)
  873. def create_table_index(self, key, **kwargs):
  874. """ Create a pytables index on the table
  875. Parameters
  876. ----------
  877. key : object (the node to index)
  878. Exceptions
  879. ----------
  880. raises if the node is not a table
  881. """
  882. # version requirements
  883. _tables()
  884. s = self.get_storer(key)
  885. if s is None:
  886. return
  887. if not s.is_table:
  888. raise TypeError(
  889. "cannot create table index on a Fixed format store")
  890. s.create_index(**kwargs)
  891. def groups(self):
  892. """return a list of all the top-level nodes (that are not themselves a
  893. pandas storage object)
  894. """
  895. _tables()
  896. self._check_if_open()
  897. return [
  898. g for g in self._handle.walk_groups()
  899. if (not isinstance(g, _table_mod.link.Link) and
  900. (getattr(g._v_attrs, 'pandas_type', None) or
  901. getattr(g, 'table', None) or
  902. (isinstance(g, _table_mod.table.Table) and
  903. g._v_name != u'table')))
  904. ]
  905. def walk(self, where="/"):
  906. """ Walk the pytables group hierarchy for pandas objects
  907. This generator will yield the group path, subgroups and pandas object
  908. names for each group.
  909. Any non-pandas PyTables objects that are not a group will be ignored.
  910. The `where` group itself is listed first (preorder), then each of its
  911. child groups (following an alphanumerical order) is also traversed,
  912. following the same procedure.
  913. .. versionadded:: 0.24.0
  914. Parameters
  915. ----------
  916. where : str, optional
  917. Group where to start walking.
  918. If not supplied, the root group is used.
  919. Yields
  920. ------
  921. path : str
  922. Full path to a group (without trailing '/')
  923. groups : list of str
  924. names of the groups contained in `path`
  925. leaves : list of str
  926. names of the pandas objects contained in `path`
  927. """
  928. _tables()
  929. self._check_if_open()
  930. for g in self._handle.walk_groups(where):
  931. if getattr(g._v_attrs, 'pandas_type', None) is not None:
  932. continue
  933. groups = []
  934. leaves = []
  935. for child in g._v_children.values():
  936. pandas_type = getattr(child._v_attrs, 'pandas_type', None)
  937. if pandas_type is None:
  938. if isinstance(child, _table_mod.group.Group):
  939. groups.append(child._v_name)
  940. else:
  941. leaves.append(child._v_name)
  942. yield (g._v_pathname.rstrip('/'), groups, leaves)
  943. def get_node(self, key):
  944. """ return the node with the key or None if it does not exist """
  945. self._check_if_open()
  946. try:
  947. if not key.startswith('/'):
  948. key = '/' + key
  949. return self._handle.get_node(self.root, key)
  950. except _table_mod.exceptions.NoSuchNodeError:
  951. return None
  952. def get_storer(self, key):
  953. """ return the storer object for a key, raise if not in the file """
  954. group = self.get_node(key)
  955. if group is None:
  956. raise KeyError('No object named {key} in the file'.format(key=key))
  957. s = self._create_storer(group)
  958. s.infer_axes()
  959. return s
  960. def copy(self, file, mode='w', propindexes=True, keys=None, complib=None,
  961. complevel=None, fletcher32=False, overwrite=True):
  962. """ copy the existing store to a new file, upgrading in place
  963. Parameters
  964. ----------
  965. propindexes: restore indexes in copied file (defaults to True)
  966. keys : list of keys to include in the copy (defaults to all)
  967. overwrite : overwrite (remove and replace) existing nodes in the
  968. new store (default is True)
  969. mode, complib, complevel, fletcher32 same as in HDFStore.__init__
  970. Returns
  971. -------
  972. open file handle of the new store
  973. """
  974. new_store = HDFStore(
  975. file,
  976. mode=mode,
  977. complib=complib,
  978. complevel=complevel,
  979. fletcher32=fletcher32)
  980. if keys is None:
  981. keys = list(self.keys())
  982. if not isinstance(keys, (tuple, list)):
  983. keys = [keys]
  984. for k in keys:
  985. s = self.get_storer(k)
  986. if s is not None:
  987. if k in new_store:
  988. if overwrite:
  989. new_store.remove(k)
  990. data = self.select(k)
  991. if s.is_table:
  992. index = False
  993. if propindexes:
  994. index = [a.name for a in s.axes if a.is_indexed]
  995. new_store.append(
  996. k, data, index=index,
  997. data_columns=getattr(s, 'data_columns', None),
  998. encoding=s.encoding
  999. )
  1000. else:
  1001. new_store.put(k, data, encoding=s.encoding)
  1002. return new_store
  1003. def info(self):
  1004. """
  1005. Print detailed information on the store.
  1006. .. versionadded:: 0.21.0
  1007. """
  1008. output = '{type}\nFile path: {path}\n'.format(
  1009. type=type(self), path=pprint_thing(self._path))
  1010. if self.is_open:
  1011. lkeys = sorted(list(self.keys()))
  1012. if len(lkeys):
  1013. keys = []
  1014. values = []
  1015. for k in lkeys:
  1016. try:
  1017. s = self.get_storer(k)
  1018. if s is not None:
  1019. keys.append(pprint_thing(s.pathname or k))
  1020. values.append(
  1021. pprint_thing(s or 'invalid_HDFStore node'))
  1022. except Exception as detail:
  1023. keys.append(k)
  1024. values.append(
  1025. "[invalid_HDFStore node: {detail}]".format(
  1026. detail=pprint_thing(detail)))
  1027. output += adjoin(12, keys, values)
  1028. else:
  1029. output += 'Empty'
  1030. else:
  1031. output += "File is CLOSED"
  1032. return output
  1033. # private methods ######
  1034. def _check_if_open(self):
  1035. if not self.is_open:
  1036. raise ClosedFileError("{0} file is not open!".format(self._path))
  1037. def _validate_format(self, format, kwargs):
  1038. """ validate / deprecate formats; return the new kwargs """
  1039. kwargs = kwargs.copy()
  1040. # validate
  1041. try:
  1042. kwargs['format'] = _FORMAT_MAP[format.lower()]
  1043. except KeyError:
  1044. raise TypeError("invalid HDFStore format specified [{0}]"
  1045. .format(format))
  1046. return kwargs
  1047. def _create_storer(self, group, format=None, value=None, append=False,
  1048. **kwargs):
  1049. """ return a suitable class to operate """
  1050. def error(t):
  1051. raise TypeError(
  1052. "cannot properly create the storer for: [{t}] [group->"
  1053. "{group},value->{value},format->{format},append->{append},"
  1054. "kwargs->{kwargs}]".format(t=t, group=group,
  1055. value=type(value), format=format,
  1056. append=append, kwargs=kwargs))
  1057. pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None))
  1058. tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None))
  1059. # infer the pt from the passed value
  1060. if pt is None:
  1061. if value is None:
  1062. _tables()
  1063. if (getattr(group, 'table', None) or
  1064. isinstance(group, _table_mod.table.Table)):
  1065. pt = u'frame_table'
  1066. tt = u'generic_table'
  1067. else:
  1068. raise TypeError(
  1069. "cannot create a storer if the object is not existing "
  1070. "nor a value are passed")
  1071. else:
  1072. try:
  1073. pt = _TYPE_MAP[type(value)]
  1074. except KeyError:
  1075. error('_TYPE_MAP')
  1076. # we are actually a table
  1077. if format == 'table':
  1078. pt += u'_table'
  1079. # a storer node
  1080. if u'table' not in pt:
  1081. try:
  1082. return globals()[_STORER_MAP[pt]](self, group, **kwargs)
  1083. except KeyError:
  1084. error('_STORER_MAP')
  1085. # existing node (and must be a table)
  1086. if tt is None:
  1087. # if we are a writer, determine the tt
  1088. if value is not None:
  1089. if pt == u'series_table':
  1090. index = getattr(value, 'index', None)
  1091. if index is not None:
  1092. if index.nlevels == 1:
  1093. tt = u'appendable_series'
  1094. elif index.nlevels > 1:
  1095. tt = u'appendable_multiseries'
  1096. elif pt == u'frame_table':
  1097. index = getattr(value, 'index', None)
  1098. if index is not None:
  1099. if index.nlevels == 1:
  1100. tt = u'appendable_frame'
  1101. elif index.nlevels > 1:
  1102. tt = u'appendable_multiframe'
  1103. elif pt == u'wide_table':
  1104. tt = u'appendable_panel'
  1105. elif pt == u'ndim_table':
  1106. tt = u'appendable_ndim'
  1107. else:
  1108. # distiguish between a frame/table
  1109. tt = u'legacy_panel'
  1110. try:
  1111. fields = group.table._v_attrs.fields
  1112. if len(fields) == 1 and fields[0] == u'value':
  1113. tt = u'legacy_frame'
  1114. except IndexError:
  1115. pass
  1116. try:
  1117. return globals()[_TABLE_MAP[tt]](self, group, **kwargs)
  1118. except KeyError:
  1119. error('_TABLE_MAP')
  1120. def _write_to_group(self, key, value, format, index=True, append=False,
  1121. complib=None, encoding=None, **kwargs):
  1122. group = self.get_node(key)
  1123. # remove the node if we are not appending
  1124. if group is not None and not append:
  1125. self._handle.remove_node(group, recursive=True)
  1126. group = None
  1127. # we don't want to store a table node at all if are object is 0-len
  1128. # as there are not dtypes
  1129. if getattr(value, 'empty', None) and (format == 'table' or append):
  1130. return
  1131. if group is None:
  1132. paths = key.split('/')
  1133. # recursively create the groups
  1134. path = '/'
  1135. for p in paths:
  1136. if not len(p):
  1137. continue
  1138. new_path = path
  1139. if not path.endswith('/'):
  1140. new_path += '/'
  1141. new_path += p
  1142. group = self.get_node(new_path)
  1143. if group is None:
  1144. group = self._handle.create_group(path, p)
  1145. path = new_path
  1146. s = self._create_storer(group, format, value, append=append,
  1147. encoding=encoding, **kwargs)
  1148. if append:
  1149. # raise if we are trying to append to a Fixed format,
  1150. # or a table that exists (and we are putting)
  1151. if (not s.is_table or
  1152. (s.is_table and format == 'fixed' and s.is_exists)):
  1153. raise ValueError('Can only append to Tables')
  1154. if not s.is_exists:
  1155. s.set_object_info()
  1156. else:
  1157. s.set_object_info()
  1158. if not s.is_table and complib:
  1159. raise ValueError(
  1160. 'Compression not supported on Fixed format stores'
  1161. )
  1162. # write the object
  1163. s.write(obj=value, append=append, complib=complib, **kwargs)
  1164. if s.is_table and index:
  1165. s.create_index(columns=index)
  1166. def _read_group(self, group, **kwargs):
  1167. s = self._create_storer(group)
  1168. s.infer_axes()
  1169. return s.read(**kwargs)
  1170. class TableIterator(object):
  1171. """ define the iteration interface on a table
  1172. Parameters
  1173. ----------
  1174. store : the reference store
  1175. s : the referred storer
  1176. func : the function to execute the query
  1177. where : the where of the query
  1178. nrows : the rows to iterate on
  1179. start : the passed start value (default is None)
  1180. stop : the passed stop value (default is None)
  1181. iterator : boolean, whether to use the default iterator
  1182. chunksize : the passed chunking value (default is 50000)
  1183. auto_close : boolean, automatically close the store at the end of
  1184. iteration, default is False
  1185. kwargs : the passed kwargs
  1186. """
  1187. def __init__(self, store, s, func, where, nrows, start=None, stop=None,
  1188. iterator=False, chunksize=None, auto_close=False):
  1189. self.store = store
  1190. self.s = s
  1191. self.func = func
  1192. self.where = where
  1193. # set start/stop if they are not set if we are a table
  1194. if self.s.is_table:
  1195. if nrows is None:
  1196. nrows = 0
  1197. if start is None:
  1198. start = 0
  1199. if stop is None:
  1200. stop = nrows
  1201. stop = min(nrows, stop)
  1202. self.nrows = nrows
  1203. self.start = start
  1204. self.stop = stop
  1205. self.coordinates = None
  1206. if iterator or chunksize is not None:
  1207. if chunksize is None:
  1208. chunksize = 100000
  1209. self.chunksize = int(chunksize)
  1210. else:
  1211. self.chunksize = None
  1212. self.auto_close = auto_close
  1213. def __iter__(self):
  1214. # iterate
  1215. current = self.start
  1216. while current < self.stop:
  1217. stop = min(current + self.chunksize, self.stop)
  1218. value = self.func(None, None, self.coordinates[current:stop])
  1219. current = stop
  1220. if value is None or not len(value):
  1221. continue
  1222. yield value
  1223. self.close()
  1224. def close(self):
  1225. if self.auto_close:
  1226. self.store.close()
  1227. def get_result(self, coordinates=False):
  1228. # return the actual iterator
  1229. if self.chunksize is not None:
  1230. if not self.s.is_table:
  1231. raise TypeError(
  1232. "can only use an iterator or chunksize on a table")
  1233. self.coordinates = self.s.read_coordinates(where=self.where)
  1234. return self
  1235. # if specified read via coordinates (necessary for multiple selections
  1236. if coordinates:
  1237. where = self.s.read_coordinates(where=self.where, start=self.start,
  1238. stop=self.stop)
  1239. else:
  1240. where = self.where
  1241. # directly return the result
  1242. results = self.func(self.start, self.stop, where)
  1243. self.close()
  1244. return results
  1245. class IndexCol(StringMixin):
  1246. """ an index column description class
  1247. Parameters
  1248. ----------
  1249. axis : axis which I reference
  1250. values : the ndarray like converted values
  1251. kind : a string description of this type
  1252. typ : the pytables type
  1253. pos : the position in the pytables
  1254. """
  1255. is_an_indexable = True
  1256. is_data_indexable = True
  1257. _info_fields = ['freq', 'tz', 'index_name']
  1258. def __init__(self, values=None, kind=None, typ=None, cname=None,
  1259. itemsize=None, name=None, axis=None, kind_attr=None,
  1260. pos=None, freq=None, tz=None, index_name=None, **kwargs):
  1261. self.values = values
  1262. self.kind = kind
  1263. self.typ = typ
  1264. self.itemsize = itemsize
  1265. self.name = name
  1266. self.cname = cname
  1267. self.kind_attr = kind_attr
  1268. self.axis = axis
  1269. self.pos = pos
  1270. self.freq = freq
  1271. self.tz = tz
  1272. self.index_name = index_name
  1273. self.table = None
  1274. self.meta = None
  1275. self.metadata = None
  1276. if name is not None:
  1277. self.set_name(name, kind_attr)
  1278. if pos is not None:
  1279. self.set_pos(pos)
  1280. def set_name(self, name, kind_attr=None):
  1281. """ set the name of this indexer """
  1282. self.name = name
  1283. self.kind_attr = kind_attr or "{name}_kind".format(name=name)
  1284. if self.cname is None:
  1285. self.cname = name
  1286. return self
  1287. def set_axis(self, axis):
  1288. """ set the axis over which I index """
  1289. self.axis = axis
  1290. return self
  1291. def set_pos(self, pos):
  1292. """ set the position of this column in the Table """
  1293. self.pos = pos
  1294. if pos is not None and self.typ is not None:
  1295. self.typ._v_pos = pos
  1296. return self
  1297. def set_table(self, table):
  1298. self.table = table
  1299. return self
  1300. def __unicode__(self):
  1301. temp = tuple(
  1302. map(pprint_thing,
  1303. (self.name,
  1304. self.cname,
  1305. self.axis,
  1306. self.pos,
  1307. self.kind)))
  1308. return ','.join(("{key}->{value}".format(key=key, value=value)
  1309. for key, value in zip(
  1310. ['name', 'cname', 'axis', 'pos', 'kind'], temp)))
  1311. def __eq__(self, other):
  1312. """ compare 2 col items """
  1313. return all(getattr(self, a, None) == getattr(other, a, None)
  1314. for a in ['name', 'cname', 'axis', 'pos'])
  1315. def __ne__(self, other):
  1316. return not self.__eq__(other)
  1317. @property
  1318. def is_indexed(self):
  1319. """ return whether I am an indexed column """
  1320. try:
  1321. return getattr(self.table.cols, self.cname).is_indexed
  1322. except AttributeError:
  1323. False
  1324. def copy(self):
  1325. new_self = copy.copy(self)
  1326. return new_self
  1327. def infer(self, handler):
  1328. """infer this column from the table: create and return a new object"""
  1329. table = handler.table
  1330. new_self = self.copy()
  1331. new_self.set_table(table)
  1332. new_self.get_attr()
  1333. new_self.read_metadata(handler)
  1334. return new_self
  1335. def convert(self, values, nan_rep, encoding, errors):
  1336. """ set the values from this selection: take = take ownership """
  1337. # values is a recarray
  1338. if values.dtype.fields is not None:
  1339. values = values[self.cname]
  1340. values = _maybe_convert(values, self.kind, encoding, errors)
  1341. kwargs = dict()
  1342. if self.freq is not None:
  1343. kwargs['freq'] = _ensure_decoded(self.freq)
  1344. if self.index_name is not None:
  1345. kwargs['name'] = _ensure_decoded(self.index_name)
  1346. # making an Index instance could throw a number of different errors
  1347. try:
  1348. self.values = Index(values, **kwargs)
  1349. except Exception: # noqa: E722
  1350. # if the output freq is different that what we recorded,
  1351. # it should be None (see also 'doc example part 2')
  1352. if 'freq' in kwargs:
  1353. kwargs['freq'] = None
  1354. self.values = Index(values, **kwargs)
  1355. self.values = _set_tz(self.values, self.tz)
  1356. return self
  1357. def take_data(self):
  1358. """ return the values & release the memory """
  1359. self.values, values = None, self.values
  1360. return values
  1361. @property
  1362. def attrs(self):
  1363. return self.table._v_attrs
  1364. @property
  1365. def description(self):
  1366. return self.table.description
  1367. @property
  1368. def col(self):
  1369. """ return my current col description """
  1370. return getattr(self.description, self.cname, None)
  1371. @property
  1372. def cvalues(self):
  1373. """ return my cython values """
  1374. return self.values
  1375. def __iter__(self):
  1376. return iter(self.values)
  1377. def maybe_set_size(self, min_itemsize=None):
  1378. """ maybe set a string col itemsize:
  1379. min_itemsize can be an integer or a dict with this columns name
  1380. with an integer size """
  1381. if _ensure_decoded(self.kind) == u'string':
  1382. if isinstance(min_itemsize, dict):
  1383. min_itemsize = min_itemsize.get(self.name)
  1384. if min_itemsize is not None and self.typ.itemsize < min_itemsize:
  1385. self.typ = _tables(
  1386. ).StringCol(itemsize=min_itemsize, pos=self.pos)
  1387. def validate(self, handler, append):
  1388. self.validate_names()
  1389. def validate_names(self):
  1390. pass
  1391. def validate_and_set(self, handler, append):
  1392. self.set_table(handler.table)
  1393. self.validate_col()
  1394. self.validate_attr(append)
  1395. self.validate_metadata(handler)
  1396. self.write_metadata(handler)
  1397. self.set_attr()
  1398. def validate_col(self, itemsize=None):
  1399. """ validate this column: return the compared against itemsize """
  1400. # validate this column for string truncation (or reset to the max size)
  1401. if _ensure_decoded(self.kind) == u'string':
  1402. c = self.col
  1403. if c is not None:
  1404. if itemsize is None:
  1405. itemsize = self.itemsize
  1406. if c.itemsize < itemsize:
  1407. raise ValueError(
  1408. "Trying to store a string with len [{itemsize}] in "
  1409. "[{cname}] column but\nthis column has a limit of "
  1410. "[{c_itemsize}]!\nConsider using min_itemsize to "
  1411. "preset the sizes on these columns".format(
  1412. itemsize=itemsize, cname=self.cname,
  1413. c_itemsize=c.itemsize))
  1414. return c.itemsize
  1415. return None
  1416. def validate_attr(self, append):
  1417. # check for backwards incompatibility
  1418. if append:
  1419. existing_kind = getattr(self.attrs, self.kind_attr, None)
  1420. if existing_kind is not None and existing_kind != self.kind:
  1421. raise TypeError(
  1422. "incompatible kind in col [{existing} - "
  1423. "{self_kind}]".format(
  1424. existing=existing_kind, self_kind=self.kind))
  1425. def update_info(self, info):
  1426. """ set/update the info for this indexable with the key/value
  1427. if there is a conflict raise/warn as needed """
  1428. for key in self._info_fields:
  1429. value = getattr(self, key, None)
  1430. idx = _get_info(info, self.name)
  1431. existing_value = idx.get(key)
  1432. if key in idx and value is not None and existing_value != value:
  1433. # frequency/name just warn
  1434. if key in ['freq', 'index_name']:
  1435. ws = attribute_conflict_doc % (key, existing_value, value)
  1436. warnings.warn(ws, AttributeConflictWarning, stacklevel=6)
  1437. # reset
  1438. idx[key] = None
  1439. setattr(self, key, None)
  1440. else:
  1441. raise ValueError(
  1442. "invalid info for [{name}] for [{key}], "
  1443. "existing_value [{existing_value}] conflicts with "
  1444. "new value [{value}]".format(
  1445. name=self.name, key=key,
  1446. existing_value=existing_value, value=value))
  1447. else:
  1448. if value is not None or existing_value is not None:
  1449. idx[key] = value
  1450. return self
  1451. def set_info(self, info):
  1452. """ set my state from the passed info """
  1453. idx = info.get(self.name)
  1454. if idx is not None:
  1455. self.__dict__.update(idx)
  1456. def get_attr(self):
  1457. """ set the kind for this column """
  1458. self.kind = getattr(self.attrs, self.kind_attr, None)
  1459. def set_attr(self):
  1460. """ set the kind for this column """
  1461. setattr(self.attrs, self.kind_attr, self.kind)
  1462. def read_metadata(self, handler):
  1463. """ retrieve the metadata for this columns """
  1464. self.metadata = handler.read_metadata(self.cname)
  1465. def validate_metadata(self, handler):
  1466. """ validate that kind=category does not change the categories """
  1467. if self.meta == 'category':
  1468. new_metadata = self.metadata
  1469. cur_metadata = handler.read_metadata(self.cname)
  1470. if (new_metadata is not None and cur_metadata is not None and
  1471. not array_equivalent(new_metadata, cur_metadata)):
  1472. raise ValueError("cannot append a categorical with "
  1473. "different categories to the existing")
  1474. def write_metadata(self, handler):
  1475. """ set the meta data """
  1476. if self.metadata is not None:
  1477. handler.write_metadata(self.cname, self.metadata)
  1478. class GenericIndexCol(IndexCol):
  1479. """ an index which is not represented in the data of the table """
  1480. @property
  1481. def is_indexed(self):
  1482. return False
  1483. def convert(self, values, nan_rep, encoding, errors):
  1484. """ set the values from this selection: take = take ownership """
  1485. self.values = Int64Index(np.arange(self.table.nrows))
  1486. return self
  1487. def get_attr(self):
  1488. pass
  1489. def set_attr(self):
  1490. pass
  1491. class DataCol(IndexCol):
  1492. """ a data holding column, by definition this is not indexable
  1493. Parameters
  1494. ----------
  1495. data : the actual data
  1496. cname : the column name in the table to hold the data (typically
  1497. values)
  1498. meta : a string description of the metadata
  1499. metadata : the actual metadata
  1500. """
  1501. is_an_indexable = False
  1502. is_data_indexable = False
  1503. _info_fields = ['tz', 'ordered']
  1504. @classmethod
  1505. def create_for_block(
  1506. cls, i=None, name=None, cname=None, version=None, **kwargs):
  1507. """ return a new datacol with the block i """
  1508. if cname is None:
  1509. cname = name or 'values_block_{idx}'.format(idx=i)
  1510. if name is None:
  1511. name = cname
  1512. # prior to 0.10.1, we named values blocks like: values_block_0 an the
  1513. # name values_0
  1514. try:
  1515. if version[0] == 0 and version[1] <= 10 and version[2] == 0:
  1516. m = re.search(r"values_block_(\d+)", name)
  1517. if m:
  1518. name = "values_{group}".format(group=m.groups()[0])
  1519. except IndexError:
  1520. pass
  1521. return cls(name=name, cname=cname, **kwargs)
  1522. def __init__(self, values=None, kind=None, typ=None,
  1523. cname=None, data=None, meta=None, metadata=None,
  1524. block=None, **kwargs):
  1525. super(DataCol, self).__init__(values=values, kind=kind, typ=typ,
  1526. cname=cname, **kwargs)
  1527. self.dtype = None
  1528. self.dtype_attr = u'{name}_dtype'.format(name=self.name)
  1529. self.meta = meta
  1530. self.meta_attr = u'{name}_meta'.format(name=self.name)
  1531. self.set_data(data)
  1532. self.set_metadata(metadata)
  1533. def __unicode__(self):
  1534. temp = tuple(
  1535. map(pprint_thing,
  1536. (self.name,
  1537. self.cname,
  1538. self.dtype,
  1539. self.kind,
  1540. self.shape)))
  1541. return ','.join(("{key}->{value}".format(key=key, value=value)
  1542. for key, value in zip(
  1543. ['name', 'cname', 'dtype', 'kind', 'shape'], temp)))
  1544. def __eq__(self, other):
  1545. """ compare 2 col items """
  1546. return all(getattr(self, a, None) == getattr(other, a, None)
  1547. for a in ['name', 'cname', 'dtype', 'pos'])
  1548. def set_data(self, data, dtype=None):
  1549. self.data = data
  1550. if data is not None:
  1551. if dtype is not None:
  1552. self.dtype = dtype
  1553. self.set_kind()
  1554. elif self.dtype is None:
  1555. self.dtype = data.dtype.name
  1556. self.set_kind()
  1557. def take_data(self):
  1558. """ return the data & release the memory """
  1559. self.data, data = None, self.data
  1560. return data
  1561. def set_metadata(self, metadata):
  1562. """ record the metadata """
  1563. if metadata is not None:
  1564. metadata = np.array(metadata, copy=False).ravel()
  1565. self.metadata = metadata
  1566. def set_kind(self):
  1567. # set my kind if we can
  1568. if self.dtype is not None:
  1569. dtype = _ensure_decoded(self.dtype)
  1570. if dtype.startswith(u'string') or dtype.startswith(u'bytes'):
  1571. self.kind = 'string'
  1572. elif dtype.startswith(u'float'):
  1573. self.kind = 'float'
  1574. elif dtype.startswith(u'complex'):
  1575. self.kind = 'complex'
  1576. elif dtype.startswith(u'int') or dtype.startswith(u'uint'):
  1577. self.kind = 'integer'
  1578. elif dtype.startswith(u'date'):
  1579. self.kind = 'datetime'
  1580. elif dtype.startswith(u'timedelta'):
  1581. self.kind = 'timedelta'
  1582. elif dtype.startswith(u'bool'):
  1583. self.kind = 'bool'
  1584. else:
  1585. raise AssertionError(
  1586. "cannot interpret dtype of [{dtype}] in [{obj}]".format(
  1587. dtype=dtype, obj=self))
  1588. # set my typ if we need
  1589. if self.typ is None:
  1590. self.typ = getattr(self.description, self.cname, None)
  1591. def set_atom(self, block, block_items, existing_col, min_itemsize,
  1592. nan_rep, info, encoding=None, errors='strict'):
  1593. """ create and setup my atom from the block b """
  1594. self.values = list(block_items)
  1595. # short-cut certain block types
  1596. if block.is_categorical:
  1597. return self.set_atom_categorical(block, items=block_items,
  1598. info=info)
  1599. elif block.is_datetimetz:
  1600. return self.set_atom_datetime64tz(block, info=info)
  1601. elif block.is_datetime:
  1602. return self.set_atom_datetime64(block)
  1603. elif block.is_timedelta:
  1604. return self.set_atom_timedelta64(block)
  1605. elif block.is_complex:
  1606. return self.set_atom_complex(block)
  1607. dtype = block.dtype.name
  1608. inferred_type = lib.infer_dtype(block.values, skipna=False)
  1609. if inferred_type == 'date':
  1610. raise TypeError(
  1611. "[date] is not implemented as a table column")
  1612. elif inferred_type == 'datetime':
  1613. # after 8260
  1614. # this only would be hit for a mutli-timezone dtype
  1615. # which is an error
  1616. raise TypeError(
  1617. "too many timezones in this block, create separate "
  1618. "data columns"
  1619. )
  1620. elif inferred_type == 'unicode':
  1621. raise TypeError(
  1622. "[unicode] is not implemented as a table column")
  1623. # this is basically a catchall; if say a datetime64 has nans then will
  1624. # end up here ###
  1625. elif inferred_type == 'string' or dtype == 'object':
  1626. self.set_atom_string(
  1627. block, block_items,
  1628. existing_col,
  1629. min_itemsize,
  1630. nan_rep,
  1631. encoding,
  1632. errors)
  1633. # set as a data block
  1634. else:
  1635. self.set_atom_data(block)
  1636. def get_atom_string(self, block, itemsize):
  1637. return _tables().StringCol(itemsize=itemsize, shape=block.shape[0])
  1638. def set_atom_string(self, block, block_items, existing_col, min_itemsize,
  1639. nan_rep, encoding, errors):
  1640. # fill nan items with myself, don't disturb the blocks by
  1641. # trying to downcast
  1642. block = block.fillna(nan_rep, downcast=False)
  1643. if isinstance(block, list):
  1644. block = block[0]
  1645. data = block.values
  1646. # see if we have a valid string type
  1647. inferred_type = lib.infer_dtype(data.ravel(), skipna=False)
  1648. if inferred_type != 'string':
  1649. # we cannot serialize this data, so report an exception on a column
  1650. # by column basis
  1651. for i, item in enumerate(block_items):
  1652. col = block.iget(i)
  1653. inferred_type = lib.infer_dtype(col.ravel(), skipna=False)
  1654. if inferred_type != 'string':
  1655. raise TypeError(
  1656. "Cannot serialize the column [{item}] because\n"
  1657. "its data contents are [{type}] object dtype".format(
  1658. item=item, type=inferred_type)
  1659. )
  1660. # itemsize is the maximum length of a string (along any dimension)
  1661. data_converted = _convert_string_array(data, encoding, errors)
  1662. itemsize = data_converted.itemsize
  1663. # specified min_itemsize?
  1664. if isinstance(min_itemsize, dict):
  1665. min_itemsize = int(min_itemsize.get(
  1666. self.name) or min_itemsize.get('values') or 0)
  1667. itemsize = max(min_itemsize or 0, itemsize)
  1668. # check for column in the values conflicts
  1669. if existing_col is not None:
  1670. eci = existing_col.validate_col(itemsize)
  1671. if eci > itemsize:
  1672. itemsize = eci
  1673. self.itemsize = itemsize
  1674. self.kind = 'string'
  1675. self.typ = self.get_atom_string(block, itemsize)
  1676. self.set_data(data_converted.astype(
  1677. '|S{size}'.format(size=itemsize), copy=False))
  1678. def get_atom_coltype(self, kind=None):
  1679. """ return the PyTables column class for this column """
  1680. if kind is None:
  1681. kind = self.kind
  1682. if self.kind.startswith('uint'):
  1683. col_name = "UInt{name}Col".format(name=kind[4:])
  1684. else:
  1685. col_name = "{name}Col".format(name=kind.capitalize())
  1686. return getattr(_tables(), col_name)
  1687. def get_atom_data(self, block, kind=None):
  1688. return self.get_atom_coltype(kind=kind)(shape=block.shape[0])
  1689. def set_atom_complex(self, block):
  1690. self.kind = block.dtype.name
  1691. itemsize = int(self.kind.split('complex')[-1]) // 8
  1692. self.typ = _tables().ComplexCol(
  1693. itemsize=itemsize, shape=block.shape[0])
  1694. self.set_data(block.values.astype(self.typ.type, copy=False))
  1695. def set_atom_data(self, block):
  1696. self.kind = block.dtype.name
  1697. self.typ = self.get_atom_data(block)
  1698. self.set_data(block.values.astype(self.typ.type, copy=False))
  1699. def set_atom_categorical(self, block, items, info=None, values=None):
  1700. # currently only supports a 1-D categorical
  1701. # in a 1-D block
  1702. values = block.values
  1703. codes = values.codes
  1704. self.kind = 'integer'
  1705. self.dtype = codes.dtype.name
  1706. if values.ndim > 1:
  1707. raise NotImplementedError("only support 1-d categoricals")
  1708. if len(items) > 1:
  1709. raise NotImplementedError("only support single block categoricals")
  1710. # write the codes; must be in a block shape
  1711. self.ordered = values.ordered
  1712. self.typ = self.get_atom_data(block, kind=codes.dtype.name)
  1713. self.set_data(_block_shape(codes))
  1714. # write the categories
  1715. self.meta = 'category'
  1716. self.set_metadata(block.values.categories)
  1717. # update the info
  1718. self.update_info(info)
  1719. def get_atom_datetime64(self, block):
  1720. return _tables().Int64Col(shape=block.shape[0])
  1721. def set_atom_datetime64(self, block, values=None):
  1722. self.kind = 'datetime64'
  1723. self.typ = self.get_atom_datetime64(block)
  1724. if values is None:
  1725. values = block.values.view('i8')
  1726. self.set_data(values, 'datetime64')
  1727. def set_atom_datetime64tz(self, block, info, values=None):
  1728. if values is None:
  1729. values = block.values
  1730. # convert this column to i8 in UTC, and save the tz
  1731. values = values.asi8.reshape(block.shape)
  1732. # store a converted timezone
  1733. self.tz = _get_tz(block.values.tz)
  1734. self.update_info(info)
  1735. self.kind = 'datetime64'
  1736. self.typ = self.get_atom_datetime64(block)
  1737. self.set_data(values, 'datetime64')
  1738. def get_atom_timedelta64(self, block):
  1739. return _tables().Int64Col(shape=block.shape[0])
  1740. def set_atom_timedelta64(self, block, values=None):
  1741. self.kind = 'timedelta64'
  1742. self.typ = self.get_atom_timedelta64(block)
  1743. if values is None:
  1744. values = block.values.view('i8')
  1745. self.set_data(values, 'timedelta64')
  1746. @property
  1747. def shape(self):
  1748. return getattr(self.data, 'shape', None)
  1749. @property
  1750. def cvalues(self):
  1751. """ return my cython values """
  1752. return self.data
  1753. def validate_attr(self, append):
  1754. """validate that we have the same order as the existing & same dtype"""
  1755. if append:
  1756. existing_fields = getattr(self.attrs, self.kind_attr, None)
  1757. if (existing_fields is not None and
  1758. existing_fields != list(self.values)):
  1759. raise ValueError("appended items do not match existing items"
  1760. " in table!")
  1761. existing_dtype = getattr(self.attrs, self.dtype_attr, None)
  1762. if (existing_dtype is not None and
  1763. existing_dtype != self.dtype):
  1764. raise ValueError("appended items dtype do not match existing "
  1765. "items dtype in table!")
  1766. def convert(self, values, nan_rep, encoding, errors):
  1767. """set the data from this selection (and convert to the correct dtype
  1768. if we can)
  1769. """
  1770. # values is a recarray
  1771. if values.dtype.fields is not None:
  1772. values = values[self.cname]
  1773. self.set_data(values)
  1774. # use the meta if needed
  1775. meta = _ensure_decoded(self.meta)
  1776. # convert to the correct dtype
  1777. if self.dtype is not None:
  1778. dtype = _ensure_decoded(self.dtype)
  1779. # reverse converts
  1780. if dtype == u'datetime64':
  1781. # recreate with tz if indicated
  1782. self.data = _set_tz(self.data, self.tz, coerce=True)
  1783. elif dtype == u'timedelta64':
  1784. self.data = np.asarray(self.data, dtype='m8[ns]')
  1785. elif dtype == u'date':
  1786. try:
  1787. self.data = np.asarray(
  1788. [date.fromordinal(v) for v in self.data], dtype=object)
  1789. except ValueError:
  1790. self.data = np.asarray(
  1791. [date.fromtimestamp(v) for v in self.data],
  1792. dtype=object)
  1793. elif dtype == u'datetime':
  1794. self.data = np.asarray(
  1795. [datetime.fromtimestamp(v) for v in self.data],
  1796. dtype=object)
  1797. elif meta == u'category':
  1798. # we have a categorical
  1799. categories = self.metadata
  1800. codes = self.data.ravel()
  1801. # if we have stored a NaN in the categories
  1802. # then strip it; in theory we could have BOTH
  1803. # -1s in the codes and nulls :<
  1804. if categories is None:
  1805. # Handle case of NaN-only categorical columns in which case
  1806. # the categories are an empty array; when this is stored,
  1807. # pytables cannot write a zero-len array, so on readback
  1808. # the categories would be None and `read_hdf()` would fail.
  1809. categories = Index([], dtype=np.float64)
  1810. else:
  1811. mask = isna(categories)
  1812. if mask.any():
  1813. categories = categories[~mask]
  1814. codes[codes != -1] -= mask.astype(int).cumsum().values
  1815. self.data = Categorical.from_codes(codes,
  1816. categories=categories,
  1817. ordered=self.ordered)
  1818. else:
  1819. try:
  1820. self.data = self.data.astype(dtype, copy=False)
  1821. except TypeError:
  1822. self.data = self.data.astype('O', copy=False)
  1823. # convert nans / decode
  1824. if _ensure_decoded(self.kind) == u'string':
  1825. self.data = _unconvert_string_array(
  1826. self.data, nan_rep=nan_rep, encoding=encoding, errors=errors)
  1827. return self
  1828. def get_attr(self):
  1829. """ get the data for this column """
  1830. self.values = getattr(self.attrs, self.kind_attr, None)
  1831. self.dtype = getattr(self.attrs, self.dtype_attr, None)
  1832. self.meta = getattr(self.attrs, self.meta_attr, None)
  1833. self.set_kind()
  1834. def set_attr(self):
  1835. """ set the data for this column """
  1836. setattr(self.attrs, self.kind_attr, self.values)
  1837. setattr(self.attrs, self.meta_attr, self.meta)
  1838. if self.dtype is not None:
  1839. setattr(self.attrs, self.dtype_attr, self.dtype)
  1840. class DataIndexableCol(DataCol):
  1841. """ represent a data column that can be indexed """
  1842. is_data_indexable = True
  1843. def validate_names(self):
  1844. if not Index(self.values).is_object():
  1845. raise ValueError("cannot have non-object label DataIndexableCol")
  1846. def get_atom_string(self, block, itemsize):
  1847. return _tables().StringCol(itemsize=itemsize)
  1848. def get_atom_data(self, block, kind=None):
  1849. return self.get_atom_coltype(kind=kind)()
  1850. def get_atom_datetime64(self, block):
  1851. return _tables().Int64Col()
  1852. def get_atom_timedelta64(self, block):
  1853. return _tables().Int64Col()
  1854. class GenericDataIndexableCol(DataIndexableCol):
  1855. """ represent a generic pytables data column """
  1856. def get_attr(self):
  1857. pass
  1858. class Fixed(StringMixin):
  1859. """ represent an object in my store
  1860. facilitate read/write of various types of objects
  1861. this is an abstract base class
  1862. Parameters
  1863. ----------
  1864. parent : my parent HDFStore
  1865. group : the group node where the table resides
  1866. """
  1867. pandas_kind = None
  1868. obj_type = None
  1869. ndim = None
  1870. is_table = False
  1871. def __init__(self, parent, group, encoding=None, errors='strict',
  1872. **kwargs):
  1873. self.parent = parent
  1874. self.group = group
  1875. self.encoding = _ensure_encoding(encoding)
  1876. self.errors = errors
  1877. self.set_version()
  1878. @property
  1879. def is_old_version(self):
  1880. return (self.version[0] <= 0 and self.version[1] <= 10 and
  1881. self.version[2] < 1)
  1882. def set_version(self):
  1883. """ compute and set our version """
  1884. version = _ensure_decoded(
  1885. getattr(self.group._v_attrs, 'pandas_version', None))
  1886. try:
  1887. self.version = tuple(int(x) for x in version.split('.'))
  1888. if len(self.version) == 2:
  1889. self.version = self.version + (0,)
  1890. except AttributeError:
  1891. self.version = (0, 0, 0)
  1892. @property
  1893. def pandas_type(self):
  1894. return _ensure_decoded(getattr(self.group._v_attrs,
  1895. 'pandas_type', None))
  1896. @property
  1897. def format_type(self):
  1898. return 'fixed'
  1899. def __unicode__(self):
  1900. """ return a pretty representation of myself """
  1901. self.infer_axes()
  1902. s = self.shape
  1903. if s is not None:
  1904. if isinstance(s, (list, tuple)):
  1905. s = "[{shape}]".format(
  1906. shape=','.join(pprint_thing(x) for x in s))
  1907. return "{type:12.12} (shape->{shape})".format(
  1908. type=self.pandas_type, shape=s)
  1909. return self.pandas_type
  1910. def set_object_info(self):
  1911. """ set my pandas type & version """
  1912. self.attrs.pandas_type = str(self.pandas_kind)
  1913. self.attrs.pandas_version = str(_version)
  1914. self.set_version()
  1915. def copy(self):
  1916. new_self = copy.copy(self)
  1917. return new_self
  1918. @property
  1919. def storage_obj_type(self):
  1920. return self.obj_type
  1921. @property
  1922. def shape(self):
  1923. return self.nrows
  1924. @property
  1925. def pathname(self):
  1926. return self.group._v_pathname
  1927. @property
  1928. def _handle(self):
  1929. return self.parent._handle
  1930. @property
  1931. def _filters(self):
  1932. return self.parent._filters
  1933. @property
  1934. def _complevel(self):
  1935. return self.parent._complevel
  1936. @property
  1937. def _fletcher32(self):
  1938. return self.parent._fletcher32
  1939. @property
  1940. def _complib(self):
  1941. return self.parent._complib
  1942. @property
  1943. def attrs(self):
  1944. return self.group._v_attrs
  1945. def set_attrs(self):
  1946. """ set our object attributes """
  1947. pass
  1948. def get_attrs(self):
  1949. """ get our object attributes """
  1950. pass
  1951. @property
  1952. def storable(self):
  1953. """ return my storable """
  1954. return self.group
  1955. @property
  1956. def is_exists(self):
  1957. return False
  1958. @property
  1959. def nrows(self):
  1960. return getattr(self.storable, 'nrows', None)
  1961. def validate(self, other):
  1962. """ validate against an existing storable """
  1963. if other is None:
  1964. return
  1965. return True
  1966. def validate_version(self, where=None):
  1967. """ are we trying to operate on an old version? """
  1968. return True
  1969. def infer_axes(self):
  1970. """ infer the axes of my storer
  1971. return a boolean indicating if we have a valid storer or not """
  1972. s = self.storable
  1973. if s is None:
  1974. return False
  1975. self.get_attrs()
  1976. return True
  1977. def read(self, **kwargs):
  1978. raise NotImplementedError(
  1979. "cannot read on an abstract storer: subclasses should implement")
  1980. def write(self, **kwargs):
  1981. raise NotImplementedError(
  1982. "cannot write on an abstract storer: sublcasses should implement")
  1983. def delete(self, where=None, start=None, stop=None, **kwargs):
  1984. """
  1985. support fully deleting the node in its entirety (only) - where
  1986. specification must be None
  1987. """
  1988. if com._all_none(where, start, stop):
  1989. self._handle.remove_node(self.group, recursive=True)
  1990. return None
  1991. raise TypeError("cannot delete on an abstract storer")
  1992. class GenericFixed(Fixed):
  1993. """ a generified fixed version """
  1994. _index_type_map = {DatetimeIndex: 'datetime', PeriodIndex: 'period'}
  1995. _reverse_index_map = {v: k for k, v in compat.iteritems(_index_type_map)}
  1996. attributes = []
  1997. # indexer helpders
  1998. def _class_to_alias(self, cls):
  1999. return self._index_type_map.get(cls, '')
  2000. def _alias_to_class(self, alias):
  2001. if isinstance(alias, type): # pragma: no cover
  2002. # compat: for a short period of time master stored types
  2003. return alias
  2004. return self._reverse_index_map.get(alias, Index)
  2005. def _get_index_factory(self, klass):
  2006. if klass == DatetimeIndex:
  2007. def f(values, freq=None, tz=None):
  2008. # data are already in UTC, localize and convert if tz present
  2009. result = DatetimeIndex._simple_new(values.values, name=None,
  2010. freq=freq)
  2011. if tz is not None:
  2012. result = result.tz_localize('UTC').tz_convert(tz)
  2013. return result
  2014. return f
  2015. elif klass == PeriodIndex:
  2016. def f(values, freq=None, tz=None):
  2017. return PeriodIndex._simple_new(values, name=None, freq=freq)
  2018. return f
  2019. return klass
  2020. def validate_read(self, kwargs):
  2021. """
  2022. remove table keywords from kwargs and return
  2023. raise if any keywords are passed which are not-None
  2024. """
  2025. kwargs = copy.copy(kwargs)
  2026. columns = kwargs.pop('columns', None)
  2027. if columns is not None:
  2028. raise TypeError("cannot pass a column specification when reading "
  2029. "a Fixed format store. this store must be "
  2030. "selected in its entirety")
  2031. where = kwargs.pop('where', None)
  2032. if where is not None:
  2033. raise TypeError("cannot pass a where specification when reading "
  2034. "from a Fixed format store. this store must be "
  2035. "selected in its entirety")
  2036. return kwargs
  2037. @property
  2038. def is_exists(self):
  2039. return True
  2040. def set_attrs(self):
  2041. """ set our object attributes """
  2042. self.attrs.encoding = self.encoding
  2043. self.attrs.errors = self.errors
  2044. def get_attrs(self):
  2045. """ retrieve our attributes """
  2046. self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None))
  2047. self.errors = _ensure_decoded(getattr(self.attrs, 'errors', 'strict'))
  2048. for n in self.attributes:
  2049. setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
  2050. def write(self, obj, **kwargs):
  2051. self.set_attrs()
  2052. def read_array(self, key, start=None, stop=None):
  2053. """ read an array for the specified node (off of group """
  2054. import tables
  2055. node = getattr(self.group, key)
  2056. attrs = node._v_attrs
  2057. transposed = getattr(attrs, 'transposed', False)
  2058. if isinstance(node, tables.VLArray):
  2059. ret = node[0][start:stop]
  2060. else:
  2061. dtype = getattr(attrs, 'value_type', None)
  2062. shape = getattr(attrs, 'shape', None)
  2063. if shape is not None:
  2064. # length 0 axis
  2065. ret = np.empty(shape, dtype=dtype)
  2066. else:
  2067. ret = node[start:stop]
  2068. if dtype == u'datetime64':
  2069. # reconstruct a timezone if indicated
  2070. ret = _set_tz(ret, getattr(attrs, 'tz', None), coerce=True)
  2071. elif dtype == u'timedelta64':
  2072. ret = np.asarray(ret, dtype='m8[ns]')
  2073. if transposed:
  2074. return ret.T
  2075. else:
  2076. return ret
  2077. def read_index(self, key, **kwargs):
  2078. variety = _ensure_decoded(
  2079. getattr(self.attrs, '{key}_variety'.format(key=key)))
  2080. if variety == u'multi':
  2081. return self.read_multi_index(key, **kwargs)
  2082. elif variety == u'block':
  2083. return self.read_block_index(key, **kwargs)
  2084. elif variety == u'sparseint':
  2085. return self.read_sparse_intindex(key, **kwargs)
  2086. elif variety == u'regular':
  2087. _, index = self.read_index_node(getattr(self.group, key), **kwargs)
  2088. return index
  2089. else: # pragma: no cover
  2090. raise TypeError(
  2091. 'unrecognized index variety: {variety}'.format(
  2092. variety=variety))
  2093. def write_index(self, key, index):
  2094. if isinstance(index, MultiIndex):
  2095. setattr(self.attrs, '{key}_variety'.format(key=key), 'multi')
  2096. self.write_multi_index(key, index)
  2097. elif isinstance(index, BlockIndex):
  2098. setattr(self.attrs, '{key}_variety'.format(key=key), 'block')
  2099. self.write_block_index(key, index)
  2100. elif isinstance(index, IntIndex):
  2101. setattr(self.attrs, '{key}_variety'.format(key=key), 'sparseint')
  2102. self.write_sparse_intindex(key, index)
  2103. else:
  2104. setattr(self.attrs, '{key}_variety'.format(key=key), 'regular')
  2105. converted = _convert_index(index, self.encoding, self.errors,
  2106. self.format_type).set_name('index')
  2107. self.write_array(key, converted.values)
  2108. node = getattr(self.group, key)
  2109. node._v_attrs.kind = converted.kind
  2110. node._v_attrs.name = index.name
  2111. if isinstance(index, (DatetimeIndex, PeriodIndex)):
  2112. node._v_attrs.index_class = self._class_to_alias(type(index))
  2113. if hasattr(index, 'freq'):
  2114. node._v_attrs.freq = index.freq
  2115. if hasattr(index, 'tz') and index.tz is not None:
  2116. node._v_attrs.tz = _get_tz(index.tz)
  2117. def write_block_index(self, key, index):
  2118. self.write_array('{key}_blocs'.format(key=key), index.blocs)
  2119. self.write_array('{key}_blengths'.format(key=key), index.blengths)
  2120. setattr(self.attrs, '{key}_length'.format(key=key), index.length)
  2121. def read_block_index(self, key, **kwargs):
  2122. length = getattr(self.attrs, '{key}_length'.format(key=key))
  2123. blocs = self.read_array('{key}_blocs'.format(key=key), **kwargs)
  2124. blengths = self.read_array('{key}_blengths'.format(key=key), **kwargs)
  2125. return BlockIndex(length, blocs, blengths)
  2126. def write_sparse_intindex(self, key, index):
  2127. self.write_array('{key}_indices'.format(key=key), index.indices)
  2128. setattr(self.attrs, '{key}_length'.format(key=key), index.length)
  2129. def read_sparse_intindex(self, key, **kwargs):
  2130. length = getattr(self.attrs, '{key}_length'.format(key=key))
  2131. indices = self.read_array('{key}_indices'.format(key=key), **kwargs)
  2132. return IntIndex(length, indices)
  2133. def write_multi_index(self, key, index):
  2134. setattr(self.attrs, '{key}_nlevels'.format(key=key), index.nlevels)
  2135. for i, (lev, level_codes, name) in enumerate(zip(index.levels,
  2136. index.codes,
  2137. index.names)):
  2138. # write the level
  2139. level_key = '{key}_level{idx}'.format(key=key, idx=i)
  2140. conv_level = _convert_index(lev, self.encoding, self.errors,
  2141. self.format_type).set_name(level_key)
  2142. self.write_array(level_key, conv_level.values)
  2143. node = getattr(self.group, level_key)
  2144. node._v_attrs.kind = conv_level.kind
  2145. node._v_attrs.name = name
  2146. # write the name
  2147. setattr(node._v_attrs, '{key}_name{name}'.format(
  2148. key=key, name=name), name)
  2149. # write the labels
  2150. label_key = '{key}_label{idx}'.format(key=key, idx=i)
  2151. self.write_array(label_key, level_codes)
  2152. def read_multi_index(self, key, **kwargs):
  2153. nlevels = getattr(self.attrs, '{key}_nlevels'.format(key=key))
  2154. levels = []
  2155. codes = []
  2156. names = []
  2157. for i in range(nlevels):
  2158. level_key = '{key}_level{idx}'.format(key=key, idx=i)
  2159. name, lev = self.read_index_node(getattr(self.group, level_key),
  2160. **kwargs)
  2161. levels.append(lev)
  2162. names.append(name)
  2163. label_key = '{key}_label{idx}'.format(key=key, idx=i)
  2164. level_codes = self.read_array(label_key, **kwargs)
  2165. codes.append(level_codes)
  2166. return MultiIndex(levels=levels, codes=codes, names=names,
  2167. verify_integrity=True)
  2168. def read_index_node(self, node, start=None, stop=None):
  2169. data = node[start:stop]
  2170. # If the index was an empty array write_array_empty() will
  2171. # have written a sentinel. Here we relace it with the original.
  2172. if ('shape' in node._v_attrs and
  2173. self._is_empty_array(getattr(node._v_attrs, 'shape'))):
  2174. data = np.empty(getattr(node._v_attrs, 'shape'),
  2175. dtype=getattr(node._v_attrs, 'value_type'))
  2176. kind = _ensure_decoded(node._v_attrs.kind)
  2177. name = None
  2178. if 'name' in node._v_attrs:
  2179. name = _ensure_str(node._v_attrs.name)
  2180. name = _ensure_decoded(name)
  2181. index_class = self._alias_to_class(_ensure_decoded(
  2182. getattr(node._v_attrs, 'index_class', '')))
  2183. factory = self._get_index_factory(index_class)
  2184. kwargs = {}
  2185. if u'freq' in node._v_attrs:
  2186. kwargs['freq'] = node._v_attrs['freq']
  2187. if u'tz' in node._v_attrs:
  2188. kwargs['tz'] = node._v_attrs['tz']
  2189. if kind in (u'date', u'datetime'):
  2190. index = factory(_unconvert_index(data, kind,
  2191. encoding=self.encoding,
  2192. errors=self.errors),
  2193. dtype=object, **kwargs)
  2194. else:
  2195. index = factory(_unconvert_index(data, kind,
  2196. encoding=self.encoding,
  2197. errors=self.errors), **kwargs)
  2198. index.name = name
  2199. return name, index
  2200. def write_array_empty(self, key, value):
  2201. """ write a 0-len array """
  2202. # ugly hack for length 0 axes
  2203. arr = np.empty((1,) * value.ndim)
  2204. self._handle.create_array(self.group, key, arr)
  2205. getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
  2206. getattr(self.group, key)._v_attrs.shape = value.shape
  2207. def _is_empty_array(self, shape):
  2208. """Returns true if any axis is zero length."""
  2209. return any(x == 0 for x in shape)
  2210. def write_array(self, key, value, items=None):
  2211. if key in self.group:
  2212. self._handle.remove_node(self.group, key)
  2213. # Transform needed to interface with pytables row/col notation
  2214. empty_array = self._is_empty_array(value.shape)
  2215. transposed = False
  2216. if is_categorical_dtype(value):
  2217. raise NotImplementedError('Cannot store a category dtype in '
  2218. 'a HDF5 dataset that uses format='
  2219. '"fixed". Use format="table".')
  2220. if not empty_array:
  2221. if hasattr(value, 'T'):
  2222. # ExtensionArrays (1d) may not have transpose.
  2223. value = value.T
  2224. transposed = True
  2225. if self._filters is not None:
  2226. atom = None
  2227. try:
  2228. # get the atom for this datatype
  2229. atom = _tables().Atom.from_dtype(value.dtype)
  2230. except ValueError:
  2231. pass
  2232. if atom is not None:
  2233. # create an empty chunked array and fill it from value
  2234. if not empty_array:
  2235. ca = self._handle.create_carray(self.group, key, atom,
  2236. value.shape,
  2237. filters=self._filters)
  2238. ca[:] = value
  2239. getattr(self.group, key)._v_attrs.transposed = transposed
  2240. else:
  2241. self.write_array_empty(key, value)
  2242. return
  2243. if value.dtype.type == np.object_:
  2244. # infer the type, warn if we have a non-string type here (for
  2245. # performance)
  2246. inferred_type = lib.infer_dtype(value.ravel(), skipna=False)
  2247. if empty_array:
  2248. pass
  2249. elif inferred_type == 'string':
  2250. pass
  2251. else:
  2252. try:
  2253. items = list(items)
  2254. except TypeError:
  2255. pass
  2256. ws = performance_doc % (inferred_type, key, items)
  2257. warnings.warn(ws, PerformanceWarning, stacklevel=7)
  2258. vlarr = self._handle.create_vlarray(self.group, key,
  2259. _tables().ObjectAtom())
  2260. vlarr.append(value)
  2261. else:
  2262. if empty_array:
  2263. self.write_array_empty(key, value)
  2264. else:
  2265. if is_datetime64_dtype(value.dtype):
  2266. self._handle.create_array(
  2267. self.group, key, value.view('i8'))
  2268. getattr(
  2269. self.group, key)._v_attrs.value_type = 'datetime64'
  2270. elif is_datetime64tz_dtype(value.dtype):
  2271. # store as UTC
  2272. # with a zone
  2273. self._handle.create_array(self.group, key,
  2274. value.asi8)
  2275. node = getattr(self.group, key)
  2276. node._v_attrs.tz = _get_tz(value.tz)
  2277. node._v_attrs.value_type = 'datetime64'
  2278. elif is_timedelta64_dtype(value.dtype):
  2279. self._handle.create_array(
  2280. self.group, key, value.view('i8'))
  2281. getattr(
  2282. self.group, key)._v_attrs.value_type = 'timedelta64'
  2283. else:
  2284. self._handle.create_array(self.group, key, value)
  2285. getattr(self.group, key)._v_attrs.transposed = transposed
  2286. class LegacyFixed(GenericFixed):
  2287. def read_index_legacy(self, key, start=None, stop=None):
  2288. node = getattr(self.group, key)
  2289. data = node[start:stop]
  2290. kind = node._v_attrs.kind
  2291. return _unconvert_index_legacy(data, kind, encoding=self.encoding,
  2292. errors=self.errors)
  2293. class LegacySeriesFixed(LegacyFixed):
  2294. def read(self, **kwargs):
  2295. kwargs = self.validate_read(kwargs)
  2296. index = self.read_index_legacy('index')
  2297. values = self.read_array('values')
  2298. return Series(values, index=index)
  2299. class LegacyFrameFixed(LegacyFixed):
  2300. def read(self, **kwargs):
  2301. kwargs = self.validate_read(kwargs)
  2302. index = self.read_index_legacy('index')
  2303. columns = self.read_index_legacy('columns')
  2304. values = self.read_array('values')
  2305. return DataFrame(values, index=index, columns=columns)
  2306. class SeriesFixed(GenericFixed):
  2307. pandas_kind = u'series'
  2308. attributes = ['name']
  2309. @property
  2310. def shape(self):
  2311. try:
  2312. return len(getattr(self.group, 'values')),
  2313. except (TypeError, AttributeError):
  2314. return None
  2315. def read(self, **kwargs):
  2316. kwargs = self.validate_read(kwargs)
  2317. index = self.read_index('index', **kwargs)
  2318. values = self.read_array('values', **kwargs)
  2319. return Series(values, index=index, name=self.name)
  2320. def write(self, obj, **kwargs):
  2321. super(SeriesFixed, self).write(obj, **kwargs)
  2322. self.write_index('index', obj.index)
  2323. self.write_array('values', obj.values)
  2324. self.attrs.name = obj.name
  2325. class SparseFixed(GenericFixed):
  2326. def validate_read(self, kwargs):
  2327. """
  2328. we don't support start, stop kwds in Sparse
  2329. """
  2330. kwargs = super(SparseFixed, self).validate_read(kwargs)
  2331. if 'start' in kwargs or 'stop' in kwargs:
  2332. raise NotImplementedError("start and/or stop are not supported "
  2333. "in fixed Sparse reading")
  2334. return kwargs
  2335. class SparseSeriesFixed(SparseFixed):
  2336. pandas_kind = u'sparse_series'
  2337. attributes = ['name', 'fill_value', 'kind']
  2338. def read(self, **kwargs):
  2339. kwargs = self.validate_read(kwargs)
  2340. index = self.read_index('index')
  2341. sp_values = self.read_array('sp_values')
  2342. sp_index = self.read_index('sp_index')
  2343. return SparseSeries(sp_values, index=index, sparse_index=sp_index,
  2344. kind=self.kind or u'block',
  2345. fill_value=self.fill_value,
  2346. name=self.name)
  2347. def write(self, obj, **kwargs):
  2348. super(SparseSeriesFixed, self).write(obj, **kwargs)
  2349. self.write_index('index', obj.index)
  2350. self.write_index('sp_index', obj.sp_index)
  2351. self.write_array('sp_values', obj.sp_values)
  2352. self.attrs.name = obj.name
  2353. self.attrs.fill_value = obj.fill_value
  2354. self.attrs.kind = obj.kind
  2355. class SparseFrameFixed(SparseFixed):
  2356. pandas_kind = u'sparse_frame'
  2357. attributes = ['default_kind', 'default_fill_value']
  2358. def read(self, **kwargs):
  2359. kwargs = self.validate_read(kwargs)
  2360. columns = self.read_index('columns')
  2361. sdict = {}
  2362. for c in columns:
  2363. key = 'sparse_series_{columns}'.format(columns=c)
  2364. s = SparseSeriesFixed(self.parent, getattr(self.group, key))
  2365. s.infer_axes()
  2366. sdict[c] = s.read()
  2367. return SparseDataFrame(sdict, columns=columns,
  2368. default_kind=self.default_kind,
  2369. default_fill_value=self.default_fill_value)
  2370. def write(self, obj, **kwargs):
  2371. """ write it as a collection of individual sparse series """
  2372. super(SparseFrameFixed, self).write(obj, **kwargs)
  2373. for name, ss in compat.iteritems(obj):
  2374. key = 'sparse_series_{name}'.format(name=name)
  2375. if key not in self.group._v_children:
  2376. node = self._handle.create_group(self.group, key)
  2377. else:
  2378. node = getattr(self.group, key)
  2379. s = SparseSeriesFixed(self.parent, node)
  2380. s.write(ss)
  2381. self.attrs.default_fill_value = obj.default_fill_value
  2382. self.attrs.default_kind = obj.default_kind
  2383. self.write_index('columns', obj.columns)
  2384. class BlockManagerFixed(GenericFixed):
  2385. attributes = ['ndim', 'nblocks']
  2386. is_shape_reversed = False
  2387. @property
  2388. def shape(self):
  2389. try:
  2390. ndim = self.ndim
  2391. # items
  2392. items = 0
  2393. for i in range(self.nblocks):
  2394. node = getattr(self.group, 'block{idx}_items'.format(idx=i))
  2395. shape = getattr(node, 'shape', None)
  2396. if shape is not None:
  2397. items += shape[0]
  2398. # data shape
  2399. node = getattr(self.group, 'block0_values')
  2400. shape = getattr(node, 'shape', None)
  2401. if shape is not None:
  2402. shape = list(shape[0:(ndim - 1)])
  2403. else:
  2404. shape = []
  2405. shape.append(items)
  2406. # hacky - this works for frames, but is reversed for panels
  2407. if self.is_shape_reversed:
  2408. shape = shape[::-1]
  2409. return shape
  2410. except AttributeError:
  2411. return None
  2412. def read(self, start=None, stop=None, **kwargs):
  2413. # start, stop applied to rows, so 0th axis only
  2414. kwargs = self.validate_read(kwargs)
  2415. select_axis = self.obj_type()._get_block_manager_axis(0)
  2416. axes = []
  2417. for i in range(self.ndim):
  2418. _start, _stop = (start, stop) if i == select_axis else (None, None)
  2419. ax = self.read_index('axis{idx}'.format(
  2420. idx=i), start=_start, stop=_stop)
  2421. axes.append(ax)
  2422. items = axes[0]
  2423. blocks = []
  2424. for i in range(self.nblocks):
  2425. blk_items = self.read_index('block{idx}_items'.format(idx=i))
  2426. values = self.read_array('block{idx}_values'.format(idx=i),
  2427. start=_start, stop=_stop)
  2428. blk = make_block(values,
  2429. placement=items.get_indexer(blk_items))
  2430. blocks.append(blk)
  2431. return self.obj_type(BlockManager(blocks, axes))
  2432. def write(self, obj, **kwargs):
  2433. super(BlockManagerFixed, self).write(obj, **kwargs)
  2434. data = obj._data
  2435. if not data.is_consolidated():
  2436. data = data.consolidate()
  2437. self.attrs.ndim = data.ndim
  2438. for i, ax in enumerate(data.axes):
  2439. if i == 0:
  2440. if not ax.is_unique:
  2441. raise ValueError(
  2442. "Columns index has to be unique for fixed format")
  2443. self.write_index('axis{idx}'.format(idx=i), ax)
  2444. # Supporting mixed-type DataFrame objects...nontrivial
  2445. self.attrs.nblocks = len(data.blocks)
  2446. for i, blk in enumerate(data.blocks):
  2447. # I have no idea why, but writing values before items fixed #2299
  2448. blk_items = data.items.take(blk.mgr_locs)
  2449. self.write_array('block{idx}_values'.format(idx=i),
  2450. blk.values, items=blk_items)
  2451. self.write_index('block{idx}_items'.format(idx=i), blk_items)
  2452. class FrameFixed(BlockManagerFixed):
  2453. pandas_kind = u'frame'
  2454. obj_type = DataFrame
  2455. class Table(Fixed):
  2456. """ represent a table:
  2457. facilitate read/write of various types of tables
  2458. Attrs in Table Node
  2459. -------------------
  2460. These are attributes that are store in the main table node, they are
  2461. necessary to recreate these tables when read back in.
  2462. index_axes : a list of tuples of the (original indexing axis and
  2463. index column)
  2464. non_index_axes: a list of tuples of the (original index axis and
  2465. columns on a non-indexing axis)
  2466. values_axes : a list of the columns which comprise the data of this
  2467. table
  2468. data_columns : a list of the columns that we are allowing indexing
  2469. (these become single columns in values_axes), or True to force all
  2470. columns
  2471. nan_rep : the string to use for nan representations for string
  2472. objects
  2473. levels : the names of levels
  2474. metadata : the names of the metadata columns
  2475. """
  2476. pandas_kind = u'wide_table'
  2477. table_type = None
  2478. levels = 1
  2479. is_table = True
  2480. is_shape_reversed = False
  2481. def __init__(self, *args, **kwargs):
  2482. super(Table, self).__init__(*args, **kwargs)
  2483. self.index_axes = []
  2484. self.non_index_axes = []
  2485. self.values_axes = []
  2486. self.data_columns = []
  2487. self.metadata = []
  2488. self.info = dict()
  2489. self.nan_rep = None
  2490. self.selection = None
  2491. @property
  2492. def table_type_short(self):
  2493. return self.table_type.split('_')[0]
  2494. @property
  2495. def format_type(self):
  2496. return 'table'
  2497. def __unicode__(self):
  2498. """ return a pretty representatgion of myself """
  2499. self.infer_axes()
  2500. dc = ",dc->[{columns}]".format(columns=(','.join(
  2501. self.data_columns) if len(self.data_columns) else ''))
  2502. ver = ''
  2503. if self.is_old_version:
  2504. ver = "[{version}]".format(
  2505. version='.'.join(str(x) for x in self.version))
  2506. return (
  2507. "{pandas_type:12.12}{ver} (typ->{table_type},nrows->{nrows},"
  2508. "ncols->{ncols},indexers->[{index_axes}]{dc})".format(
  2509. pandas_type=self.pandas_type, ver=ver,
  2510. table_type=self.table_type_short, nrows=self.nrows,
  2511. ncols=self.ncols,
  2512. index_axes=(','.join(a.name for a in self.index_axes)), dc=dc
  2513. ))
  2514. def __getitem__(self, c):
  2515. """ return the axis for c """
  2516. for a in self.axes:
  2517. if c == a.name:
  2518. return a
  2519. return None
  2520. def validate(self, other):
  2521. """ validate against an existing table """
  2522. if other is None:
  2523. return
  2524. if other.table_type != self.table_type:
  2525. raise TypeError(
  2526. "incompatible table_type with existing "
  2527. "[{other} - {self}]".format(
  2528. other=other.table_type, self=self.table_type))
  2529. for c in ['index_axes', 'non_index_axes', 'values_axes']:
  2530. sv = getattr(self, c, None)
  2531. ov = getattr(other, c, None)
  2532. if sv != ov:
  2533. # show the error for the specific axes
  2534. for i, sax in enumerate(sv):
  2535. oax = ov[i]
  2536. if sax != oax:
  2537. raise ValueError(
  2538. "invalid combinate of [{c}] on appending data "
  2539. "[{sax}] vs current table [{oax}]".format(
  2540. c=c, sax=sax, oax=oax))
  2541. # should never get here
  2542. raise Exception(
  2543. "invalid combinate of [{c}] on appending data [{sv}] vs "
  2544. "current table [{ov}]".format(c=c, sv=sv, ov=ov))
  2545. @property
  2546. def is_multi_index(self):
  2547. """the levels attribute is 1 or a list in the case of a multi-index"""
  2548. return isinstance(self.levels, list)
  2549. def validate_metadata(self, existing):
  2550. """ create / validate metadata """
  2551. self.metadata = [
  2552. c.name for c in self.values_axes if c.metadata is not None]
  2553. def validate_multiindex(self, obj):
  2554. """validate that we can store the multi-index; reset and return the
  2555. new object
  2556. """
  2557. levels = [l if l is not None else "level_{0}".format(i)
  2558. for i, l in enumerate(obj.index.names)]
  2559. try:
  2560. return obj.reset_index(), levels
  2561. except ValueError:
  2562. raise ValueError("duplicate names/columns in the multi-index when "
  2563. "storing as a table")
  2564. @property
  2565. def nrows_expected(self):
  2566. """ based on our axes, compute the expected nrows """
  2567. return np.prod([i.cvalues.shape[0] for i in self.index_axes])
  2568. @property
  2569. def is_exists(self):
  2570. """ has this table been created """
  2571. return u'table' in self.group
  2572. @property
  2573. def storable(self):
  2574. return getattr(self.group, 'table', None)
  2575. @property
  2576. def table(self):
  2577. """ return the table group (this is my storable) """
  2578. return self.storable
  2579. @property
  2580. def dtype(self):
  2581. return self.table.dtype
  2582. @property
  2583. def description(self):
  2584. return self.table.description
  2585. @property
  2586. def axes(self):
  2587. return itertools.chain(self.index_axes, self.values_axes)
  2588. @property
  2589. def ncols(self):
  2590. """ the number of total columns in the values axes """
  2591. return sum(len(a.values) for a in self.values_axes)
  2592. @property
  2593. def is_transposed(self):
  2594. return False
  2595. @property
  2596. def data_orientation(self):
  2597. """return a tuple of my permutated axes, non_indexable at the front"""
  2598. return tuple(itertools.chain([int(a[0]) for a in self.non_index_axes],
  2599. [int(a.axis) for a in self.index_axes]))
  2600. def queryables(self):
  2601. """ return a dict of the kinds allowable columns for this object """
  2602. # compute the values_axes queryables
  2603. return dict(
  2604. [(a.cname, a) for a in self.index_axes] +
  2605. [(self.storage_obj_type._AXIS_NAMES[axis], None)
  2606. for axis, values in self.non_index_axes] +
  2607. [(v.cname, v) for v in self.values_axes
  2608. if v.name in set(self.data_columns)]
  2609. )
  2610. def index_cols(self):
  2611. """ return a list of my index cols """
  2612. return [(i.axis, i.cname) for i in self.index_axes]
  2613. def values_cols(self):
  2614. """ return a list of my values cols """
  2615. return [i.cname for i in self.values_axes]
  2616. def _get_metadata_path(self, key):
  2617. """ return the metadata pathname for this key """
  2618. return "{group}/meta/{key}/meta".format(group=self.group._v_pathname,
  2619. key=key)
  2620. def write_metadata(self, key, values):
  2621. """
  2622. write out a meta data array to the key as a fixed-format Series
  2623. Parameters
  2624. ----------
  2625. key : string
  2626. values : ndarray
  2627. """
  2628. values = Series(values)
  2629. self.parent.put(self._get_metadata_path(key), values, format='table',
  2630. encoding=self.encoding, errors=self.errors,
  2631. nan_rep=self.nan_rep)
  2632. def read_metadata(self, key):
  2633. """ return the meta data array for this key """
  2634. if getattr(getattr(self.group, 'meta', None), key, None) is not None:
  2635. return self.parent.select(self._get_metadata_path(key))
  2636. return None
  2637. def set_info(self):
  2638. """ update our table index info """
  2639. self.attrs.info = self.info
  2640. def set_attrs(self):
  2641. """ set our table type & indexables """
  2642. self.attrs.table_type = str(self.table_type)
  2643. self.attrs.index_cols = self.index_cols()
  2644. self.attrs.values_cols = self.values_cols()
  2645. self.attrs.non_index_axes = self.non_index_axes
  2646. self.attrs.data_columns = self.data_columns
  2647. self.attrs.nan_rep = self.nan_rep
  2648. self.attrs.encoding = self.encoding
  2649. self.attrs.errors = self.errors
  2650. self.attrs.levels = self.levels
  2651. self.attrs.metadata = self.metadata
  2652. self.set_info()
  2653. def get_attrs(self):
  2654. """ retrieve our attributes """
  2655. self.non_index_axes = getattr(
  2656. self.attrs, 'non_index_axes', None) or []
  2657. self.data_columns = getattr(
  2658. self.attrs, 'data_columns', None) or []
  2659. self.info = getattr(
  2660. self.attrs, 'info', None) or dict()
  2661. self.nan_rep = getattr(self.attrs, 'nan_rep', None)
  2662. self.encoding = _ensure_encoding(
  2663. getattr(self.attrs, 'encoding', None))
  2664. self.errors = _ensure_decoded(getattr(self.attrs, 'errors', 'strict'))
  2665. self.levels = getattr(
  2666. self.attrs, 'levels', None) or []
  2667. self.index_axes = [
  2668. a.infer(self) for a in self.indexables if a.is_an_indexable
  2669. ]
  2670. self.values_axes = [
  2671. a.infer(self) for a in self.indexables if not a.is_an_indexable
  2672. ]
  2673. self.metadata = getattr(
  2674. self.attrs, 'metadata', None) or []
  2675. def validate_version(self, where=None):
  2676. """ are we trying to operate on an old version? """
  2677. if where is not None:
  2678. if (self.version[0] <= 0 and self.version[1] <= 10 and
  2679. self.version[2] < 1):
  2680. ws = incompatibility_doc % '.'.join(
  2681. [str(x) for x in self.version])
  2682. warnings.warn(ws, IncompatibilityWarning)
  2683. def validate_min_itemsize(self, min_itemsize):
  2684. """validate the min_itemisze doesn't contain items that are not in the
  2685. axes this needs data_columns to be defined
  2686. """
  2687. if min_itemsize is None:
  2688. return
  2689. if not isinstance(min_itemsize, dict):
  2690. return
  2691. q = self.queryables()
  2692. for k, v in min_itemsize.items():
  2693. # ok, apply generally
  2694. if k == 'values':
  2695. continue
  2696. if k not in q:
  2697. raise ValueError(
  2698. "min_itemsize has the key [{key}] which is not an axis or "
  2699. "data_column".format(key=k))
  2700. @property
  2701. def indexables(self):
  2702. """ create/cache the indexables if they don't exist """
  2703. if self._indexables is None:
  2704. self._indexables = []
  2705. # index columns
  2706. self._indexables.extend([
  2707. IndexCol(name=name, axis=axis, pos=i)
  2708. for i, (axis, name) in enumerate(self.attrs.index_cols)
  2709. ])
  2710. # values columns
  2711. dc = set(self.data_columns)
  2712. base_pos = len(self._indexables)
  2713. def f(i, c):
  2714. klass = DataCol
  2715. if c in dc:
  2716. klass = DataIndexableCol
  2717. return klass.create_for_block(i=i, name=c, pos=base_pos + i,
  2718. version=self.version)
  2719. self._indexables.extend(
  2720. [f(i, c) for i, c in enumerate(self.attrs.values_cols)])
  2721. return self._indexables
  2722. def create_index(self, columns=None, optlevel=None, kind=None):
  2723. """
  2724. Create a pytables index on the specified columns
  2725. note: cannot index Time64Col() or ComplexCol currently;
  2726. PyTables must be >= 3.0
  2727. Parameters
  2728. ----------
  2729. columns : False (don't create an index), True (create all columns
  2730. index), None or list_like (the indexers to index)
  2731. optlevel: optimization level (defaults to 6)
  2732. kind : kind of index (defaults to 'medium')
  2733. Exceptions
  2734. ----------
  2735. raises if the node is not a table
  2736. """
  2737. if not self.infer_axes():
  2738. return
  2739. if columns is False:
  2740. return
  2741. # index all indexables and data_columns
  2742. if columns is None or columns is True:
  2743. columns = [a.cname for a in self.axes if a.is_data_indexable]
  2744. if not isinstance(columns, (tuple, list)):
  2745. columns = [columns]
  2746. kw = dict()
  2747. if optlevel is not None:
  2748. kw['optlevel'] = optlevel
  2749. if kind is not None:
  2750. kw['kind'] = kind
  2751. table = self.table
  2752. for c in columns:
  2753. v = getattr(table.cols, c, None)
  2754. if v is not None:
  2755. # remove the index if the kind/optlevel have changed
  2756. if v.is_indexed:
  2757. index = v.index
  2758. cur_optlevel = index.optlevel
  2759. cur_kind = index.kind
  2760. if kind is not None and cur_kind != kind:
  2761. v.remove_index()
  2762. else:
  2763. kw['kind'] = cur_kind
  2764. if optlevel is not None and cur_optlevel != optlevel:
  2765. v.remove_index()
  2766. else:
  2767. kw['optlevel'] = cur_optlevel
  2768. # create the index
  2769. if not v.is_indexed:
  2770. if v.type.startswith('complex'):
  2771. raise TypeError(
  2772. 'Columns containing complex values can be stored '
  2773. 'but cannot'
  2774. ' be indexed when using table format. Either use '
  2775. 'fixed format, set index=False, or do not include '
  2776. 'the columns containing complex values to '
  2777. 'data_columns when initializing the table.')
  2778. v.create_index(**kw)
  2779. def read_axes(self, where, **kwargs):
  2780. """create and return the axes sniffed from the table: return boolean
  2781. for success
  2782. """
  2783. # validate the version
  2784. self.validate_version(where)
  2785. # infer the data kind
  2786. if not self.infer_axes():
  2787. return False
  2788. # create the selection
  2789. self.selection = Selection(self, where=where, **kwargs)
  2790. values = self.selection.select()
  2791. # convert the data
  2792. for a in self.axes:
  2793. a.set_info(self.info)
  2794. a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding,
  2795. errors=self.errors)
  2796. return True
  2797. def get_object(self, obj):
  2798. """ return the data for this obj """
  2799. return obj
  2800. def validate_data_columns(self, data_columns, min_itemsize):
  2801. """take the input data_columns and min_itemize and create a data
  2802. columns spec
  2803. """
  2804. if not len(self.non_index_axes):
  2805. return []
  2806. axis, axis_labels = self.non_index_axes[0]
  2807. info = self.info.get(axis, dict())
  2808. if info.get('type') == 'MultiIndex' and data_columns:
  2809. raise ValueError("cannot use a multi-index on axis [{0}] with "
  2810. "data_columns {1}".format(axis, data_columns))
  2811. # evaluate the passed data_columns, True == use all columns
  2812. # take only valide axis labels
  2813. if data_columns is True:
  2814. data_columns = list(axis_labels)
  2815. elif data_columns is None:
  2816. data_columns = []
  2817. # if min_itemsize is a dict, add the keys (exclude 'values')
  2818. if isinstance(min_itemsize, dict):
  2819. existing_data_columns = set(data_columns)
  2820. data_columns.extend([
  2821. k for k in min_itemsize.keys()
  2822. if k != 'values' and k not in existing_data_columns
  2823. ])
  2824. # return valid columns in the order of our axis
  2825. return [c for c in data_columns if c in axis_labels]
  2826. def create_axes(self, axes, obj, validate=True, nan_rep=None,
  2827. data_columns=None, min_itemsize=None, **kwargs):
  2828. """ create and return the axes
  2829. leagcy tables create an indexable column, indexable index,
  2830. non-indexable fields
  2831. Parameters:
  2832. -----------
  2833. axes: a list of the axes in order to create (names or numbers of
  2834. the axes)
  2835. obj : the object to create axes on
  2836. validate: validate the obj against an existing object already
  2837. written
  2838. min_itemsize: a dict of the min size for a column in bytes
  2839. nan_rep : a values to use for string column nan_rep
  2840. encoding : the encoding for string values
  2841. data_columns : a list of columns that we want to create separate to
  2842. allow indexing (or True will force all columns)
  2843. """
  2844. # set the default axes if needed
  2845. if axes is None:
  2846. try:
  2847. axes = _AXES_MAP[type(obj)]
  2848. except KeyError:
  2849. raise TypeError(
  2850. "cannot properly create the storer for: [group->{group},"
  2851. "value->{value}]".format(
  2852. group=self.group._v_name, value=type(obj)))
  2853. # map axes to numbers
  2854. axes = [obj._get_axis_number(a) for a in axes]
  2855. # do we have an existing table (if so, use its axes & data_columns)
  2856. if self.infer_axes():
  2857. existing_table = self.copy()
  2858. existing_table.infer_axes()
  2859. axes = [a.axis for a in existing_table.index_axes]
  2860. data_columns = existing_table.data_columns
  2861. nan_rep = existing_table.nan_rep
  2862. self.encoding = existing_table.encoding
  2863. self.errors = existing_table.errors
  2864. self.info = copy.copy(existing_table.info)
  2865. else:
  2866. existing_table = None
  2867. # currently support on ndim-1 axes
  2868. if len(axes) != self.ndim - 1:
  2869. raise ValueError(
  2870. "currently only support ndim-1 indexers in an AppendableTable")
  2871. # create according to the new data
  2872. self.non_index_axes = []
  2873. self.data_columns = []
  2874. # nan_representation
  2875. if nan_rep is None:
  2876. nan_rep = 'nan'
  2877. self.nan_rep = nan_rep
  2878. # create axes to index and non_index
  2879. index_axes_map = dict()
  2880. for i, a in enumerate(obj.axes):
  2881. if i in axes:
  2882. name = obj._AXIS_NAMES[i]
  2883. index_axes_map[i] = _convert_index(
  2884. a, self.encoding, self.errors, self.format_type
  2885. ).set_name(name).set_axis(i)
  2886. else:
  2887. # we might be able to change the axes on the appending data if
  2888. # necessary
  2889. append_axis = list(a)
  2890. if existing_table is not None:
  2891. indexer = len(self.non_index_axes)
  2892. exist_axis = existing_table.non_index_axes[indexer][1]
  2893. if not array_equivalent(np.array(append_axis),
  2894. np.array(exist_axis)):
  2895. # ahah! -> reindex
  2896. if array_equivalent(np.array(sorted(append_axis)),
  2897. np.array(sorted(exist_axis))):
  2898. append_axis = exist_axis
  2899. # the non_index_axes info
  2900. info = _get_info(self.info, i)
  2901. info['names'] = list(a.names)
  2902. info['type'] = a.__class__.__name__
  2903. self.non_index_axes.append((i, append_axis))
  2904. # set axis positions (based on the axes)
  2905. self.index_axes = [
  2906. index_axes_map[a].set_pos(j).update_info(self.info)
  2907. for j, a in enumerate(axes)
  2908. ]
  2909. j = len(self.index_axes)
  2910. # check for column conflicts
  2911. for a in self.axes:
  2912. a.maybe_set_size(min_itemsize=min_itemsize)
  2913. # reindex by our non_index_axes & compute data_columns
  2914. for a in self.non_index_axes:
  2915. obj = _reindex_axis(obj, a[0], a[1])
  2916. def get_blk_items(mgr, blocks):
  2917. return [mgr.items.take(blk.mgr_locs) for blk in blocks]
  2918. # figure out data_columns and get out blocks
  2919. block_obj = self.get_object(obj)._consolidate()
  2920. blocks = block_obj._data.blocks
  2921. blk_items = get_blk_items(block_obj._data, blocks)
  2922. if len(self.non_index_axes):
  2923. axis, axis_labels = self.non_index_axes[0]
  2924. data_columns = self.validate_data_columns(
  2925. data_columns, min_itemsize)
  2926. if len(data_columns):
  2927. mgr = block_obj.reindex(
  2928. Index(axis_labels).difference(Index(data_columns)),
  2929. axis=axis
  2930. )._data
  2931. blocks = list(mgr.blocks)
  2932. blk_items = get_blk_items(mgr, blocks)
  2933. for c in data_columns:
  2934. mgr = block_obj.reindex([c], axis=axis)._data
  2935. blocks.extend(mgr.blocks)
  2936. blk_items.extend(get_blk_items(mgr, mgr.blocks))
  2937. # reorder the blocks in the same order as the existing_table if we can
  2938. if existing_table is not None:
  2939. by_items = {tuple(b_items.tolist()): (b, b_items)
  2940. for b, b_items in zip(blocks, blk_items)}
  2941. new_blocks = []
  2942. new_blk_items = []
  2943. for ea in existing_table.values_axes:
  2944. items = tuple(ea.values)
  2945. try:
  2946. b, b_items = by_items.pop(items)
  2947. new_blocks.append(b)
  2948. new_blk_items.append(b_items)
  2949. except (IndexError, KeyError):
  2950. raise ValueError(
  2951. "cannot match existing table structure for [{items}] "
  2952. "on appending data".format(
  2953. items=(','.join(pprint_thing(item) for
  2954. item in items))))
  2955. blocks = new_blocks
  2956. blk_items = new_blk_items
  2957. # add my values
  2958. self.values_axes = []
  2959. for i, (b, b_items) in enumerate(zip(blocks, blk_items)):
  2960. # shape of the data column are the indexable axes
  2961. klass = DataCol
  2962. name = None
  2963. # we have a data_column
  2964. if (data_columns and len(b_items) == 1 and
  2965. b_items[0] in data_columns):
  2966. klass = DataIndexableCol
  2967. name = b_items[0]
  2968. self.data_columns.append(name)
  2969. # make sure that we match up the existing columns
  2970. # if we have an existing table
  2971. if existing_table is not None and validate:
  2972. try:
  2973. existing_col = existing_table.values_axes[i]
  2974. except (IndexError, KeyError):
  2975. raise ValueError(
  2976. "Incompatible appended table [{blocks}]"
  2977. "with existing table [{table}]".format(
  2978. blocks=blocks,
  2979. table=existing_table.values_axes))
  2980. else:
  2981. existing_col = None
  2982. try:
  2983. col = klass.create_for_block(
  2984. i=i, name=name, version=self.version)
  2985. col.set_atom(block=b, block_items=b_items,
  2986. existing_col=existing_col,
  2987. min_itemsize=min_itemsize,
  2988. nan_rep=nan_rep,
  2989. encoding=self.encoding,
  2990. errors=self.errors,
  2991. info=self.info)
  2992. col.set_pos(j)
  2993. self.values_axes.append(col)
  2994. except (NotImplementedError, ValueError, TypeError) as e:
  2995. raise e
  2996. except Exception as detail:
  2997. raise Exception(
  2998. "cannot find the correct atom type -> "
  2999. "[dtype->{name},items->{items}] {detail!s}".format(
  3000. name=b.dtype.name, items=b_items, detail=detail))
  3001. j += 1
  3002. # validate our min_itemsize
  3003. self.validate_min_itemsize(min_itemsize)
  3004. # validate our metadata
  3005. self.validate_metadata(existing_table)
  3006. # validate the axes if we have an existing table
  3007. if validate:
  3008. self.validate(existing_table)
  3009. def process_axes(self, obj, columns=None):
  3010. """ process axes filters """
  3011. # make a copy to avoid side effects
  3012. if columns is not None:
  3013. columns = list(columns)
  3014. # make sure to include levels if we have them
  3015. if columns is not None and self.is_multi_index:
  3016. for n in self.levels:
  3017. if n not in columns:
  3018. columns.insert(0, n)
  3019. # reorder by any non_index_axes & limit to the select columns
  3020. for axis, labels in self.non_index_axes:
  3021. obj = _reindex_axis(obj, axis, labels, columns)
  3022. # apply the selection filters (but keep in the same order)
  3023. if self.selection.filter is not None:
  3024. for field, op, filt in self.selection.filter.format():
  3025. def process_filter(field, filt):
  3026. for axis_name in obj._AXIS_NAMES.values():
  3027. axis_number = obj._get_axis_number(axis_name)
  3028. axis_values = obj._get_axis(axis_name)
  3029. # see if the field is the name of an axis
  3030. if field == axis_name:
  3031. # if we have a multi-index, then need to include
  3032. # the levels
  3033. if self.is_multi_index:
  3034. filt = filt.union(Index(self.levels))
  3035. takers = op(axis_values, filt)
  3036. return obj.loc._getitem_axis(takers,
  3037. axis=axis_number)
  3038. # this might be the name of a file IN an axis
  3039. elif field in axis_values:
  3040. # we need to filter on this dimension
  3041. values = ensure_index(getattr(obj, field).values)
  3042. filt = ensure_index(filt)
  3043. # hack until we support reversed dim flags
  3044. if isinstance(obj, DataFrame):
  3045. axis_number = 1 - axis_number
  3046. takers = op(values, filt)
  3047. return obj.loc._getitem_axis(takers,
  3048. axis=axis_number)
  3049. raise ValueError("cannot find the field [{field}] for "
  3050. "filtering!".format(field=field))
  3051. obj = process_filter(field, filt)
  3052. return obj
  3053. def create_description(self, complib=None, complevel=None,
  3054. fletcher32=False, expectedrows=None):
  3055. """ create the description of the table from the axes & values """
  3056. # provided expected rows if its passed
  3057. if expectedrows is None:
  3058. expectedrows = max(self.nrows_expected, 10000)
  3059. d = dict(name='table', expectedrows=expectedrows)
  3060. # description from the axes & values
  3061. d['description'] = {a.cname: a.typ for a in self.axes}
  3062. if complib:
  3063. if complevel is None:
  3064. complevel = self._complevel or 9
  3065. filters = _tables().Filters(
  3066. complevel=complevel, complib=complib,
  3067. fletcher32=fletcher32 or self._fletcher32)
  3068. d['filters'] = filters
  3069. elif self._filters is not None:
  3070. d['filters'] = self._filters
  3071. return d
  3072. def read_coordinates(self, where=None, start=None, stop=None, **kwargs):
  3073. """select coordinates (row numbers) from a table; return the
  3074. coordinates object
  3075. """
  3076. # validate the version
  3077. self.validate_version(where)
  3078. # infer the data kind
  3079. if not self.infer_axes():
  3080. return False
  3081. # create the selection
  3082. self.selection = Selection(
  3083. self, where=where, start=start, stop=stop, **kwargs)
  3084. coords = self.selection.select_coords()
  3085. if self.selection.filter is not None:
  3086. for field, op, filt in self.selection.filter.format():
  3087. data = self.read_column(
  3088. field, start=coords.min(), stop=coords.max() + 1)
  3089. coords = coords[
  3090. op(data.iloc[coords - coords.min()], filt).values]
  3091. return Index(coords)
  3092. def read_column(self, column, where=None, start=None, stop=None):
  3093. """return a single column from the table, generally only indexables
  3094. are interesting
  3095. """
  3096. # validate the version
  3097. self.validate_version()
  3098. # infer the data kind
  3099. if not self.infer_axes():
  3100. return False
  3101. if where is not None:
  3102. raise TypeError("read_column does not currently accept a where "
  3103. "clause")
  3104. # find the axes
  3105. for a in self.axes:
  3106. if column == a.name:
  3107. if not a.is_data_indexable:
  3108. raise ValueError(
  3109. "column [{column}] can not be extracted individually; "
  3110. "it is not data indexable".format(column=column))
  3111. # column must be an indexable or a data column
  3112. c = getattr(self.table.cols, column)
  3113. a.set_info(self.info)
  3114. return Series(_set_tz(a.convert(c[start:stop],
  3115. nan_rep=self.nan_rep,
  3116. encoding=self.encoding,
  3117. errors=self.errors
  3118. ).take_data(),
  3119. a.tz, True), name=column)
  3120. raise KeyError(
  3121. "column [{column}] not found in the table".format(column=column))
  3122. class WORMTable(Table):
  3123. """ a write-once read-many table: this format DOES NOT ALLOW appending to a
  3124. table. writing is a one-time operation the data are stored in a format
  3125. that allows for searching the data on disk
  3126. """
  3127. table_type = u'worm'
  3128. def read(self, **kwargs):
  3129. """ read the indices and the indexing array, calculate offset rows and
  3130. return """
  3131. raise NotImplementedError("WORMTable needs to implement read")
  3132. def write(self, **kwargs):
  3133. """ write in a format that we can search later on (but cannot append
  3134. to): write out the indices and the values using _write_array
  3135. (e.g. a CArray) create an indexing table so that we can search
  3136. """
  3137. raise NotImplementedError("WORKTable needs to implement write")
  3138. class LegacyTable(Table):
  3139. """ an appendable table: allow append/query/delete operations to a
  3140. (possibly) already existing appendable table this table ALLOWS
  3141. append (but doesn't require them), and stores the data in a format
  3142. that can be easily searched
  3143. """
  3144. _indexables = [
  3145. IndexCol(name='index', axis=1, pos=0),
  3146. IndexCol(name='column', axis=2, pos=1, index_kind='columns_kind'),
  3147. DataCol(name='fields', cname='values', kind_attr='fields', pos=2)
  3148. ]
  3149. table_type = u'legacy'
  3150. ndim = 3
  3151. def write(self, **kwargs):
  3152. raise TypeError("write operations are not allowed on legacy tables!")
  3153. def read(self, where=None, columns=None, **kwargs):
  3154. """we have n indexable columns, with an arbitrary number of data
  3155. axes
  3156. """
  3157. if not self.read_axes(where=where, **kwargs):
  3158. return None
  3159. raise NotImplementedError("Panel is removed in pandas 0.25.0")
  3160. class AppendableTable(LegacyTable):
  3161. """ support the new appendable table formats """
  3162. _indexables = None
  3163. table_type = u'appendable'
  3164. def write(self, obj, axes=None, append=False, complib=None,
  3165. complevel=None, fletcher32=None, min_itemsize=None,
  3166. chunksize=None, expectedrows=None, dropna=False, **kwargs):
  3167. if not append and self.is_exists:
  3168. self._handle.remove_node(self.group, 'table')
  3169. # create the axes
  3170. self.create_axes(axes=axes, obj=obj, validate=append,
  3171. min_itemsize=min_itemsize,
  3172. **kwargs)
  3173. for a in self.axes:
  3174. a.validate(self, append)
  3175. if not self.is_exists:
  3176. # create the table
  3177. options = self.create_description(complib=complib,
  3178. complevel=complevel,
  3179. fletcher32=fletcher32,
  3180. expectedrows=expectedrows)
  3181. # set the table attributes
  3182. self.set_attrs()
  3183. # create the table
  3184. self._handle.create_table(self.group, **options)
  3185. else:
  3186. pass
  3187. # table = self.table
  3188. # update my info
  3189. self.set_info()
  3190. # validate the axes and set the kinds
  3191. for a in self.axes:
  3192. a.validate_and_set(self, append)
  3193. # add the rows
  3194. self.write_data(chunksize, dropna=dropna)
  3195. def write_data(self, chunksize, dropna=False):
  3196. """ we form the data into a 2-d including indexes,values,mask
  3197. write chunk-by-chunk """
  3198. names = self.dtype.names
  3199. nrows = self.nrows_expected
  3200. # if dropna==True, then drop ALL nan rows
  3201. masks = []
  3202. if dropna:
  3203. for a in self.values_axes:
  3204. # figure the mask: only do if we can successfully process this
  3205. # column, otherwise ignore the mask
  3206. mask = isna(a.data).all(axis=0)
  3207. if isinstance(mask, np.ndarray):
  3208. masks.append(mask.astype('u1', copy=False))
  3209. # consolidate masks
  3210. if len(masks):
  3211. mask = masks[0]
  3212. for m in masks[1:]:
  3213. mask = mask & m
  3214. mask = mask.ravel()
  3215. else:
  3216. mask = None
  3217. # broadcast the indexes if needed
  3218. indexes = [a.cvalues for a in self.index_axes]
  3219. nindexes = len(indexes)
  3220. bindexes = []
  3221. for i, idx in enumerate(indexes):
  3222. # broadcast to all other indexes except myself
  3223. if i > 0 and i < nindexes:
  3224. repeater = np.prod(
  3225. [indexes[bi].shape[0] for bi in range(0, i)])
  3226. idx = np.tile(idx, repeater)
  3227. if i < nindexes - 1:
  3228. repeater = np.prod([indexes[bi].shape[0]
  3229. for bi in range(i + 1, nindexes)])
  3230. idx = np.repeat(idx, repeater)
  3231. bindexes.append(idx)
  3232. # transpose the values so first dimension is last
  3233. # reshape the values if needed
  3234. values = [a.take_data() for a in self.values_axes]
  3235. values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1))
  3236. for v in values]
  3237. bvalues = []
  3238. for i, v in enumerate(values):
  3239. new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
  3240. bvalues.append(values[i].reshape(new_shape))
  3241. # write the chunks
  3242. if chunksize is None:
  3243. chunksize = 100000
  3244. rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
  3245. chunks = int(nrows / chunksize) + 1
  3246. for i in range(chunks):
  3247. start_i = i * chunksize
  3248. end_i = min((i + 1) * chunksize, nrows)
  3249. if start_i >= end_i:
  3250. break
  3251. self.write_data_chunk(
  3252. rows,
  3253. indexes=[a[start_i:end_i] for a in bindexes],
  3254. mask=mask[start_i:end_i] if mask is not None else None,
  3255. values=[v[start_i:end_i] for v in bvalues])
  3256. def write_data_chunk(self, rows, indexes, mask, values):
  3257. """
  3258. Parameters
  3259. ----------
  3260. rows : an empty memory space where we are putting the chunk
  3261. indexes : an array of the indexes
  3262. mask : an array of the masks
  3263. values : an array of the values
  3264. """
  3265. # 0 len
  3266. for v in values:
  3267. if not np.prod(v.shape):
  3268. return
  3269. try:
  3270. nrows = indexes[0].shape[0]
  3271. if nrows != len(rows):
  3272. rows = np.empty(nrows, dtype=self.dtype)
  3273. names = self.dtype.names
  3274. nindexes = len(indexes)
  3275. # indexes
  3276. for i, idx in enumerate(indexes):
  3277. rows[names[i]] = idx
  3278. # values
  3279. for i, v in enumerate(values):
  3280. rows[names[i + nindexes]] = v
  3281. # mask
  3282. if mask is not None:
  3283. m = ~mask.ravel().astype(bool, copy=False)
  3284. if not m.all():
  3285. rows = rows[m]
  3286. except Exception as detail:
  3287. raise Exception(
  3288. "cannot create row-data -> {detail}".format(detail=detail))
  3289. try:
  3290. if len(rows):
  3291. self.table.append(rows)
  3292. self.table.flush()
  3293. except Exception as detail:
  3294. raise TypeError(
  3295. "tables cannot write this data -> {detail}".format(
  3296. detail=detail))
  3297. def delete(self, where=None, start=None, stop=None, **kwargs):
  3298. # delete all rows (and return the nrows)
  3299. if where is None or not len(where):
  3300. if start is None and stop is None:
  3301. nrows = self.nrows
  3302. self._handle.remove_node(self.group, recursive=True)
  3303. else:
  3304. # pytables<3.0 would remove a single row with stop=None
  3305. if stop is None:
  3306. stop = self.nrows
  3307. nrows = self.table.remove_rows(start=start, stop=stop)
  3308. self.table.flush()
  3309. return nrows
  3310. # infer the data kind
  3311. if not self.infer_axes():
  3312. return None
  3313. # create the selection
  3314. table = self.table
  3315. self.selection = Selection(
  3316. self, where, start=start, stop=stop, **kwargs)
  3317. values = self.selection.select_coords()
  3318. # delete the rows in reverse order
  3319. sorted_series = Series(values).sort_values()
  3320. ln = len(sorted_series)
  3321. if ln:
  3322. # construct groups of consecutive rows
  3323. diff = sorted_series.diff()
  3324. groups = list(diff[diff > 1].index)
  3325. # 1 group
  3326. if not len(groups):
  3327. groups = [0]
  3328. # final element
  3329. if groups[-1] != ln:
  3330. groups.append(ln)
  3331. # initial element
  3332. if groups[0] != 0:
  3333. groups.insert(0, 0)
  3334. # we must remove in reverse order!
  3335. pg = groups.pop()
  3336. for g in reversed(groups):
  3337. rows = sorted_series.take(lrange(g, pg))
  3338. table.remove_rows(start=rows[rows.index[0]
  3339. ], stop=rows[rows.index[-1]] + 1)
  3340. pg = g
  3341. self.table.flush()
  3342. # return the number of rows removed
  3343. return ln
  3344. class AppendableFrameTable(AppendableTable):
  3345. """ support the new appendable table formats """
  3346. pandas_kind = u'frame_table'
  3347. table_type = u'appendable_frame'
  3348. ndim = 2
  3349. obj_type = DataFrame
  3350. @property
  3351. def is_transposed(self):
  3352. return self.index_axes[0].axis == 1
  3353. def get_object(self, obj):
  3354. """ these are written transposed """
  3355. if self.is_transposed:
  3356. obj = obj.T
  3357. return obj
  3358. def read(self, where=None, columns=None, **kwargs):
  3359. if not self.read_axes(where=where, **kwargs):
  3360. return None
  3361. info = (self.info.get(self.non_index_axes[0][0], dict())
  3362. if len(self.non_index_axes) else dict())
  3363. index = self.index_axes[0].values
  3364. frames = []
  3365. for a in self.values_axes:
  3366. # we could have a multi-index constructor here
  3367. # ensure_index doesn't recognized our list-of-tuples here
  3368. if info.get('type') == 'MultiIndex':
  3369. cols = MultiIndex.from_tuples(a.values)
  3370. else:
  3371. cols = Index(a.values)
  3372. names = info.get('names')
  3373. if names is not None:
  3374. cols.set_names(names, inplace=True)
  3375. if self.is_transposed:
  3376. values = a.cvalues
  3377. index_ = cols
  3378. cols_ = Index(index, name=getattr(index, 'name', None))
  3379. else:
  3380. values = a.cvalues.T
  3381. index_ = Index(index, name=getattr(index, 'name', None))
  3382. cols_ = cols
  3383. # if we have a DataIndexableCol, its shape will only be 1 dim
  3384. if values.ndim == 1 and isinstance(values, np.ndarray):
  3385. values = values.reshape((1, values.shape[0]))
  3386. block = make_block(values, placement=np.arange(len(cols_)))
  3387. mgr = BlockManager([block], [cols_, index_])
  3388. frames.append(DataFrame(mgr))
  3389. if len(frames) == 1:
  3390. df = frames[0]
  3391. else:
  3392. df = concat(frames, axis=1)
  3393. # apply the selection filters & axis orderings
  3394. df = self.process_axes(df, columns=columns)
  3395. return df
  3396. class AppendableSeriesTable(AppendableFrameTable):
  3397. """ support the new appendable table formats """
  3398. pandas_kind = u'series_table'
  3399. table_type = u'appendable_series'
  3400. ndim = 2
  3401. obj_type = Series
  3402. storage_obj_type = DataFrame
  3403. @property
  3404. def is_transposed(self):
  3405. return False
  3406. def get_object(self, obj):
  3407. return obj
  3408. def write(self, obj, data_columns=None, **kwargs):
  3409. """ we are going to write this as a frame table """
  3410. if not isinstance(obj, DataFrame):
  3411. name = obj.name or 'values'
  3412. obj = DataFrame({name: obj}, index=obj.index)
  3413. obj.columns = [name]
  3414. return super(AppendableSeriesTable, self).write(
  3415. obj=obj, data_columns=obj.columns.tolist(), **kwargs)
  3416. def read(self, columns=None, **kwargs):
  3417. is_multi_index = self.is_multi_index
  3418. if columns is not None and is_multi_index:
  3419. for n in self.levels:
  3420. if n not in columns:
  3421. columns.insert(0, n)
  3422. s = super(AppendableSeriesTable, self).read(columns=columns, **kwargs)
  3423. if is_multi_index:
  3424. s.set_index(self.levels, inplace=True)
  3425. s = s.iloc[:, 0]
  3426. # remove the default name
  3427. if s.name == 'values':
  3428. s.name = None
  3429. return s
  3430. class AppendableMultiSeriesTable(AppendableSeriesTable):
  3431. """ support the new appendable table formats """
  3432. pandas_kind = u'series_table'
  3433. table_type = u'appendable_multiseries'
  3434. def write(self, obj, **kwargs):
  3435. """ we are going to write this as a frame table """
  3436. name = obj.name or 'values'
  3437. obj, self.levels = self.validate_multiindex(obj)
  3438. cols = list(self.levels)
  3439. cols.append(name)
  3440. obj.columns = cols
  3441. return super(AppendableMultiSeriesTable, self).write(obj=obj, **kwargs)
  3442. class GenericTable(AppendableFrameTable):
  3443. """ a table that read/writes the generic pytables table format """
  3444. pandas_kind = u'frame_table'
  3445. table_type = u'generic_table'
  3446. ndim = 2
  3447. obj_type = DataFrame
  3448. @property
  3449. def pandas_type(self):
  3450. return self.pandas_kind
  3451. @property
  3452. def storable(self):
  3453. return getattr(self.group, 'table', None) or self.group
  3454. def get_attrs(self):
  3455. """ retrieve our attributes """
  3456. self.non_index_axes = []
  3457. self.nan_rep = None
  3458. self.levels = []
  3459. self.index_axes = [a.infer(self)
  3460. for a in self.indexables if a.is_an_indexable]
  3461. self.values_axes = [a.infer(self)
  3462. for a in self.indexables if not a.is_an_indexable]
  3463. self.data_columns = [a.name for a in self.values_axes]
  3464. @property
  3465. def indexables(self):
  3466. """ create the indexables from the table description """
  3467. if self._indexables is None:
  3468. d = self.description
  3469. # the index columns is just a simple index
  3470. self._indexables = [GenericIndexCol(name='index', axis=0)]
  3471. for i, n in enumerate(d._v_names):
  3472. dc = GenericDataIndexableCol(
  3473. name=n, pos=i, values=[n], version=self.version)
  3474. self._indexables.append(dc)
  3475. return self._indexables
  3476. def write(self, **kwargs):
  3477. raise NotImplementedError("cannot write on an generic table")
  3478. class AppendableMultiFrameTable(AppendableFrameTable):
  3479. """ a frame with a multi-index """
  3480. table_type = u'appendable_multiframe'
  3481. obj_type = DataFrame
  3482. ndim = 2
  3483. _re_levels = re.compile(r"^level_\d+$")
  3484. @property
  3485. def table_type_short(self):
  3486. return u'appendable_multi'
  3487. def write(self, obj, data_columns=None, **kwargs):
  3488. if data_columns is None:
  3489. data_columns = []
  3490. elif data_columns is True:
  3491. data_columns = obj.columns.tolist()
  3492. obj, self.levels = self.validate_multiindex(obj)
  3493. for n in self.levels:
  3494. if n not in data_columns:
  3495. data_columns.insert(0, n)
  3496. return super(AppendableMultiFrameTable, self).write(
  3497. obj=obj, data_columns=data_columns, **kwargs)
  3498. def read(self, **kwargs):
  3499. df = super(AppendableMultiFrameTable, self).read(**kwargs)
  3500. df = df.set_index(self.levels)
  3501. # remove names for 'level_%d'
  3502. df.index = df.index.set_names([
  3503. None if self._re_levels.search(l) else l for l in df.index.names
  3504. ])
  3505. return df
  3506. def _reindex_axis(obj, axis, labels, other=None):
  3507. ax = obj._get_axis(axis)
  3508. labels = ensure_index(labels)
  3509. # try not to reindex even if other is provided
  3510. # if it equals our current index
  3511. if other is not None:
  3512. other = ensure_index(other)
  3513. if (other is None or labels.equals(other)) and labels.equals(ax):
  3514. return obj
  3515. labels = ensure_index(labels.unique())
  3516. if other is not None:
  3517. labels = ensure_index(other.unique()).intersection(labels, sort=False)
  3518. if not labels.equals(ax):
  3519. slicer = [slice(None, None)] * obj.ndim
  3520. slicer[axis] = labels
  3521. obj = obj.loc[tuple(slicer)]
  3522. return obj
  3523. def _get_info(info, name):
  3524. """ get/create the info for this name """
  3525. try:
  3526. idx = info[name]
  3527. except KeyError:
  3528. idx = info[name] = dict()
  3529. return idx
  3530. # tz to/from coercion
  3531. def _get_tz(tz):
  3532. """ for a tz-aware type, return an encoded zone """
  3533. zone = timezones.get_timezone(tz)
  3534. if zone is None:
  3535. zone = tz.utcoffset().total_seconds()
  3536. return zone
  3537. def _set_tz(values, tz, preserve_UTC=False, coerce=False):
  3538. """
  3539. coerce the values to a DatetimeIndex if tz is set
  3540. preserve the input shape if possible
  3541. Parameters
  3542. ----------
  3543. values : ndarray
  3544. tz : string/pickled tz object
  3545. preserve_UTC : boolean,
  3546. preserve the UTC of the result
  3547. coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
  3548. """
  3549. if tz is not None:
  3550. name = getattr(values, 'name', None)
  3551. values = values.ravel()
  3552. tz = timezones.get_timezone(_ensure_decoded(tz))
  3553. values = DatetimeIndex(values, name=name)
  3554. if values.tz is None:
  3555. values = values.tz_localize('UTC').tz_convert(tz)
  3556. if preserve_UTC:
  3557. if tz == 'UTC':
  3558. values = list(values)
  3559. elif coerce:
  3560. values = np.asarray(values, dtype='M8[ns]')
  3561. return values
  3562. def _convert_index(index, encoding=None, errors='strict', format_type=None):
  3563. index_name = getattr(index, 'name', None)
  3564. if isinstance(index, DatetimeIndex):
  3565. converted = index.asi8
  3566. return IndexCol(converted, 'datetime64', _tables().Int64Col(),
  3567. freq=getattr(index, 'freq', None),
  3568. tz=getattr(index, 'tz', None),
  3569. index_name=index_name)
  3570. elif isinstance(index, TimedeltaIndex):
  3571. converted = index.asi8
  3572. return IndexCol(converted, 'timedelta64', _tables().Int64Col(),
  3573. freq=getattr(index, 'freq', None),
  3574. index_name=index_name)
  3575. elif isinstance(index, (Int64Index, PeriodIndex)):
  3576. atom = _tables().Int64Col()
  3577. # avoid to store ndarray of Period objects
  3578. return IndexCol(index._ndarray_values, 'integer', atom,
  3579. freq=getattr(index, 'freq', None),
  3580. index_name=index_name)
  3581. if isinstance(index, MultiIndex):
  3582. raise TypeError('MultiIndex not supported here!')
  3583. inferred_type = lib.infer_dtype(index, skipna=False)
  3584. values = np.asarray(index)
  3585. if inferred_type == 'datetime64':
  3586. converted = values.view('i8')
  3587. return IndexCol(converted, 'datetime64', _tables().Int64Col(),
  3588. freq=getattr(index, 'freq', None),
  3589. tz=getattr(index, 'tz', None),
  3590. index_name=index_name)
  3591. elif inferred_type == 'timedelta64':
  3592. converted = values.view('i8')
  3593. return IndexCol(converted, 'timedelta64', _tables().Int64Col(),
  3594. freq=getattr(index, 'freq', None),
  3595. index_name=index_name)
  3596. elif inferred_type == 'datetime':
  3597. converted = np.asarray([(time.mktime(v.timetuple()) +
  3598. v.microsecond / 1E6) for v in values],
  3599. dtype=np.float64)
  3600. return IndexCol(converted, 'datetime', _tables().Time64Col(),
  3601. index_name=index_name)
  3602. elif inferred_type == 'date':
  3603. converted = np.asarray([v.toordinal() for v in values],
  3604. dtype=np.int32)
  3605. return IndexCol(converted, 'date', _tables().Time32Col(),
  3606. index_name=index_name)
  3607. elif inferred_type == 'string':
  3608. # atom = _tables().ObjectAtom()
  3609. # return np.asarray(values, dtype='O'), 'object', atom
  3610. converted = _convert_string_array(values, encoding, errors)
  3611. itemsize = converted.dtype.itemsize
  3612. return IndexCol(
  3613. converted, 'string', _tables().StringCol(itemsize),
  3614. itemsize=itemsize, index_name=index_name
  3615. )
  3616. elif inferred_type == 'unicode':
  3617. if format_type == 'fixed':
  3618. atom = _tables().ObjectAtom()
  3619. return IndexCol(np.asarray(values, dtype='O'), 'object', atom,
  3620. index_name=index_name)
  3621. raise TypeError(
  3622. "[unicode] is not supported as a in index type for [{0}] formats"
  3623. .format(format_type)
  3624. )
  3625. elif inferred_type == 'integer':
  3626. # take a guess for now, hope the values fit
  3627. atom = _tables().Int64Col()
  3628. return IndexCol(np.asarray(values, dtype=np.int64), 'integer', atom,
  3629. index_name=index_name)
  3630. elif inferred_type == 'floating':
  3631. atom = _tables().Float64Col()
  3632. return IndexCol(np.asarray(values, dtype=np.float64), 'float', atom,
  3633. index_name=index_name)
  3634. else: # pragma: no cover
  3635. atom = _tables().ObjectAtom()
  3636. return IndexCol(np.asarray(values, dtype='O'), 'object', atom,
  3637. index_name=index_name)
  3638. def _unconvert_index(data, kind, encoding=None, errors='strict'):
  3639. kind = _ensure_decoded(kind)
  3640. if kind == u'datetime64':
  3641. index = DatetimeIndex(data)
  3642. elif kind == u'timedelta64':
  3643. index = TimedeltaIndex(data)
  3644. elif kind == u'datetime':
  3645. index = np.asarray([datetime.fromtimestamp(v) for v in data],
  3646. dtype=object)
  3647. elif kind == u'date':
  3648. try:
  3649. index = np.asarray(
  3650. [date.fromordinal(v) for v in data], dtype=object)
  3651. except (ValueError):
  3652. index = np.asarray(
  3653. [date.fromtimestamp(v) for v in data], dtype=object)
  3654. elif kind in (u'integer', u'float'):
  3655. index = np.asarray(data)
  3656. elif kind in (u'string'):
  3657. index = _unconvert_string_array(data, nan_rep=None, encoding=encoding,
  3658. errors=errors)
  3659. elif kind == u'object':
  3660. index = np.asarray(data[0])
  3661. else: # pragma: no cover
  3662. raise ValueError('unrecognized index type {kind}'.format(kind=kind))
  3663. return index
  3664. def _unconvert_index_legacy(data, kind, legacy=False, encoding=None,
  3665. errors='strict'):
  3666. kind = _ensure_decoded(kind)
  3667. if kind == u'datetime':
  3668. index = to_datetime(data)
  3669. elif kind in (u'integer'):
  3670. index = np.asarray(data, dtype=object)
  3671. elif kind in (u'string'):
  3672. index = _unconvert_string_array(data, nan_rep=None, encoding=encoding,
  3673. errors=errors)
  3674. else: # pragma: no cover
  3675. raise ValueError('unrecognized index type {kind}'.format(kind=kind))
  3676. return index
  3677. def _convert_string_array(data, encoding, errors, itemsize=None):
  3678. """
  3679. we take a string-like that is object dtype and coerce to a fixed size
  3680. string type
  3681. Parameters
  3682. ----------
  3683. data : a numpy array of object dtype
  3684. encoding : None or string-encoding
  3685. errors : handler for encoding errors
  3686. itemsize : integer, optional, defaults to the max length of the strings
  3687. Returns
  3688. -------
  3689. data in a fixed-length string dtype, encoded to bytes if needed
  3690. """
  3691. # encode if needed
  3692. if encoding is not None and len(data):
  3693. data = Series(data.ravel()).str.encode(
  3694. encoding, errors).values.reshape(data.shape)
  3695. # create the sized dtype
  3696. if itemsize is None:
  3697. ensured = ensure_object(data.ravel())
  3698. itemsize = max(1, libwriters.max_len_string_array(ensured))
  3699. data = np.asarray(data, dtype="S{size}".format(size=itemsize))
  3700. return data
  3701. def _unconvert_string_array(data, nan_rep=None, encoding=None,
  3702. errors='strict'):
  3703. """
  3704. inverse of _convert_string_array
  3705. Parameters
  3706. ----------
  3707. data : fixed length string dtyped array
  3708. nan_rep : the storage repr of NaN, optional
  3709. encoding : the encoding of the data, optional
  3710. errors : handler for encoding errors, default 'strict'
  3711. Returns
  3712. -------
  3713. an object array of the decoded data
  3714. """
  3715. shape = data.shape
  3716. data = np.asarray(data.ravel(), dtype=object)
  3717. # guard against a None encoding in PY3 (because of a legacy
  3718. # where the passed encoding is actually None)
  3719. encoding = _ensure_encoding(encoding)
  3720. if encoding is not None and len(data):
  3721. itemsize = libwriters.max_len_string_array(ensure_object(data))
  3722. if compat.PY3:
  3723. dtype = "U{0}".format(itemsize)
  3724. else:
  3725. dtype = "S{0}".format(itemsize)
  3726. if isinstance(data[0], compat.binary_type):
  3727. data = Series(data).str.decode(encoding, errors=errors).values
  3728. else:
  3729. data = data.astype(dtype, copy=False).astype(object, copy=False)
  3730. if nan_rep is None:
  3731. nan_rep = 'nan'
  3732. data = libwriters.string_array_replace_from_nan_rep(data, nan_rep)
  3733. return data.reshape(shape)
  3734. def _maybe_convert(values, val_kind, encoding, errors):
  3735. if _need_convert(val_kind):
  3736. conv = _get_converter(val_kind, encoding, errors)
  3737. # conv = np.frompyfunc(conv, 1, 1)
  3738. values = conv(values)
  3739. return values
  3740. def _get_converter(kind, encoding, errors):
  3741. kind = _ensure_decoded(kind)
  3742. if kind == 'datetime64':
  3743. return lambda x: np.asarray(x, dtype='M8[ns]')
  3744. elif kind == 'datetime':
  3745. return lambda x: to_datetime(x, cache=True).to_pydatetime()
  3746. elif kind == 'string':
  3747. return lambda x: _unconvert_string_array(x, encoding=encoding,
  3748. errors=errors)
  3749. else: # pragma: no cover
  3750. raise ValueError('invalid kind {kind}'.format(kind=kind))
  3751. def _need_convert(kind):
  3752. kind = _ensure_decoded(kind)
  3753. if kind in (u'datetime', u'datetime64', u'string'):
  3754. return True
  3755. return False
  3756. class Selection(object):
  3757. """
  3758. Carries out a selection operation on a tables.Table object.
  3759. Parameters
  3760. ----------
  3761. table : a Table object
  3762. where : list of Terms (or convertible to)
  3763. start, stop: indices to start and/or stop selection
  3764. """
  3765. def __init__(self, table, where=None, start=None, stop=None):
  3766. self.table = table
  3767. self.where = where
  3768. self.start = start
  3769. self.stop = stop
  3770. self.condition = None
  3771. self.filter = None
  3772. self.terms = None
  3773. self.coordinates = None
  3774. if is_list_like(where):
  3775. # see if we have a passed coordinate like
  3776. try:
  3777. inferred = lib.infer_dtype(where, skipna=False)
  3778. if inferred == 'integer' or inferred == 'boolean':
  3779. where = np.asarray(where)
  3780. if where.dtype == np.bool_:
  3781. start, stop = self.start, self.stop
  3782. if start is None:
  3783. start = 0
  3784. if stop is None:
  3785. stop = self.table.nrows
  3786. self.coordinates = np.arange(start, stop)[where]
  3787. elif issubclass(where.dtype.type, np.integer):
  3788. if ((self.start is not None and
  3789. (where < self.start).any()) or
  3790. (self.stop is not None and
  3791. (where >= self.stop).any())):
  3792. raise ValueError(
  3793. "where must have index locations >= start and "
  3794. "< stop"
  3795. )
  3796. self.coordinates = where
  3797. except ValueError:
  3798. pass
  3799. if self.coordinates is None:
  3800. self.terms = self.generate(where)
  3801. # create the numexpr & the filter
  3802. if self.terms is not None:
  3803. self.condition, self.filter = self.terms.evaluate()
  3804. def generate(self, where):
  3805. """ where can be a : dict,list,tuple,string """
  3806. if where is None:
  3807. return None
  3808. q = self.table.queryables()
  3809. try:
  3810. return Expr(where, queryables=q, encoding=self.table.encoding)
  3811. except NameError:
  3812. # raise a nice message, suggesting that the user should use
  3813. # data_columns
  3814. raise ValueError(
  3815. "The passed where expression: {0}\n"
  3816. " contains an invalid variable reference\n"
  3817. " all of the variable references must be a "
  3818. "reference to\n"
  3819. " an axis (e.g. 'index' or 'columns'), or a "
  3820. "data_column\n"
  3821. " The currently defined references are: {1}\n"
  3822. .format(where, ','.join(q.keys()))
  3823. )
  3824. def select(self):
  3825. """
  3826. generate the selection
  3827. """
  3828. if self.condition is not None:
  3829. return self.table.table.read_where(self.condition.format(),
  3830. start=self.start,
  3831. stop=self.stop)
  3832. elif self.coordinates is not None:
  3833. return self.table.table.read_coordinates(self.coordinates)
  3834. return self.table.table.read(start=self.start, stop=self.stop)
  3835. def select_coords(self):
  3836. """
  3837. generate the selection
  3838. """
  3839. start, stop = self.start, self.stop
  3840. nrows = self.table.nrows
  3841. if start is None:
  3842. start = 0
  3843. elif start < 0:
  3844. start += nrows
  3845. if self.stop is None:
  3846. stop = nrows
  3847. elif stop < 0:
  3848. stop += nrows
  3849. if self.condition is not None:
  3850. return self.table.table.get_where_list(self.condition.format(),
  3851. start=start, stop=stop,
  3852. sort=True)
  3853. elif self.coordinates is not None:
  3854. return self.coordinates
  3855. return np.arange(start, stop)