/pandas/io/pytables.py
Python | 4741 lines | 4690 code | 27 blank | 24 comment | 33 complexity | 26fb8d58c70144bbe83de193f4e12e88 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- # pylint: disable-msg=E1101,W0613,W0603
- """
- High level interface to PyTables for reading and writing pandas data structures
- to disk
- """
- import copy
- from datetime import date, datetime
- from distutils.version import LooseVersion
- import itertools
- import os
- import re
- import time
- import warnings
- import numpy as np
- from pandas._libs import lib, writers as libwriters
- from pandas._libs.tslibs import timezones
- from pandas.compat import PY3, filter, lrange, range, string_types
- from pandas.errors import PerformanceWarning
- from pandas.core.dtypes.common import (
- ensure_object, is_categorical_dtype, is_datetime64_dtype,
- is_datetime64tz_dtype, is_list_like, is_timedelta64_dtype)
- from pandas.core.dtypes.missing import array_equivalent
- from pandas import (
- DataFrame, DatetimeIndex, Index, Int64Index, MultiIndex, PeriodIndex,
- Series, SparseDataFrame, SparseSeries, TimedeltaIndex, compat, concat,
- isna, to_datetime)
- from pandas.core import config
- from pandas.core.arrays.categorical import Categorical
- from pandas.core.arrays.sparse import BlockIndex, IntIndex
- from pandas.core.base import StringMixin
- import pandas.core.common as com
- from pandas.core.computation.pytables import Expr, maybe_expression
- from pandas.core.config import get_option
- from pandas.core.index import ensure_index
- from pandas.core.internals import BlockManager, _block_shape, make_block
- from pandas.io.common import _stringify_path
- from pandas.io.formats.printing import adjoin, pprint_thing
- # versioning attribute
- _version = '0.15.2'
- # encoding
- # PY3 encoding if we don't specify
- _default_encoding = 'UTF-8'
- def _ensure_decoded(s):
- """ if we have bytes, decode them to unicode """
- if isinstance(s, np.bytes_):
- s = s.decode('UTF-8')
- return s
- def _ensure_encoding(encoding):
- # set the encoding if we need
- if encoding is None:
- if PY3:
- encoding = _default_encoding
- return encoding
- def _ensure_str(name):
- """Ensure that an index / column name is a str (python 3) or
- unicode (python 2); otherwise they may be np.string dtype.
- Non-string dtypes are passed through unchanged.
- https://github.com/pandas-dev/pandas/issues/13492
- """
- if isinstance(name, compat.string_types):
- name = compat.text_type(name)
- return name
- Term = Expr
- def _ensure_term(where, scope_level):
- """
- ensure that the where is a Term or a list of Term
- this makes sure that we are capturing the scope of variables
- that are passed
- create the terms here with a frame_level=2 (we are 2 levels down)
- """
- # only consider list/tuple here as an ndarray is automatically a coordinate
- # list
- level = scope_level + 1
- if isinstance(where, (list, tuple)):
- wlist = []
- for w in filter(lambda x: x is not None, where):
- if not maybe_expression(w):
- wlist.append(w)
- else:
- wlist.append(Term(w, scope_level=level))
- where = wlist
- elif maybe_expression(where):
- where = Term(where, scope_level=level)
- return where
- class PossibleDataLossError(Exception):
- pass
- class ClosedFileError(Exception):
- pass
- class IncompatibilityWarning(Warning):
- pass
- incompatibility_doc = """
- where criteria is being ignored as this version [%s] is too old (or
- not-defined), read the file in and write it out to a new file to upgrade (with
- the copy_to method)
- """
- class AttributeConflictWarning(Warning):
- pass
- attribute_conflict_doc = """
- the [%s] attribute of the existing index is [%s] which conflicts with the new
- [%s], resetting the attribute to None
- """
- class DuplicateWarning(Warning):
- pass
- duplicate_doc = """
- duplicate entries in table, taking most recently appended
- """
- performance_doc = """
- your performance may suffer as PyTables will pickle object types that it cannot
- map directly to c-types [inferred_type->%s,key->%s] [items->%s]
- """
- # formats
- _FORMAT_MAP = {
- u'f': 'fixed',
- u'fixed': 'fixed',
- u't': 'table',
- u'table': 'table',
- }
- format_deprecate_doc = """
- the table keyword has been deprecated
- use the format='fixed(f)|table(t)' keyword instead
- fixed(f) : specifies the Fixed format
- and is the default for put operations
- table(t) : specifies the Table format
- and is the default for append operations
- """
- # map object types
- _TYPE_MAP = {
- Series: u'series',
- SparseSeries: u'sparse_series',
- DataFrame: u'frame',
- SparseDataFrame: u'sparse_frame',
- }
- # storer class map
- _STORER_MAP = {
- u'Series': 'LegacySeriesFixed',
- u'DataFrame': 'LegacyFrameFixed',
- u'DataMatrix': 'LegacyFrameFixed',
- u'series': 'SeriesFixed',
- u'sparse_series': 'SparseSeriesFixed',
- u'frame': 'FrameFixed',
- u'sparse_frame': 'SparseFrameFixed',
- }
- # table class map
- _TABLE_MAP = {
- u'generic_table': 'GenericTable',
- u'appendable_series': 'AppendableSeriesTable',
- u'appendable_multiseries': 'AppendableMultiSeriesTable',
- u'appendable_frame': 'AppendableFrameTable',
- u'appendable_multiframe': 'AppendableMultiFrameTable',
- u'worm': 'WORMTable',
- }
- # axes map
- _AXES_MAP = {
- DataFrame: [0],
- }
- # register our configuration options
- dropna_doc = """
- : boolean
- drop ALL nan rows when appending to a table
- """
- format_doc = """
- : format
- default format writing format, if None, then
- put will default to 'fixed' and append will default to 'table'
- """
- with config.config_prefix('io.hdf'):
- config.register_option('dropna_table', False, dropna_doc,
- validator=config.is_bool)
- config.register_option(
- 'default_format', None, format_doc,
- validator=config.is_one_of_factory(['fixed', 'table', None])
- )
- # oh the troubles to reduce import time
- _table_mod = None
- _table_file_open_policy_is_strict = False
- def _tables():
- global _table_mod
- global _table_file_open_policy_is_strict
- if _table_mod is None:
- import tables
- _table_mod = tables
- # version requirements
- if LooseVersion(tables.__version__) < LooseVersion('3.0.0'):
- raise ImportError("PyTables version >= 3.0.0 is required")
- # set the file open policy
- # return the file open policy; this changes as of pytables 3.1
- # depending on the HDF5 version
- try:
- _table_file_open_policy_is_strict = (
- tables.file._FILE_OPEN_POLICY == 'strict')
- except AttributeError:
- pass
- return _table_mod
- # interface to/from ###
- def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None,
- append=None, **kwargs):
- """ store this object, close it if we opened it """
- if append:
- f = lambda store: store.append(key, value, **kwargs)
- else:
- f = lambda store: store.put(key, value, **kwargs)
- path_or_buf = _stringify_path(path_or_buf)
- if isinstance(path_or_buf, string_types):
- with HDFStore(path_or_buf, mode=mode, complevel=complevel,
- complib=complib) as store:
- f(store)
- else:
- f(path_or_buf)
- def read_hdf(path_or_buf, key=None, mode='r', **kwargs):
- """
- Read from the store, close it if we opened it.
- Retrieve pandas object stored in file, optionally based on where
- criteria
- Parameters
- ----------
- path_or_buf : string, buffer or path object
- Path to the file to open, or an open :class:`pandas.HDFStore` object.
- Supports any object implementing the ``__fspath__`` protocol.
- This includes :class:`pathlib.Path` and py._path.local.LocalPath
- objects.
- .. versionadded:: 0.19.0 support for pathlib, py.path.
- .. versionadded:: 0.21.0 support for __fspath__ protocol.
- key : object, optional
- The group identifier in the store. Can be omitted if the HDF file
- contains a single pandas object.
- mode : {'r', 'r+', 'a'}, optional
- Mode to use when opening the file. Ignored if path_or_buf is a
- :class:`pandas.HDFStore`. Default is 'r'.
- where : list, optional
- A list of Term (or convertible) objects.
- start : int, optional
- Row number to start selection.
- stop : int, optional
- Row number to stop selection.
- columns : list, optional
- A list of columns names to return.
- iterator : bool, optional
- Return an iterator object.
- chunksize : int, optional
- Number of rows to include in an iteration when using an iterator.
- errors : str, default 'strict'
- Specifies how encoding and decoding errors are to be handled.
- See the errors argument for :func:`open` for a full list
- of options.
- **kwargs
- Additional keyword arguments passed to HDFStore.
- Returns
- -------
- item : object
- The selected object. Return type depends on the object stored.
- See Also
- --------
- DataFrame.to_hdf : Write a HDF file from a DataFrame.
- HDFStore : Low-level access to HDF files.
- Examples
- --------
- >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z'])
- >>> df.to_hdf('./store.h5', 'data')
- >>> reread = pd.read_hdf('./store.h5')
- """
- if mode not in ['r', 'r+', 'a']:
- raise ValueError('mode {0} is not allowed while performing a read. '
- 'Allowed modes are r, r+ and a.'.format(mode))
- # grab the scope
- if 'where' in kwargs:
- kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1)
- if isinstance(path_or_buf, HDFStore):
- if not path_or_buf.is_open:
- raise IOError('The HDFStore must be open for reading.')
- store = path_or_buf
- auto_close = False
- else:
- path_or_buf = _stringify_path(path_or_buf)
- if not isinstance(path_or_buf, string_types):
- raise NotImplementedError('Support for generic buffers has not '
- 'been implemented.')
- try:
- exists = os.path.exists(path_or_buf)
- # if filepath is too long
- except (TypeError, ValueError):
- exists = False
- if not exists:
- raise compat.FileNotFoundError(
- 'File {path} does not exist'.format(path=path_or_buf))
- store = HDFStore(path_or_buf, mode=mode, **kwargs)
- # can't auto open/close if we are using an iterator
- # so delegate to the iterator
- auto_close = True
- try:
- if key is None:
- groups = store.groups()
- if len(groups) == 0:
- raise ValueError('No dataset in HDF5 file.')
- candidate_only_group = groups[0]
- # For the HDF file to have only one dataset, all other groups
- # should then be metadata groups for that candidate group. (This
- # assumes that the groups() method enumerates parent groups
- # before their children.)
- for group_to_check in groups[1:]:
- if not _is_metadata_of(group_to_check, candidate_only_group):
- raise ValueError('key must be provided when HDF5 file '
- 'contains multiple datasets.')
- key = candidate_only_group._v_pathname
- return store.select(key, auto_close=auto_close, **kwargs)
- except (ValueError, TypeError):
- # if there is an error, close the store
- try:
- store.close()
- except AttributeError:
- pass
- raise
- def _is_metadata_of(group, parent_group):
- """Check if a given group is a metadata group for a given parent_group."""
- if group._v_depth <= parent_group._v_depth:
- return False
- current = group
- while current._v_depth > 1:
- parent = current._v_parent
- if parent == parent_group and current._v_name == 'meta':
- return True
- current = current._v_parent
- return False
- class HDFStore(StringMixin):
- """
- Dict-like IO interface for storing pandas objects in PyTables
- either Fixed or Table format.
- Parameters
- ----------
- path : string
- File path to HDF5 file
- mode : {'a', 'w', 'r', 'r+'}, default 'a'
- ``'r'``
- Read-only; no data can be modified.
- ``'w'``
- Write; a new file is created (an existing file with the same
- name would be deleted).
- ``'a'``
- Append; an existing file is opened for reading and writing,
- and if the file does not exist it is created.
- ``'r+'``
- It is similar to ``'a'``, but the file must already exist.
- complevel : int, 0-9, default None
- Specifies a compression level for data.
- A value of 0 disables compression.
- complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
- Specifies the compression library to be used.
- As of v0.20.2 these additional compressors for Blosc are supported
- (default if no compressor specified: 'blosc:blosclz'):
- {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
- 'blosc:zlib', 'blosc:zstd'}.
- Specifying a compression library which is not available issues
- a ValueError.
- fletcher32 : bool, default False
- If applying compression use the fletcher32 checksum
- Examples
- --------
- >>> bar = pd.DataFrame(np.random.randn(10, 4))
- >>> store = pd.HDFStore('test.h5')
- >>> store['foo'] = bar # write to HDF5
- >>> bar = store['foo'] # retrieve
- >>> store.close()
- """
- def __init__(self, path, mode=None, complevel=None, complib=None,
- fletcher32=False, **kwargs):
- if 'format' in kwargs:
- raise ValueError('format is not a defined argument for HDFStore')
- try:
- import tables # noqa
- except ImportError as ex: # pragma: no cover
- raise ImportError('HDFStore requires PyTables, "{ex!s}" problem '
- 'importing'.format(ex=ex))
- if complib is not None and complib not in tables.filters.all_complibs:
- raise ValueError(
- "complib only supports {libs} compression.".format(
- libs=tables.filters.all_complibs))
- if complib is None and complevel is not None:
- complib = tables.filters.default_complib
- self._path = _stringify_path(path)
- if mode is None:
- mode = 'a'
- self._mode = mode
- self._handle = None
- self._complevel = complevel if complevel else 0
- self._complib = complib
- self._fletcher32 = fletcher32
- self._filters = None
- self.open(mode=mode, **kwargs)
- def __fspath__(self):
- return self._path
- @property
- def root(self):
- """ return the root node """
- self._check_if_open()
- return self._handle.root
- @property
- def filename(self):
- return self._path
- def __getitem__(self, key):
- return self.get(key)
- def __setitem__(self, key, value):
- self.put(key, value)
- def __delitem__(self, key):
- return self.remove(key)
- def __getattr__(self, name):
- """ allow attribute access to get stores """
- try:
- return self.get(name)
- except (KeyError, ClosedFileError):
- pass
- raise AttributeError(
- "'{object}' object has no attribute '{name}'".format(
- object=type(self).__name__, name=name))
- def __contains__(self, key):
- """ check for existence of this key
- can match the exact pathname or the pathnm w/o the leading '/'
- """
- node = self.get_node(key)
- if node is not None:
- name = node._v_pathname
- if name == key or name[1:] == key:
- return True
- return False
- def __len__(self):
- return len(self.groups())
- def __unicode__(self):
- return '{type}\nFile path: {path}\n'.format(
- type=type(self), path=pprint_thing(self._path))
- def __enter__(self):
- return self
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
- def keys(self):
- """
- Return a (potentially unordered) list of the keys corresponding to the
- objects stored in the HDFStore. These are ABSOLUTE path-names (e.g.
- have the leading '/'
- """
- return [n._v_pathname for n in self.groups()]
- def __iter__(self):
- return iter(self.keys())
- def items(self):
- """
- iterate on key->group
- """
- for g in self.groups():
- yield g._v_pathname, g
- iteritems = items
- def open(self, mode='a', **kwargs):
- """
- Open the file in the specified mode
- Parameters
- ----------
- mode : {'a', 'w', 'r', 'r+'}, default 'a'
- See HDFStore docstring or tables.open_file for info about modes
- """
- tables = _tables()
- if self._mode != mode:
- # if we are changing a write mode to read, ok
- if self._mode in ['a', 'w'] and mode in ['r', 'r+']:
- pass
- elif mode in ['w']:
- # this would truncate, raise here
- if self.is_open:
- raise PossibleDataLossError(
- "Re-opening the file [{0}] with mode [{1}] "
- "will delete the current file!"
- .format(self._path, self._mode)
- )
- self._mode = mode
- # close and reopen the handle
- if self.is_open:
- self.close()
- if self._complevel and self._complevel > 0:
- self._filters = _tables().Filters(self._complevel, self._complib,
- fletcher32=self._fletcher32)
- try:
- self._handle = tables.open_file(self._path, self._mode, **kwargs)
- except (IOError) as e: # pragma: no cover
- if 'can not be written' in str(e):
- print(
- 'Opening {path} in read-only mode'.format(path=self._path))
- self._handle = tables.open_file(self._path, 'r', **kwargs)
- else:
- raise
- except (ValueError) as e:
- # trap PyTables >= 3.1 FILE_OPEN_POLICY exception
- # to provide an updated message
- if 'FILE_OPEN_POLICY' in str(e):
- e = ValueError(
- "PyTables [{version}] no longer supports opening multiple "
- "files\n"
- "even in read-only mode on this HDF5 version "
- "[{hdf_version}]. You can accept this\n"
- "and not open the same file multiple times at once,\n"
- "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 "
- "which allows\n"
- "files to be opened multiple times at once\n"
- .format(version=tables.__version__,
- hdf_version=tables.get_hdf5_version()))
- raise e
- except (Exception) as e:
- # trying to read from a non-existent file causes an error which
- # is not part of IOError, make it one
- if self._mode == 'r' and 'Unable to open/create file' in str(e):
- raise IOError(str(e))
- raise
- def close(self):
- """
- Close the PyTables file handle
- """
- if self._handle is not None:
- self._handle.close()
- self._handle = None
- @property
- def is_open(self):
- """
- return a boolean indicating whether the file is open
- """
- if self._handle is None:
- return False
- return bool(self._handle.isopen)
- def flush(self, fsync=False):
- """
- Force all buffered modifications to be written to disk.
- Parameters
- ----------
- fsync : bool (default False)
- call ``os.fsync()`` on the file handle to force writing to disk.
- Notes
- -----
- Without ``fsync=True``, flushing may not guarantee that the OS writes
- to disk. With fsync, the operation will block until the OS claims the
- file has been written; however, other caching layers may still
- interfere.
- """
- if self._handle is not None:
- self._handle.flush()
- if fsync:
- try:
- os.fsync(self._handle.fileno())
- except OSError:
- pass
- def get(self, key):
- """
- Retrieve pandas object stored in file
- Parameters
- ----------
- key : object
- Returns
- -------
- obj : same type as object stored in file
- """
- group = self.get_node(key)
- if group is None:
- raise KeyError('No object named {key} in the file'.format(key=key))
- return self._read_group(group)
- def select(self, key, where=None, start=None, stop=None, columns=None,
- iterator=False, chunksize=None, auto_close=False, **kwargs):
- """
- Retrieve pandas object stored in file, optionally based on where
- criteria
- Parameters
- ----------
- key : object
- where : list of Term (or convertible) objects, optional
- start : integer (defaults to None), row number to start selection
- stop : integer (defaults to None), row number to stop selection
- columns : a list of columns that if not None, will limit the return
- columns
- iterator : boolean, return an iterator, default False
- chunksize : nrows to include in iteration, return an iterator
- auto_close : boolean, should automatically close the store when
- finished, default is False
- Returns
- -------
- The selected object
- """
- group = self.get_node(key)
- if group is None:
- raise KeyError('No object named {key} in the file'.format(key=key))
- # create the storer and axes
- where = _ensure_term(where, scope_level=1)
- s = self._create_storer(group)
- s.infer_axes()
- # function to call on iteration
- def func(_start, _stop, _where):
- return s.read(start=_start, stop=_stop,
- where=_where,
- columns=columns)
- # create the iterator
- it = TableIterator(self, s, func, where=where, nrows=s.nrows,
- start=start, stop=stop, iterator=iterator,
- chunksize=chunksize, auto_close=auto_close)
- return it.get_result()
- def select_as_coordinates(
- self, key, where=None, start=None, stop=None, **kwargs):
- """
- return the selection as an Index
- Parameters
- ----------
- key : object
- where : list of Term (or convertible) objects, optional
- start : integer (defaults to None), row number to start selection
- stop : integer (defaults to None), row number to stop selection
- """
- where = _ensure_term(where, scope_level=1)
- return self.get_storer(key).read_coordinates(where=where, start=start,
- stop=stop, **kwargs)
- def select_column(self, key, column, **kwargs):
- """
- return a single column from the table. This is generally only useful to
- select an indexable
- Parameters
- ----------
- key : object
- column: the column of interest
- Exceptions
- ----------
- raises KeyError if the column is not found (or key is not a valid
- store)
- raises ValueError if the column can not be extracted individually (it
- is part of a data block)
- """
- return self.get_storer(key).read_column(column=column, **kwargs)
- def select_as_multiple(self, keys, where=None, selector=None, columns=None,
- start=None, stop=None, iterator=False,
- chunksize=None, auto_close=False, **kwargs):
- """ Retrieve pandas objects from multiple tables
- Parameters
- ----------
- keys : a list of the tables
- selector : the table to apply the where criteria (defaults to keys[0]
- if not supplied)
- columns : the columns I want back
- start : integer (defaults to None), row number to start selection
- stop : integer (defaults to None), row number to stop selection
- iterator : boolean, return an iterator, default False
- chunksize : nrows to include in iteration, return an iterator
- Exceptions
- ----------
- raises KeyError if keys or selector is not found or keys is empty
- raises TypeError if keys is not a list or tuple
- raises ValueError if the tables are not ALL THE SAME DIMENSIONS
- """
- # default to single select
- where = _ensure_term(where, scope_level=1)
- if isinstance(keys, (list, tuple)) and len(keys) == 1:
- keys = keys[0]
- if isinstance(keys, string_types):
- return self.select(key=keys, where=where, columns=columns,
- start=start, stop=stop, iterator=iterator,
- chunksize=chunksize, **kwargs)
- if not isinstance(keys, (list, tuple)):
- raise TypeError("keys must be a list/tuple")
- if not len(keys):
- raise ValueError("keys must have a non-zero length")
- if selector is None:
- selector = keys[0]
- # collect the tables
- tbls = [self.get_storer(k) for k in keys]
- s = self.get_storer(selector)
- # validate rows
- nrows = None
- for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
- if t is None:
- raise KeyError("Invalid table [{key}]".format(key=k))
- if not t.is_table:
- raise TypeError(
- "object [{obj}] is not a table, and cannot be used in all "
- "select as multiple".format(obj=t.pathname)
- )
- if nrows is None:
- nrows = t.nrows
- elif t.nrows != nrows:
- raise ValueError(
- "all tables must have exactly the same nrows!")
- # axis is the concentation axes
- axis = list({t.non_index_axes[0][0] for t in tbls})[0]
- def func(_start, _stop, _where):
- # retrieve the objs, _where is always passed as a set of
- # coordinates here
- objs = [t.read(where=_where, columns=columns, start=_start,
- stop=_stop, **kwargs) for t in tbls]
- # concat and return
- return concat(objs, axis=axis,
- verify_integrity=False)._consolidate()
- # create the iterator
- it = TableIterator(self, s, func, where=where, nrows=nrows,
- start=start, stop=stop, iterator=iterator,
- chunksize=chunksize, auto_close=auto_close)
- return it.get_result(coordinates=True)
- def put(self, key, value, format=None, append=False, **kwargs):
- """
- Store object in HDFStore
- Parameters
- ----------
- key : object
- value : {Series, DataFrame}
- format : 'fixed(f)|table(t)', default is 'fixed'
- fixed(f) : Fixed format
- Fast writing/reading. Not-appendable, nor searchable
- table(t) : Table format
- Write as a PyTables Table structure which may perform
- worse but allow more flexible operations like searching
- / selecting subsets of the data
- append : boolean, default False
- This will force Table format, append the input data to the
- existing.
- data_columns : list of columns to create as data columns, or True to
- use all columns. See
- `here <http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns>`__ # noqa
- encoding : default None, provide an encoding for strings
- dropna : boolean, default False, do not write an ALL nan row to
- the store settable by the option 'io.hdf.dropna_table'
- """
- if format is None:
- format = get_option("io.hdf.default_format") or 'fixed'
- kwargs = self._validate_format(format, kwargs)
- self._write_to_group(key, value, append=append, **kwargs)
- def remove(self, key, where=None, start=None, stop=None):
- """
- Remove pandas object partially by specifying the where condition
- Parameters
- ----------
- key : string
- Node to remove or delete rows from
- where : list of Term (or convertible) objects, optional
- start : integer (defaults to None), row number to start selection
- stop : integer (defaults to None), row number to stop selection
- Returns
- -------
- number of rows removed (or None if not a Table)
- Exceptions
- ----------
- raises KeyError if key is not a valid store
- """
- where = _ensure_term(where, scope_level=1)
- try:
- s = self.get_storer(key)
- except KeyError:
- # the key is not a valid store, re-raising KeyError
- raise
- except Exception:
- if where is not None:
- raise ValueError(
- "trying to remove a node with a non-None where clause!")
- # we are actually trying to remove a node (with children)
- s = self.get_node(key)
- if s is not None:
- s._f_remove(recursive=True)
- return None
- # remove the node
- if com._all_none(where, start, stop):
- s.group._f_remove(recursive=True)
- # delete from the table
- else:
- if not s.is_table:
- raise ValueError(
- 'can only remove with where on objects written as tables')
- return s.delete(where=where, start=start, stop=stop)
- def append(self, key, value, format=None, append=True, columns=None,
- dropna=None, **kwargs):
- """
- Append to Table in file. Node must already exist and be Table
- format.
- Parameters
- ----------
- key : object
- value : {Series, DataFrame}
- format : 'table' is the default
- table(t) : table format
- Write as a PyTables Table structure which may perform
- worse but allow more flexible operations like searching
- / selecting subsets of the data
- append : boolean, default True, append the input data to the
- existing
- data_columns : list of columns, or True, default None
- List of columns to create as indexed data columns for on-disk
- queries, or True to use all columns. By default only the axes
- of the object are indexed. See `here
- <http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns>`__.
- min_itemsize : dict of columns that specify minimum string sizes
- nan_rep : string to use as string nan represenation
- chunksize : size to chunk the writing
- expectedrows : expected TOTAL row size of this table
- encoding : default None, provide an encoding for strings
- dropna : boolean, default False, do not write an ALL nan row to
- the store settable by the option 'io.hdf.dropna_table'
- Notes
- -----
- Does *not* check if data being appended overlaps with existing
- data in the table, so be careful
- """
- if columns is not None:
- raise TypeError("columns is not a supported keyword in append, "
- "try data_columns")
- if dropna is None:
- dropna = get_option("io.hdf.dropna_table")
- if format is None:
- format = get_option("io.hdf.default_format") or 'table'
- kwargs = self._validate_format(format, kwargs)
- self._write_to_group(key, value, append=append, dropna=dropna,
- **kwargs)
- def append_to_multiple(self, d, value, selector, data_columns=None,
- axes=None, dropna=False, **kwargs):
- """
- Append to multiple tables
- Parameters
- ----------
- d : a dict of table_name to table_columns, None is acceptable as the
- values of one node (this will get all the remaining columns)
- value : a pandas object
- selector : a string that designates the indexable table; all of its
- columns will be designed as data_columns, unless data_columns is
- passed, in which case these are used
- data_columns : list of columns to create as data columns, or True to
- use all columns
- dropna : if evaluates to True, drop rows from all tables if any single
- row in each table has all NaN. Default False.
- Notes
- -----
- axes parameter is currently not accepted
- """
- if axes is not None:
- raise TypeError("axes is currently not accepted as a parameter to"
- " append_to_multiple; you can create the "
- "tables independently instead")
- if not isinstance(d, dict):
- raise ValueError(
- "append_to_multiple must have a dictionary specified as the "
- "way to split the value"
- )
- if selector not in d:
- raise ValueError(
- "append_to_multiple requires a selector that is in passed dict"
- )
- # figure out the splitting axis (the non_index_axis)
- axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]
- # figure out how to split the value
- remain_key = None
- remain_values = []
- for k, v in d.items():
- if v is None:
- if remain_key is not None:
- raise ValueError(
- "append_to_multiple can only have one value in d that "
- "is None"
- )
- remain_key = k
- else:
- remain_values.extend(v)
- if remain_key is not None:
- ordered = value.axes[axis]
- ordd = ordered.difference(Index(remain_values))
- ordd = sorted(ordered.get_indexer(ordd))
- d[remain_key] = ordered.take(ordd)
- # data_columns
- if data_columns is None:
- data_columns = d[selector]
- # ensure rows are synchronized across the tables
- if dropna:
- idxs = (value[cols].dropna(how='all').index for cols in d.values())
- valid_index = next(idxs)
- for index in idxs:
- valid_index = valid_index.intersection(index)
- value = value.loc[valid_index]
- # append
- for k, v in d.items():
- dc = data_columns if k == selector else None
- # compute the val
- val = value.reindex(v, axis=axis)
- self.append(k, val, data_columns=dc, **kwargs)
- def create_table_index(self, key, **kwargs):
- """ Create a pytables index on the table
- Parameters
- ----------
- key : object (the node to index)
- Exceptions
- ----------
- raises if the node is not a table
- """
- # version requirements
- _tables()
- s = self.get_storer(key)
- if s is None:
- return
- if not s.is_table:
- raise TypeError(
- "cannot create table index on a Fixed format store")
- s.create_index(**kwargs)
- def groups(self):
- """return a list of all the top-level nodes (that are not themselves a
- pandas storage object)
- """
- _tables()
- self._check_if_open()
- return [
- g for g in self._handle.walk_groups()
- if (not isinstance(g, _table_mod.link.Link) and
- (getattr(g._v_attrs, 'pandas_type', None) or
- getattr(g, 'table', None) or
- (isinstance(g, _table_mod.table.Table) and
- g._v_name != u'table')))
- ]
- def walk(self, where="/"):
- """ Walk the pytables group hierarchy for pandas objects
- This generator will yield the group path, subgroups and pandas object
- names for each group.
- Any non-pandas PyTables objects that are not a group will be ignored.
- The `where` group itself is listed first (preorder), then each of its
- child groups (following an alphanumerical order) is also traversed,
- following the same procedure.
- .. versionadded:: 0.24.0
- Parameters
- ----------
- where : str, optional
- Group where to start walking.
- If not supplied, the root group is used.
- Yields
- ------
- path : str
- Full path to a group (without trailing '/')
- groups : list of str
- names of the groups contained in `path`
- leaves : list of str
- names of the pandas objects contained in `path`
- """
- _tables()
- self._check_if_open()
- for g in self._handle.walk_groups(where):
- if getattr(g._v_attrs, 'pandas_type', None) is not None:
- continue
- groups = []
- leaves = []
- for child in g._v_children.values():
- pandas_type = getattr(child._v_attrs, 'pandas_type', None)
- if pandas_type is None:
- if isinstance(child, _table_mod.group.Group):
- groups.append(child._v_name)
- else:
- leaves.append(child._v_name)
- yield (g._v_pathname.rstrip('/'), groups, leaves)
- def get_node(self, key):
- """ return the node with the key or None if it does not exist """
- self._check_if_open()
- try:
- if not key.startswith('/'):
- key = '/' + key
- return self._handle.get_node(self.root, key)
- except _table_mod.exceptions.NoSuchNodeError:
- return None
- def get_storer(self, key):
- """ return the storer object for a key, raise if not in the file """
- group = self.get_node(key)
- if group is None:
- raise KeyError('No object named {key} in the file'.format(key=key))
- s = self._create_storer(group)
- s.infer_axes()
- return s
- def copy(self, file, mode='w', propindexes=True, keys=None, complib=None,
- complevel=None, fletcher32=False, overwrite=True):
- """ copy the existing store to a new file, upgrading in place
- Parameters
- ----------
- propindexes: restore indexes in copied file (defaults to True)
- keys : list of keys to include in the copy (defaults to all)
- overwrite : overwrite (remove and replace) existing nodes in the
- new store (default is True)
- mode, complib, complevel, fletcher32 same as in HDFStore.__init__
- Returns
- -------
- open file handle of the new store
- """
- new_store = HDFStore(
- file,
- mode=mode,
- complib=complib,
- complevel=complevel,
- fletcher32=fletcher32)
- if keys is None:
- keys = list(self.keys())
- if not isinstance(keys, (tuple, list)):
- keys = [keys]
- for k in keys:
- s = self.get_storer(k)
- if s is not None:
- if k in new_store:
- if overwrite:
- new_store.remove(k)
- data = self.select(k)
- if s.is_table:
- index = False
- if propindexes:
- index = [a.name for a in s.axes if a.is_indexed]
- new_store.append(
- k, data, index=index,
- data_columns=getattr(s, 'data_columns', None),
- encoding=s.encoding
- )
- else:
- new_store.put(k, data, encoding=s.encoding)
- return new_store
- def info(self):
- """
- Print detailed information on the store.
- .. versionadded:: 0.21.0
- """
- output = '{type}\nFile path: {path}\n'.format(
- type=type(self), path=pprint_thing(self._path))
- if self.is_open:
- lkeys = sorted(list(self.keys()))
- if len(lkeys):
- keys = []
- values = []
- for k in lkeys:
- try:
- s = self.get_storer(k)
- if s is not None:
- keys.append(pprint_thing(s.pathname or k))
- values.append(
- pprint_thing(s or 'invalid_HDFStore node'))
- except Exception as detail:
- keys.append(k)
- values.append(
- "[invalid_HDFStore node: {detail}]".format(
- detail=pprint_thing(detail)))
- output += adjoin(12, keys, values)
- else:
- output += 'Empty'
- else:
- output += "File is CLOSED"
- return output
- # private methods ######
- def _check_if_open(self):
- if not self.is_open:
- raise ClosedFileError("{0} file is not open!".format(self._path))
- def _validate_format(self, format, kwargs):
- """ validate / deprecate formats; return the new kwargs """
- kwargs = kwargs.copy()
- # validate
- try:
- kwargs['format'] = _FORMAT_MAP[format.lower()]
- except KeyError:
- raise TypeError("invalid HDFStore format specified [{0}]"
- .format(format))
- return kwargs
- def _create_storer(self, group, format=None, value=None, append=False,
- **kwargs):
- """ return a suitable class to operate """
- def error(t):
- raise TypeError(
- "cannot properly create the storer for: [{t}] [group->"
- "{group},value->{value},format->{format},append->{append},"
- "kwargs->{kwargs}]".format(t=t, group=group,
- value=type(value), format=format,
- append=append, kwargs=kwargs))
- pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None))
- tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None))
- # infer the pt from the passed value
- if pt is None:
- if value is None:
- _tables()
- if (getattr(group, 'table', None) or
- isinstance(group, _table_mod.table.Table)):
- pt = u'frame_table'
- tt = u'generic_table'
- else:
- raise TypeError(
- "cannot create a storer if the object is not existing "
- "nor a value are passed")
- else:
- try:
- pt = _TYPE_MAP[type(value)]
- except KeyError:
- error('_TYPE_MAP')
- # we are actually a table
- if format == 'table':
- pt += u'_table'
- # a storer node
- if u'table' not in pt:
- try:
- return globals()[_STORER_MAP[pt]](self, group, **kwargs)
- except KeyError:
- error('_STORER_MAP')
- # existing node (and must be a table)
- if tt is None:
- # if we are a writer, determine the tt
- if value is not None:
- if pt == u'series_table':
- index = getattr(value, 'index', None)
- if index is not None:
- if index.nlevels == 1:
- tt = u'appendable_series'
- elif index.nlevels > 1:
- tt = u'appendable_multiseries'
- elif pt == u'frame_table':
- index = getattr(value, 'index', None)
- if index is not None:
- if index.nlevels == 1:
- tt = u'appendable_frame'
- elif index.nlevels > 1:
- tt = u'appendable_multiframe'
- elif pt == u'wide_table':
- tt = u'appendable_panel'
- elif pt == u'ndim_table':
- tt = u'appendable_ndim'
- else:
- # distiguish between a frame/table
- tt = u'legacy_panel'
- try:
- fields = group.table._v_attrs.fields
- if len(fields) == 1 and fields[0] == u'value':
- tt = u'legacy_frame'
- except IndexError:
- pass
- try:
- return globals()[_TABLE_MAP[tt]](self, group, **kwargs)
- except KeyError:
- error('_TABLE_MAP')
- def _write_to_group(self, key, value, format, index=True, append=False,
- complib=None, encoding=None, **kwargs):
- group = self.get_node(key)
- # remove the node if we are not appending
- if group is not None and not append:
- self._handle.remove_node(group, recursive=True)
- group = None
- # we don't want to store a table node at all if are object is 0-len
- # as there are not dtypes
- if getattr(value, 'empty', None) and (format == 'table' or append):
- return
- if group is None:
- paths = key.split('/')
- # recursively create the groups
- path = '/'
- for p in paths:
- if not len(p):
- continue
- new_path = path
- if not path.endswith('/'):
- new_path += '/'
- new_path += p
- group = self.get_node(new_path)
- if group is None:
- group = self._handle.create_group(path, p)
- path = new_path
- s = self._create_storer(group, format, value, append=append,
- encoding=encoding, **kwargs)
- if append:
- # raise if we are trying to append to a Fixed format,
- # or a table that exists (and we are putting)
- if (not s.is_table or
- (s.is_table and format == 'fixed' and s.is_exists)):
- raise ValueError('Can only append to Tables')
- if not s.is_exists:
- s.set_object_info()
- else:
- s.set_object_info()
- if not s.is_table and complib:
- raise ValueError(
- 'Compression not supported on Fixed format stores'
- )
- # write the object
- s.write(obj=value, append=append, complib=complib, **kwargs)
- if s.is_table and index:
- s.create_index(columns=index)
- def _read_group(self, group, **kwargs):
- s = self._create_storer(group)
- s.infer_axes()
- return s.read(**kwargs)
- class TableIterator(object):
- """ define the iteration interface on a table
- Parameters
- ----------
- store : the reference store
- s : the referred storer
- func : the function to execute the query
- where : the where of the query
- nrows : the rows to iterate on
- start : the passed start value (default is None)
- stop : the passed stop value (default is None)
- iterator : boolean, whether to use the default iterator
- chunksize : the passed chunking value (default is 50000)
- auto_close : boolean, automatically close the store at the end of
- iteration, default is False
- kwargs : the passed kwargs
- """
- def __init__(self, store, s, func, where, nrows, start=None, stop=None,
- iterator=False, chunksize=None, auto_close=False):
- self.store = store
- self.s = s
- self.func = func
- self.where = where
- # set start/stop if they are not set if we are a table
- if self.s.is_table:
- if nrows is None:
- nrows = 0
- if start is None:
- start = 0
- if stop is None:
- stop = nrows
- stop = min(nrows, stop)
- self.nrows = nrows
- self.start = start
- self.stop = stop
- self.coordinates = None
- if iterator or chunksize is not None:
- if chunksize is None:
- chunksize = 100000
- self.chunksize = int(chunksize)
- else:
- self.chunksize = None
- self.auto_close = auto_close
- def __iter__(self):
- # iterate
- current = self.start
- while current < self.stop:
- stop = min(current + self.chunksize, self.stop)
- value = self.func(None, None, self.coordinates[current:stop])
- current = stop
- if value is None or not len(value):
- continue
- yield value
- self.close()
- def close(self):
- if self.auto_close:
- self.store.close()
- def get_result(self, coordinates=False):
- # return the actual iterator
- if self.chunk…
Large files files are truncated, but you can click here to view the full file