PageRenderTime 187ms CodeModel.GetById 30ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/io/pytables.py

http://github.com/pydata/pandas
Python | 5304 lines | 5177 code | 56 blank | 71 comment | 73 complexity | a98a45c871da462814e8886fd2fb07e5 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. """
  2. High level interface to PyTables for reading and writing pandas data structures
  3. to disk
  4. """
  5. from __future__ import annotations
  6. from contextlib import suppress
  7. import copy
  8. from datetime import (
  9. date,
  10. tzinfo,
  11. )
  12. import itertools
  13. import os
  14. import re
  15. from textwrap import dedent
  16. from typing import (
  17. TYPE_CHECKING,
  18. Any,
  19. Callable,
  20. Hashable,
  21. Sequence,
  22. cast,
  23. )
  24. import warnings
  25. import numpy as np
  26. from pandas._config import (
  27. config,
  28. get_option,
  29. )
  30. from pandas._libs import (
  31. lib,
  32. writers as libwriters,
  33. )
  34. from pandas._libs.tslibs import timezones
  35. from pandas._typing import (
  36. ArrayLike,
  37. DtypeArg,
  38. Shape,
  39. )
  40. from pandas.compat._optional import import_optional_dependency
  41. from pandas.compat.pickle_compat import patch_pickle
  42. from pandas.errors import PerformanceWarning
  43. from pandas.util._decorators import cache_readonly
  44. from pandas.core.dtypes.common import (
  45. ensure_object,
  46. is_categorical_dtype,
  47. is_complex_dtype,
  48. is_datetime64_dtype,
  49. is_datetime64tz_dtype,
  50. is_extension_array_dtype,
  51. is_list_like,
  52. is_string_dtype,
  53. is_timedelta64_dtype,
  54. needs_i8_conversion,
  55. )
  56. from pandas.core.dtypes.missing import array_equivalent
  57. from pandas import (
  58. DataFrame,
  59. DatetimeIndex,
  60. Index,
  61. MultiIndex,
  62. PeriodIndex,
  63. Series,
  64. TimedeltaIndex,
  65. concat,
  66. isna,
  67. )
  68. from pandas.core.api import Int64Index
  69. from pandas.core.arrays import (
  70. Categorical,
  71. DatetimeArray,
  72. PeriodArray,
  73. )
  74. import pandas.core.common as com
  75. from pandas.core.computation.pytables import (
  76. PyTablesExpr,
  77. maybe_expression,
  78. )
  79. from pandas.core.construction import extract_array
  80. from pandas.core.indexes.api import ensure_index
  81. from pandas.core.internals import (
  82. ArrayManager,
  83. BlockManager,
  84. )
  85. from pandas.io.common import stringify_path
  86. from pandas.io.formats.printing import (
  87. adjoin,
  88. pprint_thing,
  89. )
  90. if TYPE_CHECKING:
  91. from tables import (
  92. Col,
  93. File,
  94. Node,
  95. )
  96. from pandas.core.internals import Block
  97. # versioning attribute
  98. _version = "0.15.2"
  99. # encoding
  100. _default_encoding = "UTF-8"
  101. def _ensure_decoded(s):
  102. """if we have bytes, decode them to unicode"""
  103. if isinstance(s, np.bytes_):
  104. s = s.decode("UTF-8")
  105. return s
  106. def _ensure_encoding(encoding):
  107. # set the encoding if we need
  108. if encoding is None:
  109. encoding = _default_encoding
  110. return encoding
  111. def _ensure_str(name):
  112. """
  113. Ensure that an index / column name is a str (python 3); otherwise they
  114. may be np.string dtype. Non-string dtypes are passed through unchanged.
  115. https://github.com/pandas-dev/pandas/issues/13492
  116. """
  117. if isinstance(name, str):
  118. name = str(name)
  119. return name
  120. Term = PyTablesExpr
  121. def _ensure_term(where, scope_level: int):
  122. """
  123. Ensure that the where is a Term or a list of Term.
  124. This makes sure that we are capturing the scope of variables that are
  125. passed create the terms here with a frame_level=2 (we are 2 levels down)
  126. """
  127. # only consider list/tuple here as an ndarray is automatically a coordinate
  128. # list
  129. level = scope_level + 1
  130. if isinstance(where, (list, tuple)):
  131. where = [
  132. Term(term, scope_level=level + 1) if maybe_expression(term) else term
  133. for term in where
  134. if term is not None
  135. ]
  136. elif maybe_expression(where):
  137. where = Term(where, scope_level=level)
  138. return where if where is None or len(where) else None
  139. class PossibleDataLossError(Exception):
  140. pass
  141. class ClosedFileError(Exception):
  142. pass
  143. class IncompatibilityWarning(Warning):
  144. pass
  145. incompatibility_doc = """
  146. where criteria is being ignored as this version [%s] is too old (or
  147. not-defined), read the file in and write it out to a new file to upgrade (with
  148. the copy_to method)
  149. """
  150. class AttributeConflictWarning(Warning):
  151. pass
  152. attribute_conflict_doc = """
  153. the [%s] attribute of the existing index is [%s] which conflicts with the new
  154. [%s], resetting the attribute to None
  155. """
  156. class DuplicateWarning(Warning):
  157. pass
  158. duplicate_doc = """
  159. duplicate entries in table, taking most recently appended
  160. """
  161. performance_doc = """
  162. your performance may suffer as PyTables will pickle object types that it cannot
  163. map directly to c-types [inferred_type->%s,key->%s] [items->%s]
  164. """
  165. # formats
  166. _FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}
  167. # axes map
  168. _AXES_MAP = {DataFrame: [0]}
  169. # register our configuration options
  170. dropna_doc = """
  171. : boolean
  172. drop ALL nan rows when appending to a table
  173. """
  174. format_doc = """
  175. : format
  176. default format writing format, if None, then
  177. put will default to 'fixed' and append will default to 'table'
  178. """
  179. with config.config_prefix("io.hdf"):
  180. config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)
  181. config.register_option(
  182. "default_format",
  183. None,
  184. format_doc,
  185. validator=config.is_one_of_factory(["fixed", "table", None]),
  186. )
  187. # oh the troubles to reduce import time
  188. _table_mod = None
  189. _table_file_open_policy_is_strict = False
  190. def _tables():
  191. global _table_mod
  192. global _table_file_open_policy_is_strict
  193. if _table_mod is None:
  194. import tables
  195. _table_mod = tables
  196. # set the file open policy
  197. # return the file open policy; this changes as of pytables 3.1
  198. # depending on the HDF5 version
  199. with suppress(AttributeError):
  200. _table_file_open_policy_is_strict = (
  201. tables.file._FILE_OPEN_POLICY == "strict"
  202. )
  203. return _table_mod
  204. # interface to/from ###
  205. def to_hdf(
  206. path_or_buf,
  207. key: str,
  208. value: DataFrame | Series,
  209. mode: str = "a",
  210. complevel: int | None = None,
  211. complib: str | None = None,
  212. append: bool = False,
  213. format: str | None = None,
  214. index: bool = True,
  215. min_itemsize: int | dict[str, int] | None = None,
  216. nan_rep=None,
  217. dropna: bool | None = None,
  218. data_columns: bool | list[str] | None = None,
  219. errors: str = "strict",
  220. encoding: str = "UTF-8",
  221. ) -> None:
  222. """store this object, close it if we opened it"""
  223. if append:
  224. f = lambda store: store.append(
  225. key,
  226. value,
  227. format=format,
  228. index=index,
  229. min_itemsize=min_itemsize,
  230. nan_rep=nan_rep,
  231. dropna=dropna,
  232. data_columns=data_columns,
  233. errors=errors,
  234. encoding=encoding,
  235. )
  236. else:
  237. # NB: dropna is not passed to `put`
  238. f = lambda store: store.put(
  239. key,
  240. value,
  241. format=format,
  242. index=index,
  243. min_itemsize=min_itemsize,
  244. nan_rep=nan_rep,
  245. data_columns=data_columns,
  246. errors=errors,
  247. encoding=encoding,
  248. dropna=dropna,
  249. )
  250. path_or_buf = stringify_path(path_or_buf)
  251. if isinstance(path_or_buf, str):
  252. with HDFStore(
  253. path_or_buf, mode=mode, complevel=complevel, complib=complib
  254. ) as store:
  255. f(store)
  256. else:
  257. f(path_or_buf)
  258. def read_hdf(
  259. path_or_buf,
  260. key=None,
  261. mode: str = "r",
  262. errors: str = "strict",
  263. where=None,
  264. start: int | None = None,
  265. stop: int | None = None,
  266. columns=None,
  267. iterator=False,
  268. chunksize: int | None = None,
  269. **kwargs,
  270. ):
  271. """
  272. Read from the store, close it if we opened it.
  273. Retrieve pandas object stored in file, optionally based on where
  274. criteria.
  275. .. warning::
  276. Pandas uses PyTables for reading and writing HDF5 files, which allows
  277. serializing object-dtype data with pickle when using the "fixed" format.
  278. Loading pickled data received from untrusted sources can be unsafe.
  279. See: https://docs.python.org/3/library/pickle.html for more.
  280. Parameters
  281. ----------
  282. path_or_buf : str, path object, pandas.HDFStore
  283. Any valid string path is acceptable. Only supports the local file system,
  284. remote URLs and file-like objects are not supported.
  285. If you want to pass in a path object, pandas accepts any
  286. ``os.PathLike``.
  287. Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.
  288. key : object, optional
  289. The group identifier in the store. Can be omitted if the HDF file
  290. contains a single pandas object.
  291. mode : {'r', 'r+', 'a'}, default 'r'
  292. Mode to use when opening the file. Ignored if path_or_buf is a
  293. :class:`pandas.HDFStore`. Default is 'r'.
  294. errors : str, default 'strict'
  295. Specifies how encoding and decoding errors are to be handled.
  296. See the errors argument for :func:`open` for a full list
  297. of options.
  298. where : list, optional
  299. A list of Term (or convertible) objects.
  300. start : int, optional
  301. Row number to start selection.
  302. stop : int, optional
  303. Row number to stop selection.
  304. columns : list, optional
  305. A list of columns names to return.
  306. iterator : bool, optional
  307. Return an iterator object.
  308. chunksize : int, optional
  309. Number of rows to include in an iteration when using an iterator.
  310. **kwargs
  311. Additional keyword arguments passed to HDFStore.
  312. Returns
  313. -------
  314. item : object
  315. The selected object. Return type depends on the object stored.
  316. See Also
  317. --------
  318. DataFrame.to_hdf : Write a HDF file from a DataFrame.
  319. HDFStore : Low-level access to HDF files.
  320. Examples
  321. --------
  322. >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z'])
  323. >>> df.to_hdf('./store.h5', 'data')
  324. >>> reread = pd.read_hdf('./store.h5')
  325. """
  326. if mode not in ["r", "r+", "a"]:
  327. raise ValueError(
  328. f"mode {mode} is not allowed while performing a read. "
  329. f"Allowed modes are r, r+ and a."
  330. )
  331. # grab the scope
  332. if where is not None:
  333. where = _ensure_term(where, scope_level=1)
  334. if isinstance(path_or_buf, HDFStore):
  335. if not path_or_buf.is_open:
  336. raise OSError("The HDFStore must be open for reading.")
  337. store = path_or_buf
  338. auto_close = False
  339. else:
  340. path_or_buf = stringify_path(path_or_buf)
  341. if not isinstance(path_or_buf, str):
  342. raise NotImplementedError(
  343. "Support for generic buffers has not been implemented."
  344. )
  345. try:
  346. exists = os.path.exists(path_or_buf)
  347. # if filepath is too long
  348. except (TypeError, ValueError):
  349. exists = False
  350. if not exists:
  351. raise FileNotFoundError(f"File {path_or_buf} does not exist")
  352. store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)
  353. # can't auto open/close if we are using an iterator
  354. # so delegate to the iterator
  355. auto_close = True
  356. try:
  357. if key is None:
  358. groups = store.groups()
  359. if len(groups) == 0:
  360. raise ValueError(
  361. "Dataset(s) incompatible with Pandas data types, "
  362. "not table, or no datasets found in HDF5 file."
  363. )
  364. candidate_only_group = groups[0]
  365. # For the HDF file to have only one dataset, all other groups
  366. # should then be metadata groups for that candidate group. (This
  367. # assumes that the groups() method enumerates parent groups
  368. # before their children.)
  369. for group_to_check in groups[1:]:
  370. if not _is_metadata_of(group_to_check, candidate_only_group):
  371. raise ValueError(
  372. "key must be provided when HDF5 "
  373. "file contains multiple datasets."
  374. )
  375. key = candidate_only_group._v_pathname
  376. return store.select(
  377. key,
  378. where=where,
  379. start=start,
  380. stop=stop,
  381. columns=columns,
  382. iterator=iterator,
  383. chunksize=chunksize,
  384. auto_close=auto_close,
  385. )
  386. except (ValueError, TypeError, KeyError):
  387. if not isinstance(path_or_buf, HDFStore):
  388. # if there is an error, close the store if we opened it.
  389. with suppress(AttributeError):
  390. store.close()
  391. raise
  392. def _is_metadata_of(group: Node, parent_group: Node) -> bool:
  393. """Check if a given group is a metadata group for a given parent_group."""
  394. if group._v_depth <= parent_group._v_depth:
  395. return False
  396. current = group
  397. while current._v_depth > 1:
  398. parent = current._v_parent
  399. if parent == parent_group and current._v_name == "meta":
  400. return True
  401. current = current._v_parent
  402. return False
  403. class HDFStore:
  404. """
  405. Dict-like IO interface for storing pandas objects in PyTables.
  406. Either Fixed or Table format.
  407. .. warning::
  408. Pandas uses PyTables for reading and writing HDF5 files, which allows
  409. serializing object-dtype data with pickle when using the "fixed" format.
  410. Loading pickled data received from untrusted sources can be unsafe.
  411. See: https://docs.python.org/3/library/pickle.html for more.
  412. Parameters
  413. ----------
  414. path : str
  415. File path to HDF5 file.
  416. mode : {'a', 'w', 'r', 'r+'}, default 'a'
  417. ``'r'``
  418. Read-only; no data can be modified.
  419. ``'w'``
  420. Write; a new file is created (an existing file with the same
  421. name would be deleted).
  422. ``'a'``
  423. Append; an existing file is opened for reading and writing,
  424. and if the file does not exist it is created.
  425. ``'r+'``
  426. It is similar to ``'a'``, but the file must already exist.
  427. complevel : int, 0-9, default None
  428. Specifies a compression level for data.
  429. A value of 0 or None disables compression.
  430. complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
  431. Specifies the compression library to be used.
  432. As of v0.20.2 these additional compressors for Blosc are supported
  433. (default if no compressor specified: 'blosc:blosclz'):
  434. {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
  435. 'blosc:zlib', 'blosc:zstd'}.
  436. Specifying a compression library which is not available issues
  437. a ValueError.
  438. fletcher32 : bool, default False
  439. If applying compression use the fletcher32 checksum.
  440. **kwargs
  441. These parameters will be passed to the PyTables open_file method.
  442. Examples
  443. --------
  444. >>> bar = pd.DataFrame(np.random.randn(10, 4))
  445. >>> store = pd.HDFStore('test.h5')
  446. >>> store['foo'] = bar # write to HDF5
  447. >>> bar = store['foo'] # retrieve
  448. >>> store.close()
  449. **Create or load HDF5 file in-memory**
  450. When passing the `driver` option to the PyTables open_file method through
  451. **kwargs, the HDF5 file is loaded or created in-memory and will only be
  452. written when closed:
  453. >>> bar = pd.DataFrame(np.random.randn(10, 4))
  454. >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE')
  455. >>> store['foo'] = bar
  456. >>> store.close() # only now, data is written to disk
  457. """
  458. _handle: File | None
  459. _mode: str
  460. _complevel: int
  461. _fletcher32: bool
  462. def __init__(
  463. self,
  464. path,
  465. mode: str = "a",
  466. complevel: int | None = None,
  467. complib=None,
  468. fletcher32: bool = False,
  469. **kwargs,
  470. ):
  471. if "format" in kwargs:
  472. raise ValueError("format is not a defined argument for HDFStore")
  473. tables = import_optional_dependency("tables")
  474. if complib is not None and complib not in tables.filters.all_complibs:
  475. raise ValueError(
  476. f"complib only supports {tables.filters.all_complibs} compression."
  477. )
  478. if complib is None and complevel is not None:
  479. complib = tables.filters.default_complib
  480. self._path = stringify_path(path)
  481. if mode is None:
  482. mode = "a"
  483. self._mode = mode
  484. self._handle = None
  485. self._complevel = complevel if complevel else 0
  486. self._complib = complib
  487. self._fletcher32 = fletcher32
  488. self._filters = None
  489. self.open(mode=mode, **kwargs)
  490. def __fspath__(self):
  491. return self._path
  492. @property
  493. def root(self):
  494. """return the root node"""
  495. self._check_if_open()
  496. assert self._handle is not None # for mypy
  497. return self._handle.root
  498. @property
  499. def filename(self):
  500. return self._path
  501. def __getitem__(self, key: str):
  502. return self.get(key)
  503. def __setitem__(self, key: str, value):
  504. self.put(key, value)
  505. def __delitem__(self, key: str):
  506. return self.remove(key)
  507. def __getattr__(self, name: str):
  508. """allow attribute access to get stores"""
  509. try:
  510. return self.get(name)
  511. except (KeyError, ClosedFileError):
  512. pass
  513. raise AttributeError(
  514. f"'{type(self).__name__}' object has no attribute '{name}'"
  515. )
  516. def __contains__(self, key: str) -> bool:
  517. """
  518. check for existence of this key
  519. can match the exact pathname or the pathnm w/o the leading '/'
  520. """
  521. node = self.get_node(key)
  522. if node is not None:
  523. name = node._v_pathname
  524. if name == key or name[1:] == key:
  525. return True
  526. return False
  527. def __len__(self) -> int:
  528. return len(self.groups())
  529. def __repr__(self) -> str:
  530. pstr = pprint_thing(self._path)
  531. return f"{type(self)}\nFile path: {pstr}\n"
  532. def __enter__(self):
  533. return self
  534. def __exit__(self, exc_type, exc_value, traceback):
  535. self.close()
  536. def keys(self, include: str = "pandas") -> list[str]:
  537. """
  538. Return a list of keys corresponding to objects stored in HDFStore.
  539. Parameters
  540. ----------
  541. include : str, default 'pandas'
  542. When kind equals 'pandas' return pandas objects.
  543. When kind equals 'native' return native HDF5 Table objects.
  544. .. versionadded:: 1.1.0
  545. Returns
  546. -------
  547. list
  548. List of ABSOLUTE path-names (e.g. have the leading '/').
  549. Raises
  550. ------
  551. raises ValueError if kind has an illegal value
  552. """
  553. if include == "pandas":
  554. return [n._v_pathname for n in self.groups()]
  555. elif include == "native":
  556. assert self._handle is not None # mypy
  557. return [
  558. n._v_pathname for n in self._handle.walk_nodes("/", classname="Table")
  559. ]
  560. raise ValueError(
  561. f"`include` should be either 'pandas' or 'native' but is '{include}'"
  562. )
  563. def __iter__(self):
  564. return iter(self.keys())
  565. def items(self):
  566. """
  567. iterate on key->group
  568. """
  569. for g in self.groups():
  570. yield g._v_pathname, g
  571. iteritems = items
  572. def open(self, mode: str = "a", **kwargs):
  573. """
  574. Open the file in the specified mode
  575. Parameters
  576. ----------
  577. mode : {'a', 'w', 'r', 'r+'}, default 'a'
  578. See HDFStore docstring or tables.open_file for info about modes
  579. **kwargs
  580. These parameters will be passed to the PyTables open_file method.
  581. """
  582. tables = _tables()
  583. if self._mode != mode:
  584. # if we are changing a write mode to read, ok
  585. if self._mode in ["a", "w"] and mode in ["r", "r+"]:
  586. pass
  587. elif mode in ["w"]:
  588. # this would truncate, raise here
  589. if self.is_open:
  590. raise PossibleDataLossError(
  591. f"Re-opening the file [{self._path}] with mode [{self._mode}] "
  592. "will delete the current file!"
  593. )
  594. self._mode = mode
  595. # close and reopen the handle
  596. if self.is_open:
  597. self.close()
  598. if self._complevel and self._complevel > 0:
  599. self._filters = _tables().Filters(
  600. self._complevel, self._complib, fletcher32=self._fletcher32
  601. )
  602. if _table_file_open_policy_is_strict and self.is_open:
  603. msg = (
  604. "Cannot open HDF5 file, which is already opened, "
  605. "even in read-only mode."
  606. )
  607. raise ValueError(msg)
  608. self._handle = tables.open_file(self._path, self._mode, **kwargs)
  609. def close(self):
  610. """
  611. Close the PyTables file handle
  612. """
  613. if self._handle is not None:
  614. self._handle.close()
  615. self._handle = None
  616. @property
  617. def is_open(self) -> bool:
  618. """
  619. return a boolean indicating whether the file is open
  620. """
  621. if self._handle is None:
  622. return False
  623. return bool(self._handle.isopen)
  624. def flush(self, fsync: bool = False):
  625. """
  626. Force all buffered modifications to be written to disk.
  627. Parameters
  628. ----------
  629. fsync : bool (default False)
  630. call ``os.fsync()`` on the file handle to force writing to disk.
  631. Notes
  632. -----
  633. Without ``fsync=True``, flushing may not guarantee that the OS writes
  634. to disk. With fsync, the operation will block until the OS claims the
  635. file has been written; however, other caching layers may still
  636. interfere.
  637. """
  638. if self._handle is not None:
  639. self._handle.flush()
  640. if fsync:
  641. with suppress(OSError):
  642. os.fsync(self._handle.fileno())
  643. def get(self, key: str):
  644. """
  645. Retrieve pandas object stored in file.
  646. Parameters
  647. ----------
  648. key : str
  649. Returns
  650. -------
  651. object
  652. Same type as object stored in file.
  653. """
  654. with patch_pickle():
  655. # GH#31167 Without this patch, pickle doesn't know how to unpickle
  656. # old DateOffset objects now that they are cdef classes.
  657. group = self.get_node(key)
  658. if group is None:
  659. raise KeyError(f"No object named {key} in the file")
  660. return self._read_group(group)
  661. def select(
  662. self,
  663. key: str,
  664. where=None,
  665. start=None,
  666. stop=None,
  667. columns=None,
  668. iterator=False,
  669. chunksize=None,
  670. auto_close: bool = False,
  671. ):
  672. """
  673. Retrieve pandas object stored in file, optionally based on where criteria.
  674. .. warning::
  675. Pandas uses PyTables for reading and writing HDF5 files, which allows
  676. serializing object-dtype data with pickle when using the "fixed" format.
  677. Loading pickled data received from untrusted sources can be unsafe.
  678. See: https://docs.python.org/3/library/pickle.html for more.
  679. Parameters
  680. ----------
  681. key : str
  682. Object being retrieved from file.
  683. where : list or None
  684. List of Term (or convertible) objects, optional.
  685. start : int or None
  686. Row number to start selection.
  687. stop : int, default None
  688. Row number to stop selection.
  689. columns : list or None
  690. A list of columns that if not None, will limit the return columns.
  691. iterator : bool or False
  692. Returns an iterator.
  693. chunksize : int or None
  694. Number or rows to include in iteration, return an iterator.
  695. auto_close : bool or False
  696. Should automatically close the store when finished.
  697. Returns
  698. -------
  699. object
  700. Retrieved object from file.
  701. """
  702. group = self.get_node(key)
  703. if group is None:
  704. raise KeyError(f"No object named {key} in the file")
  705. # create the storer and axes
  706. where = _ensure_term(where, scope_level=1)
  707. s = self._create_storer(group)
  708. s.infer_axes()
  709. # function to call on iteration
  710. def func(_start, _stop, _where):
  711. return s.read(start=_start, stop=_stop, where=_where, columns=columns)
  712. # create the iterator
  713. it = TableIterator(
  714. self,
  715. s,
  716. func,
  717. where=where,
  718. nrows=s.nrows,
  719. start=start,
  720. stop=stop,
  721. iterator=iterator,
  722. chunksize=chunksize,
  723. auto_close=auto_close,
  724. )
  725. return it.get_result()
  726. def select_as_coordinates(
  727. self,
  728. key: str,
  729. where=None,
  730. start: int | None = None,
  731. stop: int | None = None,
  732. ):
  733. """
  734. return the selection as an Index
  735. .. warning::
  736. Pandas uses PyTables for reading and writing HDF5 files, which allows
  737. serializing object-dtype data with pickle when using the "fixed" format.
  738. Loading pickled data received from untrusted sources can be unsafe.
  739. See: https://docs.python.org/3/library/pickle.html for more.
  740. Parameters
  741. ----------
  742. key : str
  743. where : list of Term (or convertible) objects, optional
  744. start : integer (defaults to None), row number to start selection
  745. stop : integer (defaults to None), row number to stop selection
  746. """
  747. where = _ensure_term(where, scope_level=1)
  748. tbl = self.get_storer(key)
  749. if not isinstance(tbl, Table):
  750. raise TypeError("can only read_coordinates with a table")
  751. return tbl.read_coordinates(where=where, start=start, stop=stop)
  752. def select_column(
  753. self,
  754. key: str,
  755. column: str,
  756. start: int | None = None,
  757. stop: int | None = None,
  758. ):
  759. """
  760. return a single column from the table. This is generally only useful to
  761. select an indexable
  762. .. warning::
  763. Pandas uses PyTables for reading and writing HDF5 files, which allows
  764. serializing object-dtype data with pickle when using the "fixed" format.
  765. Loading pickled data received from untrusted sources can be unsafe.
  766. See: https://docs.python.org/3/library/pickle.html for more.
  767. Parameters
  768. ----------
  769. key : str
  770. column : str
  771. The column of interest.
  772. start : int or None, default None
  773. stop : int or None, default None
  774. Raises
  775. ------
  776. raises KeyError if the column is not found (or key is not a valid
  777. store)
  778. raises ValueError if the column can not be extracted individually (it
  779. is part of a data block)
  780. """
  781. tbl = self.get_storer(key)
  782. if not isinstance(tbl, Table):
  783. raise TypeError("can only read_column with a table")
  784. return tbl.read_column(column=column, start=start, stop=stop)
  785. def select_as_multiple(
  786. self,
  787. keys,
  788. where=None,
  789. selector=None,
  790. columns=None,
  791. start=None,
  792. stop=None,
  793. iterator=False,
  794. chunksize=None,
  795. auto_close: bool = False,
  796. ):
  797. """
  798. Retrieve pandas objects from multiple tables.
  799. .. warning::
  800. Pandas uses PyTables for reading and writing HDF5 files, which allows
  801. serializing object-dtype data with pickle when using the "fixed" format.
  802. Loading pickled data received from untrusted sources can be unsafe.
  803. See: https://docs.python.org/3/library/pickle.html for more.
  804. Parameters
  805. ----------
  806. keys : a list of the tables
  807. selector : the table to apply the where criteria (defaults to keys[0]
  808. if not supplied)
  809. columns : the columns I want back
  810. start : integer (defaults to None), row number to start selection
  811. stop : integer (defaults to None), row number to stop selection
  812. iterator : bool, return an iterator, default False
  813. chunksize : nrows to include in iteration, return an iterator
  814. auto_close : bool, default False
  815. Should automatically close the store when finished.
  816. Raises
  817. ------
  818. raises KeyError if keys or selector is not found or keys is empty
  819. raises TypeError if keys is not a list or tuple
  820. raises ValueError if the tables are not ALL THE SAME DIMENSIONS
  821. """
  822. # default to single select
  823. where = _ensure_term(where, scope_level=1)
  824. if isinstance(keys, (list, tuple)) and len(keys) == 1:
  825. keys = keys[0]
  826. if isinstance(keys, str):
  827. return self.select(
  828. key=keys,
  829. where=where,
  830. columns=columns,
  831. start=start,
  832. stop=stop,
  833. iterator=iterator,
  834. chunksize=chunksize,
  835. auto_close=auto_close,
  836. )
  837. if not isinstance(keys, (list, tuple)):
  838. raise TypeError("keys must be a list/tuple")
  839. if not len(keys):
  840. raise ValueError("keys must have a non-zero length")
  841. if selector is None:
  842. selector = keys[0]
  843. # collect the tables
  844. tbls = [self.get_storer(k) for k in keys]
  845. s = self.get_storer(selector)
  846. # validate rows
  847. nrows = None
  848. for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
  849. if t is None:
  850. raise KeyError(f"Invalid table [{k}]")
  851. if not t.is_table:
  852. raise TypeError(
  853. f"object [{t.pathname}] is not a table, and cannot be used in all "
  854. "select as multiple"
  855. )
  856. if nrows is None:
  857. nrows = t.nrows
  858. elif t.nrows != nrows:
  859. raise ValueError("all tables must have exactly the same nrows!")
  860. # The isinstance checks here are redundant with the check above,
  861. # but necessary for mypy; see GH#29757
  862. _tbls = [x for x in tbls if isinstance(x, Table)]
  863. # axis is the concentration axes
  864. axis = list({t.non_index_axes[0][0] for t in _tbls})[0]
  865. def func(_start, _stop, _where):
  866. # retrieve the objs, _where is always passed as a set of
  867. # coordinates here
  868. objs = [
  869. t.read(where=_where, columns=columns, start=_start, stop=_stop)
  870. for t in tbls
  871. ]
  872. # concat and return
  873. return concat(objs, axis=axis, verify_integrity=False)._consolidate()
  874. # create the iterator
  875. it = TableIterator(
  876. self,
  877. s,
  878. func,
  879. where=where,
  880. nrows=nrows,
  881. start=start,
  882. stop=stop,
  883. iterator=iterator,
  884. chunksize=chunksize,
  885. auto_close=auto_close,
  886. )
  887. return it.get_result(coordinates=True)
  888. def put(
  889. self,
  890. key: str,
  891. value: DataFrame | Series,
  892. format=None,
  893. index=True,
  894. append=False,
  895. complib=None,
  896. complevel: int | None = None,
  897. min_itemsize: int | dict[str, int] | None = None,
  898. nan_rep=None,
  899. data_columns: list[str] | None = None,
  900. encoding=None,
  901. errors: str = "strict",
  902. track_times: bool = True,
  903. dropna: bool = False,
  904. ):
  905. """
  906. Store object in HDFStore.
  907. Parameters
  908. ----------
  909. key : str
  910. value : {Series, DataFrame}
  911. format : 'fixed(f)|table(t)', default is 'fixed'
  912. Format to use when storing object in HDFStore. Value can be one of:
  913. ``'fixed'``
  914. Fixed format. Fast writing/reading. Not-appendable, nor searchable.
  915. ``'table'``
  916. Table format. Write as a PyTables Table structure which may perform
  917. worse but allow more flexible operations like searching / selecting
  918. subsets of the data.
  919. append : bool, default False
  920. This will force Table format, append the input data to the existing.
  921. data_columns : list, default None
  922. List of columns to create as data columns, or True to use all columns.
  923. See `here
  924. <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
  925. encoding : str, default None
  926. Provide an encoding for strings.
  927. track_times : bool, default True
  928. Parameter is propagated to 'create_table' method of 'PyTables'.
  929. If set to False it enables to have the same h5 files (same hashes)
  930. independent on creation time.
  931. .. versionadded:: 1.1.0
  932. """
  933. if format is None:
  934. format = get_option("io.hdf.default_format") or "fixed"
  935. format = self._validate_format(format)
  936. self._write_to_group(
  937. key,
  938. value,
  939. format=format,
  940. index=index,
  941. append=append,
  942. complib=complib,
  943. complevel=complevel,
  944. min_itemsize=min_itemsize,
  945. nan_rep=nan_rep,
  946. data_columns=data_columns,
  947. encoding=encoding,
  948. errors=errors,
  949. track_times=track_times,
  950. dropna=dropna,
  951. )
  952. def remove(self, key: str, where=None, start=None, stop=None):
  953. """
  954. Remove pandas object partially by specifying the where condition
  955. Parameters
  956. ----------
  957. key : str
  958. Node to remove or delete rows from
  959. where : list of Term (or convertible) objects, optional
  960. start : integer (defaults to None), row number to start selection
  961. stop : integer (defaults to None), row number to stop selection
  962. Returns
  963. -------
  964. number of rows removed (or None if not a Table)
  965. Raises
  966. ------
  967. raises KeyError if key is not a valid store
  968. """
  969. where = _ensure_term(where, scope_level=1)
  970. try:
  971. s = self.get_storer(key)
  972. except KeyError:
  973. # the key is not a valid store, re-raising KeyError
  974. raise
  975. except AssertionError:
  976. # surface any assertion errors for e.g. debugging
  977. raise
  978. except Exception as err:
  979. # In tests we get here with ClosedFileError, TypeError, and
  980. # _table_mod.NoSuchNodeError. TODO: Catch only these?
  981. if where is not None:
  982. raise ValueError(
  983. "trying to remove a node with a non-None where clause!"
  984. ) from err
  985. # we are actually trying to remove a node (with children)
  986. node = self.get_node(key)
  987. if node is not None:
  988. node._f_remove(recursive=True)
  989. return None
  990. # remove the node
  991. if com.all_none(where, start, stop):
  992. s.group._f_remove(recursive=True)
  993. # delete from the table
  994. else:
  995. if not s.is_table:
  996. raise ValueError(
  997. "can only remove with where on objects written as tables"
  998. )
  999. return s.delete(where=where, start=start, stop=stop)
  1000. def append(
  1001. self,
  1002. key: str,
  1003. value: DataFrame | Series,
  1004. format=None,
  1005. axes=None,
  1006. index=True,
  1007. append=True,
  1008. complib=None,
  1009. complevel: int | None = None,
  1010. columns=None,
  1011. min_itemsize: int | dict[str, int] | None = None,
  1012. nan_rep=None,
  1013. chunksize=None,
  1014. expectedrows=None,
  1015. dropna: bool | None = None,
  1016. data_columns: list[str] | None = None,
  1017. encoding=None,
  1018. errors: str = "strict",
  1019. ):
  1020. """
  1021. Append to Table in file. Node must already exist and be Table
  1022. format.
  1023. Parameters
  1024. ----------
  1025. key : str
  1026. value : {Series, DataFrame}
  1027. format : 'table' is the default
  1028. Format to use when storing object in HDFStore. Value can be one of:
  1029. ``'table'``
  1030. Table format. Write as a PyTables Table structure which may perform
  1031. worse but allow more flexible operations like searching / selecting
  1032. subsets of the data.
  1033. append : bool, default True
  1034. Append the input data to the existing.
  1035. data_columns : list of columns, or True, default None
  1036. List of columns to create as indexed data columns for on-disk
  1037. queries, or True to use all columns. By default only the axes
  1038. of the object are indexed. See `here
  1039. <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
  1040. min_itemsize : dict of columns that specify minimum str sizes
  1041. nan_rep : str to use as str nan representation
  1042. chunksize : size to chunk the writing
  1043. expectedrows : expected TOTAL row size of this table
  1044. encoding : default None, provide an encoding for str
  1045. dropna : bool, default False
  1046. Do not write an ALL nan row to the store settable
  1047. by the option 'io.hdf.dropna_table'.
  1048. Notes
  1049. -----
  1050. Does *not* check if data being appended overlaps with existing
  1051. data in the table, so be careful
  1052. """
  1053. if columns is not None:
  1054. raise TypeError(
  1055. "columns is not a supported keyword in append, try data_columns"
  1056. )
  1057. if dropna is None:
  1058. dropna = get_option("io.hdf.dropna_table")
  1059. if format is None:
  1060. format = get_option("io.hdf.default_format") or "table"
  1061. format = self._validate_format(format)
  1062. self._write_to_group(
  1063. key,
  1064. value,
  1065. format=format,
  1066. axes=axes,
  1067. index=index,
  1068. append=append,
  1069. complib=complib,
  1070. complevel=complevel,
  1071. min_itemsize=min_itemsize,
  1072. nan_rep=nan_rep,
  1073. chunksize=chunksize,
  1074. expectedrows=expectedrows,
  1075. dropna=dropna,
  1076. data_columns=data_columns,
  1077. encoding=encoding,
  1078. errors=errors,
  1079. )
  1080. def append_to_multiple(
  1081. self,
  1082. d: dict,
  1083. value,
  1084. selector,
  1085. data_columns=None,
  1086. axes=None,
  1087. dropna=False,
  1088. **kwargs,
  1089. ):
  1090. """
  1091. Append to multiple tables
  1092. Parameters
  1093. ----------
  1094. d : a dict of table_name to table_columns, None is acceptable as the
  1095. values of one node (this will get all the remaining columns)
  1096. value : a pandas object
  1097. selector : a string that designates the indexable table; all of its
  1098. columns will be designed as data_columns, unless data_columns is
  1099. passed, in which case these are used
  1100. data_columns : list of columns to create as data columns, or True to
  1101. use all columns
  1102. dropna : if evaluates to True, drop rows from all tables if any single
  1103. row in each table has all NaN. Default False.
  1104. Notes
  1105. -----
  1106. axes parameter is currently not accepted
  1107. """
  1108. if axes is not None:
  1109. raise TypeError(
  1110. "axes is currently not accepted as a parameter to append_to_multiple; "
  1111. "you can create the tables independently instead"
  1112. )
  1113. if not isinstance(d, dict):
  1114. raise ValueError(
  1115. "append_to_multiple must have a dictionary specified as the "
  1116. "way to split the value"
  1117. )
  1118. if selector not in d:
  1119. raise ValueError(
  1120. "append_to_multiple requires a selector that is in passed dict"
  1121. )
  1122. # figure out the splitting axis (the non_index_axis)
  1123. axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]
  1124. # figure out how to split the value
  1125. remain_key = None
  1126. remain_values: list = []
  1127. for k, v in d.items():
  1128. if v is None:
  1129. if remain_key is not None:
  1130. raise ValueError(
  1131. "append_to_multiple can only have one value in d that is None"
  1132. )
  1133. remain_key = k
  1134. else:
  1135. remain_values.extend(v)
  1136. if remain_key is not None:
  1137. ordered = value.axes[axis]
  1138. ordd = ordered.difference(Index(remain_values))
  1139. ordd = sorted(ordered.get_indexer(ordd))
  1140. d[remain_key] = ordered.take(ordd)
  1141. # data_columns
  1142. if data_columns is None:
  1143. data_columns = d[selector]
  1144. # ensure rows are synchronized across the tables
  1145. if dropna:
  1146. idxs = (value[cols].dropna(how="all").index for cols in d.values())
  1147. valid_index = next(idxs)
  1148. for index in idxs:
  1149. valid_index = valid_index.intersection(index)
  1150. value = value.loc[valid_index]
  1151. min_itemsize = kwargs.pop("min_itemsize", None)
  1152. # append
  1153. for k, v in d.items():
  1154. dc = data_columns if k == selector else None
  1155. # compute the val
  1156. val = value.reindex(v, axis=axis)
  1157. filtered = (
  1158. {key: value for (key, value) in min_itemsize.items() if key in v}
  1159. if min_itemsize is not None
  1160. else None
  1161. )
  1162. self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs)
  1163. def create_table_index(
  1164. self,
  1165. key: str,
  1166. columns=None,
  1167. optlevel: int | None = None,
  1168. kind: str | None = None,
  1169. ):
  1170. """
  1171. Create a pytables index on the table.
  1172. Parameters
  1173. ----------
  1174. key : str
  1175. columns : None, bool, or listlike[str]
  1176. Indicate which columns to create an index on.
  1177. * False : Do not create any indexes.
  1178. * True : Create indexes on all columns.
  1179. * None : Create indexes on all columns.
  1180. * listlike : Create indexes on the given columns.
  1181. optlevel : int or None, default None
  1182. Optimization level, if None, pytables defaults to 6.
  1183. kind : str or None, default None
  1184. Kind of index, if None, pytables defaults to "medium".
  1185. Raises
  1186. ------
  1187. TypeError: raises if the node is not a table
  1188. """
  1189. # version requirements
  1190. _tables()
  1191. s = self.get_storer(key)
  1192. if s is None:
  1193. return
  1194. if not isinstance(s, Table):
  1195. raise TypeError("cannot create table index on a Fixed format store")
  1196. s.create_index(columns=columns, optlevel=optlevel, kind=kind)
  1197. def groups(self):
  1198. """
  1199. Return a list of all the top-level nodes.
  1200. Each node returned is not a pandas storage object.
  1201. Returns
  1202. -------
  1203. list
  1204. List of objects.
  1205. """
  1206. _tables()
  1207. self._check_if_open()
  1208. assert self._handle is not None # for mypy
  1209. assert _table_mod is not None # for mypy
  1210. return [
  1211. g
  1212. for g in self._handle.walk_groups()
  1213. if (
  1214. not isinstance(g, _table_mod.link.Link)
  1215. and (
  1216. getattr(g._v_attrs, "pandas_type", None)
  1217. or getattr(g, "table", None)
  1218. or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")
  1219. )
  1220. )
  1221. ]
  1222. def walk(self, where="/"):
  1223. """
  1224. Walk the pytables group hierarchy for pandas objects.
  1225. This generator will yield the group path, subgroups and pandas object
  1226. names for each group.
  1227. Any non-pandas PyTables objects that are not a group will be ignored.
  1228. The `where` group itself is listed first (preorder), then each of its
  1229. child groups (following an alphanumerical order) is also traversed,
  1230. following the same procedure.
  1231. Parameters
  1232. ----------
  1233. where : str, default "/"
  1234. Group where to start walking.
  1235. Yields
  1236. ------
  1237. path : str
  1238. Full path to a group (without trailing '/').
  1239. groups : list
  1240. Names (strings) of the groups contained in `path`.
  1241. leaves : list
  1242. Names (strings) of the pandas objects contained in `path`.
  1243. """
  1244. _tables()
  1245. self._check_if_open()
  1246. assert self._handle is not None # for mypy
  1247. assert _table_mod is not None # for mypy
  1248. for g in self._handle.walk_groups(where):
  1249. if getattr(g._v_attrs, "pandas_type", None) is not None:
  1250. continue
  1251. groups = []
  1252. leaves = []
  1253. for child in g._v_children.values():
  1254. pandas_type = getattr(child._v_attrs, "pandas_type", None)
  1255. if pandas_type is None:
  1256. if isinstance(child, _table_mod.group.Group):
  1257. groups.append(child._v_name)
  1258. else:
  1259. leaves.append(child._v_name)
  1260. yield (g._v_pathname.rstrip("/"), groups, leaves)
  1261. def get_node(self, key: str) -> Node | None:
  1262. """return the node with the key or None if it does not exist"""
  1263. self._check_if_open()
  1264. if not key.startswith("/"):
  1265. key = "/" + key
  1266. assert self._handle is not None
  1267. assert _table_mod is not None # for mypy
  1268. try:
  1269. node = self._handle.get_node(self.root, key)
  1270. except _table_mod.exceptions.NoSuchNodeError:
  1271. return None
  1272. assert isinstance(node, _table_mod.Node), type(node)
  1273. return node
  1274. def get_storer(self, key: str) -> GenericFixed | Table:
  1275. """return the storer object for a key, raise if not in the file"""
  1276. group = self.get_node(key)
  1277. if group is None:
  1278. raise KeyError(f"No object named {key} in the file")
  1279. s = self._create_storer(group)
  1280. s.infer_axes()
  1281. return s
  1282. def copy(
  1283. self,
  1284. file,
  1285. mode="w",
  1286. propindexes: bool = True,
  1287. keys=None,
  1288. complib=None,
  1289. complevel: int | None = None,
  1290. fletcher32: bool = False,
  1291. overwrite=True,
  1292. ):
  1293. """
  1294. Copy the existing store to a new file, updating in place.
  1295. Parameters
  1296. ----------
  1297. propindexes : bool, default True
  1298. Restore indexes in copied file.
  1299. keys : list, optional
  1300. List of keys to include in the copy (defaults to all).
  1301. overwrite : bool, default True
  1302. Whether to overwrite (remove and replace) existing nodes in the new store.
  1303. mode, complib, complevel, fletcher32 same as in HDFStore.__init__
  1304. Returns
  1305. -------
  1306. open file handle of the new store
  1307. """
  1308. new_store = HDFStore(
  1309. file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32
  1310. )
  1311. if keys is None:
  1312. keys = list(self.keys())
  1313. if not isinstance(keys, (tuple, list)):
  1314. keys = [keys]
  1315. for k in keys:
  1316. s = self.get_storer(k)
  1317. if s is not None:
  1318. if k in new_store:
  1319. if overwrite:
  1320. new_store.remove(k)
  1321. data = self.select(k)
  1322. if isinstance(s, Table):
  1323. index: bool | list[str] = False
  1324. if propindexes:
  1325. index = [a.name for a in s.axes if a.is_indexed]
  1326. new_store.append(
  1327. k,
  1328. data,
  1329. index=index,
  1330. data_columns=getattr(s, "data_columns", None),
  1331. encoding=s.encoding,
  1332. )
  1333. else:
  1334. new_store.put(k, data, encoding=s.encoding)
  1335. return new_store
  1336. def info(self) -> str:
  1337. """
  1338. Print detailed information on the store.
  1339. Returns
  1340. -------
  1341. str
  1342. """
  1343. path = pprint_thing(self._path)
  1344. output = f"{type(self)}\nFile path: {path}\n"
  1345. if self.is_open:
  1346. lkeys = sorted(self.keys())
  1347. if len(lkeys):
  1348. keys = []
  1349. values = []
  1350. for k in lkeys:
  1351. try:
  1352. s = self.get_storer(k)
  1353. if s is not None:
  1354. keys.append(pprint_thing(s.pathname or k))
  1355. values.append(pprint_thing(s or "invalid_HDFStore node"))
  1356. except AssertionError:
  1357. # surface any assertion errors for e.g. debugging
  1358. raise
  1359. except Exception as detail:
  1360. keys.append(k)
  1361. dstr = pprint_thing(detail)
  1362. values.append(f"[invalid_HDFStore node: {dstr}]")
  1363. output += adjoin(12, keys, values)
  1364. else:
  1365. output += "Empty"
  1366. else:
  1367. output += "File is CLOSED"
  1368. return output
  1369. # ------------------------------------------------------------------------
  1370. # private methods
  1371. def _check_if_open(self):
  1372. if not self.is_open:
  1373. raise ClosedFileError(f"{self._path} file is not open!")
  1374. def _validate_format(self, format: str) -> str:
  1375. """validate / deprecate formats"""
  1376. # validate
  1377. try:
  1378. format = _FORMAT_MAP[format.lower()]
  1379. except KeyError as err:
  1380. raise TypeError(f"invalid HDFStore format specified [{format}]") from err
  1381. return for

Large files files are truncated, but you can click here to view the full file