PageRenderTime 45ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/io/pytables.py

http://github.com/pydata/pandas
Python | 5304 lines | 5177 code | 56 blank | 71 comment | 73 complexity | a98a45c871da462814e8886fd2fb07e5 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. """
  2. High level interface to PyTables for reading and writing pandas data structures
  3. to disk
  4. """
  5. from __future__ import annotations
  6. from contextlib import suppress
  7. import copy
  8. from datetime import (
  9. date,
  10. tzinfo,
  11. )
  12. import itertools
  13. import os
  14. import re
  15. from textwrap import dedent
  16. from typing import (
  17. TYPE_CHECKING,
  18. Any,
  19. Callable,
  20. Hashable,
  21. Sequence,
  22. cast,
  23. )
  24. import warnings
  25. import numpy as np
  26. from pandas._config import (
  27. config,
  28. get_option,
  29. )
  30. from pandas._libs import (
  31. lib,
  32. writers as libwriters,
  33. )
  34. from pandas._libs.tslibs import timezones
  35. from pandas._typing import (
  36. ArrayLike,
  37. DtypeArg,
  38. Shape,
  39. )
  40. from pandas.compat._optional import import_optional_dependency
  41. from pandas.compat.pickle_compat import patch_pickle
  42. from pandas.errors import PerformanceWarning
  43. from pandas.util._decorators import cache_readonly
  44. from pandas.core.dtypes.common import (
  45. ensure_object,
  46. is_categorical_dtype,
  47. is_complex_dtype,
  48. is_datetime64_dtype,
  49. is_datetime64tz_dtype,
  50. is_extension_array_dtype,
  51. is_list_like,
  52. is_string_dtype,
  53. is_timedelta64_dtype,
  54. needs_i8_conversion,
  55. )
  56. from pandas.core.dtypes.missing import array_equivalent
  57. from pandas import (
  58. DataFrame,
  59. DatetimeIndex,
  60. Index,
  61. MultiIndex,
  62. PeriodIndex,
  63. Series,
  64. TimedeltaIndex,
  65. concat,
  66. isna,
  67. )
  68. from pandas.core.api import Int64Index
  69. from pandas.core.arrays import (
  70. Categorical,
  71. DatetimeArray,
  72. PeriodArray,
  73. )
  74. import pandas.core.common as com
  75. from pandas.core.computation.pytables import (
  76. PyTablesExpr,
  77. maybe_expression,
  78. )
  79. from pandas.core.construction import extract_array
  80. from pandas.core.indexes.api import ensure_index
  81. from pandas.core.internals import (
  82. ArrayManager,
  83. BlockManager,
  84. )
  85. from pandas.io.common import stringify_path
  86. from pandas.io.formats.printing import (
  87. adjoin,
  88. pprint_thing,
  89. )
  90. if TYPE_CHECKING:
  91. from tables import (
  92. Col,
  93. File,
  94. Node,
  95. )
  96. from pandas.core.internals import Block
  97. # versioning attribute
  98. _version = "0.15.2"
  99. # encoding
  100. _default_encoding = "UTF-8"
  101. def _ensure_decoded(s):
  102. """if we have bytes, decode them to unicode"""
  103. if isinstance(s, np.bytes_):
  104. s = s.decode("UTF-8")
  105. return s
  106. def _ensure_encoding(encoding):
  107. # set the encoding if we need
  108. if encoding is None:
  109. encoding = _default_encoding
  110. return encoding
  111. def _ensure_str(name):
  112. """
  113. Ensure that an index / column name is a str (python 3); otherwise they
  114. may be np.string dtype. Non-string dtypes are passed through unchanged.
  115. https://github.com/pandas-dev/pandas/issues/13492
  116. """
  117. if isinstance(name, str):
  118. name = str(name)
  119. return name
  120. Term = PyTablesExpr
  121. def _ensure_term(where, scope_level: int):
  122. """
  123. Ensure that the where is a Term or a list of Term.
  124. This makes sure that we are capturing the scope of variables that are
  125. passed create the terms here with a frame_level=2 (we are 2 levels down)
  126. """
  127. # only consider list/tuple here as an ndarray is automatically a coordinate
  128. # list
  129. level = scope_level + 1
  130. if isinstance(where, (list, tuple)):
  131. where = [
  132. Term(term, scope_level=level + 1) if maybe_expression(term) else term
  133. for term in where
  134. if term is not None
  135. ]
  136. elif maybe_expression(where):
  137. where = Term(where, scope_level=level)
  138. return where if where is None or len(where) else None
  139. class PossibleDataLossError(Exception):
  140. pass
  141. class ClosedFileError(Exception):
  142. pass
  143. class IncompatibilityWarning(Warning):
  144. pass
  145. incompatibility_doc = """
  146. where criteria is being ignored as this version [%s] is too old (or
  147. not-defined), read the file in and write it out to a new file to upgrade (with
  148. the copy_to method)
  149. """
  150. class AttributeConflictWarning(Warning):
  151. pass
  152. attribute_conflict_doc = """
  153. the [%s] attribute of the existing index is [%s] which conflicts with the new
  154. [%s], resetting the attribute to None
  155. """
  156. class DuplicateWarning(Warning):
  157. pass
  158. duplicate_doc = """
  159. duplicate entries in table, taking most recently appended
  160. """
  161. performance_doc = """
  162. your performance may suffer as PyTables will pickle object types that it cannot
  163. map directly to c-types [inferred_type->%s,key->%s] [items->%s]
  164. """
  165. # formats
  166. _FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}
  167. # axes map
  168. _AXES_MAP = {DataFrame: [0]}
  169. # register our configuration options
  170. dropna_doc = """
  171. : boolean
  172. drop ALL nan rows when appending to a table
  173. """
  174. format_doc = """
  175. : format
  176. default format writing format, if None, then
  177. put will default to 'fixed' and append will default to 'table'
  178. """
  179. with config.config_prefix("io.hdf"):
  180. config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)
  181. config.register_option(
  182. "default_format",
  183. None,
  184. format_doc,
  185. validator=config.is_one_of_factory(["fixed", "table", None]),
  186. )
  187. # oh the troubles to reduce import time
  188. _table_mod = None
  189. _table_file_open_policy_is_strict = False
  190. def _tables():
  191. global _table_mod
  192. global _table_file_open_policy_is_strict
  193. if _table_mod is None:
  194. import tables
  195. _table_mod = tables
  196. # set the file open policy
  197. # return the file open policy; this changes as of pytables 3.1
  198. # depending on the HDF5 version
  199. with suppress(AttributeError):
  200. _table_file_open_policy_is_strict = (
  201. tables.file._FILE_OPEN_POLICY == "strict"
  202. )
  203. return _table_mod
  204. # interface to/from ###
  205. def to_hdf(
  206. path_or_buf,
  207. key: str,
  208. value: DataFrame | Series,
  209. mode: str = "a",
  210. complevel: int | None = None,
  211. complib: str | None = None,
  212. append: bool = False,
  213. format: str | None = None,
  214. index: bool = True,
  215. min_itemsize: int | dict[str, int] | None = None,
  216. nan_rep=None,
  217. dropna: bool | None = None,
  218. data_columns: bool | list[str] | None = None,
  219. errors: str = "strict",
  220. encoding: str = "UTF-8",
  221. ) -> None:
  222. """store this object, close it if we opened it"""
  223. if append:
  224. f = lambda store: store.append(
  225. key,
  226. value,
  227. format=format,
  228. index=index,
  229. min_itemsize=min_itemsize,
  230. nan_rep=nan_rep,
  231. dropna=dropna,
  232. data_columns=data_columns,
  233. errors=errors,
  234. encoding=encoding,
  235. )
  236. else:
  237. # NB: dropna is not passed to `put`
  238. f = lambda store: store.put(
  239. key,
  240. value,
  241. format=format,
  242. index=index,
  243. min_itemsize=min_itemsize,
  244. nan_rep=nan_rep,
  245. data_columns=data_columns,
  246. errors=errors,
  247. encoding=encoding,
  248. dropna=dropna,
  249. )
  250. path_or_buf = stringify_path(path_or_buf)
  251. if isinstance(path_or_buf, str):
  252. with HDFStore(
  253. path_or_buf, mode=mode, complevel=complevel, complib=complib
  254. ) as store:
  255. f(store)
  256. else:
  257. f(path_or_buf)
  258. def read_hdf(
  259. path_or_buf,
  260. key=None,
  261. mode: str = "r",
  262. errors: str = "strict",
  263. where=None,
  264. start: int | None = None,
  265. stop: int | None = None,
  266. columns=None,
  267. iterator=False,
  268. chunksize: int | None = None,
  269. **kwargs,
  270. ):
  271. """
  272. Read from the store, close it if we opened it.
  273. Retrieve pandas object stored in file, optionally based on where
  274. criteria.
  275. .. warning::
  276. Pandas uses PyTables for reading and writing HDF5 files, which allows
  277. serializing object-dtype data with pickle when using the "fixed" format.
  278. Loading pickled data received from untrusted sources can be unsafe.
  279. See: https://docs.python.org/3/library/pickle.html for more.
  280. Parameters
  281. ----------
  282. path_or_buf : str, path object, pandas.HDFStore
  283. Any valid string path is acceptable. Only supports the local file system,
  284. remote URLs and file-like objects are not supported.
  285. If you want to pass in a path object, pandas accepts any
  286. ``os.PathLike``.
  287. Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.
  288. key : object, optional
  289. The group identifier in the store. Can be omitted if the HDF file
  290. contains a single pandas object.
  291. mode : {'r', 'r+', 'a'}, default 'r'
  292. Mode to use when opening the file. Ignored if path_or_buf is a
  293. :class:`pandas.HDFStore`. Default is 'r'.
  294. errors : str, default 'strict'
  295. Specifies how encoding and decoding errors are to be handled.
  296. See the errors argument for :func:`open` for a full list
  297. of options.
  298. where : list, optional
  299. A list of Term (or convertible) objects.
  300. start : int, optional
  301. Row number to start selection.
  302. stop : int, optional
  303. Row number to stop selection.
  304. columns : list, optional
  305. A list of columns names to return.
  306. iterator : bool, optional
  307. Return an iterator object.
  308. chunksize : int, optional
  309. Number of rows to include in an iteration when using an iterator.
  310. **kwargs
  311. Additional keyword arguments passed to HDFStore.
  312. Returns
  313. -------
  314. item : object
  315. The selected object. Return type depends on the object stored.
  316. See Also
  317. --------
  318. DataFrame.to_hdf : Write a HDF file from a DataFrame.
  319. HDFStore : Low-level access to HDF files.
  320. Examples
  321. --------
  322. >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z'])
  323. >>> df.to_hdf('./store.h5', 'data')
  324. >>> reread = pd.read_hdf('./store.h5')
  325. """
  326. if mode not in ["r", "r+", "a"]:
  327. raise ValueError(
  328. f"mode {mode} is not allowed while performing a read. "
  329. f"Allowed modes are r, r+ and a."
  330. )
  331. # grab the scope
  332. if where is not None:
  333. where = _ensure_term(where, scope_level=1)
  334. if isinstance(path_or_buf, HDFStore):
  335. if not path_or_buf.is_open:
  336. raise OSError("The HDFStore must be open for reading.")
  337. store = path_or_buf
  338. auto_close = False
  339. else:
  340. path_or_buf = stringify_path(path_or_buf)
  341. if not isinstance(path_or_buf, str):
  342. raise NotImplementedError(
  343. "Support for generic buffers has not been implemented."
  344. )
  345. try:
  346. exists = os.path.exists(path_or_buf)
  347. # if filepath is too long
  348. except (TypeError, ValueError):
  349. exists = False
  350. if not exists:
  351. raise FileNotFoundError(f"File {path_or_buf} does not exist")
  352. store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)
  353. # can't auto open/close if we are using an iterator
  354. # so delegate to the iterator
  355. auto_close = True
  356. try:
  357. if key is None:
  358. groups = store.groups()
  359. if len(groups) == 0:
  360. raise ValueError(
  361. "Dataset(s) incompatible with Pandas data types, "
  362. "not table, or no datasets found in HDF5 file."
  363. )
  364. candidate_only_group = groups[0]
  365. # For the HDF file to have only one dataset, all other groups
  366. # should then be metadata groups for that candidate group. (This
  367. # assumes that the groups() method enumerates parent groups
  368. # before their children.)
  369. for group_to_check in groups[1:]:
  370. if not _is_metadata_of(group_to_check, candidate_only_group):
  371. raise ValueError(
  372. "key must be provided when HDF5 "
  373. "file contains multiple datasets."
  374. )
  375. key = candidate_only_group._v_pathname
  376. return store.select(
  377. key,
  378. where=where,
  379. start=start,
  380. stop=stop,
  381. columns=columns,
  382. iterator=iterator,
  383. chunksize=chunksize,
  384. auto_close=auto_close,
  385. )
  386. except (ValueError, TypeError, KeyError):
  387. if not isinstance(path_or_buf, HDFStore):
  388. # if there is an error, close the store if we opened it.
  389. with suppress(AttributeError):
  390. store.close()
  391. raise
  392. def _is_metadata_of(group: Node, parent_group: Node) -> bool:
  393. """Check if a given group is a metadata group for a given parent_group."""
  394. if group._v_depth <= parent_group._v_depth:
  395. return False
  396. current = group
  397. while current._v_depth > 1:
  398. parent = current._v_parent
  399. if parent == parent_group and current._v_name == "meta":
  400. return True
  401. current = current._v_parent
  402. return False
  403. class HDFStore:
  404. """
  405. Dict-like IO interface for storing pandas objects in PyTables.
  406. Either Fixed or Table format.
  407. .. warning::
  408. Pandas uses PyTables for reading and writing HDF5 files, which allows
  409. serializing object-dtype data with pickle when using the "fixed" format.
  410. Loading pickled data received from untrusted sources can be unsafe.
  411. See: https://docs.python.org/3/library/pickle.html for more.
  412. Parameters
  413. ----------
  414. path : str
  415. File path to HDF5 file.
  416. mode : {'a', 'w', 'r', 'r+'}, default 'a'
  417. ``'r'``
  418. Read-only; no data can be modified.
  419. ``'w'``
  420. Write; a new file is created (an existing file with the same
  421. name would be deleted).
  422. ``'a'``
  423. Append; an existing file is opened for reading and writing,
  424. and if the file does not exist it is created.
  425. ``'r+'``
  426. It is similar to ``'a'``, but the file must already exist.
  427. complevel : int, 0-9, default None
  428. Specifies a compression level for data.
  429. A value of 0 or None disables compression.
  430. complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
  431. Specifies the compression library to be used.
  432. As of v0.20.2 these additional compressors for Blosc are supported
  433. (default if no compressor specified: 'blosc:blosclz'):
  434. {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
  435. 'blosc:zlib', 'blosc:zstd'}.
  436. Specifying a compression library which is not available issues
  437. a ValueError.
  438. fletcher32 : bool, default False
  439. If applying compression use the fletcher32 checksum.
  440. **kwargs
  441. These parameters will be passed to the PyTables open_file method.
  442. Examples
  443. --------
  444. >>> bar = pd.DataFrame(np.random.randn(10, 4))
  445. >>> store = pd.HDFStore('test.h5')
  446. >>> store['foo'] = bar # write to HDF5
  447. >>> bar = store['foo'] # retrieve
  448. >>> store.close()
  449. **Create or load HDF5 file in-memory**
  450. When passing the `driver` option to the PyTables open_file method through
  451. **kwargs, the HDF5 file is loaded or created in-memory and will only be
  452. written when closed:
  453. >>> bar = pd.DataFrame(np.random.randn(10, 4))
  454. >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE')
  455. >>> store['foo'] = bar
  456. >>> store.close() # only now, data is written to disk
  457. """
  458. _handle: File | None
  459. _mode: str
  460. _complevel: int
  461. _fletcher32: bool
  462. def __init__(
  463. self,
  464. path,
  465. mode: str = "a",
  466. complevel: int | None = None,
  467. complib=None,
  468. fletcher32: bool = False,
  469. **kwargs,
  470. ):
  471. if "format" in kwargs:
  472. raise ValueError("format is not a defined argument for HDFStore")
  473. tables = import_optional_dependency("tables")
  474. if complib is not None and complib not in tables.filters.all_complibs:
  475. raise ValueError(
  476. f"complib only supports {tables.filters.all_complibs} compression."
  477. )
  478. if complib is None and complevel is not None:
  479. complib = tables.filters.default_complib
  480. self._path = stringify_path(path)
  481. if mode is None:
  482. mode = "a"
  483. self._mode = mode
  484. self._handle = None
  485. self._complevel = complevel if complevel else 0
  486. self._complib = complib
  487. self._fletcher32 = fletcher32
  488. self._filters = None
  489. self.open(mode=mode, **kwargs)
  490. def __fspath__(self):
  491. return self._path
  492. @property
  493. def root(self):
  494. """return the root node"""
  495. self._check_if_open()
  496. assert self._handle is not None # for mypy
  497. return self._handle.root
  498. @property
  499. def filename(self):
  500. return self._path
  501. def __getitem__(self, key: str):
  502. return self.get(key)
  503. def __setitem__(self, key: str, value):
  504. self.put(key, value)
  505. def __delitem__(self, key: str):
  506. return self.remove(key)
  507. def __getattr__(self, name: str):
  508. """allow attribute access to get stores"""
  509. try:
  510. return self.get(name)
  511. except (KeyError, ClosedFileError):
  512. pass
  513. raise AttributeError(
  514. f"'{type(self).__name__}' object has no attribute '{name}'"
  515. )
  516. def __contains__(self, key: str) -> bool:
  517. """
  518. check for existence of this key
  519. can match the exact pathname or the pathnm w/o the leading '/'
  520. """
  521. node = self.get_node(key)
  522. if node is not None:
  523. name = node._v_pathname
  524. if name == key or name[1:] == key:
  525. return True
  526. return False
  527. def __len__(self) -> int:
  528. return len(self.groups())
  529. def __repr__(self) -> str:
  530. pstr = pprint_thing(self._path)
  531. return f"{type(self)}\nFile path: {pstr}\n"
  532. def __enter__(self):
  533. return self
  534. def __exit__(self, exc_type, exc_value, traceback):
  535. self.close()
  536. def keys(self, include: str = "pandas") -> list[str]:
  537. """
  538. Return a list of keys corresponding to objects stored in HDFStore.
  539. Parameters
  540. ----------
  541. include : str, default 'pandas'
  542. When kind equals 'pandas' return pandas objects.
  543. When kind equals 'native' return native HDF5 Table objects.
  544. .. versionadded:: 1.1.0
  545. Returns
  546. -------
  547. list
  548. List of ABSOLUTE path-names (e.g. have the leading '/').
  549. Raises
  550. ------
  551. raises ValueError if kind has an illegal value
  552. """
  553. if include == "pandas":
  554. return [n._v_pathname for n in self.groups()]
  555. elif include == "native":
  556. assert self._handle is not None # mypy
  557. return [
  558. n._v_pathname for n in self._handle.walk_nodes("/", classname="Table")
  559. ]
  560. raise ValueError(
  561. f"`include` should be either 'pandas' or 'native' but is '{include}'"
  562. )
  563. def __iter__(self):
  564. return iter(self.keys())
  565. def items(self):
  566. """
  567. iterate on key->group
  568. """
  569. for g in self.groups():
  570. yield g._v_pathname, g
  571. iteritems = items
  572. def open(self, mode: str = "a", **kwargs):
  573. """
  574. Open the file in the specified mode
  575. Parameters
  576. ----------
  577. mode : {'a', 'w', 'r', 'r+'}, default 'a'
  578. See HDFStore docstring or tables.open_file for info about modes
  579. **kwargs
  580. These parameters will be passed to the PyTables open_file method.
  581. """
  582. tables = _tables()
  583. if self._mode != mode:
  584. # if we are changing a write mode to read, ok
  585. if self._mode in ["a", "w"] and mode in ["r", "r+"]:
  586. pass
  587. elif mode in ["w"]:
  588. # this would truncate, raise here
  589. if self.is_open:
  590. raise PossibleDataLossError(
  591. f"Re-opening the file [{self._path}] with mode [{self._mode}] "
  592. "will delete the current file!"
  593. )
  594. self._mode = mode
  595. # close and reopen the handle
  596. if self.is_open:
  597. self.close()
  598. if self._complevel and self._complevel > 0:
  599. self._filters = _tables().Filters(
  600. self._complevel, self._complib, fletcher32=self._fletcher32
  601. )
  602. if _table_file_open_policy_is_strict and self.is_open:
  603. msg = (
  604. "Cannot open HDF5 file, which is already opened, "
  605. "even in read-only mode."
  606. )
  607. raise ValueError(msg)
  608. self._handle = tables.open_file(self._path, self._mode, **kwargs)
  609. def close(self):
  610. """
  611. Close the PyTables file handle
  612. """
  613. if self._handle is not None:
  614. self._handle.close()
  615. self._handle = None
  616. @property
  617. def is_open(self) -> bool:
  618. """
  619. return a boolean indicating whether the file is open
  620. """
  621. if self._handle is None:
  622. return False
  623. return bool(self._handle.isopen)
  624. def flush(self, fsync: bool = False):
  625. """
  626. Force all buffered modifications to be written to disk.
  627. Parameters
  628. ----------
  629. fsync : bool (default False)
  630. call ``os.fsync()`` on the file handle to force writing to disk.
  631. Notes
  632. -----
  633. Without ``fsync=True``, flushing may not guarantee that the OS writes
  634. to disk. With fsync, the operation will block until the OS claims the
  635. file has been written; however, other caching layers may still
  636. interfere.
  637. """
  638. if self._handle is not None:
  639. self._handle.flush()
  640. if fsync:
  641. with suppress(OSError):
  642. os.fsync(self._handle.fileno())
  643. def get(self, key: str):
  644. """
  645. Retrieve pandas object stored in file.
  646. Parameters
  647. ----------
  648. key : str
  649. Returns
  650. -------
  651. object
  652. Same type as object stored in file.
  653. """
  654. with patch_pickle():
  655. # GH#31167 Without this patch, pickle doesn't know how to unpickle
  656. # old DateOffset objects now that they are cdef classes.
  657. group = self.get_node(key)
  658. if group is None:
  659. raise KeyError(f"No object named {key} in the file")
  660. return self._read_group(group)
  661. def select(
  662. self,
  663. key: str,
  664. where=None,
  665. start=None,
  666. stop=None,
  667. columns=None,
  668. iterator=False,
  669. chunksize=None,
  670. auto_close: bool = False,
  671. ):
  672. """
  673. Retrieve pandas object stored in file, optionally based on where criteria.
  674. .. warning::
  675. Pandas uses PyTables for reading and writing HDF5 files, which allows
  676. serializing object-dtype data with pickle when using the "fixed" format.
  677. Loading pickled data received from untrusted sources can be unsafe.
  678. See: https://docs.python.org/3/library/pickle.html for more.
  679. Parameters
  680. ----------
  681. key : str
  682. Object being retrieved from file.
  683. where : list or None
  684. List of Term (or convertible) objects, optional.
  685. start : int or None
  686. Row number to start selection.
  687. stop : int, default None
  688. Row number to stop selection.
  689. columns : list or None
  690. A list of columns that if not None, will limit the return columns.
  691. iterator : bool or False
  692. Returns an iterator.
  693. chunksize : int or None
  694. Number or rows to include in iteration, return an iterator.
  695. auto_close : bool or False
  696. Should automatically close the store when finished.
  697. Returns
  698. -------
  699. object
  700. Retrieved object from file.
  701. """
  702. group = self.get_node(key)
  703. if group is None:
  704. raise KeyError(f"No object named {key} in the file")
  705. # create the storer and axes
  706. where = _ensure_term(where, scope_level=1)
  707. s = self._create_storer(group)
  708. s.infer_axes()
  709. # function to call on iteration
  710. def func(_start, _stop, _where):
  711. return s.read(start=_start, stop=_stop, where=_where, columns=columns)
  712. # create the iterator
  713. it = TableIterator(
  714. self,
  715. s,
  716. func,
  717. where=where,
  718. nrows=s.nrows,
  719. start=start,
  720. stop=stop,
  721. iterator=iterator,
  722. chunksize=chunksize,
  723. auto_close=auto_close,
  724. )
  725. return it.get_result()
  726. def select_as_coordinates(
  727. self,
  728. key: str,
  729. where=None,
  730. start: int | None = None,
  731. stop: int | None = None,
  732. ):
  733. """
  734. return the selection as an Index
  735. .. warning::
  736. Pandas uses PyTables for reading and writing HDF5 files, which allows
  737. serializing object-dtype data with pickle when using the "fixed" format.
  738. Loading pickled data received from untrusted sources can be unsafe.
  739. See: https://docs.python.org/3/library/pickle.html for more.
  740. Parameters
  741. ----------
  742. key : str
  743. where : list of Term (or convertible) objects, optional
  744. start : integer (defaults to None), row number to start selection
  745. stop : integer (defaults to None), row number to stop selection
  746. """
  747. where = _ensure_term(where, scope_level=1)
  748. tbl = self.get_storer(key)
  749. if not isinstance(tbl, Table):
  750. raise TypeError("can only read_coordinates with a table")
  751. return tbl.read_coordinates(where=where, start=start, stop=stop)
  752. def select_column(
  753. self,
  754. key: str,
  755. column: str,
  756. start: int | None = None,
  757. stop: int | None = None,
  758. ):
  759. """
  760. return a single column from the table. This is generally only useful to
  761. select an indexable
  762. .. warning::
  763. Pandas uses PyTables for reading and writing HDF5 files, which allows
  764. serializing object-dtype data with pickle when using the "fixed" format.
  765. Loading pickled data received from untrusted sources can be unsafe.
  766. See: https://docs.python.org/3/library/pickle.html for more.
  767. Parameters
  768. ----------
  769. key : str
  770. column : str
  771. The column of interest.
  772. start : int or None, default None
  773. stop : int or None, default None
  774. Raises
  775. ------
  776. raises KeyError if the column is not found (or key is not a valid
  777. store)
  778. raises ValueError if the column can not be extracted individually (it
  779. is part of a data block)
  780. """
  781. tbl = self.get_storer(key)
  782. if not isinstance(tbl, Table):
  783. raise TypeError("can only read_column with a table")
  784. return tbl.read_column(column=column, start=start, stop=stop)
  785. def select_as_multiple(
  786. self,
  787. keys,
  788. where=None,
  789. selector=None,
  790. columns=None,
  791. start=None,
  792. stop=None,
  793. iterator=False,
  794. chunksize=None,
  795. auto_close: bool = False,
  796. ):
  797. """
  798. Retrieve pandas objects from multiple tables.
  799. .. warning::
  800. Pandas uses PyTables for reading and writing HDF5 files, which allows
  801. serializing object-dtype data with pickle when using the "fixed" format.
  802. Loading pickled data received from untrusted sources can be unsafe.
  803. See: https://docs.python.org/3/library/pickle.html for more.
  804. Parameters
  805. ----------
  806. keys : a list of the tables
  807. selector : the table to apply the where criteria (defaults to keys[0]
  808. if not supplied)
  809. columns : the columns I want back
  810. start : integer (defaults to None), row number to start selection
  811. stop : integer (defaults to None), row number to stop selection
  812. iterator : bool, return an iterator, default False
  813. chunksize : nrows to include in iteration, return an iterator
  814. auto_close : bool, default False
  815. Should automatically close the store when finished.
  816. Raises
  817. ------
  818. raises KeyError if keys or selector is not found or keys is empty
  819. raises TypeError if keys is not a list or tuple
  820. raises ValueError if the tables are not ALL THE SAME DIMENSIONS
  821. """
  822. # default to single select
  823. where = _ensure_term(where, scope_level=1)
  824. if isinstance(keys, (list, tuple)) and len(keys) == 1:
  825. keys = keys[0]
  826. if isinstance(keys, str):
  827. return self.select(
  828. key=keys,
  829. where=where,
  830. columns=columns,
  831. start=start,
  832. stop=stop,
  833. iterator=iterator,
  834. chunksize=chunksize,
  835. auto_close=auto_close,
  836. )
  837. if not isinstance(keys, (list, tuple)):
  838. raise TypeError("keys must be a list/tuple")
  839. if not len(keys):
  840. raise ValueError("keys must have a non-zero length")
  841. if selector is None:
  842. selector = keys[0]
  843. # collect the tables
  844. tbls = [self.get_storer(k) for k in keys]
  845. s = self.get_storer(selector)
  846. # validate rows
  847. nrows = None
  848. for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
  849. if t is None:
  850. raise KeyError(f"Invalid table [{k}]")
  851. if not t.is_table:
  852. raise TypeError(
  853. f"object [{t.pathname}] is not a table, and cannot be used in all "
  854. "select as multiple"
  855. )
  856. if nrows is None:
  857. nrows = t.nrows
  858. elif t.nrows != nrows:
  859. raise ValueError("all tables must have exactly the same nrows!")
  860. # The isinstance checks here are redundant with the check above,
  861. # but necessary for mypy; see GH#29757
  862. _tbls = [x for x in tbls if isinstance(x, Table)]
  863. # axis is the concentration axes
  864. axis = list({t.non_index_axes[0][0] for t in _tbls})[0]
  865. def func(_start, _stop, _where):
  866. # retrieve the objs, _where is always passed as a set of
  867. # coordinates here
  868. objs = [
  869. t.read(where=_where, columns=columns, start=_start, stop=_stop)
  870. for t in tbls
  871. ]
  872. # concat and return
  873. return concat(objs, axis=axis, verify_integrity=False)._consolidate()
  874. # create the iterator
  875. it = TableIterator(
  876. self,
  877. s,
  878. func,
  879. where=where,
  880. nrows=nrows,
  881. start=start,
  882. stop=stop,
  883. iterator=iterator,
  884. chunksize=chunksize,
  885. auto_close=auto_close,
  886. )
  887. return it.get_result(coordinates=True)
  888. def put(
  889. self,
  890. key: str,
  891. value: DataFrame | Series,
  892. format=None,
  893. index=True,
  894. append=False,
  895. complib=None,
  896. complevel: int | None = None,
  897. min_itemsize: int | dict[str, int] | None = None,
  898. nan_rep=None,
  899. data_columns: list[str] | None = None,
  900. encoding=None,
  901. errors: str = "strict",
  902. track_times: bool = True,
  903. dropna: bool = False,
  904. ):
  905. """
  906. Store object in HDFStore.
  907. Parameters
  908. ----------
  909. key : str
  910. value : {Series, DataFrame}
  911. format : 'fixed(f)|table(t)', default is 'fixed'
  912. Format to use when storing object in HDFStore. Value can be one of:
  913. ``'fixed'``
  914. Fixed format. Fast writing/reading. Not-appendable, nor searchable.
  915. ``'table'``
  916. Table format. Write as a PyTables Table structure which may perform
  917. worse but allow more flexible operations like searching / selecting
  918. subsets of the data.
  919. append : bool, default False
  920. This will force Table format, append the input data to the existing.
  921. data_columns : list, default None
  922. List of columns to create as data columns, or True to use all columns.
  923. See `here
  924. <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
  925. encoding : str, default None
  926. Provide an encoding for strings.
  927. track_times : bool, default True
  928. Parameter is propagated to 'create_table' method of 'PyTables'.
  929. If set to False it enables to have the same h5 files (same hashes)
  930. independent on creation time.
  931. .. versionadded:: 1.1.0
  932. """
  933. if format is None:
  934. format = get_option("io.hdf.default_format") or "fixed"
  935. format = self._validate_format(format)
  936. self._write_to_group(
  937. key,
  938. value,
  939. format=format,
  940. index=index,
  941. append=append,
  942. complib=complib,
  943. complevel=complevel,
  944. min_itemsize=min_itemsize,
  945. nan_rep=nan_rep,
  946. data_columns=data_columns,
  947. encoding=encoding,
  948. errors=errors,
  949. track_times=track_times,
  950. dropna=dropna,
  951. )
  952. def remove(self, key: str, where=None, start=None, stop=None):
  953. """
  954. Remove pandas object partially by specifying the where condition
  955. Parameters
  956. ----------
  957. key : str
  958. Node to remove or delete rows from
  959. where : list of Term (or convertible) objects, optional
  960. start : integer (defaults to None), row number to start selection
  961. stop : integer (defaults to None), row number to stop selection
  962. Returns
  963. -------
  964. number of rows removed (or None if not a Table)
  965. Raises
  966. ------
  967. raises KeyError if key is not a valid store
  968. """
  969. where = _ensure_term(where, scope_level=1)
  970. try:
  971. s = self.get_storer(key)
  972. except KeyError:
  973. # the key is not a valid store, re-raising KeyError
  974. raise
  975. except AssertionError:
  976. # surface any assertion errors for e.g. debugging
  977. raise
  978. except Exception as err:
  979. # In tests we get here with ClosedFileError, TypeError, and
  980. # _table_mod.NoSuchNodeError. TODO: Catch only these?
  981. if where is not None:
  982. raise ValueError(
  983. "trying to remove a node with a non-None where clause!"
  984. ) from err
  985. # we are actually trying to remove a node (with children)
  986. node = self.get_node(key)
  987. if node is not None:
  988. node._f_remove(recursive=True)
  989. return None
  990. # remove the node
  991. if com.all_none(where, start, stop):
  992. s.group._f_remove(recursive=True)
  993. # delete from the table
  994. else:
  995. if not s.is_table:
  996. raise ValueError(
  997. "can only remove with where on objects written as tables"
  998. )
  999. return s.delete(where=where, start=start, stop=stop)
  1000. def append(
  1001. self,
  1002. key: str,
  1003. value: DataFrame | Series,
  1004. format=None,
  1005. axes=None,
  1006. index=True,
  1007. append=True,
  1008. complib=None,
  1009. complevel: int | None = None,
  1010. columns=None,
  1011. min_itemsize: int | dict[str, int] | None = None,
  1012. nan_rep=None,
  1013. chunksize=None,
  1014. expectedrows=None,
  1015. dropna: bool | None = None,
  1016. data_columns: list[str] | None = None,
  1017. encoding=None,
  1018. errors: str = "strict",
  1019. ):
  1020. """
  1021. Append to Table in file. Node must already exist and be Table
  1022. format.
  1023. Parameters
  1024. ----------
  1025. key : str
  1026. value : {Series, DataFrame}
  1027. format : 'table' is the default
  1028. Format to use when storing object in HDFStore. Value can be one of:
  1029. ``'table'``
  1030. Table format. Write as a PyTables Table structure which may perform
  1031. worse but allow more flexible operations like searching / selecting
  1032. subsets of the data.
  1033. append : bool, default True
  1034. Append the input data to the existing.
  1035. data_columns : list of columns, or True, default None
  1036. List of columns to create as indexed data columns for on-disk
  1037. queries, or True to use all columns. By default only the axes
  1038. of the object are indexed. See `here
  1039. <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
  1040. min_itemsize : dict of columns that specify minimum str sizes
  1041. nan_rep : str to use as str nan representation
  1042. chunksize : size to chunk the writing
  1043. expectedrows : expected TOTAL row size of this table
  1044. encoding : default None, provide an encoding for str
  1045. dropna : bool, default False
  1046. Do not write an ALL nan row to the store settable
  1047. by the option 'io.hdf.dropna_table'.
  1048. Notes
  1049. -----
  1050. Does *not* check if data being appended overlaps with existing
  1051. data in the table, so be careful
  1052. """
  1053. if columns is not None:
  1054. raise TypeError(
  1055. "columns is not a supported keyword in append, try data_columns"
  1056. )
  1057. if dropna is None:
  1058. dropna = get_option("io.hdf.dropna_table")
  1059. if format is None:
  1060. format = get_option("io.hdf.default_format") or "table"
  1061. format = self._validate_format(format)
  1062. self._write_to_group(
  1063. key,
  1064. value,
  1065. format=format,
  1066. axes=axes,
  1067. index=index,
  1068. append=append,
  1069. complib=complib,
  1070. complevel=complevel,
  1071. min_itemsize=min_itemsize,
  1072. nan_rep=nan_rep,
  1073. chunksize=chunksize,
  1074. expectedrows=expectedrows,
  1075. dropna=dropna,
  1076. data_columns=data_columns,
  1077. encoding=encoding,
  1078. errors=errors,
  1079. )
  1080. def append_to_multiple(
  1081. self,
  1082. d: dict,
  1083. value,
  1084. selector,
  1085. data_columns=None,
  1086. axes=None,
  1087. dropna=False,
  1088. **kwargs,
  1089. ):
  1090. """
  1091. Append to multiple tables
  1092. Parameters
  1093. ----------
  1094. d : a dict of table_name to table_columns, None is acceptable as the
  1095. values of one node (this will get all the remaining columns)
  1096. value : a pandas object
  1097. selector : a string that designates the indexable table; all of its
  1098. columns will be designed as data_columns, unless data_columns is
  1099. passed, in which case these are used
  1100. data_columns : list of columns to create as data columns, or True to
  1101. use all columns
  1102. dropna : if evaluates to True, drop rows from all tables if any single
  1103. row in each table has all NaN. Default False.
  1104. Notes
  1105. -----
  1106. axes parameter is currently not accepted
  1107. """
  1108. if axes is not None:
  1109. raise TypeError(
  1110. "axes is currently not accepted as a parameter to append_to_multiple; "
  1111. "you can create the tables independently instead"
  1112. )
  1113. if not isinstance(d, dict):
  1114. raise ValueError(
  1115. "append_to_multiple must have a dictionary specified as the "
  1116. "way to split the value"
  1117. )
  1118. if selector not in d:
  1119. raise ValueError(
  1120. "append_to_multiple requires a selector that is in passed dict"
  1121. )
  1122. # figure out the splitting axis (the non_index_axis)
  1123. axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]
  1124. # figure out how to split the value
  1125. remain_key = None
  1126. remain_values: list = []
  1127. for k, v in d.items():
  1128. if v is None:
  1129. if remain_key is not None:
  1130. raise ValueError(
  1131. "append_to_multiple can only have one value in d that is None"
  1132. )
  1133. remain_key = k
  1134. else:
  1135. remain_values.extend(v)
  1136. if remain_key is not None:
  1137. ordered = value.axes[axis]
  1138. ordd = ordered.difference(Index(remain_values))
  1139. ordd = sorted(ordered.get_indexer(ordd))
  1140. d[remain_key] = ordered.take(ordd)
  1141. # data_columns
  1142. if data_columns is None:
  1143. data_columns = d[selector]
  1144. # ensure rows are synchronized across the tables
  1145. if dropna:
  1146. idxs = (value[cols].dropna(how="all").index for cols in d.values())
  1147. valid_index = next(idxs)
  1148. for index in idxs:
  1149. valid_index = valid_index.intersection(index)
  1150. value = value.loc[valid_index]
  1151. min_itemsize = kwargs.pop("min_itemsize", None)
  1152. # append
  1153. for k, v in d.items():
  1154. dc = data_columns if k == selector else None
  1155. # compute the val
  1156. val = value.reindex(v, axis=axis)
  1157. filtered = (
  1158. {key: value for (key, value) in min_itemsize.items() if key in v}
  1159. if min_itemsize is not None
  1160. else None
  1161. )
  1162. self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs)
  1163. def create_table_index(
  1164. self,
  1165. key: str,
  1166. columns=None,
  1167. optlevel: int | None = None,
  1168. kind: str | None = None,
  1169. ):
  1170. """
  1171. Create a pytables index on the table.
  1172. Parameters
  1173. ----------
  1174. key : str
  1175. columns : None, bool, or listlike[str]
  1176. Indicate which columns to create an index on.
  1177. * False : Do not create any indexes.
  1178. * True : Create indexes on all columns.
  1179. * None : Create indexes on all columns.
  1180. * listlike : Create indexes on the given columns.
  1181. optlevel : int or None, default None
  1182. Optimization level, if None, pytables defaults to 6.
  1183. kind : str or None, default None
  1184. Kind of index, if None, pytables defaults to "medium".
  1185. Raises
  1186. ------
  1187. TypeError: raises if the node is not a table
  1188. """
  1189. # version requirements
  1190. _tables()
  1191. s = self.get_storer(key)
  1192. if s is None:
  1193. return
  1194. if not isinstance(s, Table):
  1195. raise TypeError("cannot create table index on a Fixed format store")
  1196. s.create_index(columns=columns, optlevel=optlevel, kind=kind)
  1197. def groups(self):
  1198. """
  1199. Return a list of all the top-level nodes.
  1200. Each node returned is not a pandas storage object.
  1201. Returns
  1202. -------
  1203. list
  1204. List of objects.
  1205. """
  1206. _tables()
  1207. self._check_if_open()
  1208. assert self._handle is not None # for mypy
  1209. assert _table_mod is not None # for mypy
  1210. return [
  1211. g
  1212. for g in self._handle.walk_groups()
  1213. if (
  1214. not isinstance(g, _table_mod.link.Link)
  1215. and (
  1216. getattr(g._v_attrs, "pandas_type", None)
  1217. or getattr(g, "table", None)
  1218. or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")
  1219. )
  1220. )
  1221. ]
  1222. def walk(self, where="/"):
  1223. """
  1224. Walk the pytables group hierarchy for pandas objects.
  1225. This generator will yield the group path, subgroups and pandas object
  1226. names for each group.
  1227. Any non-pandas PyTables objects that are not a group will be ignored.
  1228. The `where` group itself is listed first (preorder), then each of its
  1229. child groups (following an alphanumerical order) is also traversed,
  1230. following the same procedure.
  1231. Parameters
  1232. ----------
  1233. where : str, default "/"
  1234. Group where to start walking.
  1235. Yields
  1236. ------
  1237. path : str
  1238. Full path to a group (without trailing '/').
  1239. groups : list
  1240. Names (strings) of the groups contained in `path`.
  1241. leaves : list
  1242. Names (strings) of the pandas objects contained in `path`.
  1243. """
  1244. _tables()
  1245. self._check_if_open()
  1246. assert self._handle is not None # for mypy
  1247. assert _table_mod is not None # for mypy
  1248. for g in self._handle.walk_groups(where):
  1249. if getattr(g._v_attrs, "pandas_type", None) is not None:
  1250. continue
  1251. groups = []
  1252. leaves = []
  1253. for child in g._v_children.values():
  1254. pandas_type = getattr(child._v_attrs, "pandas_type", None)
  1255. if pandas_type is None:
  1256. if isinstance(child, _table_mod.group.Group):
  1257. groups.append(child._v_name)
  1258. else:
  1259. leaves.append(child._v_name)
  1260. yield (g._v_pathname.rstrip("/"), groups, leaves)
  1261. def get_node(self, key: str) -> Node | None:
  1262. """return the node with the key or None if it does not exist"""
  1263. self._check_if_open()
  1264. if not key.startswith("/"):
  1265. key = "/" + key
  1266. assert self._handle is not None
  1267. assert _table_mod is not None # for mypy
  1268. try:
  1269. node = self._handle.get_node(self.root, key)
  1270. except _table_mod.exceptions.NoSuchNodeError:
  1271. return None
  1272. assert isinstance(node, _table_mod.Node), type(node)
  1273. return node
  1274. def get_storer(self, key: str) -> GenericFixed | Table:
  1275. """return the storer object for a key, raise if not in the file"""
  1276. group = self.get_node(key)
  1277. if group is None:
  1278. raise KeyError(f"No object named {key} in the file")
  1279. s = self._create_storer(group)
  1280. s.infer_axes()
  1281. return s
  1282. def copy(
  1283. self,
  1284. file,
  1285. mode="w",
  1286. propindexes: bool = True,
  1287. keys=None,
  1288. complib=None,
  1289. complevel: int | None = None,
  1290. fletcher32: bool = False,
  1291. overwrite=True,
  1292. ):
  1293. """
  1294. Copy the existing store to a new file, updating in place.
  1295. Parameters
  1296. ----------
  1297. propindexes : bool, default True
  1298. Restore indexes in copied file.
  1299. keys : list, optional
  1300. List of keys to include in the copy (defaults to all).
  1301. overwrite : bool, default True
  1302. Whether to overwrite (remove and replace) existing nodes in the new store.
  1303. mode, complib, complevel, fletcher32 same as in HDFStore.__init__
  1304. Returns
  1305. -------
  1306. open file handle of the new store
  1307. """
  1308. new_store = HDFStore(
  1309. file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32
  1310. )
  1311. if keys is None:
  1312. keys = list(self.keys())
  1313. if not isinstance(keys, (tuple, list)):
  1314. keys = [keys]
  1315. for k in keys:
  1316. s = self.get_storer(k)
  1317. if s is not None:
  1318. if k in new_store:
  1319. if overwrite:
  1320. new_store.remove(k)
  1321. data = self.select(k)
  1322. if isinstance(s, Table):
  1323. index: bool | list[str] = False
  1324. if propindexes:
  1325. index = [a.name for a in s.axes if a.is_indexed]
  1326. new_store.append(
  1327. k,
  1328. data,
  1329. index=index,
  1330. data_columns=getattr(s, "data_columns", None),
  1331. encoding=s.encoding,
  1332. )
  1333. else:
  1334. new_store.put(k, data, encoding=s.encoding)
  1335. return new_store
  1336. def info(self) -> str:
  1337. """
  1338. Print detailed information on the store.
  1339. Returns
  1340. -------
  1341. str
  1342. """
  1343. path = pprint_thing(self._path)
  1344. output = f"{type(self)}\nFile path: {path}\n"
  1345. if self.is_open:
  1346. lkeys = sorted(self.keys())
  1347. if len(lkeys):
  1348. keys = []
  1349. values = []
  1350. for k in lkeys:
  1351. try:
  1352. s = self.get_storer(k)
  1353. if s is not None:
  1354. keys.append(pprint_thing(s.pathname or k))
  1355. values.append(pprint_thing(s or "invalid_HDFStore node"))
  1356. except AssertionError:
  1357. # surface any assertion errors for e.g. debugging
  1358. raise
  1359. except Exception as detail:
  1360. keys.append(k)
  1361. dstr = pprint_thing(detail)
  1362. values.append(f"[invalid_HDFStore node: {dstr}]")
  1363. output += adjoin(12, keys, values)
  1364. else:
  1365. output += "Empty"
  1366. else:
  1367. output += "File is CLOSED"
  1368. return output
  1369. # ------------------------------------------------------------------------
  1370. # private methods
  1371. def _check_if_open(self):
  1372. if not self.is_open:
  1373. raise ClosedFileError(f"{self._path} file is not open!")
  1374. def _validate_format(self, format: str) -> str:
  1375. """validate / deprecate formats"""
  1376. # validate
  1377. try:
  1378. format = _FORMAT_MAP[format.lower()]
  1379. except KeyError as err:
  1380. raise TypeError(f"invalid HDFStore format specified [{format}]") from err
  1381. return format
  1382. def _create_storer(
  1383. self,
  1384. group,
  1385. format=None,
  1386. value: DataFrame | Series | None = None,
  1387. encoding: str = "UTF-8",
  1388. errors: str = "strict",
  1389. ) -> GenericFixed | Table:
  1390. """return a suitable class to operate"""
  1391. cls: type[GenericFixed] | type[Table]
  1392. if value is not None and not isinstance(value, (Series, DataFrame)):
  1393. raise TypeError("value must be None, Series, or DataFrame")
  1394. def error(t):
  1395. # return instead of raising so mypy can tell where we are raising
  1396. return TypeError(
  1397. f"cannot properly create the storer for: [{t}] [group->"
  1398. f"{group},value->{type(value)},format->{format}"
  1399. )
  1400. pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))
  1401. tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))
  1402. # infer the pt from the passed value
  1403. if pt is None:
  1404. if value is None:
  1405. _tables()
  1406. assert _table_mod is not None # for mypy
  1407. if getattr(group, "table", None) or isinstance(
  1408. group, _table_mod.table.Table
  1409. ):
  1410. pt = "frame_table"
  1411. tt = "generic_table"
  1412. else:
  1413. raise TypeError(
  1414. "cannot create a storer if the object is not existing "
  1415. "nor a value are passed"
  1416. )
  1417. else:
  1418. if isinstance(value, Series):
  1419. pt = "series"
  1420. else:
  1421. pt = "frame"
  1422. # we are actually a table
  1423. if format == "table":
  1424. pt += "_table"
  1425. # a storer node
  1426. if "table" not in pt:
  1427. _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed}
  1428. try:
  1429. cls = _STORER_MAP[pt]
  1430. except KeyError as err:
  1431. raise error("_STORER_MAP") from err
  1432. return cls(self, group, encoding=encoding, errors=errors)
  1433. # existing node (and must be a table)
  1434. if tt is None:
  1435. # if we are a writer, determine the tt
  1436. if value is not None:
  1437. if pt == "series_table":
  1438. index = getattr(value, "index", None)
  1439. if index is not None:
  1440. if index.nlevels == 1:
  1441. tt = "appendable_series"
  1442. elif index.nlevels > 1:
  1443. tt = "appendable_multiseries"
  1444. elif pt == "frame_table":
  1445. index = getattr(value, "index", None)
  1446. if index is not None:
  1447. if index.nlevels == 1:
  1448. tt = "appendable_frame"
  1449. elif index.nlevels > 1:
  1450. tt = "appendable_multiframe"
  1451. _TABLE_MAP = {
  1452. "generic_table": GenericTable,
  1453. "appendable_series": AppendableSeriesTable,
  1454. "appendable_multiseries": AppendableMultiSeriesTable,
  1455. "appendable_frame": AppendableFrameTable,
  1456. "appendable_multiframe": AppendableMultiFrameTable,
  1457. "worm": WORMTable,
  1458. }
  1459. try:
  1460. cls = _TABLE_MAP[tt]
  1461. except KeyError as err:
  1462. raise error("_TABLE_MAP") from err
  1463. return cls(self, group, encoding=encoding, errors=errors)
  1464. def _write_to_group(
  1465. self,
  1466. key: str,
  1467. value: DataFrame | Series,
  1468. format,
  1469. axes=None,
  1470. index=True,
  1471. append=False,
  1472. complib=None,
  1473. complevel: int | None = None,
  1474. fletcher32=None,
  1475. min_itemsize: int | dict[str, int] | None = None,
  1476. chunksize=None,
  1477. expectedrows=None,
  1478. dropna=False,
  1479. nan_rep=None,
  1480. data_columns=None,
  1481. encoding=None,
  1482. errors: str = "strict",
  1483. track_times: bool = True,
  1484. ) -> None:
  1485. # we don't want to store a table node at all if our object is 0-len
  1486. # as there are not dtypes
  1487. if getattr(value, "empty", None) and (format == "table" or append):
  1488. return
  1489. group = self._identify_group(key, append)
  1490. s = self._create_storer(group, format, value, encoding=encoding, errors=errors)
  1491. if append:
  1492. # raise if we are trying to append to a Fixed format,
  1493. # or a table that exists (and we are putting)
  1494. if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):
  1495. raise ValueError("Can only append to Tables")
  1496. if not s.is_exists:
  1497. s.set_object_info()
  1498. else:
  1499. s.set_object_info()
  1500. if not s.is_table and complib:
  1501. raise ValueError("Compression not supported on Fixed format stores")
  1502. # write the object
  1503. s.write(
  1504. obj=value,
  1505. axes=axes,
  1506. append=append,
  1507. complib=complib,
  1508. complevel=complevel,
  1509. fletcher32=fletcher32,
  1510. min_itemsize=min_itemsize,
  1511. chunksize=chunksize,
  1512. expectedrows=expectedrows,
  1513. dropna=dropna,
  1514. nan_rep=nan_rep,
  1515. data_columns=data_columns,
  1516. track_times=track_times,
  1517. )
  1518. if isinstance(s, Table) and index:
  1519. s.create_index(columns=index)
  1520. def _read_group(self, group: Node):
  1521. s = self._create_storer(group)
  1522. s.infer_axes()
  1523. return s.read()
  1524. def _identify_group(self, key: str, append: bool) -> Node:
  1525. """Identify HDF5 group based on key, delete/create group if needed."""
  1526. group = self.get_node(key)
  1527. # we make this assertion for mypy; the get_node call will already
  1528. # have raised if this is incorrect
  1529. assert self._handle is not None
  1530. # remove the node if we are not appending
  1531. if group is not None and not append:
  1532. self._handle.remove_node(group, recursive=True)
  1533. group = None
  1534. if group is None:
  1535. group = self._create_nodes_and_group(key)
  1536. return group
  1537. def _create_nodes_and_group(self, key: str) -> Node:
  1538. """Create nodes from key and return group name."""
  1539. # assertion for mypy
  1540. assert self._handle is not None
  1541. paths = key.split("/")
  1542. # recursively create the groups
  1543. path = "/"
  1544. for p in paths:
  1545. if not len(p):
  1546. continue
  1547. new_path = path
  1548. if not path.endswith("/"):
  1549. new_path += "/"
  1550. new_path += p
  1551. group = self.get_node(new_path)
  1552. if group is None:
  1553. group = self._handle.create_group(path, p)
  1554. path = new_path
  1555. return group
  1556. class TableIterator:
  1557. """
  1558. Define the iteration interface on a table
  1559. Parameters
  1560. ----------
  1561. store : HDFStore
  1562. s : the referred storer
  1563. func : the function to execute the query
  1564. where : the where of the query
  1565. nrows : the rows to iterate on
  1566. start : the passed start value (default is None)
  1567. stop : the passed stop value (default is None)
  1568. iterator : bool, default False
  1569. Whether to use the default iterator.
  1570. chunksize : the passed chunking value (default is 100000)
  1571. auto_close : bool, default False
  1572. Whether to automatically close the store at the end of iteration.
  1573. """
  1574. chunksize: int | None
  1575. store: HDFStore
  1576. s: GenericFixed | Table
  1577. def __init__(
  1578. self,
  1579. store: HDFStore,
  1580. s: GenericFixed | Table,
  1581. func,
  1582. where,
  1583. nrows,
  1584. start=None,
  1585. stop=None,
  1586. iterator: bool = False,
  1587. chunksize: int | None = None,
  1588. auto_close: bool = False,
  1589. ):
  1590. self.store = store
  1591. self.s = s
  1592. self.func = func
  1593. self.where = where
  1594. # set start/stop if they are not set if we are a table
  1595. if self.s.is_table:
  1596. if nrows is None:
  1597. nrows = 0
  1598. if start is None:
  1599. start = 0
  1600. if stop is None:
  1601. stop = nrows
  1602. stop = min(nrows, stop)
  1603. self.nrows = nrows
  1604. self.start = start
  1605. self.stop = stop
  1606. self.coordinates = None
  1607. if iterator or chunksize is not None:
  1608. if chunksize is None:
  1609. chunksize = 100000
  1610. self.chunksize = int(chunksize)
  1611. else:
  1612. self.chunksize = None
  1613. self.auto_close = auto_close
  1614. def __iter__(self):
  1615. # iterate
  1616. current = self.start
  1617. if self.coordinates is None:
  1618. raise ValueError("Cannot iterate until get_result is called.")
  1619. while current < self.stop:
  1620. stop = min(current + self.chunksize, self.stop)
  1621. value = self.func(None, None, self.coordinates[current:stop])
  1622. current = stop
  1623. if value is None or not len(value):
  1624. continue
  1625. yield value
  1626. self.close()
  1627. def close(self):
  1628. if self.auto_close:
  1629. self.store.close()
  1630. def get_result(self, coordinates: bool = False):
  1631. # return the actual iterator
  1632. if self.chunksize is not None:
  1633. if not isinstance(self.s, Table):
  1634. raise TypeError("can only use an iterator or chunksize on a table")
  1635. self.coordinates = self.s.read_coordinates(where=self.where)
  1636. return self
  1637. # if specified read via coordinates (necessary for multiple selections
  1638. if coordinates:
  1639. if not isinstance(self.s, Table):
  1640. raise TypeError("can only read_coordinates on a table")
  1641. where = self.s.read_coordinates(
  1642. where=self.where, start=self.start, stop=self.stop
  1643. )
  1644. else:
  1645. where = self.where
  1646. # directly return the result
  1647. results = self.func(self.start, self.stop, where)
  1648. self.close()
  1649. return results
  1650. class IndexCol:
  1651. """
  1652. an index column description class
  1653. Parameters
  1654. ----------
  1655. axis : axis which I reference
  1656. values : the ndarray like converted values
  1657. kind : a string description of this type
  1658. typ : the pytables type
  1659. pos : the position in the pytables
  1660. """
  1661. is_an_indexable = True
  1662. is_data_indexable = True
  1663. _info_fields = ["freq", "tz", "index_name"]
  1664. name: str
  1665. cname: str
  1666. def __init__(
  1667. self,
  1668. name: str,
  1669. values=None,
  1670. kind=None,
  1671. typ=None,
  1672. cname: str | None = None,
  1673. axis=None,
  1674. pos=None,
  1675. freq=None,
  1676. tz=None,
  1677. index_name=None,
  1678. ordered=None,
  1679. table=None,
  1680. meta=None,
  1681. metadata=None,
  1682. ):
  1683. if not isinstance(name, str):
  1684. raise ValueError("`name` must be a str.")
  1685. self.values = values
  1686. self.kind = kind
  1687. self.typ = typ
  1688. self.name = name
  1689. self.cname = cname or name
  1690. self.axis = axis
  1691. self.pos = pos
  1692. self.freq = freq
  1693. self.tz = tz
  1694. self.index_name = index_name
  1695. self.ordered = ordered
  1696. self.table = table
  1697. self.meta = meta
  1698. self.metadata = metadata
  1699. if pos is not None:
  1700. self.set_pos(pos)
  1701. # These are ensured as long as the passed arguments match the
  1702. # constructor annotations.
  1703. assert isinstance(self.name, str)
  1704. assert isinstance(self.cname, str)
  1705. @property
  1706. def itemsize(self) -> int:
  1707. # Assumes self.typ has already been initialized
  1708. return self.typ.itemsize
  1709. @property
  1710. def kind_attr(self) -> str:
  1711. return f"{self.name}_kind"
  1712. def set_pos(self, pos: int):
  1713. """set the position of this column in the Table"""
  1714. self.pos = pos
  1715. if pos is not None and self.typ is not None:
  1716. self.typ._v_pos = pos
  1717. def __repr__(self) -> str:
  1718. temp = tuple(
  1719. map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))
  1720. )
  1721. return ",".join(
  1722. (
  1723. f"{key}->{value}"
  1724. for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp)
  1725. )
  1726. )
  1727. def __eq__(self, other: Any) -> bool:
  1728. """compare 2 col items"""
  1729. return all(
  1730. getattr(self, a, None) == getattr(other, a, None)
  1731. for a in ["name", "cname", "axis", "pos"]
  1732. )
  1733. def __ne__(self, other) -> bool:
  1734. return not self.__eq__(other)
  1735. @property
  1736. def is_indexed(self) -> bool:
  1737. """return whether I am an indexed column"""
  1738. if not hasattr(self.table, "cols"):
  1739. # e.g. if infer hasn't been called yet, self.table will be None.
  1740. return False
  1741. return getattr(self.table.cols, self.cname).is_indexed
  1742. def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
  1743. """
  1744. Convert the data from this selection to the appropriate pandas type.
  1745. """
  1746. assert isinstance(values, np.ndarray), type(values)
  1747. # values is a recarray
  1748. if values.dtype.fields is not None:
  1749. values = values[self.cname]
  1750. val_kind = _ensure_decoded(self.kind)
  1751. values = _maybe_convert(values, val_kind, encoding, errors)
  1752. kwargs = {}
  1753. kwargs["name"] = _ensure_decoded(self.index_name)
  1754. if self.freq is not None:
  1755. kwargs["freq"] = _ensure_decoded(self.freq)
  1756. factory: type[Index] | type[DatetimeIndex] = Index
  1757. if is_datetime64_dtype(values.dtype) or is_datetime64tz_dtype(values.dtype):
  1758. factory = DatetimeIndex
  1759. elif values.dtype == "i8" and "freq" in kwargs:
  1760. # PeriodIndex data is stored as i8
  1761. # error: Incompatible types in assignment (expression has type
  1762. # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type
  1763. # "Union[Type[Index], Type[DatetimeIndex]]")
  1764. factory = lambda x, **kwds: PeriodIndex( # type: ignore[assignment]
  1765. ordinal=x, **kwds
  1766. )
  1767. # making an Index instance could throw a number of different errors
  1768. try:
  1769. new_pd_index = factory(values, **kwargs)
  1770. except ValueError:
  1771. # if the output freq is different that what we recorded,
  1772. # it should be None (see also 'doc example part 2')
  1773. if "freq" in kwargs:
  1774. kwargs["freq"] = None
  1775. new_pd_index = factory(values, **kwargs)
  1776. # error: Incompatible types in assignment (expression has type
  1777. # "Union[ndarray, DatetimeIndex]", variable has type "Index")
  1778. new_pd_index = _set_tz(new_pd_index, self.tz) # type: ignore[assignment]
  1779. return new_pd_index, new_pd_index
  1780. def take_data(self):
  1781. """return the values"""
  1782. return self.values
  1783. @property
  1784. def attrs(self):
  1785. return self.table._v_attrs
  1786. @property
  1787. def description(self):
  1788. return self.table.description
  1789. @property
  1790. def col(self):
  1791. """return my current col description"""
  1792. return getattr(self.description, self.cname, None)
  1793. @property
  1794. def cvalues(self):
  1795. """return my cython values"""
  1796. return self.values
  1797. def __iter__(self):
  1798. return iter(self.values)
  1799. def maybe_set_size(self, min_itemsize=None):
  1800. """
  1801. maybe set a string col itemsize:
  1802. min_itemsize can be an integer or a dict with this columns name
  1803. with an integer size
  1804. """
  1805. if _ensure_decoded(self.kind) == "string":
  1806. if isinstance(min_itemsize, dict):
  1807. min_itemsize = min_itemsize.get(self.name)
  1808. if min_itemsize is not None and self.typ.itemsize < min_itemsize:
  1809. self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)
  1810. def validate_names(self):
  1811. pass
  1812. def validate_and_set(self, handler: AppendableTable, append: bool):
  1813. self.table = handler.table
  1814. self.validate_col()
  1815. self.validate_attr(append)
  1816. self.validate_metadata(handler)
  1817. self.write_metadata(handler)
  1818. self.set_attr()
  1819. def validate_col(self, itemsize=None):
  1820. """validate this column: return the compared against itemsize"""
  1821. # validate this column for string truncation (or reset to the max size)
  1822. if _ensure_decoded(self.kind) == "string":
  1823. c = self.col
  1824. if c is not None:
  1825. if itemsize is None:
  1826. itemsize = self.itemsize
  1827. if c.itemsize < itemsize:
  1828. raise ValueError(
  1829. f"Trying to store a string with len [{itemsize}] in "
  1830. f"[{self.cname}] column but\nthis column has a limit of "
  1831. f"[{c.itemsize}]!\nConsider using min_itemsize to "
  1832. "preset the sizes on these columns"
  1833. )
  1834. return c.itemsize
  1835. return None
  1836. def validate_attr(self, append: bool):
  1837. # check for backwards incompatibility
  1838. if append:
  1839. existing_kind = getattr(self.attrs, self.kind_attr, None)
  1840. if existing_kind is not None and existing_kind != self.kind:
  1841. raise TypeError(
  1842. f"incompatible kind in col [{existing_kind} - {self.kind}]"
  1843. )
  1844. def update_info(self, info):
  1845. """
  1846. set/update the info for this indexable with the key/value
  1847. if there is a conflict raise/warn as needed
  1848. """
  1849. for key in self._info_fields:
  1850. value = getattr(self, key, None)
  1851. idx = info.setdefault(self.name, {})
  1852. existing_value = idx.get(key)
  1853. if key in idx and value is not None and existing_value != value:
  1854. # frequency/name just warn
  1855. if key in ["freq", "index_name"]:
  1856. ws = attribute_conflict_doc % (key, existing_value, value)
  1857. warnings.warn(ws, AttributeConflictWarning, stacklevel=6)
  1858. # reset
  1859. idx[key] = None
  1860. setattr(self, key, None)
  1861. else:
  1862. raise ValueError(
  1863. f"invalid info for [{self.name}] for [{key}], "
  1864. f"existing_value [{existing_value}] conflicts with "
  1865. f"new value [{value}]"
  1866. )
  1867. else:
  1868. if value is not None or existing_value is not None:
  1869. idx[key] = value
  1870. def set_info(self, info):
  1871. """set my state from the passed info"""
  1872. idx = info.get(self.name)
  1873. if idx is not None:
  1874. self.__dict__.update(idx)
  1875. def set_attr(self):
  1876. """set the kind for this column"""
  1877. setattr(self.attrs, self.kind_attr, self.kind)
  1878. def validate_metadata(self, handler: AppendableTable):
  1879. """validate that kind=category does not change the categories"""
  1880. if self.meta == "category":
  1881. new_metadata = self.metadata
  1882. cur_metadata = handler.read_metadata(self.cname)
  1883. if (
  1884. new_metadata is not None
  1885. and cur_metadata is not None
  1886. and not array_equivalent(new_metadata, cur_metadata)
  1887. ):
  1888. raise ValueError(
  1889. "cannot append a categorical with "
  1890. "different categories to the existing"
  1891. )
  1892. def write_metadata(self, handler: AppendableTable):
  1893. """set the meta data"""
  1894. if self.metadata is not None:
  1895. handler.write_metadata(self.cname, self.metadata)
  1896. class GenericIndexCol(IndexCol):
  1897. """an index which is not represented in the data of the table"""
  1898. @property
  1899. def is_indexed(self) -> bool:
  1900. return False
  1901. def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
  1902. """
  1903. Convert the data from this selection to the appropriate pandas type.
  1904. Parameters
  1905. ----------
  1906. values : np.ndarray
  1907. nan_rep : str
  1908. encoding : str
  1909. errors : str
  1910. """
  1911. assert isinstance(values, np.ndarray), type(values)
  1912. # error: Incompatible types in assignment (expression has type
  1913. # "Int64Index", variable has type "ndarray")
  1914. values = Int64Index(np.arange(len(values))) # type: ignore[assignment]
  1915. return values, values
  1916. def set_attr(self):
  1917. pass
  1918. class DataCol(IndexCol):
  1919. """
  1920. a data holding column, by definition this is not indexable
  1921. Parameters
  1922. ----------
  1923. data : the actual data
  1924. cname : the column name in the table to hold the data (typically
  1925. values)
  1926. meta : a string description of the metadata
  1927. metadata : the actual metadata
  1928. """
  1929. is_an_indexable = False
  1930. is_data_indexable = False
  1931. _info_fields = ["tz", "ordered"]
  1932. def __init__(
  1933. self,
  1934. name: str,
  1935. values=None,
  1936. kind=None,
  1937. typ=None,
  1938. cname=None,
  1939. pos=None,
  1940. tz=None,
  1941. ordered=None,
  1942. table=None,
  1943. meta=None,
  1944. metadata=None,
  1945. dtype: DtypeArg | None = None,
  1946. data=None,
  1947. ):
  1948. super().__init__(
  1949. name=name,
  1950. values=values,
  1951. kind=kind,
  1952. typ=typ,
  1953. pos=pos,
  1954. cname=cname,
  1955. tz=tz,
  1956. ordered=ordered,
  1957. table=table,
  1958. meta=meta,
  1959. metadata=metadata,
  1960. )
  1961. self.dtype = dtype
  1962. self.data = data
  1963. @property
  1964. def dtype_attr(self) -> str:
  1965. return f"{self.name}_dtype"
  1966. @property
  1967. def meta_attr(self) -> str:
  1968. return f"{self.name}_meta"
  1969. def __repr__(self) -> str:
  1970. temp = tuple(
  1971. map(
  1972. pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)
  1973. )
  1974. )
  1975. return ",".join(
  1976. (
  1977. f"{key}->{value}"
  1978. for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp)
  1979. )
  1980. )
  1981. def __eq__(self, other: Any) -> bool:
  1982. """compare 2 col items"""
  1983. return all(
  1984. getattr(self, a, None) == getattr(other, a, None)
  1985. for a in ["name", "cname", "dtype", "pos"]
  1986. )
  1987. def set_data(self, data: ArrayLike):
  1988. assert data is not None
  1989. assert self.dtype is None
  1990. data, dtype_name = _get_data_and_dtype_name(data)
  1991. self.data = data
  1992. self.dtype = dtype_name
  1993. self.kind = _dtype_to_kind(dtype_name)
  1994. def take_data(self):
  1995. """return the data"""
  1996. return self.data
  1997. @classmethod
  1998. def _get_atom(cls, values: ArrayLike) -> Col:
  1999. """
  2000. Get an appropriately typed and shaped pytables.Col object for values.
  2001. """
  2002. dtype = values.dtype
  2003. # error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no
  2004. # attribute "itemsize"
  2005. itemsize = dtype.itemsize # type: ignore[union-attr]
  2006. shape = values.shape
  2007. if values.ndim == 1:
  2008. # EA, use block shape pretending it is 2D
  2009. # TODO(EA2D): not necessary with 2D EAs
  2010. shape = (1, values.size)
  2011. if isinstance(values, Categorical):
  2012. codes = values.codes
  2013. atom = cls.get_atom_data(shape, kind=codes.dtype.name)
  2014. elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
  2015. atom = cls.get_atom_datetime64(shape)
  2016. elif is_timedelta64_dtype(dtype):
  2017. atom = cls.get_atom_timedelta64(shape)
  2018. elif is_complex_dtype(dtype):
  2019. atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0])
  2020. elif is_string_dtype(dtype):
  2021. atom = cls.get_atom_string(shape, itemsize)
  2022. else:
  2023. atom = cls.get_atom_data(shape, kind=dtype.name)
  2024. return atom
  2025. @classmethod
  2026. def get_atom_string(cls, shape, itemsize):
  2027. return _tables().StringCol(itemsize=itemsize, shape=shape[0])
  2028. @classmethod
  2029. def get_atom_coltype(cls, kind: str) -> type[Col]:
  2030. """return the PyTables column class for this column"""
  2031. if kind.startswith("uint"):
  2032. k4 = kind[4:]
  2033. col_name = f"UInt{k4}Col"
  2034. elif kind.startswith("period"):
  2035. # we store as integer
  2036. col_name = "Int64Col"
  2037. else:
  2038. kcap = kind.capitalize()
  2039. col_name = f"{kcap}Col"
  2040. return getattr(_tables(), col_name)
  2041. @classmethod
  2042. def get_atom_data(cls, shape, kind: str) -> Col:
  2043. return cls.get_atom_coltype(kind=kind)(shape=shape[0])
  2044. @classmethod
  2045. def get_atom_datetime64(cls, shape):
  2046. return _tables().Int64Col(shape=shape[0])
  2047. @classmethod
  2048. def get_atom_timedelta64(cls, shape):
  2049. return _tables().Int64Col(shape=shape[0])
  2050. @property
  2051. def shape(self):
  2052. return getattr(self.data, "shape", None)
  2053. @property
  2054. def cvalues(self):
  2055. """return my cython values"""
  2056. return self.data
  2057. def validate_attr(self, append):
  2058. """validate that we have the same order as the existing & same dtype"""
  2059. if append:
  2060. existing_fields = getattr(self.attrs, self.kind_attr, None)
  2061. if existing_fields is not None and existing_fields != list(self.values):
  2062. raise ValueError("appended items do not match existing items in table!")
  2063. existing_dtype = getattr(self.attrs, self.dtype_attr, None)
  2064. if existing_dtype is not None and existing_dtype != self.dtype:
  2065. raise ValueError(
  2066. "appended items dtype do not match existing items dtype in table!"
  2067. )
  2068. def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
  2069. """
  2070. Convert the data from this selection to the appropriate pandas type.
  2071. Parameters
  2072. ----------
  2073. values : np.ndarray
  2074. nan_rep :
  2075. encoding : str
  2076. errors : str
  2077. Returns
  2078. -------
  2079. index : listlike to become an Index
  2080. data : ndarraylike to become a column
  2081. """
  2082. assert isinstance(values, np.ndarray), type(values)
  2083. # values is a recarray
  2084. if values.dtype.fields is not None:
  2085. values = values[self.cname]
  2086. assert self.typ is not None
  2087. if self.dtype is None:
  2088. # Note: in tests we never have timedelta64 or datetime64,
  2089. # so the _get_data_and_dtype_name may be unnecessary
  2090. converted, dtype_name = _get_data_and_dtype_name(values)
  2091. kind = _dtype_to_kind(dtype_name)
  2092. else:
  2093. converted = values
  2094. dtype_name = self.dtype
  2095. kind = self.kind
  2096. assert isinstance(converted, np.ndarray) # for mypy
  2097. # use the meta if needed
  2098. meta = _ensure_decoded(self.meta)
  2099. metadata = self.metadata
  2100. ordered = self.ordered
  2101. tz = self.tz
  2102. assert dtype_name is not None
  2103. # convert to the correct dtype
  2104. dtype = _ensure_decoded(dtype_name)
  2105. # reverse converts
  2106. if dtype == "datetime64":
  2107. # recreate with tz if indicated
  2108. converted = _set_tz(converted, tz, coerce=True)
  2109. elif dtype == "timedelta64":
  2110. converted = np.asarray(converted, dtype="m8[ns]")
  2111. elif dtype == "date":
  2112. try:
  2113. converted = np.asarray(
  2114. [date.fromordinal(v) for v in converted], dtype=object
  2115. )
  2116. except ValueError:
  2117. converted = np.asarray(
  2118. [date.fromtimestamp(v) for v in converted], dtype=object
  2119. )
  2120. elif meta == "category":
  2121. # we have a categorical
  2122. categories = metadata
  2123. codes = converted.ravel()
  2124. # if we have stored a NaN in the categories
  2125. # then strip it; in theory we could have BOTH
  2126. # -1s in the codes and nulls :<
  2127. if categories is None:
  2128. # Handle case of NaN-only categorical columns in which case
  2129. # the categories are an empty array; when this is stored,
  2130. # pytables cannot write a zero-len array, so on readback
  2131. # the categories would be None and `read_hdf()` would fail.
  2132. categories = Index([], dtype=np.float64)
  2133. else:
  2134. mask = isna(categories)
  2135. if mask.any():
  2136. categories = categories[~mask]
  2137. codes[codes != -1] -= mask.astype(int).cumsum()._values
  2138. converted = Categorical.from_codes(
  2139. codes, categories=categories, ordered=ordered
  2140. )
  2141. else:
  2142. try:
  2143. converted = converted.astype(dtype, copy=False)
  2144. except TypeError:
  2145. converted = converted.astype("O", copy=False)
  2146. # convert nans / decode
  2147. if _ensure_decoded(kind) == "string":
  2148. converted = _unconvert_string_array(
  2149. converted, nan_rep=nan_rep, encoding=encoding, errors=errors
  2150. )
  2151. return self.values, converted
  2152. def set_attr(self):
  2153. """set the data for this column"""
  2154. setattr(self.attrs, self.kind_attr, self.values)
  2155. setattr(self.attrs, self.meta_attr, self.meta)
  2156. assert self.dtype is not None
  2157. setattr(self.attrs, self.dtype_attr, self.dtype)
  2158. class DataIndexableCol(DataCol):
  2159. """represent a data column that can be indexed"""
  2160. is_data_indexable = True
  2161. def validate_names(self):
  2162. if not Index(self.values).is_object():
  2163. # TODO: should the message here be more specifically non-str?
  2164. raise ValueError("cannot have non-object label DataIndexableCol")
  2165. @classmethod
  2166. def get_atom_string(cls, shape, itemsize):
  2167. return _tables().StringCol(itemsize=itemsize)
  2168. @classmethod
  2169. def get_atom_data(cls, shape, kind: str) -> Col:
  2170. return cls.get_atom_coltype(kind=kind)()
  2171. @classmethod
  2172. def get_atom_datetime64(cls, shape):
  2173. return _tables().Int64Col()
  2174. @classmethod
  2175. def get_atom_timedelta64(cls, shape):
  2176. return _tables().Int64Col()
  2177. class GenericDataIndexableCol(DataIndexableCol):
  2178. """represent a generic pytables data column"""
  2179. pass
  2180. class Fixed:
  2181. """
  2182. represent an object in my store
  2183. facilitate read/write of various types of objects
  2184. this is an abstract base class
  2185. Parameters
  2186. ----------
  2187. parent : HDFStore
  2188. group : Node
  2189. The group node where the table resides.
  2190. """
  2191. pandas_kind: str
  2192. format_type: str = "fixed" # GH#30962 needed by dask
  2193. obj_type: type[DataFrame | Series]
  2194. ndim: int
  2195. encoding: str
  2196. parent: HDFStore
  2197. group: Node
  2198. errors: str
  2199. is_table = False
  2200. def __init__(
  2201. self,
  2202. parent: HDFStore,
  2203. group: Node,
  2204. encoding: str = "UTF-8",
  2205. errors: str = "strict",
  2206. ):
  2207. assert isinstance(parent, HDFStore), type(parent)
  2208. assert _table_mod is not None # needed for mypy
  2209. assert isinstance(group, _table_mod.Node), type(group)
  2210. self.parent = parent
  2211. self.group = group
  2212. self.encoding = _ensure_encoding(encoding)
  2213. self.errors = errors
  2214. @property
  2215. def is_old_version(self) -> bool:
  2216. return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1
  2217. @property
  2218. def version(self) -> tuple[int, int, int]:
  2219. """compute and set our version"""
  2220. version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))
  2221. try:
  2222. version = tuple(int(x) for x in version.split("."))
  2223. if len(version) == 2:
  2224. version = version + (0,)
  2225. except AttributeError:
  2226. version = (0, 0, 0)
  2227. return version
  2228. @property
  2229. def pandas_type(self):
  2230. return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))
  2231. def __repr__(self) -> str:
  2232. """return a pretty representation of myself"""
  2233. self.infer_axes()
  2234. s = self.shape
  2235. if s is not None:
  2236. if isinstance(s, (list, tuple)):
  2237. jshape = ",".join([pprint_thing(x) for x in s])
  2238. s = f"[{jshape}]"
  2239. return f"{self.pandas_type:12.12} (shape->{s})"
  2240. return self.pandas_type
  2241. def set_object_info(self):
  2242. """set my pandas type & version"""
  2243. self.attrs.pandas_type = str(self.pandas_kind)
  2244. self.attrs.pandas_version = str(_version)
  2245. def copy(self):
  2246. new_self = copy.copy(self)
  2247. return new_self
  2248. @property
  2249. def shape(self):
  2250. return self.nrows
  2251. @property
  2252. def pathname(self):
  2253. return self.group._v_pathname
  2254. @property
  2255. def _handle(self):
  2256. return self.parent._handle
  2257. @property
  2258. def _filters(self):
  2259. return self.parent._filters
  2260. @property
  2261. def _complevel(self) -> int:
  2262. return self.parent._complevel
  2263. @property
  2264. def _fletcher32(self) -> bool:
  2265. return self.parent._fletcher32
  2266. @property
  2267. def attrs(self):
  2268. return self.group._v_attrs
  2269. def set_attrs(self):
  2270. """set our object attributes"""
  2271. pass
  2272. def get_attrs(self):
  2273. """get our object attributes"""
  2274. pass
  2275. @property
  2276. def storable(self):
  2277. """return my storable"""
  2278. return self.group
  2279. @property
  2280. def is_exists(self) -> bool:
  2281. return False
  2282. @property
  2283. def nrows(self):
  2284. return getattr(self.storable, "nrows", None)
  2285. def validate(self, other):
  2286. """validate against an existing storable"""
  2287. if other is None:
  2288. return
  2289. return True
  2290. def validate_version(self, where=None):
  2291. """are we trying to operate on an old version?"""
  2292. return True
  2293. def infer_axes(self):
  2294. """
  2295. infer the axes of my storer
  2296. return a boolean indicating if we have a valid storer or not
  2297. """
  2298. s = self.storable
  2299. if s is None:
  2300. return False
  2301. self.get_attrs()
  2302. return True
  2303. def read(
  2304. self,
  2305. where=None,
  2306. columns=None,
  2307. start: int | None = None,
  2308. stop: int | None = None,
  2309. ):
  2310. raise NotImplementedError(
  2311. "cannot read on an abstract storer: subclasses should implement"
  2312. )
  2313. def write(self, **kwargs):
  2314. raise NotImplementedError(
  2315. "cannot write on an abstract storer: subclasses should implement"
  2316. )
  2317. def delete(self, where=None, start: int | None = None, stop: int | None = None):
  2318. """
  2319. support fully deleting the node in its entirety (only) - where
  2320. specification must be None
  2321. """
  2322. if com.all_none(where, start, stop):
  2323. self._handle.remove_node(self.group, recursive=True)
  2324. return None
  2325. raise TypeError("cannot delete on an abstract storer")
  2326. class GenericFixed(Fixed):
  2327. """a generified fixed version"""
  2328. _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}
  2329. _reverse_index_map = {v: k for k, v in _index_type_map.items()}
  2330. attributes: list[str] = []
  2331. # indexer helpers
  2332. def _class_to_alias(self, cls) -> str:
  2333. return self._index_type_map.get(cls, "")
  2334. def _alias_to_class(self, alias):
  2335. if isinstance(alias, type): # pragma: no cover
  2336. # compat: for a short period of time master stored types
  2337. return alias
  2338. return self._reverse_index_map.get(alias, Index)
  2339. def _get_index_factory(self, attrs):
  2340. index_class = self._alias_to_class(
  2341. _ensure_decoded(getattr(attrs, "index_class", ""))
  2342. )
  2343. factory: Callable
  2344. if index_class == DatetimeIndex:
  2345. def f(values, freq=None, tz=None):
  2346. # data are already in UTC, localize and convert if tz present
  2347. dta = DatetimeArray._simple_new(values.values, freq=freq)
  2348. result = DatetimeIndex._simple_new(dta, name=None)
  2349. if tz is not None:
  2350. result = result.tz_localize("UTC").tz_convert(tz)
  2351. return result
  2352. factory = f
  2353. elif index_class == PeriodIndex:
  2354. def f(values, freq=None, tz=None):
  2355. parr = PeriodArray._simple_new(values, freq=freq)
  2356. return PeriodIndex._simple_new(parr, name=None)
  2357. factory = f
  2358. else:
  2359. factory = index_class
  2360. kwargs = {}
  2361. if "freq" in attrs:
  2362. kwargs["freq"] = attrs["freq"]
  2363. if index_class is Index:
  2364. # DTI/PI would be gotten by _alias_to_class
  2365. factory = TimedeltaIndex
  2366. if "tz" in attrs:
  2367. if isinstance(attrs["tz"], bytes):
  2368. # created by python2
  2369. kwargs["tz"] = attrs["tz"].decode("utf-8")
  2370. else:
  2371. # created by python3
  2372. kwargs["tz"] = attrs["tz"]
  2373. assert index_class is DatetimeIndex # just checking
  2374. return factory, kwargs
  2375. def validate_read(self, columns, where):
  2376. """
  2377. raise if any keywords are passed which are not-None
  2378. """
  2379. if columns is not None:
  2380. raise TypeError(
  2381. "cannot pass a column specification when reading "
  2382. "a Fixed format store. this store must be selected in its entirety"
  2383. )
  2384. if where is not None:
  2385. raise TypeError(
  2386. "cannot pass a where specification when reading "
  2387. "from a Fixed format store. this store must be selected in its entirety"
  2388. )
  2389. @property
  2390. def is_exists(self) -> bool:
  2391. return True
  2392. def set_attrs(self):
  2393. """set our object attributes"""
  2394. self.attrs.encoding = self.encoding
  2395. self.attrs.errors = self.errors
  2396. def get_attrs(self):
  2397. """retrieve our attributes"""
  2398. self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
  2399. self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
  2400. for n in self.attributes:
  2401. setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
  2402. def write(self, obj, **kwargs):
  2403. self.set_attrs()
  2404. def read_array(self, key: str, start: int | None = None, stop: int | None = None):
  2405. """read an array for the specified node (off of group"""
  2406. import tables
  2407. node = getattr(self.group, key)
  2408. attrs = node._v_attrs
  2409. transposed = getattr(attrs, "transposed", False)
  2410. if isinstance(node, tables.VLArray):
  2411. ret = node[0][start:stop]
  2412. else:
  2413. dtype = _ensure_decoded(getattr(attrs, "value_type", None))
  2414. shape = getattr(attrs, "shape", None)
  2415. if shape is not None:
  2416. # length 0 axis
  2417. ret = np.empty(shape, dtype=dtype)
  2418. else:
  2419. ret = node[start:stop]
  2420. if dtype == "datetime64":
  2421. # reconstruct a timezone if indicated
  2422. tz = getattr(attrs, "tz", None)
  2423. ret = _set_tz(ret, tz, coerce=True)
  2424. elif dtype == "timedelta64":
  2425. ret = np.asarray(ret, dtype="m8[ns]")
  2426. if transposed:
  2427. return ret.T
  2428. else:
  2429. return ret
  2430. def read_index(
  2431. self, key: str, start: int | None = None, stop: int | None = None
  2432. ) -> Index:
  2433. variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))
  2434. if variety == "multi":
  2435. return self.read_multi_index(key, start=start, stop=stop)
  2436. elif variety == "regular":
  2437. node = getattr(self.group, key)
  2438. index = self.read_index_node(node, start=start, stop=stop)
  2439. return index
  2440. else: # pragma: no cover
  2441. raise TypeError(f"unrecognized index variety: {variety}")
  2442. def write_index(self, key: str, index: Index):
  2443. if isinstance(index, MultiIndex):
  2444. setattr(self.attrs, f"{key}_variety", "multi")
  2445. self.write_multi_index(key, index)
  2446. else:
  2447. setattr(self.attrs, f"{key}_variety", "regular")
  2448. converted = _convert_index("index", index, self.encoding, self.errors)
  2449. self.write_array(key, converted.values)
  2450. node = getattr(self.group, key)
  2451. node._v_attrs.kind = converted.kind
  2452. node._v_attrs.name = index.name
  2453. if isinstance(index, (DatetimeIndex, PeriodIndex)):
  2454. node._v_attrs.index_class = self._class_to_alias(type(index))
  2455. if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
  2456. node._v_attrs.freq = index.freq
  2457. if isinstance(index, DatetimeIndex) and index.tz is not None:
  2458. node._v_attrs.tz = _get_tz(index.tz)
  2459. def write_multi_index(self, key: str, index: MultiIndex):
  2460. setattr(self.attrs, f"{key}_nlevels", index.nlevels)
  2461. for i, (lev, level_codes, name) in enumerate(
  2462. zip(index.levels, index.codes, index.names)
  2463. ):
  2464. # write the level
  2465. if is_extension_array_dtype(lev):
  2466. raise NotImplementedError(
  2467. "Saving a MultiIndex with an extension dtype is not supported."
  2468. )
  2469. level_key = f"{key}_level{i}"
  2470. conv_level = _convert_index(level_key, lev, self.encoding, self.errors)
  2471. self.write_array(level_key, conv_level.values)
  2472. node = getattr(self.group, level_key)
  2473. node._v_attrs.kind = conv_level.kind
  2474. node._v_attrs.name = name
  2475. # write the name
  2476. setattr(node._v_attrs, f"{key}_name{name}", name)
  2477. # write the labels
  2478. label_key = f"{key}_label{i}"
  2479. self.write_array(label_key, level_codes)
  2480. def read_multi_index(
  2481. self, key: str, start: int | None = None, stop: int | None = None
  2482. ) -> MultiIndex:
  2483. nlevels = getattr(self.attrs, f"{key}_nlevels")
  2484. levels = []
  2485. codes = []
  2486. names: list[Hashable] = []
  2487. for i in range(nlevels):
  2488. level_key = f"{key}_level{i}"
  2489. node = getattr(self.group, level_key)
  2490. lev = self.read_index_node(node, start=start, stop=stop)
  2491. levels.append(lev)
  2492. names.append(lev.name)
  2493. label_key = f"{key}_label{i}"
  2494. level_codes = self.read_array(label_key, start=start, stop=stop)
  2495. codes.append(level_codes)
  2496. return MultiIndex(
  2497. levels=levels, codes=codes, names=names, verify_integrity=True
  2498. )
  2499. def read_index_node(
  2500. self, node: Node, start: int | None = None, stop: int | None = None
  2501. ) -> Index:
  2502. data = node[start:stop]
  2503. # If the index was an empty array write_array_empty() will
  2504. # have written a sentinel. Here we replace it with the original.
  2505. if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
  2506. data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)
  2507. kind = _ensure_decoded(node._v_attrs.kind)
  2508. name = None
  2509. if "name" in node._v_attrs:
  2510. name = _ensure_str(node._v_attrs.name)
  2511. name = _ensure_decoded(name)
  2512. attrs = node._v_attrs
  2513. factory, kwargs = self._get_index_factory(attrs)
  2514. if kind == "date":
  2515. index = factory(
  2516. _unconvert_index(
  2517. data, kind, encoding=self.encoding, errors=self.errors
  2518. ),
  2519. dtype=object,
  2520. **kwargs,
  2521. )
  2522. else:
  2523. index = factory(
  2524. _unconvert_index(
  2525. data, kind, encoding=self.encoding, errors=self.errors
  2526. ),
  2527. **kwargs,
  2528. )
  2529. index.name = name
  2530. return index
  2531. def write_array_empty(self, key: str, value: ArrayLike):
  2532. """write a 0-len array"""
  2533. # ugly hack for length 0 axes
  2534. arr = np.empty((1,) * value.ndim)
  2535. self._handle.create_array(self.group, key, arr)
  2536. node = getattr(self.group, key)
  2537. node._v_attrs.value_type = str(value.dtype)
  2538. node._v_attrs.shape = value.shape
  2539. def write_array(
  2540. self, key: str, obj: DataFrame | Series, items: Index | None = None
  2541. ) -> None:
  2542. # TODO: we only have a few tests that get here, the only EA
  2543. # that gets passed is DatetimeArray, and we never have
  2544. # both self._filters and EA
  2545. value = extract_array(obj, extract_numpy=True)
  2546. if key in self.group:
  2547. self._handle.remove_node(self.group, key)
  2548. # Transform needed to interface with pytables row/col notation
  2549. empty_array = value.size == 0
  2550. transposed = False
  2551. if is_categorical_dtype(value.dtype):
  2552. raise NotImplementedError(
  2553. "Cannot store a category dtype in a HDF5 dataset that uses format="
  2554. '"fixed". Use format="table".'
  2555. )
  2556. if not empty_array:
  2557. if hasattr(value, "T"):
  2558. # ExtensionArrays (1d) may not have transpose.
  2559. value = value.T
  2560. transposed = True
  2561. atom = None
  2562. if self._filters is not None:
  2563. with suppress(ValueError):
  2564. # get the atom for this datatype
  2565. atom = _tables().Atom.from_dtype(value.dtype)
  2566. if atom is not None:
  2567. # We only get here if self._filters is non-None and
  2568. # the Atom.from_dtype call succeeded
  2569. # create an empty chunked array and fill it from value
  2570. if not empty_array:
  2571. ca = self._handle.create_carray(
  2572. self.group, key, atom, value.shape, filters=self._filters
  2573. )
  2574. ca[:] = value
  2575. else:
  2576. self.write_array_empty(key, value)
  2577. elif value.dtype.type == np.object_:
  2578. # infer the type, warn if we have a non-string type here (for
  2579. # performance)
  2580. inferred_type = lib.infer_dtype(value, skipna=False)
  2581. if empty_array:
  2582. pass
  2583. elif inferred_type == "string":
  2584. pass
  2585. else:
  2586. ws = performance_doc % (inferred_type, key, items)
  2587. warnings.warn(ws, PerformanceWarning, stacklevel=7)
  2588. vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
  2589. vlarr.append(value)
  2590. elif is_datetime64_dtype(value.dtype):
  2591. self._handle.create_array(self.group, key, value.view("i8"))
  2592. getattr(self.group, key)._v_attrs.value_type = "datetime64"
  2593. elif is_datetime64tz_dtype(value.dtype):
  2594. # store as UTC
  2595. # with a zone
  2596. # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
  2597. # attribute "asi8"
  2598. self._handle.create_array(
  2599. self.group, key, value.asi8 # type: ignore[union-attr]
  2600. )
  2601. node = getattr(self.group, key)
  2602. # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
  2603. # attribute "tz"
  2604. node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr]
  2605. node._v_attrs.value_type = "datetime64"
  2606. elif is_timedelta64_dtype(value.dtype):
  2607. self._handle.create_array(self.group, key, value.view("i8"))
  2608. getattr(self.group, key)._v_attrs.value_type = "timedelta64"
  2609. elif empty_array:
  2610. self.write_array_empty(key, value)
  2611. else:
  2612. self._handle.create_array(self.group, key, value)
  2613. getattr(self.group, key)._v_attrs.transposed = transposed
  2614. class SeriesFixed(GenericFixed):
  2615. pandas_kind = "series"
  2616. attributes = ["name"]
  2617. name: Hashable
  2618. @property
  2619. def shape(self):
  2620. try:
  2621. return (len(self.group.values),)
  2622. except (TypeError, AttributeError):
  2623. return None
  2624. def read(
  2625. self,
  2626. where=None,
  2627. columns=None,
  2628. start: int | None = None,
  2629. stop: int | None = None,
  2630. ):
  2631. self.validate_read(columns, where)
  2632. index = self.read_index("index", start=start, stop=stop)
  2633. values = self.read_array("values", start=start, stop=stop)
  2634. return Series(values, index=index, name=self.name)
  2635. def write(self, obj, **kwargs):
  2636. super().write(obj, **kwargs)
  2637. self.write_index("index", obj.index)
  2638. self.write_array("values", obj)
  2639. self.attrs.name = obj.name
  2640. class BlockManagerFixed(GenericFixed):
  2641. attributes = ["ndim", "nblocks"]
  2642. nblocks: int
  2643. @property
  2644. def shape(self) -> Shape | None:
  2645. try:
  2646. ndim = self.ndim
  2647. # items
  2648. items = 0
  2649. for i in range(self.nblocks):
  2650. node = getattr(self.group, f"block{i}_items")
  2651. shape = getattr(node, "shape", None)
  2652. if shape is not None:
  2653. items += shape[0]
  2654. # data shape
  2655. node = self.group.block0_values
  2656. shape = getattr(node, "shape", None)
  2657. if shape is not None:
  2658. shape = list(shape[0 : (ndim - 1)])
  2659. else:
  2660. shape = []
  2661. shape.append(items)
  2662. return shape
  2663. except AttributeError:
  2664. return None
  2665. def read(
  2666. self,
  2667. where=None,
  2668. columns=None,
  2669. start: int | None = None,
  2670. stop: int | None = None,
  2671. ):
  2672. # start, stop applied to rows, so 0th axis only
  2673. self.validate_read(columns, where)
  2674. select_axis = self.obj_type()._get_block_manager_axis(0)
  2675. axes = []
  2676. for i in range(self.ndim):
  2677. _start, _stop = (start, stop) if i == select_axis else (None, None)
  2678. ax = self.read_index(f"axis{i}", start=_start, stop=_stop)
  2679. axes.append(ax)
  2680. items = axes[0]
  2681. dfs = []
  2682. for i in range(self.nblocks):
  2683. blk_items = self.read_index(f"block{i}_items")
  2684. values = self.read_array(f"block{i}_values", start=_start, stop=_stop)
  2685. columns = items[items.get_indexer(blk_items)]
  2686. df = DataFrame(values.T, columns=columns, index=axes[1])
  2687. dfs.append(df)
  2688. if len(dfs) > 0:
  2689. out = concat(dfs, axis=1)
  2690. out = out.reindex(columns=items, copy=False)
  2691. return out
  2692. return DataFrame(columns=axes[0], index=axes[1])
  2693. def write(self, obj, **kwargs):
  2694. super().write(obj, **kwargs)
  2695. # TODO(ArrayManager) HDFStore relies on accessing the blocks
  2696. if isinstance(obj._mgr, ArrayManager):
  2697. obj = obj._as_manager("block")
  2698. data = obj._mgr
  2699. if not data.is_consolidated():
  2700. data = data.consolidate()
  2701. self.attrs.ndim = data.ndim
  2702. for i, ax in enumerate(data.axes):
  2703. if i == 0 and (not ax.is_unique):
  2704. raise ValueError("Columns index has to be unique for fixed format")
  2705. self.write_index(f"axis{i}", ax)
  2706. # Supporting mixed-type DataFrame objects...nontrivial
  2707. self.attrs.nblocks = len(data.blocks)
  2708. for i, blk in enumerate(data.blocks):
  2709. # I have no idea why, but writing values before items fixed #2299
  2710. blk_items = data.items.take(blk.mgr_locs)
  2711. self.write_array(f"block{i}_values", blk.values, items=blk_items)
  2712. self.write_index(f"block{i}_items", blk_items)
  2713. class FrameFixed(BlockManagerFixed):
  2714. pandas_kind = "frame"
  2715. obj_type = DataFrame
  2716. class Table(Fixed):
  2717. """
  2718. represent a table:
  2719. facilitate read/write of various types of tables
  2720. Attrs in Table Node
  2721. -------------------
  2722. These are attributes that are store in the main table node, they are
  2723. necessary to recreate these tables when read back in.
  2724. index_axes : a list of tuples of the (original indexing axis and
  2725. index column)
  2726. non_index_axes: a list of tuples of the (original index axis and
  2727. columns on a non-indexing axis)
  2728. values_axes : a list of the columns which comprise the data of this
  2729. table
  2730. data_columns : a list of the columns that we are allowing indexing
  2731. (these become single columns in values_axes), or True to force all
  2732. columns
  2733. nan_rep : the string to use for nan representations for string
  2734. objects
  2735. levels : the names of levels
  2736. metadata : the names of the metadata columns
  2737. """
  2738. pandas_kind = "wide_table"
  2739. format_type: str = "table" # GH#30962 needed by dask
  2740. table_type: str
  2741. levels: int | list[Hashable] = 1
  2742. is_table = True
  2743. index_axes: list[IndexCol]
  2744. non_index_axes: list[tuple[int, Any]]
  2745. values_axes: list[DataCol]
  2746. data_columns: list
  2747. metadata: list
  2748. info: dict
  2749. def __init__(
  2750. self,
  2751. parent: HDFStore,
  2752. group: Node,
  2753. encoding=None,
  2754. errors: str = "strict",
  2755. index_axes=None,
  2756. non_index_axes=None,
  2757. values_axes=None,
  2758. data_columns=None,
  2759. info=None,
  2760. nan_rep=None,
  2761. ):
  2762. super().__init__(parent, group, encoding=encoding, errors=errors)
  2763. self.index_axes = index_axes or []
  2764. self.non_index_axes = non_index_axes or []
  2765. self.values_axes = values_axes or []
  2766. self.data_columns = data_columns or []
  2767. self.info = info or {}
  2768. self.nan_rep = nan_rep
  2769. @property
  2770. def table_type_short(self) -> str:
  2771. return self.table_type.split("_")[0]
  2772. def __repr__(self) -> str:
  2773. """return a pretty representation of myself"""
  2774. self.infer_axes()
  2775. jdc = ",".join(self.data_columns) if len(self.data_columns) else ""
  2776. dc = f",dc->[{jdc}]"
  2777. ver = ""
  2778. if self.is_old_version:
  2779. jver = ".".join([str(x) for x in self.version])
  2780. ver = f"[{jver}]"
  2781. jindex_axes = ",".join([a.name for a in self.index_axes])
  2782. return (
  2783. f"{self.pandas_type:12.12}{ver} "
  2784. f"(typ->{self.table_type_short},nrows->{self.nrows},"
  2785. f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})"
  2786. )
  2787. def __getitem__(self, c: str):
  2788. """return the axis for c"""
  2789. for a in self.axes:
  2790. if c == a.name:
  2791. return a
  2792. return None
  2793. def validate(self, other):
  2794. """validate against an existing table"""
  2795. if other is None:
  2796. return
  2797. if other.table_type != self.table_type:
  2798. raise TypeError(
  2799. "incompatible table_type with existing "
  2800. f"[{other.table_type} - {self.table_type}]"
  2801. )
  2802. for c in ["index_axes", "non_index_axes", "values_axes"]:
  2803. sv = getattr(self, c, None)
  2804. ov = getattr(other, c, None)
  2805. if sv != ov:
  2806. # show the error for the specific axes
  2807. for i, sax in enumerate(sv):
  2808. oax = ov[i]
  2809. if sax != oax:
  2810. raise ValueError(
  2811. f"invalid combination of [{c}] on appending data "
  2812. f"[{sax}] vs current table [{oax}]"
  2813. )
  2814. # should never get here
  2815. raise Exception(
  2816. f"invalid combination of [{c}] on appending data [{sv}] vs "
  2817. f"current table [{ov}]"
  2818. )
  2819. @property
  2820. def is_multi_index(self) -> bool:
  2821. """the levels attribute is 1 or a list in the case of a multi-index"""
  2822. return isinstance(self.levels, list)
  2823. def validate_multiindex(
  2824. self, obj: DataFrame | Series
  2825. ) -> tuple[DataFrame, list[Hashable]]:
  2826. """
  2827. validate that we can store the multi-index; reset and return the
  2828. new object
  2829. """
  2830. levels = [
  2831. l if l is not None else f"level_{i}" for i, l in enumerate(obj.index.names)
  2832. ]
  2833. try:
  2834. reset_obj = obj.reset_index()
  2835. except ValueError as err:
  2836. raise ValueError(
  2837. "duplicate names/columns in the multi-index when storing as a table"
  2838. ) from err
  2839. assert isinstance(reset_obj, DataFrame) # for mypy
  2840. return reset_obj, levels
  2841. @property
  2842. def nrows_expected(self) -> int:
  2843. """based on our axes, compute the expected nrows"""
  2844. return np.prod([i.cvalues.shape[0] for i in self.index_axes])
  2845. @property
  2846. def is_exists(self) -> bool:
  2847. """has this table been created"""
  2848. return "table" in self.group
  2849. @property
  2850. def storable(self):
  2851. return getattr(self.group, "table", None)
  2852. @property
  2853. def table(self):
  2854. """return the table group (this is my storable)"""
  2855. return self.storable
  2856. @property
  2857. def dtype(self):
  2858. return self.table.dtype
  2859. @property
  2860. def description(self):
  2861. return self.table.description
  2862. @property
  2863. def axes(self):
  2864. return itertools.chain(self.index_axes, self.values_axes)
  2865. @property
  2866. def ncols(self) -> int:
  2867. """the number of total columns in the values axes"""
  2868. return sum(len(a.values) for a in self.values_axes)
  2869. @property
  2870. def is_transposed(self) -> bool:
  2871. return False
  2872. @property
  2873. def data_orientation(self):
  2874. """return a tuple of my permutated axes, non_indexable at the front"""
  2875. return tuple(
  2876. itertools.chain(
  2877. [int(a[0]) for a in self.non_index_axes],
  2878. [int(a.axis) for a in self.index_axes],
  2879. )
  2880. )
  2881. def queryables(self) -> dict[str, Any]:
  2882. """return a dict of the kinds allowable columns for this object"""
  2883. # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here
  2884. axis_names = {0: "index", 1: "columns"}
  2885. # compute the values_axes queryables
  2886. d1 = [(a.cname, a) for a in self.index_axes]
  2887. d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes]
  2888. d3 = [
  2889. (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)
  2890. ]
  2891. # error: Unsupported operand types for + ("List[Tuple[str, IndexCol]]" and
  2892. # "List[Tuple[str, None]]")
  2893. return dict(d1 + d2 + d3) # type: ignore[operator]
  2894. def index_cols(self):
  2895. """return a list of my index cols"""
  2896. # Note: each `i.cname` below is assured to be a str.
  2897. return [(i.axis, i.cname) for i in self.index_axes]
  2898. def values_cols(self) -> list[str]:
  2899. """return a list of my values cols"""
  2900. return [i.cname for i in self.values_axes]
  2901. def _get_metadata_path(self, key: str) -> str:
  2902. """return the metadata pathname for this key"""
  2903. group = self.group._v_pathname
  2904. return f"{group}/meta/{key}/meta"
  2905. def write_metadata(self, key: str, values: np.ndarray):
  2906. """
  2907. Write out a metadata array to the key as a fixed-format Series.
  2908. Parameters
  2909. ----------
  2910. key : str
  2911. values : ndarray
  2912. """
  2913. self.parent.put(
  2914. self._get_metadata_path(key),
  2915. Series(values),
  2916. format="table",
  2917. encoding=self.encoding,
  2918. errors=self.errors,
  2919. nan_rep=self.nan_rep,
  2920. )
  2921. def read_metadata(self, key: str):
  2922. """return the meta data array for this key"""
  2923. if getattr(getattr(self.group, "meta", None), key, None) is not None:
  2924. return self.parent.select(self._get_metadata_path(key))
  2925. return None
  2926. def set_attrs(self):
  2927. """set our table type & indexables"""
  2928. self.attrs.table_type = str(self.table_type)
  2929. self.attrs.index_cols = self.index_cols()
  2930. self.attrs.values_cols = self.values_cols()
  2931. self.attrs.non_index_axes = self.non_index_axes
  2932. self.attrs.data_columns = self.data_columns
  2933. self.attrs.nan_rep = self.nan_rep
  2934. self.attrs.encoding = self.encoding
  2935. self.attrs.errors = self.errors
  2936. self.attrs.levels = self.levels
  2937. self.attrs.info = self.info
  2938. def get_attrs(self):
  2939. """retrieve our attributes"""
  2940. self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []
  2941. self.data_columns = getattr(self.attrs, "data_columns", None) or []
  2942. self.info = getattr(self.attrs, "info", None) or {}
  2943. self.nan_rep = getattr(self.attrs, "nan_rep", None)
  2944. self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
  2945. self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
  2946. self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []
  2947. self.index_axes = [a for a in self.indexables if a.is_an_indexable]
  2948. self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
  2949. def validate_version(self, where=None):
  2950. """are we trying to operate on an old version?"""
  2951. if where is not None:
  2952. if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1:
  2953. ws = incompatibility_doc % ".".join([str(x) for x in self.version])
  2954. warnings.warn(ws, IncompatibilityWarning)
  2955. def validate_min_itemsize(self, min_itemsize):
  2956. """
  2957. validate the min_itemsize doesn't contain items that are not in the
  2958. axes this needs data_columns to be defined
  2959. """
  2960. if min_itemsize is None:
  2961. return
  2962. if not isinstance(min_itemsize, dict):
  2963. return
  2964. q = self.queryables()
  2965. for k in min_itemsize:
  2966. # ok, apply generally
  2967. if k == "values":
  2968. continue
  2969. if k not in q:
  2970. raise ValueError(
  2971. f"min_itemsize has the key [{k}] which is not an axis or "
  2972. "data_column"
  2973. )
  2974. @cache_readonly
  2975. def indexables(self):
  2976. """create/cache the indexables if they don't exist"""
  2977. _indexables = []
  2978. desc = self.description
  2979. table_attrs = self.table.attrs
  2980. # Note: each of the `name` kwargs below are str, ensured
  2981. # by the definition in index_cols.
  2982. # index columns
  2983. for i, (axis, name) in enumerate(self.attrs.index_cols):
  2984. atom = getattr(desc, name)
  2985. md = self.read_metadata(name)
  2986. meta = "category" if md is not None else None
  2987. kind_attr = f"{name}_kind"
  2988. kind = getattr(table_attrs, kind_attr, None)
  2989. index_col = IndexCol(
  2990. name=name,
  2991. axis=axis,
  2992. pos=i,
  2993. kind=kind,
  2994. typ=atom,
  2995. table=self.table,
  2996. meta=meta,
  2997. metadata=md,
  2998. )
  2999. _indexables.append(index_col)
  3000. # values columns
  3001. dc = set(self.data_columns)
  3002. base_pos = len(_indexables)
  3003. def f(i, c):
  3004. assert isinstance(c, str)
  3005. klass = DataCol
  3006. if c in dc:
  3007. klass = DataIndexableCol
  3008. atom = getattr(desc, c)
  3009. adj_name = _maybe_adjust_name(c, self.version)
  3010. # TODO: why kind_attr here?
  3011. values = getattr(table_attrs, f"{adj_name}_kind", None)
  3012. dtype = getattr(table_attrs, f"{adj_name}_dtype", None)
  3013. kind = _dtype_to_kind(dtype)
  3014. md = self.read_metadata(c)
  3015. # TODO: figure out why these two versions of `meta` dont always match.
  3016. # meta = "category" if md is not None else None
  3017. meta = getattr(table_attrs, f"{adj_name}_meta", None)
  3018. obj = klass(
  3019. name=adj_name,
  3020. cname=c,
  3021. values=values,
  3022. kind=kind,
  3023. pos=base_pos + i,
  3024. typ=atom,
  3025. table=self.table,
  3026. meta=meta,
  3027. metadata=md,
  3028. dtype=dtype,
  3029. )
  3030. return obj
  3031. # Note: the definition of `values_cols` ensures that each
  3032. # `c` below is a str.
  3033. _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)])
  3034. return _indexables
  3035. def create_index(self, columns=None, optlevel=None, kind: str | None = None):
  3036. """
  3037. Create a pytables index on the specified columns.
  3038. Parameters
  3039. ----------
  3040. columns : None, bool, or listlike[str]
  3041. Indicate which columns to create an index on.
  3042. * False : Do not create any indexes.
  3043. * True : Create indexes on all columns.
  3044. * None : Create indexes on all columns.
  3045. * listlike : Create indexes on the given columns.
  3046. optlevel : int or None, default None
  3047. Optimization level, if None, pytables defaults to 6.
  3048. kind : str or None, default None
  3049. Kind of index, if None, pytables defaults to "medium".
  3050. Raises
  3051. ------
  3052. TypeError if trying to create an index on a complex-type column.
  3053. Notes
  3054. -----
  3055. Cannot index Time64Col or ComplexCol.
  3056. Pytables must be >= 3.0.
  3057. """
  3058. if not self.infer_axes():
  3059. return
  3060. if columns is False:
  3061. return
  3062. # index all indexables and data_columns
  3063. if columns is None or columns is True:
  3064. columns = [a.cname for a in self.axes if a.is_data_indexable]
  3065. if not isinstance(columns, (tuple, list)):
  3066. columns = [columns]
  3067. kw = {}
  3068. if optlevel is not None:
  3069. kw["optlevel"] = optlevel
  3070. if kind is not None:
  3071. kw["kind"] = kind
  3072. table = self.table
  3073. for c in columns:
  3074. v = getattr(table.cols, c, None)
  3075. if v is not None:
  3076. # remove the index if the kind/optlevel have changed
  3077. if v.is_indexed:
  3078. index = v.index
  3079. cur_optlevel = index.optlevel
  3080. cur_kind = index.kind
  3081. if kind is not None and cur_kind != kind:
  3082. v.remove_index()
  3083. else:
  3084. kw["kind"] = cur_kind
  3085. if optlevel is not None and cur_optlevel != optlevel:
  3086. v.remove_index()
  3087. else:
  3088. kw["optlevel"] = cur_optlevel
  3089. # create the index
  3090. if not v.is_indexed:
  3091. if v.type.startswith("complex"):
  3092. raise TypeError(
  3093. "Columns containing complex values can be stored but "
  3094. "cannot be indexed when using table format. Either use "
  3095. "fixed format, set index=False, or do not include "
  3096. "the columns containing complex values to "
  3097. "data_columns when initializing the table."
  3098. )
  3099. v.create_index(**kw)
  3100. elif c in self.non_index_axes[0][1]:
  3101. # GH 28156
  3102. raise AttributeError(
  3103. f"column {c} is not a data_column.\n"
  3104. f"In order to read column {c} you must reload the dataframe \n"
  3105. f"into HDFStore and include {c} with the data_columns argument."
  3106. )
  3107. def _read_axes(
  3108. self, where, start: int | None = None, stop: int | None = None
  3109. ) -> list[tuple[ArrayLike, ArrayLike]]:
  3110. """
  3111. Create the axes sniffed from the table.
  3112. Parameters
  3113. ----------
  3114. where : ???
  3115. start : int or None, default None
  3116. stop : int or None, default None
  3117. Returns
  3118. -------
  3119. List[Tuple[index_values, column_values]]
  3120. """
  3121. # create the selection
  3122. selection = Selection(self, where=where, start=start, stop=stop)
  3123. values = selection.select()
  3124. results = []
  3125. # convert the data
  3126. for a in self.axes:
  3127. a.set_info(self.info)
  3128. res = a.convert(
  3129. values,
  3130. nan_rep=self.nan_rep,
  3131. encoding=self.encoding,
  3132. errors=self.errors,
  3133. )
  3134. results.append(res)
  3135. return results
  3136. @classmethod
  3137. def get_object(cls, obj, transposed: bool):
  3138. """return the data for this obj"""
  3139. return obj
  3140. def validate_data_columns(self, data_columns, min_itemsize, non_index_axes):
  3141. """
  3142. take the input data_columns and min_itemize and create a data
  3143. columns spec
  3144. """
  3145. if not len(non_index_axes):
  3146. return []
  3147. axis, axis_labels = non_index_axes[0]
  3148. info = self.info.get(axis, {})
  3149. if info.get("type") == "MultiIndex" and data_columns:
  3150. raise ValueError(
  3151. f"cannot use a multi-index on axis [{axis}] with "
  3152. f"data_columns {data_columns}"
  3153. )
  3154. # evaluate the passed data_columns, True == use all columns
  3155. # take only valid axis labels
  3156. if data_columns is True:
  3157. data_columns = list(axis_labels)
  3158. elif data_columns is None:
  3159. data_columns = []
  3160. # if min_itemsize is a dict, add the keys (exclude 'values')
  3161. if isinstance(min_itemsize, dict):
  3162. existing_data_columns = set(data_columns)
  3163. data_columns = list(data_columns) # ensure we do not modify
  3164. data_columns.extend(
  3165. [
  3166. k
  3167. for k in min_itemsize.keys()
  3168. if k != "values" and k not in existing_data_columns
  3169. ]
  3170. )
  3171. # return valid columns in the order of our axis
  3172. return [c for c in data_columns if c in axis_labels]
  3173. def _create_axes(
  3174. self,
  3175. axes,
  3176. obj: DataFrame,
  3177. validate: bool = True,
  3178. nan_rep=None,
  3179. data_columns=None,
  3180. min_itemsize=None,
  3181. ):
  3182. """
  3183. Create and return the axes.
  3184. Parameters
  3185. ----------
  3186. axes: list or None
  3187. The names or numbers of the axes to create.
  3188. obj : DataFrame
  3189. The object to create axes on.
  3190. validate: bool, default True
  3191. Whether to validate the obj against an existing object already written.
  3192. nan_rep :
  3193. A value to use for string column nan_rep.
  3194. data_columns : List[str], True, or None, default None
  3195. Specify the columns that we want to create to allow indexing on.
  3196. * True : Use all available columns.
  3197. * None : Use no columns.
  3198. * List[str] : Use the specified columns.
  3199. min_itemsize: Dict[str, int] or None, default None
  3200. The min itemsize for a column in bytes.
  3201. """
  3202. if not isinstance(obj, DataFrame):
  3203. group = self.group._v_name
  3204. raise TypeError(
  3205. f"cannot properly create the storer for: [group->{group},"
  3206. f"value->{type(obj)}]"
  3207. )
  3208. # set the default axes if needed
  3209. if axes is None:
  3210. axes = [0]
  3211. # map axes to numbers
  3212. axes = [obj._get_axis_number(a) for a in axes]
  3213. # do we have an existing table (if so, use its axes & data_columns)
  3214. if self.infer_axes():
  3215. table_exists = True
  3216. axes = [a.axis for a in self.index_axes]
  3217. data_columns = list(self.data_columns)
  3218. nan_rep = self.nan_rep
  3219. # TODO: do we always have validate=True here?
  3220. else:
  3221. table_exists = False
  3222. new_info = self.info
  3223. assert self.ndim == 2 # with next check, we must have len(axes) == 1
  3224. # currently support on ndim-1 axes
  3225. if len(axes) != self.ndim - 1:
  3226. raise ValueError(
  3227. "currently only support ndim-1 indexers in an AppendableTable"
  3228. )
  3229. # create according to the new data
  3230. new_non_index_axes: list = []
  3231. # nan_representation
  3232. if nan_rep is None:
  3233. nan_rep = "nan"
  3234. # We construct the non-index-axis first, since that alters new_info
  3235. idx = [x for x in [0, 1] if x not in axes][0]
  3236. a = obj.axes[idx]
  3237. # we might be able to change the axes on the appending data if necessary
  3238. append_axis = list(a)
  3239. if table_exists:
  3240. indexer = len(new_non_index_axes) # i.e. 0
  3241. exist_axis = self.non_index_axes[indexer][1]
  3242. if not array_equivalent(np.array(append_axis), np.array(exist_axis)):
  3243. # ahah! -> reindex
  3244. if array_equivalent(
  3245. np.array(sorted(append_axis)), np.array(sorted(exist_axis))
  3246. ):
  3247. append_axis = exist_axis
  3248. # the non_index_axes info
  3249. info = new_info.setdefault(idx, {})
  3250. info["names"] = list(a.names)
  3251. info["type"] = type(a).__name__
  3252. new_non_index_axes.append((idx, append_axis))
  3253. # Now we can construct our new index axis
  3254. idx = axes[0]
  3255. a = obj.axes[idx]
  3256. axis_name = obj._get_axis_name(idx)
  3257. new_index = _convert_index(axis_name, a, self.encoding, self.errors)
  3258. new_index.axis = idx
  3259. # Because we are always 2D, there is only one new_index, so
  3260. # we know it will have pos=0
  3261. new_index.set_pos(0)
  3262. new_index.update_info(new_info)
  3263. new_index.maybe_set_size(min_itemsize) # check for column conflicts
  3264. new_index_axes = [new_index]
  3265. j = len(new_index_axes) # i.e. 1
  3266. assert j == 1
  3267. # reindex by our non_index_axes & compute data_columns
  3268. assert len(new_non_index_axes) == 1
  3269. for a in new_non_index_axes:
  3270. obj = _reindex_axis(obj, a[0], a[1])
  3271. transposed = new_index.axis == 1
  3272. # figure out data_columns and get out blocks
  3273. data_columns = self.validate_data_columns(
  3274. data_columns, min_itemsize, new_non_index_axes
  3275. )
  3276. frame = self.get_object(obj, transposed)._consolidate()
  3277. blocks, blk_items = self._get_blocks_and_items(
  3278. frame, table_exists, new_non_index_axes, self.values_axes, data_columns
  3279. )
  3280. # add my values
  3281. vaxes = []
  3282. for i, (blk, b_items) in enumerate(zip(blocks, blk_items)):
  3283. # shape of the data column are the indexable axes
  3284. klass = DataCol
  3285. name = None
  3286. # we have a data_column
  3287. if data_columns and len(b_items) == 1 and b_items[0] in data_columns:
  3288. klass = DataIndexableCol
  3289. name = b_items[0]
  3290. if not (name is None or isinstance(name, str)):
  3291. # TODO: should the message here be more specifically non-str?
  3292. raise ValueError("cannot have non-object label DataIndexableCol")
  3293. # make sure that we match up the existing columns
  3294. # if we have an existing table
  3295. existing_col: DataCol | None
  3296. if table_exists and validate:
  3297. try:
  3298. existing_col = self.values_axes[i]
  3299. except (IndexError, KeyError) as err:
  3300. raise ValueError(
  3301. f"Incompatible appended table [{blocks}]"
  3302. f"with existing table [{self.values_axes}]"
  3303. ) from err
  3304. else:
  3305. existing_col = None
  3306. new_name = name or f"values_block_{i}"
  3307. data_converted = _maybe_convert_for_string_atom(
  3308. new_name,
  3309. blk,
  3310. existing_col=existing_col,
  3311. min_itemsize=min_itemsize,
  3312. nan_rep=nan_rep,
  3313. encoding=self.encoding,
  3314. errors=self.errors,
  3315. columns=b_items,
  3316. )
  3317. adj_name = _maybe_adjust_name(new_name, self.version)
  3318. typ = klass._get_atom(data_converted)
  3319. kind = _dtype_to_kind(data_converted.dtype.name)
  3320. tz = None
  3321. if getattr(data_converted, "tz", None) is not None:
  3322. tz = _get_tz(data_converted.tz)
  3323. meta = metadata = ordered = None
  3324. if is_categorical_dtype(data_converted.dtype):
  3325. ordered = data_converted.ordered
  3326. meta = "category"
  3327. metadata = np.array(data_converted.categories, copy=False).ravel()
  3328. data, dtype_name = _get_data_and_dtype_name(data_converted)
  3329. col = klass(
  3330. name=adj_name,
  3331. cname=new_name,
  3332. values=list(b_items),
  3333. typ=typ,
  3334. pos=j,
  3335. kind=kind,
  3336. tz=tz,
  3337. ordered=ordered,
  3338. meta=meta,
  3339. metadata=metadata,
  3340. dtype=dtype_name,
  3341. data=data,
  3342. )
  3343. col.update_info(new_info)
  3344. vaxes.append(col)
  3345. j += 1
  3346. dcs = [col.name for col in vaxes if col.is_data_indexable]
  3347. new_table = type(self)(
  3348. parent=self.parent,
  3349. group=self.group,
  3350. encoding=self.encoding,
  3351. errors=self.errors,
  3352. index_axes=new_index_axes,
  3353. non_index_axes=new_non_index_axes,
  3354. values_axes=vaxes,
  3355. data_columns=dcs,
  3356. info=new_info,
  3357. nan_rep=nan_rep,
  3358. )
  3359. if hasattr(self, "levels"):
  3360. # TODO: get this into constructor, only for appropriate subclass
  3361. new_table.levels = self.levels
  3362. new_table.validate_min_itemsize(min_itemsize)
  3363. if validate and table_exists:
  3364. new_table.validate(self)
  3365. return new_table
  3366. @staticmethod
  3367. def _get_blocks_and_items(
  3368. frame: DataFrame,
  3369. table_exists: bool,
  3370. new_non_index_axes,
  3371. values_axes,
  3372. data_columns,
  3373. ):
  3374. # Helper to clarify non-state-altering parts of _create_axes
  3375. # TODO(ArrayManager) HDFStore relies on accessing the blocks
  3376. if isinstance(frame._mgr, ArrayManager):
  3377. frame = frame._as_manager("block")
  3378. def get_blk_items(mgr):
  3379. return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks]
  3380. mgr = frame._mgr
  3381. mgr = cast(BlockManager, mgr)
  3382. blocks: list[Block] = list(mgr.blocks)
  3383. blk_items: list[Index] = get_blk_items(mgr)
  3384. if len(data_columns):
  3385. axis, axis_labels = new_non_index_axes[0]
  3386. new_labels = Index(axis_labels).difference(Index(data_columns))
  3387. mgr = frame.reindex(new_labels, axis=axis)._mgr
  3388. # error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" has no
  3389. # attribute "blocks"
  3390. blocks = list(mgr.blocks) # type: ignore[union-attr]
  3391. blk_items = get_blk_items(mgr)
  3392. for c in data_columns:
  3393. mgr = frame.reindex([c], axis=axis)._mgr
  3394. # error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" has
  3395. # no attribute "blocks"
  3396. blocks.extend(mgr.blocks) # type: ignore[union-attr]
  3397. blk_items.extend(get_blk_items(mgr))
  3398. # reorder the blocks in the same order as the existing table if we can
  3399. if table_exists:
  3400. by_items = {
  3401. tuple(b_items.tolist()): (b, b_items)
  3402. for b, b_items in zip(blocks, blk_items)
  3403. }
  3404. new_blocks: list[Block] = []
  3405. new_blk_items = []
  3406. for ea in values_axes:
  3407. items = tuple(ea.values)
  3408. try:
  3409. b, b_items = by_items.pop(items)
  3410. new_blocks.append(b)
  3411. new_blk_items.append(b_items)
  3412. except (IndexError, KeyError) as err:
  3413. jitems = ",".join([pprint_thing(item) for item in items])
  3414. raise ValueError(
  3415. f"cannot match existing table structure for [{jitems}] "
  3416. "on appending data"
  3417. ) from err
  3418. blocks = new_blocks
  3419. blk_items = new_blk_items
  3420. return blocks, blk_items
  3421. def process_axes(self, obj, selection: Selection, columns=None):
  3422. """process axes filters"""
  3423. # make a copy to avoid side effects
  3424. if columns is not None:
  3425. columns = list(columns)
  3426. # make sure to include levels if we have them
  3427. if columns is not None and self.is_multi_index:
  3428. assert isinstance(self.levels, list) # assured by is_multi_index
  3429. for n in self.levels:
  3430. if n not in columns:
  3431. columns.insert(0, n)
  3432. # reorder by any non_index_axes & limit to the select columns
  3433. for axis, labels in self.non_index_axes:
  3434. obj = _reindex_axis(obj, axis, labels, columns)
  3435. # apply the selection filters (but keep in the same order)
  3436. if selection.filter is not None:
  3437. for field, op, filt in selection.filter.format():
  3438. def process_filter(field, filt):
  3439. for axis_name in obj._AXIS_ORDERS:
  3440. axis_number = obj._get_axis_number(axis_name)
  3441. axis_values = obj._get_axis(axis_name)
  3442. assert axis_number is not None
  3443. # see if the field is the name of an axis
  3444. if field == axis_name:
  3445. # if we have a multi-index, then need to include
  3446. # the levels
  3447. if self.is_multi_index:
  3448. filt = filt.union(Index(self.levels))
  3449. takers = op(axis_values, filt)
  3450. return obj.loc(axis=axis_number)[takers]
  3451. # this might be the name of a file IN an axis
  3452. elif field in axis_values:
  3453. # we need to filter on this dimension
  3454. values = ensure_index(getattr(obj, field).values)
  3455. filt = ensure_index(filt)
  3456. # hack until we support reversed dim flags
  3457. if isinstance(obj, DataFrame):
  3458. axis_number = 1 - axis_number
  3459. takers = op(values, filt)
  3460. return obj.loc(axis=axis_number)[takers]
  3461. raise ValueError(f"cannot find the field [{field}] for filtering!")
  3462. obj = process_filter(field, filt)
  3463. return obj
  3464. def create_description(
  3465. self,
  3466. complib,
  3467. complevel: int | None,
  3468. fletcher32: bool,
  3469. expectedrows: int | None,
  3470. ) -> dict[str, Any]:
  3471. """create the description of the table from the axes & values"""
  3472. # provided expected rows if its passed
  3473. if expectedrows is None:
  3474. expectedrows = max(self.nrows_expected, 10000)
  3475. d = {"name": "table", "expectedrows": expectedrows}
  3476. # description from the axes & values
  3477. d["description"] = {a.cname: a.typ for a in self.axes}
  3478. if complib:
  3479. if complevel is None:
  3480. complevel = self._complevel or 9
  3481. filters = _tables().Filters(
  3482. complevel=complevel,
  3483. complib=complib,
  3484. fletcher32=fletcher32 or self._fletcher32,
  3485. )
  3486. d["filters"] = filters
  3487. elif self._filters is not None:
  3488. d["filters"] = self._filters
  3489. return d
  3490. def read_coordinates(
  3491. self, where=None, start: int | None = None, stop: int | None = None
  3492. ):
  3493. """
  3494. select coordinates (row numbers) from a table; return the
  3495. coordinates object
  3496. """
  3497. # validate the version
  3498. self.validate_version(where)
  3499. # infer the data kind
  3500. if not self.infer_axes():
  3501. return False
  3502. # create the selection
  3503. selection = Selection(self, where=where, start=start, stop=stop)
  3504. coords = selection.select_coords()
  3505. if selection.filter is not None:
  3506. for field, op, filt in selection.filter.format():
  3507. data = self.read_column(
  3508. field, start=coords.min(), stop=coords.max() + 1
  3509. )
  3510. coords = coords[op(data.iloc[coords - coords.min()], filt).values]
  3511. return Index(coords)
  3512. def read_column(
  3513. self,
  3514. column: str,
  3515. where=None,
  3516. start: int | None = None,
  3517. stop: int | None = None,
  3518. ):
  3519. """
  3520. return a single column from the table, generally only indexables
  3521. are interesting
  3522. """
  3523. # validate the version
  3524. self.validate_version()
  3525. # infer the data kind
  3526. if not self.infer_axes():
  3527. return False
  3528. if where is not None:
  3529. raise TypeError("read_column does not currently accept a where clause")
  3530. # find the axes
  3531. for a in self.axes:
  3532. if column == a.name:
  3533. if not a.is_data_indexable:
  3534. raise ValueError(
  3535. f"column [{column}] can not be extracted individually; "
  3536. "it is not data indexable"
  3537. )
  3538. # column must be an indexable or a data column
  3539. c = getattr(self.table.cols, column)
  3540. a.set_info(self.info)
  3541. col_values = a.convert(
  3542. c[start:stop],
  3543. nan_rep=self.nan_rep,
  3544. encoding=self.encoding,
  3545. errors=self.errors,
  3546. )
  3547. return Series(_set_tz(col_values[1], a.tz), name=column)
  3548. raise KeyError(f"column [{column}] not found in the table")
  3549. class WORMTable(Table):
  3550. """
  3551. a write-once read-many table: this format DOES NOT ALLOW appending to a
  3552. table. writing is a one-time operation the data are stored in a format
  3553. that allows for searching the data on disk
  3554. """
  3555. table_type = "worm"
  3556. def read(
  3557. self,
  3558. where=None,
  3559. columns=None,
  3560. start: int | None = None,
  3561. stop: int | None = None,
  3562. ):
  3563. """
  3564. read the indices and the indexing array, calculate offset rows and return
  3565. """
  3566. raise NotImplementedError("WORMTable needs to implement read")
  3567. def write(self, **kwargs):
  3568. """
  3569. write in a format that we can search later on (but cannot append
  3570. to): write out the indices and the values using _write_array
  3571. (e.g. a CArray) create an indexing table so that we can search
  3572. """
  3573. raise NotImplementedError("WORMTable needs to implement write")
  3574. class AppendableTable(Table):
  3575. """support the new appendable table formats"""
  3576. table_type = "appendable"
  3577. def write(
  3578. self,
  3579. obj,
  3580. axes=None,
  3581. append=False,
  3582. complib=None,
  3583. complevel=None,
  3584. fletcher32=None,
  3585. min_itemsize=None,
  3586. chunksize=None,
  3587. expectedrows=None,
  3588. dropna=False,
  3589. nan_rep=None,
  3590. data_columns=None,
  3591. track_times=True,
  3592. ):
  3593. if not append and self.is_exists:
  3594. self._handle.remove_node(self.group, "table")
  3595. # create the axes
  3596. table = self._create_axes(
  3597. axes=axes,
  3598. obj=obj,
  3599. validate=append,
  3600. min_itemsize=min_itemsize,
  3601. nan_rep=nan_rep,
  3602. data_columns=data_columns,
  3603. )
  3604. for a in table.axes:
  3605. a.validate_names()
  3606. if not table.is_exists:
  3607. # create the table
  3608. options = table.create_description(
  3609. complib=complib,
  3610. complevel=complevel,
  3611. fletcher32=fletcher32,
  3612. expectedrows=expectedrows,
  3613. )
  3614. # set the table attributes
  3615. table.set_attrs()
  3616. options["track_times"] = track_times
  3617. # create the table
  3618. table._handle.create_table(table.group, **options)
  3619. # update my info
  3620. table.attrs.info = table.info
  3621. # validate the axes and set the kinds
  3622. for a in table.axes:
  3623. a.validate_and_set(table, append)
  3624. # add the rows
  3625. table.write_data(chunksize, dropna=dropna)
  3626. def write_data(self, chunksize: int | None, dropna: bool = False):
  3627. """
  3628. we form the data into a 2-d including indexes,values,mask write chunk-by-chunk
  3629. """
  3630. names = self.dtype.names
  3631. nrows = self.nrows_expected
  3632. # if dropna==True, then drop ALL nan rows
  3633. masks = []
  3634. if dropna:
  3635. for a in self.values_axes:
  3636. # figure the mask: only do if we can successfully process this
  3637. # column, otherwise ignore the mask
  3638. mask = isna(a.data).all(axis=0)
  3639. if isinstance(mask, np.ndarray):
  3640. masks.append(mask.astype("u1", copy=False))
  3641. # consolidate masks
  3642. if len(masks):
  3643. mask = masks[0]
  3644. for m in masks[1:]:
  3645. mask = mask & m
  3646. mask = mask.ravel()
  3647. else:
  3648. mask = None
  3649. # broadcast the indexes if needed
  3650. indexes = [a.cvalues for a in self.index_axes]
  3651. nindexes = len(indexes)
  3652. assert nindexes == 1, nindexes # ensures we dont need to broadcast
  3653. # transpose the values so first dimension is last
  3654. # reshape the values if needed
  3655. values = [a.take_data() for a in self.values_axes]
  3656. values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]
  3657. bvalues = []
  3658. for i, v in enumerate(values):
  3659. new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
  3660. bvalues.append(values[i].reshape(new_shape))
  3661. # write the chunks
  3662. if chunksize is None:
  3663. chunksize = 100000
  3664. rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
  3665. chunks = nrows // chunksize + 1
  3666. for i in range(chunks):
  3667. start_i = i * chunksize
  3668. end_i = min((i + 1) * chunksize, nrows)
  3669. if start_i >= end_i:
  3670. break
  3671. self.write_data_chunk(
  3672. rows,
  3673. indexes=[a[start_i:end_i] for a in indexes],
  3674. mask=mask[start_i:end_i] if mask is not None else None,
  3675. values=[v[start_i:end_i] for v in bvalues],
  3676. )
  3677. def write_data_chunk(
  3678. self,
  3679. rows: np.ndarray,
  3680. indexes: list[np.ndarray],
  3681. mask: np.ndarray | None,
  3682. values: list[np.ndarray],
  3683. ):
  3684. """
  3685. Parameters
  3686. ----------
  3687. rows : an empty memory space where we are putting the chunk
  3688. indexes : an array of the indexes
  3689. mask : an array of the masks
  3690. values : an array of the values
  3691. """
  3692. # 0 len
  3693. for v in values:
  3694. if not np.prod(v.shape):
  3695. return
  3696. nrows = indexes[0].shape[0]
  3697. if nrows != len(rows):
  3698. rows = np.empty(nrows, dtype=self.dtype)
  3699. names = self.dtype.names
  3700. nindexes = len(indexes)
  3701. # indexes
  3702. for i, idx in enumerate(indexes):
  3703. rows[names[i]] = idx
  3704. # values
  3705. for i, v in enumerate(values):
  3706. rows[names[i + nindexes]] = v
  3707. # mask
  3708. if mask is not None:
  3709. m = ~mask.ravel().astype(bool, copy=False)
  3710. if not m.all():
  3711. rows = rows[m]
  3712. if len(rows):
  3713. self.table.append(rows)
  3714. self.table.flush()
  3715. def delete(self, where=None, start: int | None = None, stop: int | None = None):
  3716. # delete all rows (and return the nrows)
  3717. if where is None or not len(where):
  3718. if start is None and stop is None:
  3719. nrows = self.nrows
  3720. self._handle.remove_node(self.group, recursive=True)
  3721. else:
  3722. # pytables<3.0 would remove a single row with stop=None
  3723. if stop is None:
  3724. stop = self.nrows
  3725. nrows = self.table.remove_rows(start=start, stop=stop)
  3726. self.table.flush()
  3727. return nrows
  3728. # infer the data kind
  3729. if not self.infer_axes():
  3730. return None
  3731. # create the selection
  3732. table = self.table
  3733. selection = Selection(self, where, start=start, stop=stop)
  3734. values = selection.select_coords()
  3735. # delete the rows in reverse order
  3736. sorted_series = Series(values).sort_values()
  3737. ln = len(sorted_series)
  3738. if ln:
  3739. # construct groups of consecutive rows
  3740. diff = sorted_series.diff()
  3741. groups = list(diff[diff > 1].index)
  3742. # 1 group
  3743. if not len(groups):
  3744. groups = [0]
  3745. # final element
  3746. if groups[-1] != ln:
  3747. groups.append(ln)
  3748. # initial element
  3749. if groups[0] != 0:
  3750. groups.insert(0, 0)
  3751. # we must remove in reverse order!
  3752. pg = groups.pop()
  3753. for g in reversed(groups):
  3754. rows = sorted_series.take(range(g, pg))
  3755. table.remove_rows(
  3756. start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1
  3757. )
  3758. pg = g
  3759. self.table.flush()
  3760. # return the number of rows removed
  3761. return ln
  3762. class AppendableFrameTable(AppendableTable):
  3763. """support the new appendable table formats"""
  3764. pandas_kind = "frame_table"
  3765. table_type = "appendable_frame"
  3766. ndim = 2
  3767. obj_type: type[DataFrame | Series] = DataFrame
  3768. @property
  3769. def is_transposed(self) -> bool:
  3770. return self.index_axes[0].axis == 1
  3771. @classmethod
  3772. def get_object(cls, obj, transposed: bool):
  3773. """these are written transposed"""
  3774. if transposed:
  3775. obj = obj.T
  3776. return obj
  3777. def read(
  3778. self,
  3779. where=None,
  3780. columns=None,
  3781. start: int | None = None,
  3782. stop: int | None = None,
  3783. ):
  3784. # validate the version
  3785. self.validate_version(where)
  3786. # infer the data kind
  3787. if not self.infer_axes():
  3788. return None
  3789. result = self._read_axes(where=where, start=start, stop=stop)
  3790. info = (
  3791. self.info.get(self.non_index_axes[0][0], {})
  3792. if len(self.non_index_axes)
  3793. else {}
  3794. )
  3795. inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]
  3796. assert len(inds) == 1
  3797. ind = inds[0]
  3798. index = result[ind][0]
  3799. frames = []
  3800. for i, a in enumerate(self.axes):
  3801. if a not in self.values_axes:
  3802. continue
  3803. index_vals, cvalues = result[i]
  3804. # we could have a multi-index constructor here
  3805. # ensure_index doesn't recognized our list-of-tuples here
  3806. if info.get("type") != "MultiIndex":
  3807. cols = Index(index_vals)
  3808. else:
  3809. cols = MultiIndex.from_tuples(index_vals)
  3810. names = info.get("names")
  3811. if names is not None:
  3812. cols.set_names(names, inplace=True)
  3813. if self.is_transposed:
  3814. values = cvalues
  3815. index_ = cols
  3816. cols_ = Index(index, name=getattr(index, "name", None))
  3817. else:
  3818. values = cvalues.T
  3819. index_ = Index(index, name=getattr(index, "name", None))
  3820. cols_ = cols
  3821. # if we have a DataIndexableCol, its shape will only be 1 dim
  3822. if values.ndim == 1 and isinstance(values, np.ndarray):
  3823. values = values.reshape((1, values.shape[0]))
  3824. if isinstance(values, np.ndarray):
  3825. df = DataFrame(values.T, columns=cols_, index=index_)
  3826. elif isinstance(values, Index):
  3827. df = DataFrame(values, columns=cols_, index=index_)
  3828. else:
  3829. # Categorical
  3830. df = DataFrame._from_arrays([values], columns=cols_, index=index_)
  3831. assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
  3832. frames.append(df)
  3833. if len(frames) == 1:
  3834. df = frames[0]
  3835. else:
  3836. df = concat(frames, axis=1)
  3837. selection = Selection(self, where=where, start=start, stop=stop)
  3838. # apply the selection filters & axis orderings
  3839. df = self.process_axes(df, selection=selection, columns=columns)
  3840. return df
  3841. class AppendableSeriesTable(AppendableFrameTable):
  3842. """support the new appendable table formats"""
  3843. pandas_kind = "series_table"
  3844. table_type = "appendable_series"
  3845. ndim = 2
  3846. obj_type = Series
  3847. @property
  3848. def is_transposed(self) -> bool:
  3849. return False
  3850. @classmethod
  3851. def get_object(cls, obj, transposed: bool):
  3852. return obj
  3853. def write(self, obj, data_columns=None, **kwargs):
  3854. """we are going to write this as a frame table"""
  3855. if not isinstance(obj, DataFrame):
  3856. name = obj.name or "values"
  3857. obj = obj.to_frame(name)
  3858. return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)
  3859. def read(
  3860. self,
  3861. where=None,
  3862. columns=None,
  3863. start: int | None = None,
  3864. stop: int | None = None,
  3865. ) -> Series:
  3866. is_multi_index = self.is_multi_index
  3867. if columns is not None and is_multi_index:
  3868. assert isinstance(self.levels, list) # needed for mypy
  3869. for n in self.levels:
  3870. if n not in columns:
  3871. columns.insert(0, n)
  3872. s = super().read(where=where, columns=columns, start=start, stop=stop)
  3873. if is_multi_index:
  3874. s.set_index(self.levels, inplace=True)
  3875. s = s.iloc[:, 0]
  3876. # remove the default name
  3877. if s.name == "values":
  3878. s.name = None
  3879. return s
  3880. class AppendableMultiSeriesTable(AppendableSeriesTable):
  3881. """support the new appendable table formats"""
  3882. pandas_kind = "series_table"
  3883. table_type = "appendable_multiseries"
  3884. def write(self, obj, **kwargs):
  3885. """we are going to write this as a frame table"""
  3886. name = obj.name or "values"
  3887. newobj, self.levels = self.validate_multiindex(obj)
  3888. assert isinstance(self.levels, list) # for mypy
  3889. cols = list(self.levels)
  3890. cols.append(name)
  3891. newobj.columns = Index(cols)
  3892. return super().write(obj=newobj, **kwargs)
  3893. class GenericTable(AppendableFrameTable):
  3894. """a table that read/writes the generic pytables table format"""
  3895. pandas_kind = "frame_table"
  3896. table_type = "generic_table"
  3897. ndim = 2
  3898. obj_type = DataFrame
  3899. levels: list[Hashable]
  3900. @property
  3901. def pandas_type(self) -> str:
  3902. return self.pandas_kind
  3903. @property
  3904. def storable(self):
  3905. return getattr(self.group, "table", None) or self.group
  3906. def get_attrs(self):
  3907. """retrieve our attributes"""
  3908. self.non_index_axes = []
  3909. self.nan_rep = None
  3910. self.levels = []
  3911. self.index_axes = [a for a in self.indexables if a.is_an_indexable]
  3912. self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
  3913. self.data_columns = [a.name for a in self.values_axes]
  3914. @cache_readonly
  3915. def indexables(self):
  3916. """create the indexables from the table description"""
  3917. d = self.description
  3918. # TODO: can we get a typ for this? AFAICT it is the only place
  3919. # where we aren't passing one
  3920. # the index columns is just a simple index
  3921. md = self.read_metadata("index")
  3922. meta = "category" if md is not None else None
  3923. index_col = GenericIndexCol(
  3924. name="index", axis=0, table=self.table, meta=meta, metadata=md
  3925. )
  3926. _indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col]
  3927. for i, n in enumerate(d._v_names):
  3928. assert isinstance(n, str)
  3929. atom = getattr(d, n)
  3930. md = self.read_metadata(n)
  3931. meta = "category" if md is not None else None
  3932. dc = GenericDataIndexableCol(
  3933. name=n,
  3934. pos=i,
  3935. values=[n],
  3936. typ=atom,
  3937. table=self.table,
  3938. meta=meta,
  3939. metadata=md,
  3940. )
  3941. _indexables.append(dc)
  3942. return _indexables
  3943. def write(self, **kwargs):
  3944. raise NotImplementedError("cannot write on an generic table")
  3945. class AppendableMultiFrameTable(AppendableFrameTable):
  3946. """a frame with a multi-index"""
  3947. table_type = "appendable_multiframe"
  3948. obj_type = DataFrame
  3949. ndim = 2
  3950. _re_levels = re.compile(r"^level_\d+$")
  3951. @property
  3952. def table_type_short(self) -> str:
  3953. return "appendable_multi"
  3954. def write(self, obj, data_columns=None, **kwargs):
  3955. if data_columns is None:
  3956. data_columns = []
  3957. elif data_columns is True:
  3958. data_columns = obj.columns.tolist()
  3959. obj, self.levels = self.validate_multiindex(obj)
  3960. assert isinstance(self.levels, list) # for mypy
  3961. for n in self.levels:
  3962. if n not in data_columns:
  3963. data_columns.insert(0, n)
  3964. return super().write(obj=obj, data_columns=data_columns, **kwargs)
  3965. def read(
  3966. self,
  3967. where=None,
  3968. columns=None,
  3969. start: int | None = None,
  3970. stop: int | None = None,
  3971. ):
  3972. df = super().read(where=where, columns=columns, start=start, stop=stop)
  3973. df = df.set_index(self.levels)
  3974. # remove names for 'level_%d'
  3975. df.index = df.index.set_names(
  3976. [None if self._re_levels.search(name) else name for name in df.index.names]
  3977. )
  3978. return df
  3979. def _reindex_axis(obj: DataFrame, axis: int, labels: Index, other=None) -> DataFrame:
  3980. ax = obj._get_axis(axis)
  3981. labels = ensure_index(labels)
  3982. # try not to reindex even if other is provided
  3983. # if it equals our current index
  3984. if other is not None:
  3985. other = ensure_index(other)
  3986. if (other is None or labels.equals(other)) and labels.equals(ax):
  3987. return obj
  3988. labels = ensure_index(labels.unique())
  3989. if other is not None:
  3990. labels = ensure_index(other.unique()).intersection(labels, sort=False)
  3991. if not labels.equals(ax):
  3992. slicer: list[slice | Index] = [slice(None, None)] * obj.ndim
  3993. slicer[axis] = labels
  3994. obj = obj.loc[tuple(slicer)]
  3995. return obj
  3996. # tz to/from coercion
  3997. def _get_tz(tz: tzinfo) -> str | tzinfo:
  3998. """for a tz-aware type, return an encoded zone"""
  3999. zone = timezones.get_timezone(tz)
  4000. return zone
  4001. def _set_tz(
  4002. values: np.ndarray | Index,
  4003. tz: str | tzinfo | None,
  4004. coerce: bool = False,
  4005. ) -> np.ndarray | DatetimeIndex:
  4006. """
  4007. coerce the values to a DatetimeIndex if tz is set
  4008. preserve the input shape if possible
  4009. Parameters
  4010. ----------
  4011. values : ndarray or Index
  4012. tz : str or tzinfo
  4013. coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
  4014. """
  4015. if isinstance(values, DatetimeIndex):
  4016. # If values is tzaware, the tz gets dropped in the values.ravel()
  4017. # call below (which returns an ndarray). So we are only non-lossy
  4018. # if `tz` matches `values.tz`.
  4019. assert values.tz is None or values.tz == tz
  4020. if tz is not None:
  4021. if isinstance(values, DatetimeIndex):
  4022. name = values.name
  4023. values = values.asi8
  4024. else:
  4025. name = None
  4026. values = values.ravel()
  4027. tz = _ensure_decoded(tz)
  4028. values = DatetimeIndex(values, name=name)
  4029. values = values.tz_localize("UTC").tz_convert(tz)
  4030. elif coerce:
  4031. values = np.asarray(values, dtype="M8[ns]")
  4032. # error: Incompatible return value type (got "Union[ndarray, Index]",
  4033. # expected "Union[ndarray, DatetimeIndex]")
  4034. return values # type: ignore[return-value]
  4035. def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol:
  4036. assert isinstance(name, str)
  4037. index_name = index.name
  4038. # error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index";
  4039. # expected "Union[ExtensionArray, ndarray]"
  4040. converted, dtype_name = _get_data_and_dtype_name(index) # type: ignore[arg-type]
  4041. kind = _dtype_to_kind(dtype_name)
  4042. atom = DataIndexableCol._get_atom(converted)
  4043. if isinstance(index, Int64Index) or needs_i8_conversion(index.dtype):
  4044. # Includes Int64Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,
  4045. # in which case "kind" is "integer", "integer", "datetime64",
  4046. # "timedelta64", and "integer", respectively.
  4047. return IndexCol(
  4048. name,
  4049. values=converted,
  4050. kind=kind,
  4051. typ=atom,
  4052. freq=getattr(index, "freq", None),
  4053. tz=getattr(index, "tz", None),
  4054. index_name=index_name,
  4055. )
  4056. if isinstance(index, MultiIndex):
  4057. raise TypeError("MultiIndex not supported here!")
  4058. inferred_type = lib.infer_dtype(index, skipna=False)
  4059. # we won't get inferred_type of "datetime64" or "timedelta64" as these
  4060. # would go through the DatetimeIndex/TimedeltaIndex paths above
  4061. values = np.asarray(index)
  4062. if inferred_type == "date":
  4063. converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)
  4064. return IndexCol(
  4065. name, converted, "date", _tables().Time32Col(), index_name=index_name
  4066. )
  4067. elif inferred_type == "string":
  4068. converted = _convert_string_array(values, encoding, errors)
  4069. itemsize = converted.dtype.itemsize
  4070. return IndexCol(
  4071. name,
  4072. converted,
  4073. "string",
  4074. _tables().StringCol(itemsize),
  4075. index_name=index_name,
  4076. )
  4077. elif inferred_type in ["integer", "floating"]:
  4078. return IndexCol(
  4079. name, values=converted, kind=kind, typ=atom, index_name=index_name
  4080. )
  4081. else:
  4082. assert isinstance(converted, np.ndarray) and converted.dtype == object
  4083. assert kind == "object", kind
  4084. atom = _tables().ObjectAtom()
  4085. return IndexCol(name, converted, kind, atom, index_name=index_name)
  4086. def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index:
  4087. index: Index | np.ndarray
  4088. if kind == "datetime64":
  4089. index = DatetimeIndex(data)
  4090. elif kind == "timedelta64":
  4091. index = TimedeltaIndex(data)
  4092. elif kind == "date":
  4093. try:
  4094. index = np.asarray([date.fromordinal(v) for v in data], dtype=object)
  4095. except (ValueError):
  4096. index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)
  4097. elif kind in ("integer", "float"):
  4098. index = np.asarray(data)
  4099. elif kind in ("string"):
  4100. index = _unconvert_string_array(
  4101. data, nan_rep=None, encoding=encoding, errors=errors
  4102. )
  4103. elif kind == "object":
  4104. index = np.asarray(data[0])
  4105. else: # pragma: no cover
  4106. raise ValueError(f"unrecognized index type {kind}")
  4107. return index
  4108. def _maybe_convert_for_string_atom(
  4109. name: str,
  4110. block: Block,
  4111. existing_col,
  4112. min_itemsize,
  4113. nan_rep,
  4114. encoding,
  4115. errors,
  4116. columns: list[str],
  4117. ):
  4118. bvalues = block.values
  4119. if bvalues.dtype != object:
  4120. return bvalues
  4121. dtype_name = bvalues.dtype.name
  4122. inferred_type = lib.infer_dtype(bvalues, skipna=False)
  4123. if inferred_type == "date":
  4124. raise TypeError("[date] is not implemented as a table column")
  4125. elif inferred_type == "datetime":
  4126. # after GH#8260
  4127. # this only would be hit for a multi-timezone dtype which is an error
  4128. raise TypeError(
  4129. "too many timezones in this block, create separate data columns"
  4130. )
  4131. elif not (inferred_type == "string" or dtype_name == "object"):
  4132. return bvalues
  4133. blocks: list[Block] = block.fillna(nan_rep, downcast=False)
  4134. # Note: because block is always object dtype, fillna goes
  4135. # through a path such that the result is always a 1-element list
  4136. assert len(blocks) == 1
  4137. block = blocks[0]
  4138. data = block.values
  4139. # see if we have a valid string type
  4140. inferred_type = lib.infer_dtype(data, skipna=False)
  4141. if inferred_type != "string":
  4142. # we cannot serialize this data, so report an exception on a column
  4143. # by column basis
  4144. # expected behaviour:
  4145. # search block for a non-string object column by column
  4146. for i in range(data.shape[0]):
  4147. col = block.iget(i)
  4148. inferred_type = lib.infer_dtype(col, skipna=False)
  4149. if inferred_type != "string":
  4150. error_column_label = columns[i] if len(columns) > i else f"No.{i}"
  4151. raise TypeError(
  4152. f"Cannot serialize the column [{error_column_label}]\n"
  4153. f"because its data contents are not [string] but "
  4154. f"[{inferred_type}] object dtype"
  4155. )
  4156. # itemsize is the maximum length of a string (along any dimension)
  4157. # error: Argument 1 to "_convert_string_array" has incompatible type "Union[ndarray,
  4158. # ExtensionArray]"; expected "ndarray"
  4159. data_converted = _convert_string_array(
  4160. data, encoding, errors # type: ignore[arg-type]
  4161. ).reshape(data.shape)
  4162. itemsize = data_converted.itemsize
  4163. # specified min_itemsize?
  4164. if isinstance(min_itemsize, dict):
  4165. min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)
  4166. itemsize = max(min_itemsize or 0, itemsize)
  4167. # check for column in the values conflicts
  4168. if existing_col is not None:
  4169. eci = existing_col.validate_col(itemsize)
  4170. if eci is not None and eci > itemsize:
  4171. itemsize = eci
  4172. data_converted = data_converted.astype(f"|S{itemsize}", copy=False)
  4173. return data_converted
  4174. def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray:
  4175. """
  4176. Take a string-like that is object dtype and coerce to a fixed size string type.
  4177. Parameters
  4178. ----------
  4179. data : np.ndarray[object]
  4180. encoding : str
  4181. errors : str
  4182. Handler for encoding errors.
  4183. Returns
  4184. -------
  4185. np.ndarray[fixed-length-string]
  4186. """
  4187. # encode if needed
  4188. if len(data):
  4189. data = (
  4190. Series(data.ravel())
  4191. .str.encode(encoding, errors)
  4192. ._values.reshape(data.shape)
  4193. )
  4194. # create the sized dtype
  4195. ensured = ensure_object(data.ravel())
  4196. itemsize = max(1, libwriters.max_len_string_array(ensured))
  4197. data = np.asarray(data, dtype=f"S{itemsize}")
  4198. return data
  4199. def _unconvert_string_array(
  4200. data: np.ndarray, nan_rep, encoding: str, errors: str
  4201. ) -> np.ndarray:
  4202. """
  4203. Inverse of _convert_string_array.
  4204. Parameters
  4205. ----------
  4206. data : np.ndarray[fixed-length-string]
  4207. nan_rep : the storage repr of NaN
  4208. encoding : str
  4209. errors : str
  4210. Handler for encoding errors.
  4211. Returns
  4212. -------
  4213. np.ndarray[object]
  4214. Decoded data.
  4215. """
  4216. shape = data.shape
  4217. data = np.asarray(data.ravel(), dtype=object)
  4218. if len(data):
  4219. itemsize = libwriters.max_len_string_array(ensure_object(data))
  4220. dtype = f"U{itemsize}"
  4221. if isinstance(data[0], bytes):
  4222. data = Series(data).str.decode(encoding, errors=errors)._values
  4223. else:
  4224. data = data.astype(dtype, copy=False).astype(object, copy=False)
  4225. if nan_rep is None:
  4226. nan_rep = "nan"
  4227. libwriters.string_array_replace_from_nan_rep(data, nan_rep)
  4228. return data.reshape(shape)
  4229. def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str):
  4230. assert isinstance(val_kind, str), type(val_kind)
  4231. if _need_convert(val_kind):
  4232. conv = _get_converter(val_kind, encoding, errors)
  4233. values = conv(values)
  4234. return values
  4235. def _get_converter(kind: str, encoding: str, errors: str):
  4236. if kind == "datetime64":
  4237. return lambda x: np.asarray(x, dtype="M8[ns]")
  4238. elif kind == "string":
  4239. return lambda x: _unconvert_string_array(
  4240. x, nan_rep=None, encoding=encoding, errors=errors
  4241. )
  4242. else: # pragma: no cover
  4243. raise ValueError(f"invalid kind {kind}")
  4244. def _need_convert(kind: str) -> bool:
  4245. if kind in ("datetime64", "string"):
  4246. return True
  4247. return False
  4248. def _maybe_adjust_name(name: str, version: Sequence[int]) -> str:
  4249. """
  4250. Prior to 0.10.1, we named values blocks like: values_block_0 an the
  4251. name values_0, adjust the given name if necessary.
  4252. Parameters
  4253. ----------
  4254. name : str
  4255. version : Tuple[int, int, int]
  4256. Returns
  4257. -------
  4258. str
  4259. """
  4260. if isinstance(version, str) or len(version) < 3:
  4261. raise ValueError("Version is incorrect, expected sequence of 3 integers.")
  4262. if version[0] == 0 and version[1] <= 10 and version[2] == 0:
  4263. m = re.search(r"values_block_(\d+)", name)
  4264. if m:
  4265. grp = m.groups()[0]
  4266. name = f"values_{grp}"
  4267. return name
  4268. def _dtype_to_kind(dtype_str: str) -> str:
  4269. """
  4270. Find the "kind" string describing the given dtype name.
  4271. """
  4272. dtype_str = _ensure_decoded(dtype_str)
  4273. if dtype_str.startswith("string") or dtype_str.startswith("bytes"):
  4274. kind = "string"
  4275. elif dtype_str.startswith("float"):
  4276. kind = "float"
  4277. elif dtype_str.startswith("complex"):
  4278. kind = "complex"
  4279. elif dtype_str.startswith("int") or dtype_str.startswith("uint"):
  4280. kind = "integer"
  4281. elif dtype_str.startswith("datetime64"):
  4282. kind = "datetime64"
  4283. elif dtype_str.startswith("timedelta"):
  4284. kind = "timedelta64"
  4285. elif dtype_str.startswith("bool"):
  4286. kind = "bool"
  4287. elif dtype_str.startswith("category"):
  4288. kind = "category"
  4289. elif dtype_str.startswith("period"):
  4290. # We store the `freq` attr so we can restore from integers
  4291. kind = "integer"
  4292. elif dtype_str == "object":
  4293. kind = "object"
  4294. else:
  4295. raise ValueError(f"cannot interpret dtype of [{dtype_str}]")
  4296. return kind
  4297. def _get_data_and_dtype_name(data: ArrayLike):
  4298. """
  4299. Convert the passed data into a storable form and a dtype string.
  4300. """
  4301. if isinstance(data, Categorical):
  4302. data = data.codes
  4303. # For datetime64tz we need to drop the TZ in tests TODO: why?
  4304. dtype_name = data.dtype.name.split("[")[0]
  4305. if data.dtype.kind in ["m", "M"]:
  4306. data = np.asarray(data.view("i8"))
  4307. # TODO: we used to reshape for the dt64tz case, but no longer
  4308. # doing that doesn't seem to break anything. why?
  4309. elif isinstance(data, PeriodIndex):
  4310. data = data.asi8
  4311. data = np.asarray(data)
  4312. return data, dtype_name
  4313. class Selection:
  4314. """
  4315. Carries out a selection operation on a tables.Table object.
  4316. Parameters
  4317. ----------
  4318. table : a Table object
  4319. where : list of Terms (or convertible to)
  4320. start, stop: indices to start and/or stop selection
  4321. """
  4322. def __init__(
  4323. self,
  4324. table: Table,
  4325. where=None,
  4326. start: int | None = None,
  4327. stop: int | None = None,
  4328. ):
  4329. self.table = table
  4330. self.where = where
  4331. self.start = start
  4332. self.stop = stop
  4333. self.condition = None
  4334. self.filter = None
  4335. self.terms = None
  4336. self.coordinates = None
  4337. if is_list_like(where):
  4338. # see if we have a passed coordinate like
  4339. with suppress(ValueError):
  4340. inferred = lib.infer_dtype(where, skipna=False)
  4341. if inferred == "integer" or inferred == "boolean":
  4342. where = np.asarray(where)
  4343. if where.dtype == np.bool_:
  4344. start, stop = self.start, self.stop
  4345. if start is None:
  4346. start = 0
  4347. if stop is None:
  4348. stop = self.table.nrows
  4349. self.coordinates = np.arange(start, stop)[where]
  4350. elif issubclass(where.dtype.type, np.integer):
  4351. if (self.start is not None and (where < self.start).any()) or (
  4352. self.stop is not None and (where >= self.stop).any()
  4353. ):
  4354. raise ValueError(
  4355. "where must have index locations >= start and < stop"
  4356. )
  4357. self.coordinates = where
  4358. if self.coordinates is None:
  4359. self.terms = self.generate(where)
  4360. # create the numexpr & the filter
  4361. if self.terms is not None:
  4362. self.condition, self.filter = self.terms.evaluate()
  4363. def generate(self, where):
  4364. """where can be a : dict,list,tuple,string"""
  4365. if where is None:
  4366. return None
  4367. q = self.table.queryables()
  4368. try:
  4369. return PyTablesExpr(where, queryables=q, encoding=self.table.encoding)
  4370. except NameError as err:
  4371. # raise a nice message, suggesting that the user should use
  4372. # data_columns
  4373. qkeys = ",".join(q.keys())
  4374. msg = dedent(
  4375. f"""\
  4376. The passed where expression: {where}
  4377. contains an invalid variable reference
  4378. all of the variable references must be a reference to
  4379. an axis (e.g. 'index' or 'columns'), or a data_column
  4380. The currently defined references are: {qkeys}
  4381. """
  4382. )
  4383. raise ValueError(msg) from err
  4384. def select(self):
  4385. """
  4386. generate the selection
  4387. """
  4388. if self.condition is not None:
  4389. return self.table.table.read_where(
  4390. self.condition.format(), start=self.start, stop=self.stop
  4391. )
  4392. elif self.coordinates is not None:
  4393. return self.table.table.read_coordinates(self.coordinates)
  4394. return self.table.table.read(start=self.start, stop=self.stop)
  4395. def select_coords(self):
  4396. """
  4397. generate the selection
  4398. """
  4399. start, stop = self.start, self.stop
  4400. nrows = self.table.nrows
  4401. if start is None:
  4402. start = 0
  4403. elif start < 0:
  4404. start += nrows
  4405. if stop is None:
  4406. stop = nrows
  4407. elif stop < 0:
  4408. stop += nrows
  4409. if self.condition is not None:
  4410. return self.table.table.get_where_list(
  4411. self.condition.format(), start=start, stop=stop, sort=True
  4412. )
  4413. elif self.coordinates is not None:
  4414. return self.coordinates
  4415. return np.arange(start, stop)