/pandas/io/tests/test_pytables.py
Python | 5436 lines | 5393 code | 30 blank | 13 comment | 23 complexity | 069156a1606495477320752b993ba0f0 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- import nose
- import sys
- import os
- import warnings
- import tempfile
- from contextlib import contextmanager
- import datetime
- import numpy as np
- import pandas
- import pandas as pd
- from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index,
- RangeIndex, Categorical, bdate_range,
- date_range, timedelta_range, Index, DatetimeIndex,
- isnull)
- from pandas.compat import is_platform_windows, PY3, PY35
- from pandas.formats.printing import pprint_thing
- from pandas.io.pytables import _tables, TableIterator
- try:
- _tables()
- except ImportError as e:
- raise nose.SkipTest(e)
- from pandas.io.pytables import (HDFStore, get_store, Term, read_hdf,
- IncompatibilityWarning, PerformanceWarning,
- AttributeConflictWarning, DuplicateWarning,
- PossibleDataLossError, ClosedFileError)
- from pandas.io import pytables as pytables
- import pandas.util.testing as tm
- from pandas.util.testing import (assert_panel4d_equal,
- assert_panel_equal,
- assert_frame_equal,
- assert_series_equal,
- assert_produces_warning,
- set_timezone)
- from pandas import concat, Timestamp
- from pandas import compat
- from pandas.compat import range, lrange, u
- try:
- import tables
- except ImportError:
- raise nose.SkipTest('no pytables')
- from distutils.version import LooseVersion
- _default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2'
- else 'zlib')
- _multiprocess_can_split_ = False
- # testing on windows/py3 seems to fault
- # for using compression
- skip_compression = PY3 and is_platform_windows()
- # contextmanager to ensure the file cleanup
- def safe_remove(path):
- if path is not None:
- try:
- os.remove(path)
- except:
- pass
- def safe_close(store):
- try:
- if store is not None:
- store.close()
- except:
- pass
- def create_tempfile(path):
- """ create an unopened named temporary file """
- return os.path.join(tempfile.gettempdir(), path)
- @contextmanager
- def ensure_clean_store(path, mode='a', complevel=None, complib=None,
- fletcher32=False):
- try:
- # put in the temporary path if we don't have one already
- if not len(os.path.dirname(path)):
- path = create_tempfile(path)
- store = HDFStore(path, mode=mode, complevel=complevel,
- complib=complib, fletcher32=False)
- yield store
- finally:
- safe_close(store)
- if mode == 'w' or mode == 'a':
- safe_remove(path)
- @contextmanager
- def ensure_clean_path(path):
- """
- return essentially a named temporary file that is not opened
- and deleted on existing; if path is a list, then create and
- return list of filenames
- """
- try:
- if isinstance(path, list):
- filenames = [create_tempfile(p) for p in path]
- yield filenames
- else:
- filenames = [create_tempfile(path)]
- yield filenames[0]
- finally:
- for f in filenames:
- safe_remove(f)
- # set these parameters so we don't have file sharing
- tables.parameters.MAX_NUMEXPR_THREADS = 1
- tables.parameters.MAX_BLOSC_THREADS = 1
- tables.parameters.MAX_THREADS = 1
- def _maybe_remove(store, key):
- """For tests using tables, try removing the table to be sure there is
- no content from previous tests using the same table name."""
- try:
- store.remove(key)
- except:
- pass
- @contextmanager
- def compat_assert_produces_warning(w):
- """ don't produce a warning under PY3 """
- if compat.PY3:
- yield
- else:
- with tm.assert_produces_warning(expected_warning=w,
- check_stacklevel=False):
- yield
- class Base(tm.TestCase):
- @classmethod
- def setUpClass(cls):
- super(Base, cls).setUpClass()
- # Pytables 3.0.0 deprecates lots of things
- tm.reset_testing_mode()
- @classmethod
- def tearDownClass(cls):
- super(Base, cls).tearDownClass()
- # Pytables 3.0.0 deprecates lots of things
- tm.set_testing_mode()
- def setUp(self):
- warnings.filterwarnings(action='ignore', category=FutureWarning)
- self.path = 'tmp.__%s__.h5' % tm.rands(10)
- def tearDown(self):
- pass
- class TestHDFStore(Base, tm.TestCase):
- def test_factory_fun(self):
- path = create_tempfile(self.path)
- try:
- with get_store(path) as tbl:
- raise ValueError('blah')
- except ValueError:
- pass
- finally:
- safe_remove(path)
- try:
- with get_store(path) as tbl:
- tbl['a'] = tm.makeDataFrame()
- with get_store(path) as tbl:
- self.assertEqual(len(tbl), 1)
- self.assertEqual(type(tbl['a']), DataFrame)
- finally:
- safe_remove(self.path)
- def test_context(self):
- path = create_tempfile(self.path)
- try:
- with HDFStore(path) as tbl:
- raise ValueError('blah')
- except ValueError:
- pass
- finally:
- safe_remove(path)
- try:
- with HDFStore(path) as tbl:
- tbl['a'] = tm.makeDataFrame()
- with HDFStore(path) as tbl:
- self.assertEqual(len(tbl), 1)
- self.assertEqual(type(tbl['a']), DataFrame)
- finally:
- safe_remove(path)
- def test_conv_read_write(self):
- path = create_tempfile(self.path)
- try:
- def roundtrip(key, obj, **kwargs):
- obj.to_hdf(path, key, **kwargs)
- return read_hdf(path, key)
- o = tm.makeTimeSeries()
- assert_series_equal(o, roundtrip('series', o))
- o = tm.makeStringSeries()
- assert_series_equal(o, roundtrip('string_series', o))
- o = tm.makeDataFrame()
- assert_frame_equal(o, roundtrip('frame', o))
- o = tm.makePanel()
- assert_panel_equal(o, roundtrip('panel', o))
- # table
- df = DataFrame(dict(A=lrange(5), B=lrange(5)))
- df.to_hdf(path, 'table', append=True)
- result = read_hdf(path, 'table', where=['index>2'])
- assert_frame_equal(df[df.index > 2], result)
- finally:
- safe_remove(path)
- def test_long_strings(self):
- # GH6166
- # unconversion of long strings was being chopped in earlier
- # versions of numpy < 1.7.2
- df = DataFrame({'a': tm.rands_array(100, size=10)},
- index=tm.rands_array(100, size=10))
- with ensure_clean_store(self.path) as store:
- store.append('df', df, data_columns=['a'])
- result = store.select('df')
- assert_frame_equal(df, result)
- def test_api(self):
- # GH4584
- # API issue when to_hdf doesn't acdept append AND format args
- with ensure_clean_path(self.path) as path:
- df = tm.makeDataFrame()
- df.iloc[:10].to_hdf(path, 'df', append=True, format='table')
- df.iloc[10:].to_hdf(path, 'df', append=True, format='table')
- assert_frame_equal(read_hdf(path, 'df'), df)
- # append to False
- df.iloc[:10].to_hdf(path, 'df', append=False, format='table')
- df.iloc[10:].to_hdf(path, 'df', append=True, format='table')
- assert_frame_equal(read_hdf(path, 'df'), df)
- with ensure_clean_path(self.path) as path:
- df = tm.makeDataFrame()
- df.iloc[:10].to_hdf(path, 'df', append=True)
- df.iloc[10:].to_hdf(path, 'df', append=True, format='table')
- assert_frame_equal(read_hdf(path, 'df'), df)
- # append to False
- df.iloc[:10].to_hdf(path, 'df', append=False, format='table')
- df.iloc[10:].to_hdf(path, 'df', append=True)
- assert_frame_equal(read_hdf(path, 'df'), df)
- with ensure_clean_path(self.path) as path:
- df = tm.makeDataFrame()
- df.to_hdf(path, 'df', append=False, format='fixed')
- assert_frame_equal(read_hdf(path, 'df'), df)
- df.to_hdf(path, 'df', append=False, format='f')
- assert_frame_equal(read_hdf(path, 'df'), df)
- df.to_hdf(path, 'df', append=False)
- assert_frame_equal(read_hdf(path, 'df'), df)
- df.to_hdf(path, 'df')
- assert_frame_equal(read_hdf(path, 'df'), df)
- with ensure_clean_store(self.path) as store:
- path = store._path
- df = tm.makeDataFrame()
- _maybe_remove(store, 'df')
- store.append('df', df.iloc[:10], append=True, format='table')
- store.append('df', df.iloc[10:], append=True, format='table')
- assert_frame_equal(store.select('df'), df)
- # append to False
- _maybe_remove(store, 'df')
- store.append('df', df.iloc[:10], append=False, format='table')
- store.append('df', df.iloc[10:], append=True, format='table')
- assert_frame_equal(store.select('df'), df)
- # formats
- _maybe_remove(store, 'df')
- store.append('df', df.iloc[:10], append=False, format='table')
- store.append('df', df.iloc[10:], append=True, format='table')
- assert_frame_equal(store.select('df'), df)
- _maybe_remove(store, 'df')
- store.append('df', df.iloc[:10], append=False, format='table')
- store.append('df', df.iloc[10:], append=True, format=None)
- assert_frame_equal(store.select('df'), df)
- with ensure_clean_path(self.path) as path:
- # invalid
- df = tm.makeDataFrame()
- self.assertRaises(ValueError, df.to_hdf, path,
- 'df', append=True, format='f')
- self.assertRaises(ValueError, df.to_hdf, path,
- 'df', append=True, format='fixed')
- self.assertRaises(TypeError, df.to_hdf, path,
- 'df', append=True, format='foo')
- self.assertRaises(TypeError, df.to_hdf, path,
- 'df', append=False, format='bar')
- # File path doesn't exist
- path = ""
- self.assertRaises(IOError, read_hdf, path, 'df')
- def test_api_default_format(self):
- # default_format option
- with ensure_clean_store(self.path) as store:
- df = tm.makeDataFrame()
- pandas.set_option('io.hdf.default_format', 'fixed')
- _maybe_remove(store, 'df')
- store.put('df', df)
- self.assertFalse(store.get_storer('df').is_table)
- self.assertRaises(ValueError, store.append, 'df2', df)
- pandas.set_option('io.hdf.default_format', 'table')
- _maybe_remove(store, 'df')
- store.put('df', df)
- self.assertTrue(store.get_storer('df').is_table)
- _maybe_remove(store, 'df2')
- store.append('df2', df)
- self.assertTrue(store.get_storer('df').is_table)
- pandas.set_option('io.hdf.default_format', None)
- with ensure_clean_path(self.path) as path:
- df = tm.makeDataFrame()
- pandas.set_option('io.hdf.default_format', 'fixed')
- df.to_hdf(path, 'df')
- with get_store(path) as store:
- self.assertFalse(store.get_storer('df').is_table)
- self.assertRaises(ValueError, df.to_hdf, path, 'df2', append=True)
- pandas.set_option('io.hdf.default_format', 'table')
- df.to_hdf(path, 'df3')
- with HDFStore(path) as store:
- self.assertTrue(store.get_storer('df3').is_table)
- df.to_hdf(path, 'df4', append=True)
- with HDFStore(path) as store:
- self.assertTrue(store.get_storer('df4').is_table)
- pandas.set_option('io.hdf.default_format', None)
- def test_keys(self):
- with ensure_clean_store(self.path) as store:
- store['a'] = tm.makeTimeSeries()
- store['b'] = tm.makeStringSeries()
- store['c'] = tm.makeDataFrame()
- store['d'] = tm.makePanel()
- store['foo/bar'] = tm.makePanel()
- self.assertEqual(len(store), 5)
- expected = set(['/a', '/b', '/c', '/d', '/foo/bar'])
- self.assertTrue(set(store.keys()) == expected)
- self.assertTrue(set(store) == expected)
- def test_iter_empty(self):
- with ensure_clean_store(self.path) as store:
- # GH 12221
- self.assertTrue(list(store) == [])
- def test_repr(self):
- with ensure_clean_store(self.path) as store:
- repr(store)
- store['a'] = tm.makeTimeSeries()
- store['b'] = tm.makeStringSeries()
- store['c'] = tm.makeDataFrame()
- store['d'] = tm.makePanel()
- store['foo/bar'] = tm.makePanel()
- store.append('e', tm.makePanel())
- df = tm.makeDataFrame()
- df['obj1'] = 'foo'
- df['obj2'] = 'bar'
- df['bool1'] = df['A'] > 0
- df['bool2'] = df['B'] > 0
- df['bool3'] = True
- df['int1'] = 1
- df['int2'] = 2
- df['timestamp1'] = Timestamp('20010102')
- df['timestamp2'] = Timestamp('20010103')
- df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0)
- df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0)
- df.ix[3:6, ['obj1']] = np.nan
- df = df.consolidate()._convert(datetime=True)
- warnings.filterwarnings('ignore', category=PerformanceWarning)
- store['df'] = df
- warnings.filterwarnings('always', category=PerformanceWarning)
- # make a random group in hdf space
- store._handle.create_group(store._handle.root, 'bah')
- repr(store)
- str(store)
- # storers
- with ensure_clean_store(self.path) as store:
- df = tm.makeDataFrame()
- store.append('df', df)
- s = store.get_storer('df')
- repr(s)
- str(s)
- def test_contains(self):
- with ensure_clean_store(self.path) as store:
- store['a'] = tm.makeTimeSeries()
- store['b'] = tm.makeDataFrame()
- store['foo/bar'] = tm.makeDataFrame()
- self.assertIn('a', store)
- self.assertIn('b', store)
- self.assertNotIn('c', store)
- self.assertIn('foo/bar', store)
- self.assertIn('/foo/bar', store)
- self.assertNotIn('/foo/b', store)
- self.assertNotIn('bar', store)
- # GH 2694
- warnings.filterwarnings(
- 'ignore', category=tables.NaturalNameWarning)
- store['node())'] = tm.makeDataFrame()
- self.assertIn('node())', store)
- def test_versioning(self):
- with ensure_clean_store(self.path) as store:
- store['a'] = tm.makeTimeSeries()
- store['b'] = tm.makeDataFrame()
- df = tm.makeTimeDataFrame()
- _maybe_remove(store, 'df1')
- store.append('df1', df[:10])
- store.append('df1', df[10:])
- self.assertEqual(store.root.a._v_attrs.pandas_version, '0.15.2')
- self.assertEqual(store.root.b._v_attrs.pandas_version, '0.15.2')
- self.assertEqual(store.root.df1._v_attrs.pandas_version, '0.15.2')
- # write a file and wipe its versioning
- _maybe_remove(store, 'df2')
- store.append('df2', df)
- # this is an error because its table_type is appendable, but no
- # version info
- store.get_node('df2')._v_attrs.pandas_version = None
- self.assertRaises(Exception, store.select, 'df2')
- def test_mode(self):
- df = tm.makeTimeDataFrame()
- def check(mode):
- with ensure_clean_path(self.path) as path:
- # constructor
- if mode in ['r', 'r+']:
- self.assertRaises(IOError, HDFStore, path, mode=mode)
- else:
- store = HDFStore(path, mode=mode)
- self.assertEqual(store._handle.mode, mode)
- store.close()
- with ensure_clean_path(self.path) as path:
- # context
- if mode in ['r', 'r+']:
- def f():
- with HDFStore(path, mode=mode) as store: # noqa
- pass
- self.assertRaises(IOError, f)
- else:
- with HDFStore(path, mode=mode) as store:
- self.assertEqual(store._handle.mode, mode)
- with ensure_clean_path(self.path) as path:
- # conv write
- if mode in ['r', 'r+']:
- self.assertRaises(IOError, df.to_hdf,
- path, 'df', mode=mode)
- df.to_hdf(path, 'df', mode='w')
- else:
- df.to_hdf(path, 'df', mode=mode)
- # conv read
- if mode in ['w']:
- self.assertRaises(ValueError, read_hdf,
- path, 'df', mode=mode)
- else:
- result = read_hdf(path, 'df', mode=mode)
- assert_frame_equal(result, df)
- def check_default_mode():
- # read_hdf uses default mode
- with ensure_clean_path(self.path) as path:
- df.to_hdf(path, 'df', mode='w')
- result = read_hdf(path, 'df')
- assert_frame_equal(result, df)
- check('r')
- check('r+')
- check('a')
- check('w')
- check_default_mode()
- def test_reopen_handle(self):
- with ensure_clean_path(self.path) as path:
- store = HDFStore(path, mode='a')
- store['a'] = tm.makeTimeSeries()
- # invalid mode change
- self.assertRaises(PossibleDataLossError, store.open, 'w')
- store.close()
- self.assertFalse(store.is_open)
- # truncation ok here
- store.open('w')
- self.assertTrue(store.is_open)
- self.assertEqual(len(store), 0)
- store.close()
- self.assertFalse(store.is_open)
- store = HDFStore(path, mode='a')
- store['a'] = tm.makeTimeSeries()
- # reopen as read
- store.open('r')
- self.assertTrue(store.is_open)
- self.assertEqual(len(store), 1)
- self.assertEqual(store._mode, 'r')
- store.close()
- self.assertFalse(store.is_open)
- # reopen as append
- store.open('a')
- self.assertTrue(store.is_open)
- self.assertEqual(len(store), 1)
- self.assertEqual(store._mode, 'a')
- store.close()
- self.assertFalse(store.is_open)
- # reopen as append (again)
- store.open('a')
- self.assertTrue(store.is_open)
- self.assertEqual(len(store), 1)
- self.assertEqual(store._mode, 'a')
- store.close()
- self.assertFalse(store.is_open)
- def test_open_args(self):
- with ensure_clean_path(self.path) as path:
- df = tm.makeDataFrame()
- # create an in memory store
- store = HDFStore(path, mode='a', driver='H5FD_CORE',
- driver_core_backing_store=0)
- store['df'] = df
- store.append('df2', df)
- tm.assert_frame_equal(store['df'], df)
- tm.assert_frame_equal(store['df2'], df)
- store.close()
- # the file should not have actually been written
- self.assertFalse(os.path.exists(path))
- def test_flush(self):
- with ensure_clean_store(self.path) as store:
- store['a'] = tm.makeTimeSeries()
- store.flush()
- store.flush(fsync=True)
- def test_get(self):
- with ensure_clean_store(self.path) as store:
- store['a'] = tm.makeTimeSeries()
- left = store.get('a')
- right = store['a']
- tm.assert_series_equal(left, right)
- left = store.get('/a')
- right = store['/a']
- tm.assert_series_equal(left, right)
- self.assertRaises(KeyError, store.get, 'b')
- def test_getattr(self):
- with ensure_clean_store(self.path) as store:
- s = tm.makeTimeSeries()
- store['a'] = s
- # test attribute access
- result = store.a
- tm.assert_series_equal(result, s)
- result = getattr(store, 'a')
- tm.assert_series_equal(result, s)
- df = tm.makeTimeDataFrame()
- store['df'] = df
- result = store.df
- tm.assert_frame_equal(result, df)
- # errors
- self.assertRaises(AttributeError, getattr, store, 'd')
- for x in ['mode', 'path', 'handle', 'complib']:
- self.assertRaises(AttributeError, getattr, store, x)
- # not stores
- for x in ['mode', 'path', 'handle', 'complib']:
- getattr(store, "_%s" % x)
- def test_put(self):
- with ensure_clean_store(self.path) as store:
- ts = tm.makeTimeSeries()
- df = tm.makeTimeDataFrame()
- store['a'] = ts
- store['b'] = df[:10]
- store['foo/bar/bah'] = df[:10]
- store['foo'] = df[:10]
- store['/foo'] = df[:10]
- store.put('c', df[:10], format='table')
- # not OK, not a table
- self.assertRaises(
- ValueError, store.put, 'b', df[10:], append=True)
- # node does not currently exist, test _is_table_type returns False
- # in this case
- # _maybe_remove(store, 'f')
- # self.assertRaises(ValueError, store.put, 'f', df[10:],
- # append=True)
- # can't put to a table (use append instead)
- self.assertRaises(ValueError, store.put, 'c', df[10:], append=True)
- # overwrite table
- store.put('c', df[:10], format='table', append=False)
- tm.assert_frame_equal(df[:10], store['c'])
- def test_put_string_index(self):
- with ensure_clean_store(self.path) as store:
- index = Index(
- ["I am a very long string index: %s" % i for i in range(20)])
- s = Series(np.arange(20), index=index)
- df = DataFrame({'A': s, 'B': s})
- store['a'] = s
- tm.assert_series_equal(store['a'], s)
- store['b'] = df
- tm.assert_frame_equal(store['b'], df)
- # mixed length
- index = Index(['abcdefghijklmnopqrstuvwxyz1234567890'] +
- ["I am a very long string index: %s" % i
- for i in range(20)])
- s = Series(np.arange(21), index=index)
- df = DataFrame({'A': s, 'B': s})
- store['a'] = s
- tm.assert_series_equal(store['a'], s)
- store['b'] = df
- tm.assert_frame_equal(store['b'], df)
- def test_put_compression(self):
- with ensure_clean_store(self.path) as store:
- df = tm.makeTimeDataFrame()
- store.put('c', df, format='table', complib='zlib')
- tm.assert_frame_equal(store['c'], df)
- # can't compress if format='fixed'
- self.assertRaises(ValueError, store.put, 'b', df,
- format='fixed', complib='zlib')
- def test_put_compression_blosc(self):
- tm.skip_if_no_package('tables', '2.2', app='blosc support')
- if skip_compression:
- raise nose.SkipTest("skipping on windows/PY3")
- df = tm.makeTimeDataFrame()
- with ensure_clean_store(self.path) as store:
- # can't compress if format='fixed'
- self.assertRaises(ValueError, store.put, 'b', df,
- format='fixed', complib='blosc')
- store.put('c', df, format='table', complib='blosc')
- tm.assert_frame_equal(store['c'], df)
- def test_put_integer(self):
- # non-date, non-string index
- df = DataFrame(np.random.randn(50, 100))
- self._check_roundtrip(df, tm.assert_frame_equal)
- def test_put_mixed_type(self):
- df = tm.makeTimeDataFrame()
- df['obj1'] = 'foo'
- df['obj2'] = 'bar'
- df['bool1'] = df['A'] > 0
- df['bool2'] = df['B'] > 0
- df['bool3'] = True
- df['int1'] = 1
- df['int2'] = 2
- df['timestamp1'] = Timestamp('20010102')
- df['timestamp2'] = Timestamp('20010103')
- df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0)
- df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0)
- df.ix[3:6, ['obj1']] = np.nan
- df = df.consolidate()._convert(datetime=True)
- with ensure_clean_store(self.path) as store:
- _maybe_remove(store, 'df')
- # cannot use assert_produces_warning here for some reason
- # a PendingDeprecationWarning is also raised?
- warnings.filterwarnings('ignore', category=PerformanceWarning)
- store.put('df', df)
- warnings.filterwarnings('always', category=PerformanceWarning)
- expected = store.get('df')
- tm.assert_frame_equal(expected, df)
- def test_append(self):
- with ensure_clean_store(self.path) as store:
- df = tm.makeTimeDataFrame()
- _maybe_remove(store, 'df1')
- store.append('df1', df[:10])
- store.append('df1', df[10:])
- tm.assert_frame_equal(store['df1'], df)
- _maybe_remove(store, 'df2')
- store.put('df2', df[:10], format='table')
- store.append('df2', df[10:])
- tm.assert_frame_equal(store['df2'], df)
- _maybe_remove(store, 'df3')
- store.append('/df3', df[:10])
- store.append('/df3', df[10:])
- tm.assert_frame_equal(store['df3'], df)
- # this is allowed by almost always don't want to do it
- with tm.assert_produces_warning(
- expected_warning=tables.NaturalNameWarning):
- _maybe_remove(store, '/df3 foo')
- store.append('/df3 foo', df[:10])
- store.append('/df3 foo', df[10:])
- tm.assert_frame_equal(store['df3 foo'], df)
- # panel
- wp = tm.makePanel()
- _maybe_remove(store, 'wp1')
- store.append('wp1', wp.ix[:, :10, :])
- store.append('wp1', wp.ix[:, 10:, :])
- assert_panel_equal(store['wp1'], wp)
- # ndim
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- p4d = tm.makePanel4D()
- _maybe_remove(store, 'p4d')
- store.append('p4d', p4d.ix[:, :, :10, :])
- store.append('p4d', p4d.ix[:, :, 10:, :])
- assert_panel4d_equal(store['p4d'], p4d)
- # test using axis labels
- _maybe_remove(store, 'p4d')
- store.append('p4d', p4d.ix[:, :, :10, :], axes=[
- 'items', 'major_axis', 'minor_axis'])
- store.append('p4d', p4d.ix[:, :, 10:, :], axes=[
- 'items', 'major_axis', 'minor_axis'])
- assert_panel4d_equal(store['p4d'], p4d)
- # test using differnt number of items on each axis
- p4d2 = p4d.copy()
- p4d2['l4'] = p4d['l1']
- p4d2['l5'] = p4d['l1']
- _maybe_remove(store, 'p4d2')
- store.append(
- 'p4d2', p4d2, axes=['items', 'major_axis', 'minor_axis'])
- assert_panel4d_equal(store['p4d2'], p4d2)
- # test using differt order of items on the non-index axes
- _maybe_remove(store, 'wp1')
- wp_append1 = wp.ix[:, :10, :]
- store.append('wp1', wp_append1)
- wp_append2 = wp.ix[:, 10:, :].reindex(items=wp.items[::-1])
- store.append('wp1', wp_append2)
- assert_panel_equal(store['wp1'], wp)
- # dtype issues - mizxed type in a single object column
- df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]])
- df['mixed_column'] = 'testing'
- df.ix[2, 'mixed_column'] = np.nan
- _maybe_remove(store, 'df')
- store.append('df', df)
- tm.assert_frame_equal(store['df'], df)
- # uints - test storage of uints
- uint_data = DataFrame({
- 'u08': Series(np.random.randint(0, high=255, size=5),
- dtype=np.uint8),
- 'u16': Series(np.random.randint(0, high=65535, size=5),
- dtype=np.uint16),
- 'u32': Series(np.random.randint(0, high=2**30, size=5),
- dtype=np.uint32),
- 'u64': Series([2**58, 2**59, 2**60, 2**61, 2**62],
- dtype=np.uint64)}, index=np.arange(5))
- _maybe_remove(store, 'uints')
- store.append('uints', uint_data)
- tm.assert_frame_equal(store['uints'], uint_data)
- # uints - test storage of uints in indexable columns
- _maybe_remove(store, 'uints')
- # 64-bit indices not yet supported
- store.append('uints', uint_data, data_columns=[
- 'u08', 'u16', 'u32'])
- tm.assert_frame_equal(store['uints'], uint_data)
- def test_append_series(self):
- with ensure_clean_store(self.path) as store:
- # basic
- ss = tm.makeStringSeries()
- ts = tm.makeTimeSeries()
- ns = Series(np.arange(100))
- store.append('ss', ss)
- result = store['ss']
- tm.assert_series_equal(result, ss)
- self.assertIsNone(result.name)
- store.append('ts', ts)
- result = store['ts']
- tm.assert_series_equal(result, ts)
- self.assertIsNone(result.name)
- ns.name = 'foo'
- store.append('ns', ns)
- result = store['ns']
- tm.assert_series_equal(result, ns)
- self.assertEqual(result.name, ns.name)
- # select on the values
- expected = ns[ns > 60]
- result = store.select('ns', Term('foo>60'))
- tm.assert_series_equal(result, expected)
- # select on the index and values
- expected = ns[(ns > 70) & (ns.index < 90)]
- result = store.select('ns', [Term('foo>70'), Term('index<90')])
- tm.assert_series_equal(result, expected)
- # multi-index
- mi = DataFrame(np.random.randn(5, 1), columns=['A'])
- mi['B'] = np.arange(len(mi))
- mi['C'] = 'foo'
- mi.loc[3:5, 'C'] = 'bar'
- mi.set_index(['C', 'B'], inplace=True)
- s = mi.stack()
- s.index = s.index.droplevel(2)
- store.append('mi', s)
- tm.assert_series_equal(store['mi'], s)
- def test_store_index_types(self):
- # GH5386
- # test storing various index types
- with ensure_clean_store(self.path) as store:
- def check(format, index):
- df = DataFrame(np.random.randn(10, 2), columns=list('AB'))
- df.index = index(len(df))
- _maybe_remove(store, 'df')
- store.put('df', df, format=format)
- assert_frame_equal(df, store['df'])
- for index in [tm.makeFloatIndex, tm.makeStringIndex,
- tm.makeIntIndex, tm.makeDateIndex]:
- check('table', index)
- check('fixed', index)
- # period index currently broken for table
- # seee GH7796 FIXME
- check('fixed', tm.makePeriodIndex)
- # check('table',tm.makePeriodIndex)
- # unicode
- index = tm.makeUnicodeIndex
- if compat.PY3:
- check('table', index)
- check('fixed', index)
- else:
- # only support for fixed types (and they have a perf warning)
- self.assertRaises(TypeError, check, 'table', index)
- with tm.assert_produces_warning(
- expected_warning=PerformanceWarning):
- check('fixed', index)
- def test_encoding(self):
- if sys.byteorder != 'little':
- raise nose.SkipTest('system byteorder is not little')
- with ensure_clean_store(self.path) as store:
- df = DataFrame(dict(A='foo', B='bar'), index=range(5))
- df.loc[2, 'A'] = np.nan
- df.loc[3, 'B'] = np.nan
- _maybe_remove(store, 'df')
- store.append('df', df, encoding='ascii')
- tm.assert_frame_equal(store['df'], df)
- expected = df.reindex(columns=['A'])
- result = store.select('df', Term('columns=A', encoding='ascii'))
- tm.assert_frame_equal(result, expected)
- def test_latin_encoding(self):
- if compat.PY2:
- self.assertRaisesRegexp(
- TypeError, '\[unicode\] is not implemented as a table column')
- return
- values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'],
- [b'E\xc9, 17', b'a', b'b', b'c'],
- [b'EE, 17', b'', b'a', b'b', b'c'],
- [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'],
- [b'', b'a', b'b', b'c'],
- [b'\xf8\xfc', b'a', b'b', b'c'],
- [b'A\xf8\xfc', b'', b'a', b'b', b'c'],
- [np.nan, b'', b'b', b'c'],
- [b'A\xf8\xfc', np.nan, b'', b'b', b'c']]
- def _try_decode(x, encoding='latin-1'):
- try:
- return x.decode(encoding)
- except AttributeError:
- return x
- # not sure how to remove latin-1 from code in python 2 and 3
- values = [[_try_decode(x) for x in y] for y in values]
- examples = []
- for dtype in ['category', object]:
- for val in values:
- examples.append(pandas.Series(val, dtype=dtype))
- def roundtrip(s, key='data', encoding='latin-1', nan_rep=''):
- with ensure_clean_path(self.path) as store:
- s.to_hdf(store, key, format='table', encoding=encoding,
- nan_rep=nan_rep)
- retr = read_hdf(store, key)
- s_nan = s.replace(nan_rep, np.nan)
- assert_series_equal(s_nan, retr, check_categorical=False)
- for s in examples:
- roundtrip(s)
- # fails:
- # for x in examples:
- # roundtrip(s, nan_rep=b'\xf8\xfc')
- def test_append_some_nans(self):
- with ensure_clean_store(self.path) as store:
- df = DataFrame({'A': Series(np.random.randn(20)).astype('int32'),
- 'A1': np.random.randn(20),
- 'A2': np.random.randn(20),
- 'B': 'foo', 'C': 'bar',
- 'D': Timestamp("20010101"),
- 'E': datetime.datetime(2001, 1, 2, 0, 0)},
- index=np.arange(20))
- # some nans
- _maybe_remove(store, 'df1')
- df.ix[0:15, ['A1', 'B', 'D', 'E']] = np.nan
- store.append('df1', df[:10])
- store.append('df1', df[10:])
- tm.assert_frame_equal(store['df1'], df)
- # first column
- df1 = df.copy()
- df1.ix[:, 'A1'] = np.nan
- _maybe_remove(store, 'df1')
- store.append('df1', df1[:10])
- store.append('df1', df1[10:])
- tm.assert_frame_equal(store['df1'], df1)
- # 2nd column
- df2 = df.copy()
- df2.ix[:, 'A2'] = np.nan
- _maybe_remove(store, 'df2')
- store.append('df2', df2[:10])
- store.append('df2', df2[10:])
- tm.assert_frame_equal(store['df2'], df2)
- # datetimes
- df3 = df.copy()
- df3.ix[:, 'E'] = np.nan
- _maybe_remove(store, 'df3')
- store.append('df3', df3[:10])
- store.append('df3', df3[10:])
- tm.assert_frame_equal(store['df3'], df3)
- def test_append_all_nans(self):
- with ensure_clean_store(self.path) as store:
- df = DataFrame({'A1': np.random.randn(20),
- 'A2': np.random.randn(20)},
- index=np.arange(20))
- df.ix[0:15, :] = np.nan
- # nan some entire rows (dropna=True)
- _maybe_remove(store, 'df')
- store.append('df', df[:10], dropna=True)
- store.append('df', df[10:], dropna=True)
- tm.assert_frame_equal(store['df'], df[-4:])
- # nan some entire rows (dropna=False)
- _maybe_remove(store, 'df2')
- store.append('df2', df[:10], dropna=False)
- store.append('df2', df[10:], dropna=False)
- tm.assert_frame_equal(store['df2'], df)
- # tests the option io.hdf.dropna_table
- pandas.set_option('io.hdf.dropna_table', False)
- _maybe_remove(store, 'df3')
- store.append('df3', df[:10])
- store.append('df3', df[10:])
- tm.assert_frame_equal(store['df3'], df)
- pandas.set_option('io.hdf.dropna_table', True)
- _maybe_remove(store, 'df4')
- store.append('df4', df[:10])
- store.append('df4', df[10:])
- tm.assert_frame_equal(store['df4'], df[-4:])
- # nan some entire rows (string are still written!)
- df = DataFrame({'A1': np.random.randn(20),
- 'A2': np.random.randn(20),
- 'B': 'foo', 'C': 'bar'},
- index=np.arange(20))
- df.ix[0:15, :] = np.nan
- _maybe_remove(store, 'df')
- store.append('df', df[:10], dropna=True)
- store.append('df', df[10:], dropna=True)
- tm.assert_frame_equal(store['df'], df)
- _maybe_remove(store, 'df2')
- store.append('df2', df[:10], dropna=False)
- store.append('df2', df[10:], dropna=False)
- tm.assert_frame_equal(store['df2'], df)
- # nan some entire rows (but since we have dates they are still
- # written!)
- df = DataFrame({'A1': np.random.randn(20),
- 'A2': np.random.randn(20),
- 'B': 'foo', 'C': 'bar',
- 'D': Timestamp("20010101"),
- 'E': datetime.datetime(2001, 1, 2, 0, 0)},
- index=np.arange(20))
- df.ix[0:15, :] = np.nan
- _maybe_remove(store, 'df')
- store.append('df', df[:10], dropna=True)
- store.append('df', df[10:], dropna=True)
- tm.assert_frame_equal(store['df'], df)
- _maybe_remove(store, 'df2')
- store.append('df2', df[:10], dropna=False)
- store.append('df2', df[10:], dropna=False)
- tm.assert_frame_equal(store['df2'], df)
- # Test to make sure defaults are to not drop.
- # Corresponding to Issue 9382
- df_with_missing = DataFrame(
- {'col1': [0, np.nan, 2], 'col2': [1, np.nan, np.nan]})
- with ensure_clean_path(self.path) as path:
- df_with_missing.to_hdf(path, 'df_with_missing', format='table')
- reloaded = read_hdf(path, 'df_with_missing')
- tm.assert_frame_equal(df_with_missing, reloaded)
- matrix = [[[np.nan, np.nan, np.nan], [1, np.nan, np.nan]],
- [[np.nan, np.nan, np.nan], [np.nan, 5, 6]],
- [[np.nan, np.nan, np.nan], [np.nan, 3, np.nan]]]
- panel_with_missing = Panel(matrix, items=['Item1', 'Item2', 'Item3'],
- major_axis=[1, 2],
- minor_axis=['A', 'B', 'C'])
- with ensure_clean_path(self.path) as path:
- panel_with_missing.to_hdf(
- path, 'panel_with_missing', format='table')
- reloaded_panel = read_hdf(path, 'panel_with_missing')
- tm.assert_panel_equal(panel_with_missing, reloaded_panel)
- def test_append_frame_column_oriented(self):
- with ensure_clean_store(self.path) as store:
- # column oriented
- df = tm.makeTimeDataFrame()
- _maybe_remove(store, 'df1')
- store.append('df1', df.ix[:, :2], axes=['columns'])
- store.append('df1', df.ix[:, 2:])
- tm.assert_frame_equal(store['df1'], df)
- result = store.select('df1', 'columns=A')
- expected = df.reindex(columns=['A'])
- tm.assert_frame_equal(expected, result)
- # selection on the non-indexable
- result = store.select(
- 'df1', ('columns=A', Term('index=df.index[0:4]')))
- expected = df.reindex(columns=['A'], index=df.index[0:4])
- tm.assert_frame_equal(expected, result)
- # this isn't supported
- self.assertRaises(TypeError, store.select, 'df1', (
- 'columns=A', Term('index>df.index[4]')))
- def test_append_with_different_block_ordering(self):
- # GH 4096; using same frames, but different block orderings
- with ensure_clean_store(self.path) as store:
- for i in range(10):
- df = DataFrame(np.random.randn(10, 2), columns=list('AB'))
- df['index'] = range(10)
- df['index'] += i * 10
- df['int64'] = Series([1] * len(df), dtype='int64')
- df['int16'] = Series([1] * len(df), dtype='int16')
- if i % 2 == 0:
- del df['int64']
- df['int64'] = Series([1] * len(df), dtype='int64')
- if i % 3 == 0:
- a = df.pop('A')
- df['A'] = a
- df.set_index('index', inplace=True)
- store.append('df', df)
- # test a different ordering but with more fields (like invalid
- # combinate)
- with ensure_clean_store(self.path) as store:
- df = DataFrame(np.random.randn(10, 2),
- columns=list('AB'), dtype='float64')
- df['int64'] = Series([1] * len(df), dtype='int64')
- df['int16'] = Series([1] * len(df), dtype='int16')
- store.append('df', df)
- # store additonal fields in different blocks
- df['int16_2'] = Series([1] * len(df), dtype='int16')
- self.assertRaises(ValueError, store.append, 'df', df)
- # store multile additonal fields in different blocks
- df['float_3'] = Series([1.] * len(df), dtype='float64')
- self.assertRaises(ValueError, store.append, 'df', df)
- def test_ndim_indexables(self):
- # test using ndim tables in new ways
- with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
- with ensure_clean_store(self.path) as store:
- p4d = tm.makePanel4D()
- def check_indexers(key, indexers):
- for i, idx in enumerate(indexers):
- descr = getattr(store.root, key).table.description
- self.assertTrue(getattr(descr, idx)._v_pos == i)
- # append then change (will take existing schema)
- indexers = ['items', 'major_axis', 'minor_axis']
- _maybe_remove(store, 'p4d')
- store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
- store.append('p4d', p4d.ix[:, :, 10:, :])
- assert_panel4d_equal(store.select('p4d'), p4d)
- check_indexers('p4d', indexers)
- # same as above, but try to append with differnt axes
- _maybe_remove(store, 'p4d')
- store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
- store.append('p4d', p4d.ix[:, :, 10:, :], axes=[
- 'labels', 'items', 'major_axis'])
- assert_panel4d_equal(store.select('p4d'), p4d)
- check_indexers('p4d', indexers)
- # pass incorrect number of axes
- _maybe_remove(store, 'p4d')
- self.assertRaises(ValueError, store.append, 'p4d', p4d.ix[
- :, :, :10, :], axes=['major_axis', 'minor_axis'])
- # different than default indexables #1
- indexers = ['labels', 'major_axis', 'minor_axis']
- _maybe_remove(store, 'p4d')
- store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
- store.append('p4d', p4d.ix[:, :, 10:, :])
- assert_panel4d_equal(store['p4d'], p4d)
- check_indexers('p4d', indexers)
- # different than default indexables #2
- indexers = ['major_axis', 'labels', 'minor_axis']
- _maybe_remove(store, 'p4d')
- store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
- store.append('p4d', p4d.ix[:, :, 10:, :])
- assert_panel4d_equal(store['p4d'], p4d)
- check_indexers('p4d', indexers)
- # partial selection
- result = store.select('p4d', ['labels=l1'])
- expected = p4d.reindex(labels=['l1'])
- assert_panel4d_equal(result, expected)
- # partial selection2
- result = store.select('p4d', [Term(
- 'labels=l1'), Term('items=ItemA'), Term('minor_axis=B')])
- expected = p4d.reindex(
- labels=['l1'], items=['ItemA'], minor_axis=['B'])
- assert_panel4d_equal(result, expected)
- # non-existant partial selection
- result = store.select('p4d', [Term(
- 'labels=l1'), Term('items=Item1'), Term('minor_axis=B')])
- expected = p4d.reindex(labels=['l1'], items=[],
- minor_axis=['B'])
- assert_panel4d_equal(result, expected)
- def test_append_with_strings(self):
- with ensure_clean_store(self.path) as store:
- wp = tm.makePanel()
- wp2 = wp.rename_axis(
- dict([(x, "%s_extra" % x) for x in wp.minor_axis]), axis=2)
- def check_col(key, name, size):
- self.assertEqual(getattr(store.get_storer(
- key).table.description, name).itemsize, size)
- store.append('s1', wp, min_itemsize=20)
- store.append('s1', wp2)
- expected = concat([wp, wp2], axis=2)
- expected = expected.reindex(minor_axis=sorted(expected.minor_axis))
- assert_panel_equal(store['s1'], expected)
- check_col('s1', 'minor_axis', 20)
- # test dict format
- store.append('s2', wp, min_itemsize={'minor_axis': 20})
- store.append('s2', wp2)
- expected = concat([wp, wp2], axis=2)
- expected = expected.reindex(minor_axis=sorted(expected.minor_axis))
- assert_panel_equal(store['s2'], expected)
- check_col('s2', 'minor_axis', 20)
- # apply the wrong field (similar to #1)
- store.append('s3', wp, min_itemsize={'major_axis': 20})
- self.assertRaises(ValueError, store.append, 's3', wp2)
- # test truncation of bigger strings
- store.append('s4', wp)
- self.assertRaises(ValueError, store.append, 's4', wp2)
- # avoid truncation on elements
- df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']])
- store.append('df_big', df)
- tm.assert_frame_equal(store.select('df_big'), df)
- check_col('df_big', 'values_block_1', 15)
- # appending smaller string ok
- df2 = DataFrame([[124, 'asdqy'], [346, 'dggnhefbdfb']])
- store.append('df_big', df2)
- expected = concat([df, df2])
- tm.assert_frame_equal(store.select('df_big'), expected)
- check_col('df_big', 'values_block_1', 15)
- # avoid truncation on elements
- df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']])
- store.append('df_big2', df, min_itemsize={'values': 50})
- tm.assert_frame_equal(store.select('df_big2'), df)
- check_col('df_big2', 'values_block_1', 50)
- # bigger string on next append
- store.append('df_new', df)
- df_new = DataFrame(
- [[124, 'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']])
- self.assertRaises(ValueError, store.append, 'df_new', df_new)
- # with nans
- _maybe_remove(store, 'df')
- df = tm.makeTimeDataFrame()
- df['string'] = 'foo'
- df.ix[1:4, 'string'] = np.nan
- df['string2'] = 'bar'
- df.ix[4:8, 'string2'] = np.nan
- df['string3'] = 'bah'
- df.ix[1:, 'string3'] = np.nan
- store.append('df', df)
- result = store.select('df')
- tm.assert_frame_equal(result, df)
- with ensure_clean_store(self.path) as store:
- def check_col(key, name, size):
- self.assertEqual(getattr(store.get_storer(
- key).table.description, name).itemsize, size)
- d…
Large files files are truncated, but you can click here to view the full file