/pandas/io/tests/test_pytables.py
Python | 4364 lines | 4124 code | 177 blank | 63 comment | 86 complexity | 59b462ea4b46ff347b15ad7d8b8f3338 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- import nose
- import sys
- import os
- import warnings
- import tempfile
- from contextlib import contextmanager
- import datetime
- import numpy as np
- import pandas
- from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range,
- date_range, Index, DatetimeIndex, isnull)
- from pandas.io.pytables import (HDFStore, get_store, Term, read_hdf,
- IncompatibilityWarning, PerformanceWarning,
- AttributeConflictWarning, DuplicateWarning,
- PossibleDataLossError, ClosedFileError)
- from pandas.io import pytables as pytables
- import pandas.util.testing as tm
- from pandas.util.testing import (assert_panel4d_equal,
- assert_panel_equal,
- assert_frame_equal,
- assert_series_equal)
- from pandas import concat, Timestamp
- from pandas import compat, _np_version_under1p7
- from pandas.compat import range, lrange, u
- from pandas.util.testing import assert_produces_warning
- try:
- import tables
- except ImportError:
- raise nose.SkipTest('no pytables')
- from distutils.version import LooseVersion
- _default_compressor = LooseVersion(tables.__version__) >= '2.2' \
- and 'blosc' or 'zlib'
- _multiprocess_can_split_ = False
- # contextmanager to ensure the file cleanup
- def safe_remove(path):
- if path is not None:
- try:
- os.remove(path)
- except:
- pass
- def safe_close(store):
- try:
- if store is not None:
- store.close()
- except:
- pass
- def create_tempfile(path):
- """ create an unopened named temporary file """
- return os.path.join(tempfile.gettempdir(),path)
- @contextmanager
- def ensure_clean_store(path, mode='a', complevel=None, complib=None,
- fletcher32=False):
- try:
- # put in the temporary path if we don't have one already
- if not len(os.path.dirname(path)):
- path = create_tempfile(path)
- store = HDFStore(path, mode=mode, complevel=complevel,
- complib=complib, fletcher32=False)
- yield store
- finally:
- safe_close(store)
- if mode == 'w' or mode == 'a':
- safe_remove(path)
- @contextmanager
- def ensure_clean_path(path):
- """
- return essentially a named temporary file that is not opened
- and deleted on existing; if path is a list, then create and
- return list of filenames
- """
- try:
- if isinstance(path, list):
- filenames = [ create_tempfile(p) for p in path ]
- yield filenames
- else:
- filenames = [ create_tempfile(path) ]
- yield filenames[0]
- finally:
- for f in filenames:
- safe_remove(f)
- # set these parameters so we don't have file sharing
- tables.parameters.MAX_NUMEXPR_THREADS = 1
- tables.parameters.MAX_BLOSC_THREADS = 1
- tables.parameters.MAX_THREADS = 1
- def _maybe_remove(store, key):
- """For tests using tables, try removing the table to be sure there is
- no content from previous tests using the same table name."""
- try:
- store.remove(key)
- except:
- pass
- def compat_assert_produces_warning(w,f):
- """ don't produce a warning under PY3 """
- if compat.PY3:
- f()
- else:
- with tm.assert_produces_warning(expected_warning=w):
- f()
- class TestHDFStore(tm.TestCase):
- @classmethod
- def setUpClass(cls):
- super(TestHDFStore, cls).setUpClass()
- # Pytables 3.0.0 deprecates lots of things
- tm.reset_testing_mode()
- @classmethod
- def tearDownClass(cls):
- super(TestHDFStore, cls).tearDownClass()
- # Pytables 3.0.0 deprecates lots of things
- tm.set_testing_mode()
- def setUp(self):
- warnings.filterwarnings(action='ignore', category=FutureWarning)
- self.path = 'tmp.__%s__.h5' % tm.rands(10)
- def tearDown(self):
- pass
- def test_factory_fun(self):
- try:
- with get_store(self.path) as tbl:
- raise ValueError('blah')
- except ValueError:
- pass
- finally:
- safe_remove(self.path)
- try:
- with get_store(self.path) as tbl:
- tbl['a'] = tm.makeDataFrame()
- with get_store(self.path) as tbl:
- self.assertEqual(len(tbl), 1)
- self.assertEqual(type(tbl['a']), DataFrame)
- finally:
- safe_remove(self.path)
- def test_conv_read_write(self):
- try:
- def roundtrip(key, obj,**kwargs):
- obj.to_hdf(self.path, key,**kwargs)
- return read_hdf(self.path, key)
- o = tm.makeTimeSeries()
- assert_series_equal(o, roundtrip('series',o))
- o = tm.makeStringSeries()
- assert_series_equal(o, roundtrip('string_series',o))
- o = tm.makeDataFrame()
- assert_frame_equal(o, roundtrip('frame',o))
- o = tm.makePanel()
- assert_panel_equal(o, roundtrip('panel',o))
- # table
- df = DataFrame(dict(A=lrange(5), B=lrange(5)))
- df.to_hdf(self.path,'table',append=True)
- result = read_hdf(self.path, 'table', where = ['index>2'])
- assert_frame_equal(df[df.index>2],result)
- finally:
- safe_remove(self.path)
- def test_long_strings(self):
- # GH6166
- # unconversion of long strings was being chopped in earlier
- # versions of numpy < 1.7.2
- df = DataFrame({'a': [tm.rands(100) for _ in range(10)]},
- index=[tm.rands(100) for _ in range(10)])
- with ensure_clean_store(self.path) as store:
- store.append('df', df, data_columns=['a'])
- result = store.select('df')
- assert_frame_equal(df, result)
- def test_api(self):
- # GH4584
- # API issue when to_hdf doesn't acdept append AND format args
- with ensure_clean_path(self.path) as path:
- df = tm.makeDataFrame()
- df.iloc[:10].to_hdf(path,'df',append=True,format='table')
- df.iloc[10:].to_hdf(path,'df',append=True,format='table')
- assert_frame_equal(read_hdf(path,'df'),df)
- # append to False
- df.iloc[:10].to_hdf(path,'df',append=False,format='table')
- df.iloc[10:].to_hdf(path,'df',append=True,format='table')
- assert_frame_equal(read_hdf(path,'df'),df)
- with ensure_clean_path(self.path) as path:
- df = tm.makeDataFrame()
- df.iloc[:10].to_hdf(path,'df',append=True)
- df.iloc[10:].to_hdf(path,'df',append=True,format='table')
- assert_frame_equal(read_hdf(path,'df'),df)
- # append to False
- df.iloc[:10].to_hdf(path,'df',append=False,format='table')
- df.iloc[10:].to_hdf(path,'df',append=True)
- assert_frame_equal(read_hdf(path,'df'),df)
- with ensure_clean_path(self.path) as path:
- df = tm.makeDataFrame()
- df.to_hdf(path,'df',append=False,format='fixed')
- assert_frame_equal(read_hdf(path,'df'),df)
- df.to_hdf(path,'df',append=False,format='f')
- assert_frame_equal(read_hdf(path,'df'),df)
- df.to_hdf(path,'df',append=False)
- assert_frame_equal(read_hdf(path,'df'),df)
- df.to_hdf(path,'df')
- assert_frame_equal(read_hdf(path,'df'),df)
- with ensure_clean_store(self.path) as store:
- path = store._path
- df = tm.makeDataFrame()
- _maybe_remove(store,'df')
- store.append('df',df.iloc[:10],append=True,format='table')
- store.append('df',df.iloc[10:],append=True,format='table')
- assert_frame_equal(store.select('df'),df)
- # append to False
- _maybe_remove(store,'df')
- store.append('df',df.iloc[:10],append=False,format='table')
- store.append('df',df.iloc[10:],append=True,format='table')
- assert_frame_equal(store.select('df'),df)
- # formats
- _maybe_remove(store,'df')
- store.append('df',df.iloc[:10],append=False,format='table')
- store.append('df',df.iloc[10:],append=True,format='table')
- assert_frame_equal(store.select('df'),df)
- _maybe_remove(store,'df')
- store.append('df',df.iloc[:10],append=False,format='table')
- store.append('df',df.iloc[10:],append=True,format=None)
- assert_frame_equal(store.select('df'),df)
- with ensure_clean_path(self.path) as path:
- # invalid
- df = tm.makeDataFrame()
- self.assertRaises(ValueError, df.to_hdf, path,'df',append=True,format='f')
- self.assertRaises(ValueError, df.to_hdf, path,'df',append=True,format='fixed')
- self.assertRaises(TypeError, df.to_hdf, path,'df',append=True,format='foo')
- self.assertRaises(TypeError, df.to_hdf, path,'df',append=False,format='bar')
- def test_api_default_format(self):
- # default_format option
- with ensure_clean_store(self.path) as store:
- df = tm.makeDataFrame()
- pandas.set_option('io.hdf.default_format','fixed')
- _maybe_remove(store,'df')
- store.put('df',df)
- self.assertFalse(store.get_storer('df').is_table)
- self.assertRaises(ValueError, store.append, 'df2',df)
- pandas.set_option('io.hdf.default_format','table')
- _maybe_remove(store,'df')
- store.put('df',df)
- self.assertTrue(store.get_storer('df').is_table)
- _maybe_remove(store,'df2')
- store.append('df2',df)
- self.assertTrue(store.get_storer('df').is_table)
- pandas.set_option('io.hdf.default_format',None)
- with ensure_clean_path(self.path) as path:
- df = tm.makeDataFrame()
- pandas.set_option('io.hdf.default_format','fixed')
- df.to_hdf(path,'df')
- with get_store(path) as store:
- self.assertFalse(store.get_storer('df').is_table)
- self.assertRaises(ValueError, df.to_hdf, path,'df2', append=True)
- pandas.set_option('io.hdf.default_format','table')
- df.to_hdf(path,'df3')
- with get_store(path) as store:
- self.assertTrue(store.get_storer('df3').is_table)
- df.to_hdf(path,'df4',append=True)
- with get_store(path) as store:
- self.assertTrue(store.get_storer('df4').is_table)
- pandas.set_option('io.hdf.default_format',None)
- def test_keys(self):
- with ensure_clean_store(self.path) as store:
- store['a'] = tm.makeTimeSeries()
- store['b'] = tm.makeStringSeries()
- store['c'] = tm.makeDataFrame()
- store['d'] = tm.makePanel()
- store['foo/bar'] = tm.makePanel()
- self.assertEqual(len(store), 5)
- self.assertTrue(set(
- store.keys()) == set(['/a', '/b', '/c', '/d', '/foo/bar']))
- def test_repr(self):
- with ensure_clean_store(self.path) as store:
- repr(store)
- store['a'] = tm.makeTimeSeries()
- store['b'] = tm.makeStringSeries()
- store['c'] = tm.makeDataFrame()
- store['d'] = tm.makePanel()
- store['foo/bar'] = tm.makePanel()
- store.append('e', tm.makePanel())
- df = tm.makeDataFrame()
- df['obj1'] = 'foo'
- df['obj2'] = 'bar'
- df['bool1'] = df['A'] > 0
- df['bool2'] = df['B'] > 0
- df['bool3'] = True
- df['int1'] = 1
- df['int2'] = 2
- df['timestamp1'] = Timestamp('20010102')
- df['timestamp2'] = Timestamp('20010103')
- df['datetime1'] = datetime.datetime(2001,1,2,0,0)
- df['datetime2'] = datetime.datetime(2001,1,3,0,0)
- df.ix[3:6,['obj1']] = np.nan
- df = df.consolidate().convert_objects()
- warnings.filterwarnings('ignore', category=PerformanceWarning)
- store['df'] = df
- warnings.filterwarnings('always', category=PerformanceWarning)
- # make a random group in hdf space
- store._handle.createGroup(store._handle.root,'bah')
- repr(store)
- str(store)
- # storers
- with ensure_clean_store(self.path) as store:
- df = tm.makeDataFrame()
- store.append('df',df)
- s = store.get_storer('df')
- repr(s)
- str(s)
- def test_contains(self):
- with ensure_clean_store(self.path) as store:
- store['a'] = tm.makeTimeSeries()
- store['b'] = tm.makeDataFrame()
- store['foo/bar'] = tm.makeDataFrame()
- self.assertIn('a', store)
- self.assertIn('b', store)
- self.assertNotIn('c', store)
- self.assertIn('foo/bar', store)
- self.assertIn('/foo/bar', store)
- self.assertNotIn('/foo/b', store)
- self.assertNotIn('bar', store)
- # GH 2694
- warnings.filterwarnings('ignore', category=tables.NaturalNameWarning)
- store['node())'] = tm.makeDataFrame()
- self.assertIn('node())', store)
- def test_versioning(self):
- with ensure_clean_store(self.path) as store:
- store['a'] = tm.makeTimeSeries()
- store['b'] = tm.makeDataFrame()
- df = tm.makeTimeDataFrame()
- _maybe_remove(store, 'df1')
- store.append('df1', df[:10])
- store.append('df1', df[10:])
- self.assertEqual(store.root.a._v_attrs.pandas_version, '0.10.1')
- self.assertEqual(store.root.b._v_attrs.pandas_version, '0.10.1')
- self.assertEqual(store.root.df1._v_attrs.pandas_version, '0.10.1')
- # write a file and wipe its versioning
- _maybe_remove(store, 'df2')
- store.append('df2', df)
- # this is an error because its table_type is appendable, but no version
- # info
- store.get_node('df2')._v_attrs.pandas_version = None
- self.assertRaises(Exception, store.select, 'df2')
- def test_mode(self):
- df = tm.makeTimeDataFrame()
- def check(mode):
- with ensure_clean_path(self.path) as path:
- # constructor
- if mode in ['r','r+']:
- self.assertRaises(IOError, HDFStore, path, mode=mode)
- else:
- store = HDFStore(path,mode=mode)
- self.assertEqual(store._handle.mode, mode)
- store.close()
- with ensure_clean_path(self.path) as path:
- # context
- if mode in ['r','r+']:
- def f():
- with get_store(path,mode=mode) as store:
- pass
- self.assertRaises(IOError, f)
- else:
- with get_store(path,mode=mode) as store:
- self.assertEqual(store._handle.mode, mode)
- with ensure_clean_path(self.path) as path:
- # conv write
- if mode in ['r','r+']:
- self.assertRaises(IOError, df.to_hdf, path, 'df', mode=mode)
- df.to_hdf(path,'df',mode='w')
- else:
- df.to_hdf(path,'df',mode=mode)
- # conv read
- if mode in ['w']:
- self.assertRaises(KeyError, read_hdf, path, 'df', mode=mode)
- else:
- result = read_hdf(path,'df',mode=mode)
- assert_frame_equal(result,df)
- check('r')
- check('r+')
- check('a')
- check('w')
- def test_reopen_handle(self):
- with ensure_clean_path(self.path) as path:
- store = HDFStore(path,mode='a')
- store['a'] = tm.makeTimeSeries()
- # invalid mode change
- self.assertRaises(PossibleDataLossError, store.open, 'w')
- store.close()
- self.assertFalse(store.is_open)
- # truncation ok here
- store.open('w')
- self.assertTrue(store.is_open)
- self.assertEqual(len(store), 0)
- store.close()
- self.assertFalse(store.is_open)
- store = HDFStore(path,mode='a')
- store['a'] = tm.makeTimeSeries()
- # reopen as read
- store.open('r')
- self.assertTrue(store.is_open)
- self.assertEqual(len(store), 1)
- self.assertEqual(store._mode, 'r')
- store.close()
- self.assertFalse(store.is_open)
- # reopen as append
- store.open('a')
- self.assertTrue(store.is_open)
- self.assertEqual(len(store), 1)
- self.assertEqual(store._mode, 'a')
- store.close()
- self.assertFalse(store.is_open)
- # reopen as append (again)
- store.open('a')
- self.assertTrue(store.is_open)
- self.assertEqual(len(store), 1)
- self.assertEqual(store._mode, 'a')
- store.close()
- self.assertFalse(store.is_open)
- def test_open_args(self):
- with ensure_clean_path(self.path) as path:
- df = tm.makeDataFrame()
- # create an in memory store
- store = HDFStore(path,mode='a',driver='H5FD_CORE',driver_core_backing_store=0)
- store['df'] = df
- store.append('df2',df)
- tm.assert_frame_equal(store['df'],df)
- tm.assert_frame_equal(store['df2'],df)
- store.close()
- # only supported on pytable >= 3.0.0
- if LooseVersion(tables.__version__) >= '3.0.0':
- # the file should not have actually been written
- self.assertFalse(os.path.exists(path))
- def test_flush(self):
- with ensure_clean_store(self.path) as store:
- store['a'] = tm.makeTimeSeries()
- store.flush()
- store.flush(fsync=True)
- def test_get(self):
- with ensure_clean_store(self.path) as store:
- store['a'] = tm.makeTimeSeries()
- left = store.get('a')
- right = store['a']
- tm.assert_series_equal(left, right)
- left = store.get('/a')
- right = store['/a']
- tm.assert_series_equal(left, right)
- self.assertRaises(KeyError, store.get, 'b')
- def test_getattr(self):
- with ensure_clean_store(self.path) as store:
- s = tm.makeTimeSeries()
- store['a'] = s
- # test attribute access
- result = store.a
- tm.assert_series_equal(result, s)
- result = getattr(store,'a')
- tm.assert_series_equal(result, s)
- df = tm.makeTimeDataFrame()
- store['df'] = df
- result = store.df
- tm.assert_frame_equal(result, df)
- # errors
- self.assertRaises(AttributeError, getattr, store, 'd')
- for x in ['mode','path','handle','complib']:
- self.assertRaises(AttributeError, getattr, store, x)
- # not stores
- for x in ['mode','path','handle','complib']:
- getattr(store,"_%s" % x)
- def test_put(self):
- with ensure_clean_store(self.path) as store:
- ts = tm.makeTimeSeries()
- df = tm.makeTimeDataFrame()
- store['a'] = ts
- store['b'] = df[:10]
- store['foo/bar/bah'] = df[:10]
- store['foo'] = df[:10]
- store['/foo'] = df[:10]
- store.put('c', df[:10], format='table')
- # not OK, not a table
- self.assertRaises(
- ValueError, store.put, 'b', df[10:], append=True)
- # node does not currently exist, test _is_table_type returns False in
- # this case
- # _maybe_remove(store, 'f')
- # self.assertRaises(ValueError, store.put, 'f', df[10:], append=True)
- # can't put to a table (use append instead)
- self.assertRaises(ValueError, store.put, 'c', df[10:], append=True)
- # overwrite table
- store.put('c', df[:10], format='table', append=False)
- tm.assert_frame_equal(df[:10], store['c'])
- def test_put_string_index(self):
- with ensure_clean_store(self.path) as store:
- index = Index(
- ["I am a very long string index: %s" % i for i in range(20)])
- s = Series(np.arange(20), index=index)
- df = DataFrame({'A': s, 'B': s})
- store['a'] = s
- tm.assert_series_equal(store['a'], s)
- store['b'] = df
- tm.assert_frame_equal(store['b'], df)
- # mixed length
- index = Index(['abcdefghijklmnopqrstuvwxyz1234567890'] + ["I am a very long string index: %s" % i for i in range(20)])
- s = Series(np.arange(21), index=index)
- df = DataFrame({'A': s, 'B': s})
- store['a'] = s
- tm.assert_series_equal(store['a'], s)
- store['b'] = df
- tm.assert_frame_equal(store['b'], df)
- def test_put_compression(self):
- with ensure_clean_store(self.path) as store:
- df = tm.makeTimeDataFrame()
- store.put('c', df, format='table', complib='zlib')
- tm.assert_frame_equal(store['c'], df)
- # can't compress if format='fixed'
- self.assertRaises(ValueError, store.put, 'b', df,
- format='fixed', complib='zlib')
- def test_put_compression_blosc(self):
- tm.skip_if_no_package('tables', '2.2', app='blosc support')
- df = tm.makeTimeDataFrame()
- with ensure_clean_store(self.path) as store:
- # can't compress if format='fixed'
- self.assertRaises(ValueError, store.put, 'b', df,
- format='fixed', complib='blosc')
- store.put('c', df, format='table', complib='blosc')
- tm.assert_frame_equal(store['c'], df)
- def test_put_integer(self):
- # non-date, non-string index
- df = DataFrame(np.random.randn(50, 100))
- self._check_roundtrip(df, tm.assert_frame_equal)
- def test_put_mixed_type(self):
- df = tm.makeTimeDataFrame()
- df['obj1'] = 'foo'
- df['obj2'] = 'bar'
- df['bool1'] = df['A'] > 0
- df['bool2'] = df['B'] > 0
- df['bool3'] = True
- df['int1'] = 1
- df['int2'] = 2
- df['timestamp1'] = Timestamp('20010102')
- df['timestamp2'] = Timestamp('20010103')
- df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0)
- df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0)
- df.ix[3:6, ['obj1']] = np.nan
- df = df.consolidate().convert_objects()
- with ensure_clean_store(self.path) as store:
- _maybe_remove(store, 'df')
- # cannot use assert_produces_warning here for some reason
- # a PendingDeprecationWarning is also raised?
- warnings.filterwarnings('ignore', category=PerformanceWarning)
- store.put('df',df)
- warnings.filterwarnings('always', category=PerformanceWarning)
- expected = store.get('df')
- tm.assert_frame_equal(expected,df)
- def test_append(self):
- with ensure_clean_store(self.path) as store:
- df = tm.makeTimeDataFrame()
- _maybe_remove(store, 'df1')
- store.append('df1', df[:10])
- store.append('df1', df[10:])
- tm.assert_frame_equal(store['df1'], df)
- _maybe_remove(store, 'df2')
- store.put('df2', df[:10], format='table')
- store.append('df2', df[10:])
- tm.assert_frame_equal(store['df2'], df)
- _maybe_remove(store, 'df3')
- store.append('/df3', df[:10])
- store.append('/df3', df[10:])
- tm.assert_frame_equal(store['df3'], df)
- # this is allowed by almost always don't want to do it
- with tm.assert_produces_warning(expected_warning=tables.NaturalNameWarning):
- _maybe_remove(store, '/df3 foo')
- store.append('/df3 foo', df[:10])
- store.append('/df3 foo', df[10:])
- tm.assert_frame_equal(store['df3 foo'], df)
- # panel
- wp = tm.makePanel()
- _maybe_remove(store, 'wp1')
- store.append('wp1', wp.ix[:, :10, :])
- store.append('wp1', wp.ix[:, 10:, :])
- assert_panel_equal(store['wp1'], wp)
- # ndim
- p4d = tm.makePanel4D()
- _maybe_remove(store, 'p4d')
- store.append('p4d', p4d.ix[:, :, :10, :])
- store.append('p4d', p4d.ix[:, :, 10:, :])
- assert_panel4d_equal(store['p4d'], p4d)
- # test using axis labels
- _maybe_remove(store, 'p4d')
- store.append('p4d', p4d.ix[:, :, :10, :], axes=[
- 'items', 'major_axis', 'minor_axis'])
- store.append('p4d', p4d.ix[:, :, 10:, :], axes=[
- 'items', 'major_axis', 'minor_axis'])
- assert_panel4d_equal(store['p4d'], p4d)
- # test using differnt number of items on each axis
- p4d2 = p4d.copy()
- p4d2['l4'] = p4d['l1']
- p4d2['l5'] = p4d['l1']
- _maybe_remove(store, 'p4d2')
- store.append(
- 'p4d2', p4d2, axes=['items', 'major_axis', 'minor_axis'])
- assert_panel4d_equal(store['p4d2'], p4d2)
- # test using differt order of items on the non-index axes
- _maybe_remove(store, 'wp1')
- wp_append1 = wp.ix[:, :10, :]
- store.append('wp1', wp_append1)
- wp_append2 = wp.ix[:, 10:, :].reindex(items=wp.items[::-1])
- store.append('wp1', wp_append2)
- assert_panel_equal(store['wp1'], wp)
- # dtype issues - mizxed type in a single object column
- df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]])
- df['mixed_column'] = 'testing'
- df.ix[2, 'mixed_column'] = np.nan
- _maybe_remove(store, 'df')
- store.append('df', df)
- tm.assert_frame_equal(store['df'], df)
- # uints - test storage of uints
- uint_data = DataFrame({'u08' : Series(np.random.random_integers(0, high=255, size=5), dtype=np.uint8),
- 'u16' : Series(np.random.random_integers(0, high=65535, size=5), dtype=np.uint16),
- 'u32' : Series(np.random.random_integers(0, high=2**30, size=5), dtype=np.uint32),
- 'u64' : Series([2**58, 2**59, 2**60, 2**61, 2**62], dtype=np.uint64)},
- index=np.arange(5))
- _maybe_remove(store, 'uints')
- store.append('uints', uint_data)
- tm.assert_frame_equal(store['uints'], uint_data)
- # uints - test storage of uints in indexable columns
- _maybe_remove(store, 'uints')
- store.append('uints', uint_data, data_columns=['u08','u16','u32']) # 64-bit indices not yet supported
- tm.assert_frame_equal(store['uints'], uint_data)
- def test_append_series(self):
- with ensure_clean_store(self.path) as store:
- # basic
- ss = tm.makeStringSeries()
- ts = tm.makeTimeSeries()
- ns = Series(np.arange(100))
- store.append('ss', ss)
- result = store['ss']
- tm.assert_series_equal(result, ss)
- self.assertIsNone(result.name)
- store.append('ts', ts)
- result = store['ts']
- tm.assert_series_equal(result, ts)
- self.assertIsNone(result.name)
- ns.name = 'foo'
- store.append('ns', ns)
- result = store['ns']
- tm.assert_series_equal(result, ns)
- self.assertEqual(result.name, ns.name)
- # select on the values
- expected = ns[ns>60]
- result = store.select('ns',Term('foo>60'))
- tm.assert_series_equal(result,expected)
- # select on the index and values
- expected = ns[(ns>70) & (ns.index<90)]
- result = store.select('ns',[Term('foo>70'), Term('index<90')])
- tm.assert_series_equal(result,expected)
- # multi-index
- mi = DataFrame(np.random.randn(5,1),columns=['A'])
- mi['B'] = np.arange(len(mi))
- mi['C'] = 'foo'
- mi.loc[3:5,'C'] = 'bar'
- mi.set_index(['C','B'],inplace=True)
- s = mi.stack()
- s.index = s.index.droplevel(2)
- store.append('mi', s)
- tm.assert_series_equal(store['mi'], s)
- def test_store_index_types(self):
- # GH5386
- # test storing various index types
- with ensure_clean_store(self.path) as store:
- def check(format,index):
- df = DataFrame(np.random.randn(10,2),columns=list('AB'))
- df.index = index(len(df))
- _maybe_remove(store, 'df')
- store.put('df',df,format=format)
- assert_frame_equal(df,store['df'])
- for index in [ tm.makeFloatIndex, tm.makeStringIndex, tm.makeIntIndex,
- tm.makeDateIndex, tm.makePeriodIndex ]:
- check('table',index)
- check('fixed',index)
- # unicode
- index = tm.makeUnicodeIndex
- if compat.PY3:
- check('table',index)
- check('fixed',index)
- else:
- # only support for fixed types (and they have a perf warning)
- self.assertRaises(TypeError, check, 'table', index)
- with tm.assert_produces_warning(expected_warning=PerformanceWarning):
- check('fixed',index)
- def test_encoding(self):
- if LooseVersion(tables.__version__) < '3.0.0':
- raise nose.SkipTest('tables version does not support proper encoding')
- if sys.byteorder != 'little':
- raise nose.SkipTest('system byteorder is not little')
- with ensure_clean_store(self.path) as store:
- df = DataFrame(dict(A='foo',B='bar'),index=range(5))
- df.loc[2,'A'] = np.nan
- df.loc[3,'B'] = np.nan
- _maybe_remove(store, 'df')
- store.append('df', df, encoding='ascii')
- tm.assert_frame_equal(store['df'], df)
- expected = df.reindex(columns=['A'])
- result = store.select('df',Term('columns=A',encoding='ascii'))
- tm.assert_frame_equal(result,expected)
- def test_append_some_nans(self):
- with ensure_clean_store(self.path) as store:
- df = DataFrame({'A' : Series(np.random.randn(20)).astype('int32'),
- 'A1' : np.random.randn(20),
- 'A2' : np.random.randn(20),
- 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime.datetime(2001,1,2,0,0) },
- index=np.arange(20))
- # some nans
- _maybe_remove(store, 'df1')
- df.ix[0:15,['A1','B','D','E']] = np.nan
- store.append('df1', df[:10])
- store.append('df1', df[10:])
- tm.assert_frame_equal(store['df1'], df)
- # first column
- df1 = df.copy()
- df1.ix[:,'A1'] = np.nan
- _maybe_remove(store, 'df1')
- store.append('df1', df1[:10])
- store.append('df1', df1[10:])
- tm.assert_frame_equal(store['df1'], df1)
- # 2nd column
- df2 = df.copy()
- df2.ix[:,'A2'] = np.nan
- _maybe_remove(store, 'df2')
- store.append('df2', df2[:10])
- store.append('df2', df2[10:])
- tm.assert_frame_equal(store['df2'], df2)
- # datetimes
- df3 = df.copy()
- df3.ix[:,'E'] = np.nan
- _maybe_remove(store, 'df3')
- store.append('df3', df3[:10])
- store.append('df3', df3[10:])
- tm.assert_frame_equal(store['df3'], df3)
- def test_append_all_nans(self):
- with ensure_clean_store(self.path) as store:
- df = DataFrame({'A1' : np.random.randn(20),
- 'A2' : np.random.randn(20)},
- index=np.arange(20))
- df.ix[0:15,:] = np.nan
- # nan some entire rows (dropna=True)
- _maybe_remove(store, 'df')
- store.append('df', df[:10], dropna=True)
- store.append('df', df[10:], dropna=True)
- tm.assert_frame_equal(store['df'], df[-4:])
- # nan some entire rows (dropna=False)
- _maybe_remove(store, 'df2')
- store.append('df2', df[:10], dropna=False)
- store.append('df2', df[10:], dropna=False)
- tm.assert_frame_equal(store['df2'], df)
- # tests the option io.hdf.dropna_table
- pandas.set_option('io.hdf.dropna_table',False)
- _maybe_remove(store, 'df3')
- store.append('df3', df[:10])
- store.append('df3', df[10:])
- tm.assert_frame_equal(store['df3'], df)
- pandas.set_option('io.hdf.dropna_table',True)
- _maybe_remove(store, 'df4')
- store.append('df4', df[:10])
- store.append('df4', df[10:])
- tm.assert_frame_equal(store['df4'], df[-4:])
- # nan some entire rows (string are still written!)
- df = DataFrame({'A1' : np.random.randn(20),
- 'A2' : np.random.randn(20),
- 'B' : 'foo', 'C' : 'bar'},
- index=np.arange(20))
- df.ix[0:15,:] = np.nan
- _maybe_remove(store, 'df')
- store.append('df', df[:10], dropna=True)
- store.append('df', df[10:], dropna=True)
- tm.assert_frame_equal(store['df'], df)
- _maybe_remove(store, 'df2')
- store.append('df2', df[:10], dropna=False)
- store.append('df2', df[10:], dropna=False)
- tm.assert_frame_equal(store['df2'], df)
- # nan some entire rows (but since we have dates they are still written!)
- df = DataFrame({'A1' : np.random.randn(20),
- 'A2' : np.random.randn(20),
- 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime.datetime(2001,1,2,0,0) },
- index=np.arange(20))
- df.ix[0:15,:] = np.nan
- _maybe_remove(store, 'df')
- store.append('df', df[:10], dropna=True)
- store.append('df', df[10:], dropna=True)
- tm.assert_frame_equal(store['df'], df)
- _maybe_remove(store, 'df2')
- store.append('df2', df[:10], dropna=False)
- store.append('df2', df[10:], dropna=False)
- tm.assert_frame_equal(store['df2'], df)
- def test_append_frame_column_oriented(self):
- with ensure_clean_store(self.path) as store:
- # column oriented
- df = tm.makeTimeDataFrame()
- _maybe_remove(store, 'df1')
- store.append('df1', df.ix[:, :2], axes=['columns'])
- store.append('df1', df.ix[:, 2:])
- tm.assert_frame_equal(store['df1'], df)
- result = store.select('df1', 'columns=A')
- expected = df.reindex(columns=['A'])
- tm.assert_frame_equal(expected, result)
- # selection on the non-indexable
- result = store.select(
- 'df1', ('columns=A', Term('index=df.index[0:4]')))
- expected = df.reindex(columns=['A'], index=df.index[0:4])
- tm.assert_frame_equal(expected, result)
- # this isn't supported
- self.assertRaises(TypeError, store.select, 'df1', (
- 'columns=A', Term('index>df.index[4]')))
- def test_append_with_different_block_ordering(self):
- #GH 4096; using same frames, but different block orderings
- with ensure_clean_store(self.path) as store:
- for i in range(10):
- df = DataFrame(np.random.randn(10,2),columns=list('AB'))
- df['index'] = range(10)
- df['index'] += i*10
- df['int64'] = Series([1]*len(df),dtype='int64')
- df['int16'] = Series([1]*len(df),dtype='int16')
- if i % 2 == 0:
- del df['int64']
- df['int64'] = Series([1]*len(df),dtype='int64')
- if i % 3 == 0:
- a = df.pop('A')
- df['A'] = a
- df.set_index('index',inplace=True)
- store.append('df',df)
- # test a different ordering but with more fields (like invalid combinate)
- with ensure_clean_store(self.path) as store:
- df = DataFrame(np.random.randn(10,2),columns=list('AB'), dtype='float64')
- df['int64'] = Series([1]*len(df),dtype='int64')
- df['int16'] = Series([1]*len(df),dtype='int16')
- store.append('df',df)
- # store additonal fields in different blocks
- df['int16_2'] = Series([1]*len(df),dtype='int16')
- self.assertRaises(ValueError, store.append, 'df', df)
- # store multile additonal fields in different blocks
- df['float_3'] = Series([1.]*len(df),dtype='float64')
- self.assertRaises(ValueError, store.append, 'df', df)
- def test_ndim_indexables(self):
- """ test using ndim tables in new ways"""
- with ensure_clean_store(self.path) as store:
- p4d = tm.makePanel4D()
- def check_indexers(key, indexers):
- for i, idx in enumerate(indexers):
- self.assertTrue(getattr(getattr(
- store.root, key).table.description, idx)._v_pos == i)
- # append then change (will take existing schema)
- indexers = ['items', 'major_axis', 'minor_axis']
- _maybe_remove(store, 'p4d')
- store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
- store.append('p4d', p4d.ix[:, :, 10:, :])
- assert_panel4d_equal(store.select('p4d'), p4d)
- check_indexers('p4d', indexers)
- # same as above, but try to append with differnt axes
- _maybe_remove(store, 'p4d')
- store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
- store.append('p4d', p4d.ix[:, :, 10:, :], axes=[
- 'labels', 'items', 'major_axis'])
- assert_panel4d_equal(store.select('p4d'), p4d)
- check_indexers('p4d', indexers)
- # pass incorrect number of axes
- _maybe_remove(store, 'p4d')
- self.assertRaises(ValueError, store.append, 'p4d', p4d.ix[
- :, :, :10, :], axes=['major_axis', 'minor_axis'])
- # different than default indexables #1
- indexers = ['labels', 'major_axis', 'minor_axis']
- _maybe_remove(store, 'p4d')
- store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
- store.append('p4d', p4d.ix[:, :, 10:, :])
- assert_panel4d_equal(store['p4d'], p4d)
- check_indexers('p4d', indexers)
- # different than default indexables #2
- indexers = ['major_axis', 'labels', 'minor_axis']
- _maybe_remove(store, 'p4d')
- store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
- store.append('p4d', p4d.ix[:, :, 10:, :])
- assert_panel4d_equal(store['p4d'], p4d)
- check_indexers('p4d', indexers)
- # partial selection
- result = store.select('p4d', ['labels=l1'])
- expected = p4d.reindex(labels=['l1'])
- assert_panel4d_equal(result, expected)
- # partial selection2
- result = store.select('p4d', [Term(
- 'labels=l1'), Term('items=ItemA'), Term('minor_axis=B')])
- expected = p4d.reindex(
- labels=['l1'], items=['ItemA'], minor_axis=['B'])
- assert_panel4d_equal(result, expected)
- # non-existant partial selection
- result = store.select('p4d', [Term(
- 'labels=l1'), Term('items=Item1'), Term('minor_axis=B')])
- expected = p4d.reindex(labels=['l1'], items=[], minor_axis=['B'])
- assert_panel4d_equal(result, expected)
- def test_append_with_strings(self):
- with ensure_clean_store(self.path) as store:
- wp = tm.makePanel()
- wp2 = wp.rename_axis(
- dict([(x, "%s_extra" % x) for x in wp.minor_axis]), axis=2)
- def check_col(key,name,size):
- self.assertEqual(getattr(store.get_storer(key).table.description,name).itemsize, size)
- store.append('s1', wp, min_itemsize=20)
- store.append('s1', wp2)
- expected = concat([wp, wp2], axis=2)
- expected = expected.reindex(minor_axis=sorted(expected.minor_axis))
- assert_panel_equal(store['s1'], expected)
- check_col('s1', 'minor_axis', 20)
- # test dict format
- store.append('s2', wp, min_itemsize={'minor_axis': 20})
- store.append('s2', wp2)
- expected = concat([wp, wp2], axis=2)
- expected = expected.reindex(minor_axis=sorted(expected.minor_axis))
- assert_panel_equal(store['s2'], expected)
- check_col('s2', 'minor_axis', 20)
- # apply the wrong field (similar to #1)
- store.append('s3', wp, min_itemsize={'major_axis': 20})
- self.assertRaises(ValueError, store.append, 's3', wp2)
- # test truncation of bigger strings
- store.append('s4', wp)
- self.assertRaises(ValueError, store.append, 's4', wp2)
- # avoid truncation on elements
- df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']])
- store.append('df_big', df)
- tm.assert_frame_equal(store.select('df_big'), df)
- check_col('df_big', 'values_block_1', 15)
- # appending smaller string ok
- df2 = DataFrame([[124, 'asdqy'], [346, 'dggnhefbdfb']])
- store.append('df_big', df2)
- expected = concat([df, df2])
- tm.assert_frame_equal(store.select('df_big'), expected)
- check_col('df_big', 'values_block_1', 15)
- # avoid truncation on elements
- df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']])
- store.append('df_big2', df, min_itemsize={'values': 50})
- tm.assert_frame_equal(store.select('df_big2'), df)
- check_col('df_big2', 'values_block_1', 50)
- # bigger string on next append
- store.append('df_new', df)
- df_new = DataFrame(
- [[124, 'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']])
- self.assertRaises(ValueError, store.append, 'df_new', df_new)
- # with nans
- _maybe_remove(store, 'df')
- df = tm.makeTimeDataFrame()
- df['string'] = 'foo'
- df.ix[1:4, 'string'] = np.nan
- df['string2'] = 'bar'
- df.ix[4:8, 'string2'] = np.nan
- df['string3'] = 'bah'
- df.ix[1:, 'string3'] = np.nan
- store.append('df', df)
- result = store.select('df')
- tm.assert_frame_equal(result, df)
- with ensure_clean_store(self.path) as store:
- def check_col(key,name,size):
- self.assertEqual(getattr(store.get_storer(key).table.description,name).itemsize, size)
- df = DataFrame(dict(A = 'foo', B = 'bar'),index=range(10))
- # a min_itemsize that creates a data_column
- _maybe_remove(store, 'df')
- store.append('df', df, min_itemsize={'A' : 200 })
- check_col('df', 'A', 200)
- self.assertEqual(store.get_storer('df').data_columns, ['A'])
- # a min_itemsize that creates a data_column2
- _maybe_remove(store, 'df')
- store.append('df', df, data_columns = ['B'], min_itemsize={'A' : 200 })
- check_col('df', 'A', 200)
- self.assertEqual(store.get_storer('df').data_columns, ['B','A'])
- # a min_itemsize that creates a data_column2
- _maybe_remove(store, 'df')
- store.append('df', df, data_columns = ['B'], min_itemsize={'values' : 200 })
- check_col('df', 'B', 200)
- check_col('df', 'values_block_0', 200)
- self.assertEqual(store.get_storer('df').data_columns, ['B'])
- # infer the .typ on subsequent appends
- _maybe_remove(store, 'df')
- store.append('df', df[:5], min_itemsize=200)
- store.append('df', df[5:], min_itemsize=200)
- tm.assert_frame_equal(store['df'], df)
- # invalid min_itemsize keys
- df = DataFrame(['foo','foo','foo','barh','barh','barh'],columns=['A'])
- _maybe_remove(store, 'df')
- self.assertRaises(ValueError, store.append, 'df', df, min_itemsize={'foo' : 20, 'foobar' : 20})
- def test_append_with_data_columns(self):
- with ensure_clean_store(self.path) as store:
- df = tm.makeTimeDataFrame()
- df.loc[:,'B'].iloc[0] = 1.
- _maybe_remove(store, 'df')
- store.append('df', df[:2], data_columns=['B'])
- store.append('df', df[2:])
- tm.assert_frame_equal(store['df'], df)
- # check that we have indicies created
- assert(store._handle.root.df.table.cols.index.is_indexed is True)
- assert(store._handle.root.df.table.cols.B.is_indexed is True)
- # data column searching
- result = store.select('df', [Term('B>0')])
- expected = df[df.B > 0]
- tm.assert_frame_equal(result, expected)
- # data column searching (with an indexable and a data_columns)
- result = store.select(
- 'df', [Term('B>0'), Term('index>df.index[3]')])
- df_new = df.reindex(index=df.index[4:])
- expected = df_new[df_new.B > 0]
- tm.assert_frame_equal(result, expected)
- # data column selection with a string data_column
- df_new = df.copy()
- df_new['string'] = 'foo'
- df_new['string'][1:4] = np.nan
- df_new['string'][5:6] = 'bar'
- _maybe_remove(store, 'df')
- store.append('df', df_new, data_columns=['string'])
- result = store.select('df', [Term('string=foo')])
- expected = df_new[df_new.string == 'foo']
- tm.assert_frame_equal(result, expected)
- # using min_itemsize and a data column
- def check_col(key,name,size):
- self.assertEqual(getattr(store.get_storer(key).table.description,name).itemsize, size)
- with ensure_clean_store(self.path) as store:
- _maybe_remove(store, 'df')
- store.append('df', df_new, data_columns=['string'],
- min_itemsize={'string': 30})
- check_col('df', 'string', 30)
- _maybe_remove(store, 'df')
- store.append(
- 'df', df_new, data_columns=['string'], min_itemsize=30)
- check_col('df', 'string', 30)
- _maybe_remove(store, 'df')
- store.append('df', df_new, data_columns=['string'],
- min_itemsize={'values': 30})
- check_col('df', 'string', 30)
- with ensure_clean_store(self.path) as store:
- df_new['string2'] = 'foobarbah'
- df_new['string_block1'] = 'foobarbah1'
- df_new['string_block2'] = 'foobarbah2'
- _maybe_remove(store, 'df')
- store.append('df', df_new, data_columns=['string', 'string2'], min_itemsize={'string': 30, 'string2': 40, 'values': 50})
- check_col('df', 'string', 30)
- check_col('df', 'string2', 40)
- check_col('df', 'values_block_1', 50)
- with ensure_clean_store(self.path) as store:
- # multiple data columns
- df_new = df.copy()
- df_new.loc[:,'A'].iloc[0] = 1.
- df_new.loc[:,'B'].iloc[0] = -1.
- df_new['string'] = 'foo'
- df_new['string'][1:4] = np.nan
- df_new['string'][5:6] = 'bar'
- df_new['string2'] = 'foo'
- df_new['string2'][2:5] = np.nan
- df_new['string2'][7:8] = 'bar'
- _maybe_remove(store, 'df')
- store.append(
- 'df', df_new, data_columns=['A', 'B', 'string', 'string2'])
- result = store.select('df', [Term('string=foo'), Term(
- 'string2=foo'), Term('A>0'), Term('B<0')])
- expected = df_new[(df_new.string == 'foo') & (
- df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)]
- tm.assert_frame_equal(result, expected, check_index_type=False)
- # yield an empty frame
- result = store.select('df', [Term('string=foo'), Term(
- 'string2=cool')])
- expected = df_new[(df_new.string == 'foo') & (
- df_new.string2 == 'cool')]
- tm.assert_frame_equal(result, expected, check_index_type=False)
- with ensure_clean_store(self.path) as store:
- # doc example
- df_dc = df.copy()
- df_dc['string'] = 'foo'
- df_dc.ix[4:6, 'string'] = np.nan
- df_dc.ix[7:9, 'string'] = 'bar'
- df_dc['string2'] = 'cool'
- df_dc['datetime'] = Timestamp('20010102')
- df_dc = df_dc.conver…
Large files files are truncated, but you can click here to view the full file