PageRenderTime 55ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/io/tests/test_pytables.py

http://github.com/wesm/pandas
Python | 5436 lines | 5393 code | 30 blank | 13 comment | 23 complexity | 069156a1606495477320752b993ba0f0 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. import nose
  2. import sys
  3. import os
  4. import warnings
  5. import tempfile
  6. from contextlib import contextmanager
  7. import datetime
  8. import numpy as np
  9. import pandas
  10. import pandas as pd
  11. from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index,
  12. RangeIndex, Categorical, bdate_range,
  13. date_range, timedelta_range, Index, DatetimeIndex,
  14. isnull)
  15. from pandas.compat import is_platform_windows, PY3, PY35
  16. from pandas.formats.printing import pprint_thing
  17. from pandas.io.pytables import _tables, TableIterator
  18. try:
  19. _tables()
  20. except ImportError as e:
  21. raise nose.SkipTest(e)
  22. from pandas.io.pytables import (HDFStore, get_store, Term, read_hdf,
  23. IncompatibilityWarning, PerformanceWarning,
  24. AttributeConflictWarning, DuplicateWarning,
  25. PossibleDataLossError, ClosedFileError)
  26. from pandas.io import pytables as pytables
  27. import pandas.util.testing as tm
  28. from pandas.util.testing import (assert_panel4d_equal,
  29. assert_panel_equal,
  30. assert_frame_equal,
  31. assert_series_equal,
  32. assert_produces_warning,
  33. set_timezone)
  34. from pandas import concat, Timestamp
  35. from pandas import compat
  36. from pandas.compat import range, lrange, u
  37. try:
  38. import tables
  39. except ImportError:
  40. raise nose.SkipTest('no pytables')
  41. from distutils.version import LooseVersion
  42. _default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2'
  43. else 'zlib')
  44. _multiprocess_can_split_ = False
  45. # testing on windows/py3 seems to fault
  46. # for using compression
  47. skip_compression = PY3 and is_platform_windows()
  48. # contextmanager to ensure the file cleanup
  49. def safe_remove(path):
  50. if path is not None:
  51. try:
  52. os.remove(path)
  53. except:
  54. pass
  55. def safe_close(store):
  56. try:
  57. if store is not None:
  58. store.close()
  59. except:
  60. pass
  61. def create_tempfile(path):
  62. """ create an unopened named temporary file """
  63. return os.path.join(tempfile.gettempdir(), path)
  64. @contextmanager
  65. def ensure_clean_store(path, mode='a', complevel=None, complib=None,
  66. fletcher32=False):
  67. try:
  68. # put in the temporary path if we don't have one already
  69. if not len(os.path.dirname(path)):
  70. path = create_tempfile(path)
  71. store = HDFStore(path, mode=mode, complevel=complevel,
  72. complib=complib, fletcher32=False)
  73. yield store
  74. finally:
  75. safe_close(store)
  76. if mode == 'w' or mode == 'a':
  77. safe_remove(path)
  78. @contextmanager
  79. def ensure_clean_path(path):
  80. """
  81. return essentially a named temporary file that is not opened
  82. and deleted on existing; if path is a list, then create and
  83. return list of filenames
  84. """
  85. try:
  86. if isinstance(path, list):
  87. filenames = [create_tempfile(p) for p in path]
  88. yield filenames
  89. else:
  90. filenames = [create_tempfile(path)]
  91. yield filenames[0]
  92. finally:
  93. for f in filenames:
  94. safe_remove(f)
  95. # set these parameters so we don't have file sharing
  96. tables.parameters.MAX_NUMEXPR_THREADS = 1
  97. tables.parameters.MAX_BLOSC_THREADS = 1
  98. tables.parameters.MAX_THREADS = 1
  99. def _maybe_remove(store, key):
  100. """For tests using tables, try removing the table to be sure there is
  101. no content from previous tests using the same table name."""
  102. try:
  103. store.remove(key)
  104. except:
  105. pass
  106. @contextmanager
  107. def compat_assert_produces_warning(w):
  108. """ don't produce a warning under PY3 """
  109. if compat.PY3:
  110. yield
  111. else:
  112. with tm.assert_produces_warning(expected_warning=w,
  113. check_stacklevel=False):
  114. yield
  115. class Base(tm.TestCase):
  116. @classmethod
  117. def setUpClass(cls):
  118. super(Base, cls).setUpClass()
  119. # Pytables 3.0.0 deprecates lots of things
  120. tm.reset_testing_mode()
  121. @classmethod
  122. def tearDownClass(cls):
  123. super(Base, cls).tearDownClass()
  124. # Pytables 3.0.0 deprecates lots of things
  125. tm.set_testing_mode()
  126. def setUp(self):
  127. warnings.filterwarnings(action='ignore', category=FutureWarning)
  128. self.path = 'tmp.__%s__.h5' % tm.rands(10)
  129. def tearDown(self):
  130. pass
  131. class TestHDFStore(Base, tm.TestCase):
  132. def test_factory_fun(self):
  133. path = create_tempfile(self.path)
  134. try:
  135. with get_store(path) as tbl:
  136. raise ValueError('blah')
  137. except ValueError:
  138. pass
  139. finally:
  140. safe_remove(path)
  141. try:
  142. with get_store(path) as tbl:
  143. tbl['a'] = tm.makeDataFrame()
  144. with get_store(path) as tbl:
  145. self.assertEqual(len(tbl), 1)
  146. self.assertEqual(type(tbl['a']), DataFrame)
  147. finally:
  148. safe_remove(self.path)
  149. def test_context(self):
  150. path = create_tempfile(self.path)
  151. try:
  152. with HDFStore(path) as tbl:
  153. raise ValueError('blah')
  154. except ValueError:
  155. pass
  156. finally:
  157. safe_remove(path)
  158. try:
  159. with HDFStore(path) as tbl:
  160. tbl['a'] = tm.makeDataFrame()
  161. with HDFStore(path) as tbl:
  162. self.assertEqual(len(tbl), 1)
  163. self.assertEqual(type(tbl['a']), DataFrame)
  164. finally:
  165. safe_remove(path)
  166. def test_conv_read_write(self):
  167. path = create_tempfile(self.path)
  168. try:
  169. def roundtrip(key, obj, **kwargs):
  170. obj.to_hdf(path, key, **kwargs)
  171. return read_hdf(path, key)
  172. o = tm.makeTimeSeries()
  173. assert_series_equal(o, roundtrip('series', o))
  174. o = tm.makeStringSeries()
  175. assert_series_equal(o, roundtrip('string_series', o))
  176. o = tm.makeDataFrame()
  177. assert_frame_equal(o, roundtrip('frame', o))
  178. o = tm.makePanel()
  179. assert_panel_equal(o, roundtrip('panel', o))
  180. # table
  181. df = DataFrame(dict(A=lrange(5), B=lrange(5)))
  182. df.to_hdf(path, 'table', append=True)
  183. result = read_hdf(path, 'table', where=['index>2'])
  184. assert_frame_equal(df[df.index > 2], result)
  185. finally:
  186. safe_remove(path)
  187. def test_long_strings(self):
  188. # GH6166
  189. # unconversion of long strings was being chopped in earlier
  190. # versions of numpy < 1.7.2
  191. df = DataFrame({'a': tm.rands_array(100, size=10)},
  192. index=tm.rands_array(100, size=10))
  193. with ensure_clean_store(self.path) as store:
  194. store.append('df', df, data_columns=['a'])
  195. result = store.select('df')
  196. assert_frame_equal(df, result)
  197. def test_api(self):
  198. # GH4584
  199. # API issue when to_hdf doesn't acdept append AND format args
  200. with ensure_clean_path(self.path) as path:
  201. df = tm.makeDataFrame()
  202. df.iloc[:10].to_hdf(path, 'df', append=True, format='table')
  203. df.iloc[10:].to_hdf(path, 'df', append=True, format='table')
  204. assert_frame_equal(read_hdf(path, 'df'), df)
  205. # append to False
  206. df.iloc[:10].to_hdf(path, 'df', append=False, format='table')
  207. df.iloc[10:].to_hdf(path, 'df', append=True, format='table')
  208. assert_frame_equal(read_hdf(path, 'df'), df)
  209. with ensure_clean_path(self.path) as path:
  210. df = tm.makeDataFrame()
  211. df.iloc[:10].to_hdf(path, 'df', append=True)
  212. df.iloc[10:].to_hdf(path, 'df', append=True, format='table')
  213. assert_frame_equal(read_hdf(path, 'df'), df)
  214. # append to False
  215. df.iloc[:10].to_hdf(path, 'df', append=False, format='table')
  216. df.iloc[10:].to_hdf(path, 'df', append=True)
  217. assert_frame_equal(read_hdf(path, 'df'), df)
  218. with ensure_clean_path(self.path) as path:
  219. df = tm.makeDataFrame()
  220. df.to_hdf(path, 'df', append=False, format='fixed')
  221. assert_frame_equal(read_hdf(path, 'df'), df)
  222. df.to_hdf(path, 'df', append=False, format='f')
  223. assert_frame_equal(read_hdf(path, 'df'), df)
  224. df.to_hdf(path, 'df', append=False)
  225. assert_frame_equal(read_hdf(path, 'df'), df)
  226. df.to_hdf(path, 'df')
  227. assert_frame_equal(read_hdf(path, 'df'), df)
  228. with ensure_clean_store(self.path) as store:
  229. path = store._path
  230. df = tm.makeDataFrame()
  231. _maybe_remove(store, 'df')
  232. store.append('df', df.iloc[:10], append=True, format='table')
  233. store.append('df', df.iloc[10:], append=True, format='table')
  234. assert_frame_equal(store.select('df'), df)
  235. # append to False
  236. _maybe_remove(store, 'df')
  237. store.append('df', df.iloc[:10], append=False, format='table')
  238. store.append('df', df.iloc[10:], append=True, format='table')
  239. assert_frame_equal(store.select('df'), df)
  240. # formats
  241. _maybe_remove(store, 'df')
  242. store.append('df', df.iloc[:10], append=False, format='table')
  243. store.append('df', df.iloc[10:], append=True, format='table')
  244. assert_frame_equal(store.select('df'), df)
  245. _maybe_remove(store, 'df')
  246. store.append('df', df.iloc[:10], append=False, format='table')
  247. store.append('df', df.iloc[10:], append=True, format=None)
  248. assert_frame_equal(store.select('df'), df)
  249. with ensure_clean_path(self.path) as path:
  250. # invalid
  251. df = tm.makeDataFrame()
  252. self.assertRaises(ValueError, df.to_hdf, path,
  253. 'df', append=True, format='f')
  254. self.assertRaises(ValueError, df.to_hdf, path,
  255. 'df', append=True, format='fixed')
  256. self.assertRaises(TypeError, df.to_hdf, path,
  257. 'df', append=True, format='foo')
  258. self.assertRaises(TypeError, df.to_hdf, path,
  259. 'df', append=False, format='bar')
  260. # File path doesn't exist
  261. path = ""
  262. self.assertRaises(IOError, read_hdf, path, 'df')
  263. def test_api_default_format(self):
  264. # default_format option
  265. with ensure_clean_store(self.path) as store:
  266. df = tm.makeDataFrame()
  267. pandas.set_option('io.hdf.default_format', 'fixed')
  268. _maybe_remove(store, 'df')
  269. store.put('df', df)
  270. self.assertFalse(store.get_storer('df').is_table)
  271. self.assertRaises(ValueError, store.append, 'df2', df)
  272. pandas.set_option('io.hdf.default_format', 'table')
  273. _maybe_remove(store, 'df')
  274. store.put('df', df)
  275. self.assertTrue(store.get_storer('df').is_table)
  276. _maybe_remove(store, 'df2')
  277. store.append('df2', df)
  278. self.assertTrue(store.get_storer('df').is_table)
  279. pandas.set_option('io.hdf.default_format', None)
  280. with ensure_clean_path(self.path) as path:
  281. df = tm.makeDataFrame()
  282. pandas.set_option('io.hdf.default_format', 'fixed')
  283. df.to_hdf(path, 'df')
  284. with get_store(path) as store:
  285. self.assertFalse(store.get_storer('df').is_table)
  286. self.assertRaises(ValueError, df.to_hdf, path, 'df2', append=True)
  287. pandas.set_option('io.hdf.default_format', 'table')
  288. df.to_hdf(path, 'df3')
  289. with HDFStore(path) as store:
  290. self.assertTrue(store.get_storer('df3').is_table)
  291. df.to_hdf(path, 'df4', append=True)
  292. with HDFStore(path) as store:
  293. self.assertTrue(store.get_storer('df4').is_table)
  294. pandas.set_option('io.hdf.default_format', None)
  295. def test_keys(self):
  296. with ensure_clean_store(self.path) as store:
  297. store['a'] = tm.makeTimeSeries()
  298. store['b'] = tm.makeStringSeries()
  299. store['c'] = tm.makeDataFrame()
  300. store['d'] = tm.makePanel()
  301. store['foo/bar'] = tm.makePanel()
  302. self.assertEqual(len(store), 5)
  303. expected = set(['/a', '/b', '/c', '/d', '/foo/bar'])
  304. self.assertTrue(set(store.keys()) == expected)
  305. self.assertTrue(set(store) == expected)
  306. def test_iter_empty(self):
  307. with ensure_clean_store(self.path) as store:
  308. # GH 12221
  309. self.assertTrue(list(store) == [])
  310. def test_repr(self):
  311. with ensure_clean_store(self.path) as store:
  312. repr(store)
  313. store['a'] = tm.makeTimeSeries()
  314. store['b'] = tm.makeStringSeries()
  315. store['c'] = tm.makeDataFrame()
  316. store['d'] = tm.makePanel()
  317. store['foo/bar'] = tm.makePanel()
  318. store.append('e', tm.makePanel())
  319. df = tm.makeDataFrame()
  320. df['obj1'] = 'foo'
  321. df['obj2'] = 'bar'
  322. df['bool1'] = df['A'] > 0
  323. df['bool2'] = df['B'] > 0
  324. df['bool3'] = True
  325. df['int1'] = 1
  326. df['int2'] = 2
  327. df['timestamp1'] = Timestamp('20010102')
  328. df['timestamp2'] = Timestamp('20010103')
  329. df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0)
  330. df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0)
  331. df.ix[3:6, ['obj1']] = np.nan
  332. df = df.consolidate()._convert(datetime=True)
  333. warnings.filterwarnings('ignore', category=PerformanceWarning)
  334. store['df'] = df
  335. warnings.filterwarnings('always', category=PerformanceWarning)
  336. # make a random group in hdf space
  337. store._handle.create_group(store._handle.root, 'bah')
  338. repr(store)
  339. str(store)
  340. # storers
  341. with ensure_clean_store(self.path) as store:
  342. df = tm.makeDataFrame()
  343. store.append('df', df)
  344. s = store.get_storer('df')
  345. repr(s)
  346. str(s)
  347. def test_contains(self):
  348. with ensure_clean_store(self.path) as store:
  349. store['a'] = tm.makeTimeSeries()
  350. store['b'] = tm.makeDataFrame()
  351. store['foo/bar'] = tm.makeDataFrame()
  352. self.assertIn('a', store)
  353. self.assertIn('b', store)
  354. self.assertNotIn('c', store)
  355. self.assertIn('foo/bar', store)
  356. self.assertIn('/foo/bar', store)
  357. self.assertNotIn('/foo/b', store)
  358. self.assertNotIn('bar', store)
  359. # GH 2694
  360. warnings.filterwarnings(
  361. 'ignore', category=tables.NaturalNameWarning)
  362. store['node())'] = tm.makeDataFrame()
  363. self.assertIn('node())', store)
  364. def test_versioning(self):
  365. with ensure_clean_store(self.path) as store:
  366. store['a'] = tm.makeTimeSeries()
  367. store['b'] = tm.makeDataFrame()
  368. df = tm.makeTimeDataFrame()
  369. _maybe_remove(store, 'df1')
  370. store.append('df1', df[:10])
  371. store.append('df1', df[10:])
  372. self.assertEqual(store.root.a._v_attrs.pandas_version, '0.15.2')
  373. self.assertEqual(store.root.b._v_attrs.pandas_version, '0.15.2')
  374. self.assertEqual(store.root.df1._v_attrs.pandas_version, '0.15.2')
  375. # write a file and wipe its versioning
  376. _maybe_remove(store, 'df2')
  377. store.append('df2', df)
  378. # this is an error because its table_type is appendable, but no
  379. # version info
  380. store.get_node('df2')._v_attrs.pandas_version = None
  381. self.assertRaises(Exception, store.select, 'df2')
  382. def test_mode(self):
  383. df = tm.makeTimeDataFrame()
  384. def check(mode):
  385. with ensure_clean_path(self.path) as path:
  386. # constructor
  387. if mode in ['r', 'r+']:
  388. self.assertRaises(IOError, HDFStore, path, mode=mode)
  389. else:
  390. store = HDFStore(path, mode=mode)
  391. self.assertEqual(store._handle.mode, mode)
  392. store.close()
  393. with ensure_clean_path(self.path) as path:
  394. # context
  395. if mode in ['r', 'r+']:
  396. def f():
  397. with HDFStore(path, mode=mode) as store: # noqa
  398. pass
  399. self.assertRaises(IOError, f)
  400. else:
  401. with HDFStore(path, mode=mode) as store:
  402. self.assertEqual(store._handle.mode, mode)
  403. with ensure_clean_path(self.path) as path:
  404. # conv write
  405. if mode in ['r', 'r+']:
  406. self.assertRaises(IOError, df.to_hdf,
  407. path, 'df', mode=mode)
  408. df.to_hdf(path, 'df', mode='w')
  409. else:
  410. df.to_hdf(path, 'df', mode=mode)
  411. # conv read
  412. if mode in ['w']:
  413. self.assertRaises(ValueError, read_hdf,
  414. path, 'df', mode=mode)
  415. else:
  416. result = read_hdf(path, 'df', mode=mode)
  417. assert_frame_equal(result, df)
  418. def check_default_mode():
  419. # read_hdf uses default mode
  420. with ensure_clean_path(self.path) as path:
  421. df.to_hdf(path, 'df', mode='w')
  422. result = read_hdf(path, 'df')
  423. assert_frame_equal(result, df)
  424. check('r')
  425. check('r+')
  426. check('a')
  427. check('w')
  428. check_default_mode()
  429. def test_reopen_handle(self):
  430. with ensure_clean_path(self.path) as path:
  431. store = HDFStore(path, mode='a')
  432. store['a'] = tm.makeTimeSeries()
  433. # invalid mode change
  434. self.assertRaises(PossibleDataLossError, store.open, 'w')
  435. store.close()
  436. self.assertFalse(store.is_open)
  437. # truncation ok here
  438. store.open('w')
  439. self.assertTrue(store.is_open)
  440. self.assertEqual(len(store), 0)
  441. store.close()
  442. self.assertFalse(store.is_open)
  443. store = HDFStore(path, mode='a')
  444. store['a'] = tm.makeTimeSeries()
  445. # reopen as read
  446. store.open('r')
  447. self.assertTrue(store.is_open)
  448. self.assertEqual(len(store), 1)
  449. self.assertEqual(store._mode, 'r')
  450. store.close()
  451. self.assertFalse(store.is_open)
  452. # reopen as append
  453. store.open('a')
  454. self.assertTrue(store.is_open)
  455. self.assertEqual(len(store), 1)
  456. self.assertEqual(store._mode, 'a')
  457. store.close()
  458. self.assertFalse(store.is_open)
  459. # reopen as append (again)
  460. store.open('a')
  461. self.assertTrue(store.is_open)
  462. self.assertEqual(len(store), 1)
  463. self.assertEqual(store._mode, 'a')
  464. store.close()
  465. self.assertFalse(store.is_open)
  466. def test_open_args(self):
  467. with ensure_clean_path(self.path) as path:
  468. df = tm.makeDataFrame()
  469. # create an in memory store
  470. store = HDFStore(path, mode='a', driver='H5FD_CORE',
  471. driver_core_backing_store=0)
  472. store['df'] = df
  473. store.append('df2', df)
  474. tm.assert_frame_equal(store['df'], df)
  475. tm.assert_frame_equal(store['df2'], df)
  476. store.close()
  477. # the file should not have actually been written
  478. self.assertFalse(os.path.exists(path))
  479. def test_flush(self):
  480. with ensure_clean_store(self.path) as store:
  481. store['a'] = tm.makeTimeSeries()
  482. store.flush()
  483. store.flush(fsync=True)
  484. def test_get(self):
  485. with ensure_clean_store(self.path) as store:
  486. store['a'] = tm.makeTimeSeries()
  487. left = store.get('a')
  488. right = store['a']
  489. tm.assert_series_equal(left, right)
  490. left = store.get('/a')
  491. right = store['/a']
  492. tm.assert_series_equal(left, right)
  493. self.assertRaises(KeyError, store.get, 'b')
  494. def test_getattr(self):
  495. with ensure_clean_store(self.path) as store:
  496. s = tm.makeTimeSeries()
  497. store['a'] = s
  498. # test attribute access
  499. result = store.a
  500. tm.assert_series_equal(result, s)
  501. result = getattr(store, 'a')
  502. tm.assert_series_equal(result, s)
  503. df = tm.makeTimeDataFrame()
  504. store['df'] = df
  505. result = store.df
  506. tm.assert_frame_equal(result, df)
  507. # errors
  508. self.assertRaises(AttributeError, getattr, store, 'd')
  509. for x in ['mode', 'path', 'handle', 'complib']:
  510. self.assertRaises(AttributeError, getattr, store, x)
  511. # not stores
  512. for x in ['mode', 'path', 'handle', 'complib']:
  513. getattr(store, "_%s" % x)
  514. def test_put(self):
  515. with ensure_clean_store(self.path) as store:
  516. ts = tm.makeTimeSeries()
  517. df = tm.makeTimeDataFrame()
  518. store['a'] = ts
  519. store['b'] = df[:10]
  520. store['foo/bar/bah'] = df[:10]
  521. store['foo'] = df[:10]
  522. store['/foo'] = df[:10]
  523. store.put('c', df[:10], format='table')
  524. # not OK, not a table
  525. self.assertRaises(
  526. ValueError, store.put, 'b', df[10:], append=True)
  527. # node does not currently exist, test _is_table_type returns False
  528. # in this case
  529. # _maybe_remove(store, 'f')
  530. # self.assertRaises(ValueError, store.put, 'f', df[10:],
  531. # append=True)
  532. # can't put to a table (use append instead)
  533. self.assertRaises(ValueError, store.put, 'c', df[10:], append=True)
  534. # overwrite table
  535. store.put('c', df[:10], format='table', append=False)
  536. tm.assert_frame_equal(df[:10], store['c'])
  537. def test_put_string_index(self):
  538. with ensure_clean_store(self.path) as store:
  539. index = Index(
  540. ["I am a very long string index: %s" % i for i in range(20)])
  541. s = Series(np.arange(20), index=index)
  542. df = DataFrame({'A': s, 'B': s})
  543. store['a'] = s
  544. tm.assert_series_equal(store['a'], s)
  545. store['b'] = df
  546. tm.assert_frame_equal(store['b'], df)
  547. # mixed length
  548. index = Index(['abcdefghijklmnopqrstuvwxyz1234567890'] +
  549. ["I am a very long string index: %s" % i
  550. for i in range(20)])
  551. s = Series(np.arange(21), index=index)
  552. df = DataFrame({'A': s, 'B': s})
  553. store['a'] = s
  554. tm.assert_series_equal(store['a'], s)
  555. store['b'] = df
  556. tm.assert_frame_equal(store['b'], df)
  557. def test_put_compression(self):
  558. with ensure_clean_store(self.path) as store:
  559. df = tm.makeTimeDataFrame()
  560. store.put('c', df, format='table', complib='zlib')
  561. tm.assert_frame_equal(store['c'], df)
  562. # can't compress if format='fixed'
  563. self.assertRaises(ValueError, store.put, 'b', df,
  564. format='fixed', complib='zlib')
  565. def test_put_compression_blosc(self):
  566. tm.skip_if_no_package('tables', '2.2', app='blosc support')
  567. if skip_compression:
  568. raise nose.SkipTest("skipping on windows/PY3")
  569. df = tm.makeTimeDataFrame()
  570. with ensure_clean_store(self.path) as store:
  571. # can't compress if format='fixed'
  572. self.assertRaises(ValueError, store.put, 'b', df,
  573. format='fixed', complib='blosc')
  574. store.put('c', df, format='table', complib='blosc')
  575. tm.assert_frame_equal(store['c'], df)
  576. def test_put_integer(self):
  577. # non-date, non-string index
  578. df = DataFrame(np.random.randn(50, 100))
  579. self._check_roundtrip(df, tm.assert_frame_equal)
  580. def test_put_mixed_type(self):
  581. df = tm.makeTimeDataFrame()
  582. df['obj1'] = 'foo'
  583. df['obj2'] = 'bar'
  584. df['bool1'] = df['A'] > 0
  585. df['bool2'] = df['B'] > 0
  586. df['bool3'] = True
  587. df['int1'] = 1
  588. df['int2'] = 2
  589. df['timestamp1'] = Timestamp('20010102')
  590. df['timestamp2'] = Timestamp('20010103')
  591. df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0)
  592. df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0)
  593. df.ix[3:6, ['obj1']] = np.nan
  594. df = df.consolidate()._convert(datetime=True)
  595. with ensure_clean_store(self.path) as store:
  596. _maybe_remove(store, 'df')
  597. # cannot use assert_produces_warning here for some reason
  598. # a PendingDeprecationWarning is also raised?
  599. warnings.filterwarnings('ignore', category=PerformanceWarning)
  600. store.put('df', df)
  601. warnings.filterwarnings('always', category=PerformanceWarning)
  602. expected = store.get('df')
  603. tm.assert_frame_equal(expected, df)
  604. def test_append(self):
  605. with ensure_clean_store(self.path) as store:
  606. df = tm.makeTimeDataFrame()
  607. _maybe_remove(store, 'df1')
  608. store.append('df1', df[:10])
  609. store.append('df1', df[10:])
  610. tm.assert_frame_equal(store['df1'], df)
  611. _maybe_remove(store, 'df2')
  612. store.put('df2', df[:10], format='table')
  613. store.append('df2', df[10:])
  614. tm.assert_frame_equal(store['df2'], df)
  615. _maybe_remove(store, 'df3')
  616. store.append('/df3', df[:10])
  617. store.append('/df3', df[10:])
  618. tm.assert_frame_equal(store['df3'], df)
  619. # this is allowed by almost always don't want to do it
  620. with tm.assert_produces_warning(
  621. expected_warning=tables.NaturalNameWarning):
  622. _maybe_remove(store, '/df3 foo')
  623. store.append('/df3 foo', df[:10])
  624. store.append('/df3 foo', df[10:])
  625. tm.assert_frame_equal(store['df3 foo'], df)
  626. # panel
  627. wp = tm.makePanel()
  628. _maybe_remove(store, 'wp1')
  629. store.append('wp1', wp.ix[:, :10, :])
  630. store.append('wp1', wp.ix[:, 10:, :])
  631. assert_panel_equal(store['wp1'], wp)
  632. # ndim
  633. with tm.assert_produces_warning(FutureWarning,
  634. check_stacklevel=False):
  635. p4d = tm.makePanel4D()
  636. _maybe_remove(store, 'p4d')
  637. store.append('p4d', p4d.ix[:, :, :10, :])
  638. store.append('p4d', p4d.ix[:, :, 10:, :])
  639. assert_panel4d_equal(store['p4d'], p4d)
  640. # test using axis labels
  641. _maybe_remove(store, 'p4d')
  642. store.append('p4d', p4d.ix[:, :, :10, :], axes=[
  643. 'items', 'major_axis', 'minor_axis'])
  644. store.append('p4d', p4d.ix[:, :, 10:, :], axes=[
  645. 'items', 'major_axis', 'minor_axis'])
  646. assert_panel4d_equal(store['p4d'], p4d)
  647. # test using differnt number of items on each axis
  648. p4d2 = p4d.copy()
  649. p4d2['l4'] = p4d['l1']
  650. p4d2['l5'] = p4d['l1']
  651. _maybe_remove(store, 'p4d2')
  652. store.append(
  653. 'p4d2', p4d2, axes=['items', 'major_axis', 'minor_axis'])
  654. assert_panel4d_equal(store['p4d2'], p4d2)
  655. # test using differt order of items on the non-index axes
  656. _maybe_remove(store, 'wp1')
  657. wp_append1 = wp.ix[:, :10, :]
  658. store.append('wp1', wp_append1)
  659. wp_append2 = wp.ix[:, 10:, :].reindex(items=wp.items[::-1])
  660. store.append('wp1', wp_append2)
  661. assert_panel_equal(store['wp1'], wp)
  662. # dtype issues - mizxed type in a single object column
  663. df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]])
  664. df['mixed_column'] = 'testing'
  665. df.ix[2, 'mixed_column'] = np.nan
  666. _maybe_remove(store, 'df')
  667. store.append('df', df)
  668. tm.assert_frame_equal(store['df'], df)
  669. # uints - test storage of uints
  670. uint_data = DataFrame({
  671. 'u08': Series(np.random.randint(0, high=255, size=5),
  672. dtype=np.uint8),
  673. 'u16': Series(np.random.randint(0, high=65535, size=5),
  674. dtype=np.uint16),
  675. 'u32': Series(np.random.randint(0, high=2**30, size=5),
  676. dtype=np.uint32),
  677. 'u64': Series([2**58, 2**59, 2**60, 2**61, 2**62],
  678. dtype=np.uint64)}, index=np.arange(5))
  679. _maybe_remove(store, 'uints')
  680. store.append('uints', uint_data)
  681. tm.assert_frame_equal(store['uints'], uint_data)
  682. # uints - test storage of uints in indexable columns
  683. _maybe_remove(store, 'uints')
  684. # 64-bit indices not yet supported
  685. store.append('uints', uint_data, data_columns=[
  686. 'u08', 'u16', 'u32'])
  687. tm.assert_frame_equal(store['uints'], uint_data)
  688. def test_append_series(self):
  689. with ensure_clean_store(self.path) as store:
  690. # basic
  691. ss = tm.makeStringSeries()
  692. ts = tm.makeTimeSeries()
  693. ns = Series(np.arange(100))
  694. store.append('ss', ss)
  695. result = store['ss']
  696. tm.assert_series_equal(result, ss)
  697. self.assertIsNone(result.name)
  698. store.append('ts', ts)
  699. result = store['ts']
  700. tm.assert_series_equal(result, ts)
  701. self.assertIsNone(result.name)
  702. ns.name = 'foo'
  703. store.append('ns', ns)
  704. result = store['ns']
  705. tm.assert_series_equal(result, ns)
  706. self.assertEqual(result.name, ns.name)
  707. # select on the values
  708. expected = ns[ns > 60]
  709. result = store.select('ns', Term('foo>60'))
  710. tm.assert_series_equal(result, expected)
  711. # select on the index and values
  712. expected = ns[(ns > 70) & (ns.index < 90)]
  713. result = store.select('ns', [Term('foo>70'), Term('index<90')])
  714. tm.assert_series_equal(result, expected)
  715. # multi-index
  716. mi = DataFrame(np.random.randn(5, 1), columns=['A'])
  717. mi['B'] = np.arange(len(mi))
  718. mi['C'] = 'foo'
  719. mi.loc[3:5, 'C'] = 'bar'
  720. mi.set_index(['C', 'B'], inplace=True)
  721. s = mi.stack()
  722. s.index = s.index.droplevel(2)
  723. store.append('mi', s)
  724. tm.assert_series_equal(store['mi'], s)
  725. def test_store_index_types(self):
  726. # GH5386
  727. # test storing various index types
  728. with ensure_clean_store(self.path) as store:
  729. def check(format, index):
  730. df = DataFrame(np.random.randn(10, 2), columns=list('AB'))
  731. df.index = index(len(df))
  732. _maybe_remove(store, 'df')
  733. store.put('df', df, format=format)
  734. assert_frame_equal(df, store['df'])
  735. for index in [tm.makeFloatIndex, tm.makeStringIndex,
  736. tm.makeIntIndex, tm.makeDateIndex]:
  737. check('table', index)
  738. check('fixed', index)
  739. # period index currently broken for table
  740. # seee GH7796 FIXME
  741. check('fixed', tm.makePeriodIndex)
  742. # check('table',tm.makePeriodIndex)
  743. # unicode
  744. index = tm.makeUnicodeIndex
  745. if compat.PY3:
  746. check('table', index)
  747. check('fixed', index)
  748. else:
  749. # only support for fixed types (and they have a perf warning)
  750. self.assertRaises(TypeError, check, 'table', index)
  751. with tm.assert_produces_warning(
  752. expected_warning=PerformanceWarning):
  753. check('fixed', index)
  754. def test_encoding(self):
  755. if sys.byteorder != 'little':
  756. raise nose.SkipTest('system byteorder is not little')
  757. with ensure_clean_store(self.path) as store:
  758. df = DataFrame(dict(A='foo', B='bar'), index=range(5))
  759. df.loc[2, 'A'] = np.nan
  760. df.loc[3, 'B'] = np.nan
  761. _maybe_remove(store, 'df')
  762. store.append('df', df, encoding='ascii')
  763. tm.assert_frame_equal(store['df'], df)
  764. expected = df.reindex(columns=['A'])
  765. result = store.select('df', Term('columns=A', encoding='ascii'))
  766. tm.assert_frame_equal(result, expected)
  767. def test_latin_encoding(self):
  768. if compat.PY2:
  769. self.assertRaisesRegexp(
  770. TypeError, '\[unicode\] is not implemented as a table column')
  771. return
  772. values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'],
  773. [b'E\xc9, 17', b'a', b'b', b'c'],
  774. [b'EE, 17', b'', b'a', b'b', b'c'],
  775. [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'],
  776. [b'', b'a', b'b', b'c'],
  777. [b'\xf8\xfc', b'a', b'b', b'c'],
  778. [b'A\xf8\xfc', b'', b'a', b'b', b'c'],
  779. [np.nan, b'', b'b', b'c'],
  780. [b'A\xf8\xfc', np.nan, b'', b'b', b'c']]
  781. def _try_decode(x, encoding='latin-1'):
  782. try:
  783. return x.decode(encoding)
  784. except AttributeError:
  785. return x
  786. # not sure how to remove latin-1 from code in python 2 and 3
  787. values = [[_try_decode(x) for x in y] for y in values]
  788. examples = []
  789. for dtype in ['category', object]:
  790. for val in values:
  791. examples.append(pandas.Series(val, dtype=dtype))
  792. def roundtrip(s, key='data', encoding='latin-1', nan_rep=''):
  793. with ensure_clean_path(self.path) as store:
  794. s.to_hdf(store, key, format='table', encoding=encoding,
  795. nan_rep=nan_rep)
  796. retr = read_hdf(store, key)
  797. s_nan = s.replace(nan_rep, np.nan)
  798. assert_series_equal(s_nan, retr, check_categorical=False)
  799. for s in examples:
  800. roundtrip(s)
  801. # fails:
  802. # for x in examples:
  803. # roundtrip(s, nan_rep=b'\xf8\xfc')
  804. def test_append_some_nans(self):
  805. with ensure_clean_store(self.path) as store:
  806. df = DataFrame({'A': Series(np.random.randn(20)).astype('int32'),
  807. 'A1': np.random.randn(20),
  808. 'A2': np.random.randn(20),
  809. 'B': 'foo', 'C': 'bar',
  810. 'D': Timestamp("20010101"),
  811. 'E': datetime.datetime(2001, 1, 2, 0, 0)},
  812. index=np.arange(20))
  813. # some nans
  814. _maybe_remove(store, 'df1')
  815. df.ix[0:15, ['A1', 'B', 'D', 'E']] = np.nan
  816. store.append('df1', df[:10])
  817. store.append('df1', df[10:])
  818. tm.assert_frame_equal(store['df1'], df)
  819. # first column
  820. df1 = df.copy()
  821. df1.ix[:, 'A1'] = np.nan
  822. _maybe_remove(store, 'df1')
  823. store.append('df1', df1[:10])
  824. store.append('df1', df1[10:])
  825. tm.assert_frame_equal(store['df1'], df1)
  826. # 2nd column
  827. df2 = df.copy()
  828. df2.ix[:, 'A2'] = np.nan
  829. _maybe_remove(store, 'df2')
  830. store.append('df2', df2[:10])
  831. store.append('df2', df2[10:])
  832. tm.assert_frame_equal(store['df2'], df2)
  833. # datetimes
  834. df3 = df.copy()
  835. df3.ix[:, 'E'] = np.nan
  836. _maybe_remove(store, 'df3')
  837. store.append('df3', df3[:10])
  838. store.append('df3', df3[10:])
  839. tm.assert_frame_equal(store['df3'], df3)
  840. def test_append_all_nans(self):
  841. with ensure_clean_store(self.path) as store:
  842. df = DataFrame({'A1': np.random.randn(20),
  843. 'A2': np.random.randn(20)},
  844. index=np.arange(20))
  845. df.ix[0:15, :] = np.nan
  846. # nan some entire rows (dropna=True)
  847. _maybe_remove(store, 'df')
  848. store.append('df', df[:10], dropna=True)
  849. store.append('df', df[10:], dropna=True)
  850. tm.assert_frame_equal(store['df'], df[-4:])
  851. # nan some entire rows (dropna=False)
  852. _maybe_remove(store, 'df2')
  853. store.append('df2', df[:10], dropna=False)
  854. store.append('df2', df[10:], dropna=False)
  855. tm.assert_frame_equal(store['df2'], df)
  856. # tests the option io.hdf.dropna_table
  857. pandas.set_option('io.hdf.dropna_table', False)
  858. _maybe_remove(store, 'df3')
  859. store.append('df3', df[:10])
  860. store.append('df3', df[10:])
  861. tm.assert_frame_equal(store['df3'], df)
  862. pandas.set_option('io.hdf.dropna_table', True)
  863. _maybe_remove(store, 'df4')
  864. store.append('df4', df[:10])
  865. store.append('df4', df[10:])
  866. tm.assert_frame_equal(store['df4'], df[-4:])
  867. # nan some entire rows (string are still written!)
  868. df = DataFrame({'A1': np.random.randn(20),
  869. 'A2': np.random.randn(20),
  870. 'B': 'foo', 'C': 'bar'},
  871. index=np.arange(20))
  872. df.ix[0:15, :] = np.nan
  873. _maybe_remove(store, 'df')
  874. store.append('df', df[:10], dropna=True)
  875. store.append('df', df[10:], dropna=True)
  876. tm.assert_frame_equal(store['df'], df)
  877. _maybe_remove(store, 'df2')
  878. store.append('df2', df[:10], dropna=False)
  879. store.append('df2', df[10:], dropna=False)
  880. tm.assert_frame_equal(store['df2'], df)
  881. # nan some entire rows (but since we have dates they are still
  882. # written!)
  883. df = DataFrame({'A1': np.random.randn(20),
  884. 'A2': np.random.randn(20),
  885. 'B': 'foo', 'C': 'bar',
  886. 'D': Timestamp("20010101"),
  887. 'E': datetime.datetime(2001, 1, 2, 0, 0)},
  888. index=np.arange(20))
  889. df.ix[0:15, :] = np.nan
  890. _maybe_remove(store, 'df')
  891. store.append('df', df[:10], dropna=True)
  892. store.append('df', df[10:], dropna=True)
  893. tm.assert_frame_equal(store['df'], df)
  894. _maybe_remove(store, 'df2')
  895. store.append('df2', df[:10], dropna=False)
  896. store.append('df2', df[10:], dropna=False)
  897. tm.assert_frame_equal(store['df2'], df)
  898. # Test to make sure defaults are to not drop.
  899. # Corresponding to Issue 9382
  900. df_with_missing = DataFrame(
  901. {'col1': [0, np.nan, 2], 'col2': [1, np.nan, np.nan]})
  902. with ensure_clean_path(self.path) as path:
  903. df_with_missing.to_hdf(path, 'df_with_missing', format='table')
  904. reloaded = read_hdf(path, 'df_with_missing')
  905. tm.assert_frame_equal(df_with_missing, reloaded)
  906. matrix = [[[np.nan, np.nan, np.nan], [1, np.nan, np.nan]],
  907. [[np.nan, np.nan, np.nan], [np.nan, 5, 6]],
  908. [[np.nan, np.nan, np.nan], [np.nan, 3, np.nan]]]
  909. panel_with_missing = Panel(matrix, items=['Item1', 'Item2', 'Item3'],
  910. major_axis=[1, 2],
  911. minor_axis=['A', 'B', 'C'])
  912. with ensure_clean_path(self.path) as path:
  913. panel_with_missing.to_hdf(
  914. path, 'panel_with_missing', format='table')
  915. reloaded_panel = read_hdf(path, 'panel_with_missing')
  916. tm.assert_panel_equal(panel_with_missing, reloaded_panel)
  917. def test_append_frame_column_oriented(self):
  918. with ensure_clean_store(self.path) as store:
  919. # column oriented
  920. df = tm.makeTimeDataFrame()
  921. _maybe_remove(store, 'df1')
  922. store.append('df1', df.ix[:, :2], axes=['columns'])
  923. store.append('df1', df.ix[:, 2:])
  924. tm.assert_frame_equal(store['df1'], df)
  925. result = store.select('df1', 'columns=A')
  926. expected = df.reindex(columns=['A'])
  927. tm.assert_frame_equal(expected, result)
  928. # selection on the non-indexable
  929. result = store.select(
  930. 'df1', ('columns=A', Term('index=df.index[0:4]')))
  931. expected = df.reindex(columns=['A'], index=df.index[0:4])
  932. tm.assert_frame_equal(expected, result)
  933. # this isn't supported
  934. self.assertRaises(TypeError, store.select, 'df1', (
  935. 'columns=A', Term('index>df.index[4]')))
  936. def test_append_with_different_block_ordering(self):
  937. # GH 4096; using same frames, but different block orderings
  938. with ensure_clean_store(self.path) as store:
  939. for i in range(10):
  940. df = DataFrame(np.random.randn(10, 2), columns=list('AB'))
  941. df['index'] = range(10)
  942. df['index'] += i * 10
  943. df['int64'] = Series([1] * len(df), dtype='int64')
  944. df['int16'] = Series([1] * len(df), dtype='int16')
  945. if i % 2 == 0:
  946. del df['int64']
  947. df['int64'] = Series([1] * len(df), dtype='int64')
  948. if i % 3 == 0:
  949. a = df.pop('A')
  950. df['A'] = a
  951. df.set_index('index', inplace=True)
  952. store.append('df', df)
  953. # test a different ordering but with more fields (like invalid
  954. # combinate)
  955. with ensure_clean_store(self.path) as store:
  956. df = DataFrame(np.random.randn(10, 2),
  957. columns=list('AB'), dtype='float64')
  958. df['int64'] = Series([1] * len(df), dtype='int64')
  959. df['int16'] = Series([1] * len(df), dtype='int16')
  960. store.append('df', df)
  961. # store additonal fields in different blocks
  962. df['int16_2'] = Series([1] * len(df), dtype='int16')
  963. self.assertRaises(ValueError, store.append, 'df', df)
  964. # store multile additonal fields in different blocks
  965. df['float_3'] = Series([1.] * len(df), dtype='float64')
  966. self.assertRaises(ValueError, store.append, 'df', df)
  967. def test_ndim_indexables(self):
  968. # test using ndim tables in new ways
  969. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  970. with ensure_clean_store(self.path) as store:
  971. p4d = tm.makePanel4D()
  972. def check_indexers(key, indexers):
  973. for i, idx in enumerate(indexers):
  974. descr = getattr(store.root, key).table.description
  975. self.assertTrue(getattr(descr, idx)._v_pos == i)
  976. # append then change (will take existing schema)
  977. indexers = ['items', 'major_axis', 'minor_axis']
  978. _maybe_remove(store, 'p4d')
  979. store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
  980. store.append('p4d', p4d.ix[:, :, 10:, :])
  981. assert_panel4d_equal(store.select('p4d'), p4d)
  982. check_indexers('p4d', indexers)
  983. # same as above, but try to append with differnt axes
  984. _maybe_remove(store, 'p4d')
  985. store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
  986. store.append('p4d', p4d.ix[:, :, 10:, :], axes=[
  987. 'labels', 'items', 'major_axis'])
  988. assert_panel4d_equal(store.select('p4d'), p4d)
  989. check_indexers('p4d', indexers)
  990. # pass incorrect number of axes
  991. _maybe_remove(store, 'p4d')
  992. self.assertRaises(ValueError, store.append, 'p4d', p4d.ix[
  993. :, :, :10, :], axes=['major_axis', 'minor_axis'])
  994. # different than default indexables #1
  995. indexers = ['labels', 'major_axis', 'minor_axis']
  996. _maybe_remove(store, 'p4d')
  997. store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
  998. store.append('p4d', p4d.ix[:, :, 10:, :])
  999. assert_panel4d_equal(store['p4d'], p4d)
  1000. check_indexers('p4d', indexers)
  1001. # different than default indexables #2
  1002. indexers = ['major_axis', 'labels', 'minor_axis']
  1003. _maybe_remove(store, 'p4d')
  1004. store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
  1005. store.append('p4d', p4d.ix[:, :, 10:, :])
  1006. assert_panel4d_equal(store['p4d'], p4d)
  1007. check_indexers('p4d', indexers)
  1008. # partial selection
  1009. result = store.select('p4d', ['labels=l1'])
  1010. expected = p4d.reindex(labels=['l1'])
  1011. assert_panel4d_equal(result, expected)
  1012. # partial selection2
  1013. result = store.select('p4d', [Term(
  1014. 'labels=l1'), Term('items=ItemA'), Term('minor_axis=B')])
  1015. expected = p4d.reindex(
  1016. labels=['l1'], items=['ItemA'], minor_axis=['B'])
  1017. assert_panel4d_equal(result, expected)
  1018. # non-existant partial selection
  1019. result = store.select('p4d', [Term(
  1020. 'labels=l1'), Term('items=Item1'), Term('minor_axis=B')])
  1021. expected = p4d.reindex(labels=['l1'], items=[],
  1022. minor_axis=['B'])
  1023. assert_panel4d_equal(result, expected)
  1024. def test_append_with_strings(self):
  1025. with ensure_clean_store(self.path) as store:
  1026. wp = tm.makePanel()
  1027. wp2 = wp.rename_axis(
  1028. dict([(x, "%s_extra" % x) for x in wp.minor_axis]), axis=2)
  1029. def check_col(key, name, size):
  1030. self.assertEqual(getattr(store.get_storer(
  1031. key).table.description, name).itemsize, size)
  1032. store.append('s1', wp, min_itemsize=20)
  1033. store.append('s1', wp2)
  1034. expected = concat([wp, wp2], axis=2)
  1035. expected = expected.reindex(minor_axis=sorted(expected.minor_axis))
  1036. assert_panel_equal(store['s1'], expected)
  1037. check_col('s1', 'minor_axis', 20)
  1038. # test dict format
  1039. store.append('s2', wp, min_itemsize={'minor_axis': 20})
  1040. store.append('s2', wp2)
  1041. expected = concat([wp, wp2], axis=2)
  1042. expected = expected.reindex(minor_axis=sorted(expected.minor_axis))
  1043. assert_panel_equal(store['s2'], expected)
  1044. check_col('s2', 'minor_axis', 20)
  1045. # apply the wrong field (similar to #1)
  1046. store.append('s3', wp, min_itemsize={'major_axis': 20})
  1047. self.assertRaises(ValueError, store.append, 's3', wp2)
  1048. # test truncation of bigger strings
  1049. store.append('s4', wp)
  1050. self.assertRaises(ValueError, store.append, 's4', wp2)
  1051. # avoid truncation on elements
  1052. df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']])
  1053. store.append('df_big', df)
  1054. tm.assert_frame_equal(store.select('df_big'), df)
  1055. check_col('df_big', 'values_block_1', 15)
  1056. # appending smaller string ok
  1057. df2 = DataFrame([[124, 'asdqy'], [346, 'dggnhefbdfb']])
  1058. store.append('df_big', df2)
  1059. expected = concat([df, df2])
  1060. tm.assert_frame_equal(store.select('df_big'), expected)
  1061. check_col('df_big', 'values_block_1', 15)
  1062. # avoid truncation on elements
  1063. df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']])
  1064. store.append('df_big2', df, min_itemsize={'values': 50})
  1065. tm.assert_frame_equal(store.select('df_big2'), df)
  1066. check_col('df_big2', 'values_block_1', 50)
  1067. # bigger string on next append
  1068. store.append('df_new', df)
  1069. df_new = DataFrame(
  1070. [[124, 'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']])
  1071. self.assertRaises(ValueError, store.append, 'df_new', df_new)
  1072. # with nans
  1073. _maybe_remove(store, 'df')
  1074. df = tm.makeTimeDataFrame()
  1075. df['string'] = 'foo'
  1076. df.ix[1:4, 'string'] = np.nan
  1077. df['string2'] = 'bar'
  1078. df.ix[4:8, 'string2'] = np.nan
  1079. df['string3'] = 'bah'
  1080. df.ix[1:, 'string3'] = np.nan
  1081. store.append('df', df)
  1082. result = store.select('df')
  1083. tm.assert_frame_equal(result, df)
  1084. with ensure_clean_store(self.path) as store:
  1085. def check_col(key, name, size):
  1086. self.assertEqual(getattr(store.get_storer(
  1087. key).table.description, name).itemsize, size)
  1088. df = DataFrame(dict(A='foo', B='bar'), index=range(10))
  1089. # a min_itemsize that creates a data_column
  1090. _maybe_remove(store, 'df')
  1091. store.append('df', df, min_itemsize={'A': 200})
  1092. check_col('df', 'A', 200)
  1093. self.assertEqual(store.get_storer('df').data_columns, ['A'])
  1094. # a min_itemsize that creates a data_column2
  1095. _maybe_remove(store, 'df')
  1096. store.append('df', df, data_columns=['B'], min_itemsize={'A': 200})
  1097. check_col('df', 'A', 200)
  1098. self.assertEqual(store.get_storer('df').data_columns, ['B', 'A'])
  1099. # a min_itemsize that creates a data_column2
  1100. _maybe_remove(store, 'df')
  1101. store.append('df', df, data_columns=[
  1102. 'B'], min_itemsize={'values': 200})
  1103. check_col('df', 'B', 200)
  1104. check_col('df', 'values_block_0', 200)
  1105. self.assertEqual(store.get_storer('df').data_columns, ['B'])
  1106. # infer the .typ on subsequent appends
  1107. _maybe_remove(store, 'df')
  1108. store.append('df', df[:5], min_itemsize=200)
  1109. store.append('df', df[5:], min_itemsize=200)
  1110. tm.assert_frame_equal(store['df'], df)
  1111. # invalid min_itemsize keys
  1112. df = DataFrame(['foo', 'foo', 'foo', 'barh',
  1113. 'barh', 'barh'], columns=['A'])
  1114. _maybe_remove(store, 'df')
  1115. self.assertRaises(ValueError, store.append, 'df',
  1116. df, min_itemsize={'foo': 20, 'foobar': 20})
  1117. def test_append_with_data_columns(self):
  1118. with ensure_clean_store(self.path) as store:
  1119. df = tm.makeTimeDataFrame()
  1120. df.loc[:, 'B'].iloc[0] = 1.
  1121. _maybe_remove(store, 'df')
  1122. store.append('df', df[:2], data_columns=['B'])
  1123. store.append('df', df[2:])
  1124. tm.assert_frame_equal(store['df'], df)
  1125. # check that we have indicies created
  1126. assert(store._handle.root.df.table.cols.index.is_indexed is True)
  1127. assert(store._handle.root.df.table.cols.B.is_indexed is True)
  1128. # data column searching
  1129. result = store.select('df', [Term('B>0')])
  1130. expected = df[df.B > 0]
  1131. tm.assert_frame_equal(result, expected)
  1132. # data column searching (with an indexable and a data_columns)
  1133. result = store.select(
  1134. 'df', [Term('B>0'), Term('index>df.index[3]')])
  1135. df_new = df.reindex(index=df.index[4:])
  1136. expected = df_new[df_new.B > 0]
  1137. tm.assert_frame_equal(result, expected)
  1138. # data column selection with a string data_column
  1139. df_new = df.copy()
  1140. df_new['string'] = 'foo'
  1141. df_new.loc[1:4, 'string'] = np.nan
  1142. df_new.loc[5:6, 'string'] = 'bar'
  1143. _maybe_remove(store, 'df')
  1144. store.append('df', df_new, data_columns=['string'])
  1145. result = store.select('df', [Term('string=foo')])
  1146. expected = df_new[df_new.string == 'foo']
  1147. tm.assert_frame_equal(result, expected)
  1148. # using min_itemsize and a data column
  1149. def check_col(key, name, size):
  1150. self.assertEqual(getattr(store.get_storer(
  1151. key).table.description, name).itemsize, size)
  1152. with ensure_clean_store(self.path) as store:
  1153. _maybe_remove(store, 'df')
  1154. store.append('df', df_new, data_columns=['string'],
  1155. min_itemsize={'string': 30})
  1156. check_col('df', 'string', 30)
  1157. _maybe_remove(store, 'df')
  1158. store.append(
  1159. 'df', df_new, data_columns=['string'], min_itemsize=30)
  1160. check_col('df', 'string', 30)
  1161. _maybe_remove(store, 'df')
  1162. store.append('df', df_new, data_columns=['string'],
  1163. min_itemsize={'values': 30})
  1164. check_col('df', 'string', 30)
  1165. with ensure_clean_store(self.path) as store:
  1166. df_new['string2'] = 'foobarbah'
  1167. df_new['string_block1'] = 'foobarbah1'
  1168. df_new['string_block2'] = 'foobarbah2'
  1169. _maybe_remove(store, 'df')
  1170. store.append('df', df_new, data_columns=['string', 'string2'],
  1171. min_itemsize={'string': 30, 'string2': 40,
  1172. 'values': 50})
  1173. check_col('df', 'string', 30)
  1174. check_col('df', 'string2', 40)
  1175. check_col('df', 'values_block_1', 50)
  1176. with ensure_clean_store(self.path) as store:
  1177. # multiple data columns
  1178. df_new = df.copy()
  1179. df_new.ix[0, 'A'] = 1.
  1180. df_new.ix[0, 'B'] = -1.
  1181. df_new['string'] = 'foo'
  1182. df_new.loc[1:4, 'string'] = np.nan
  1183. df_new.loc[5:6, 'string'] = 'bar'
  1184. df_new['string2'] = 'foo'
  1185. df_new.loc[2:5, 'string2'] = np.nan
  1186. df_new.loc[7:8, 'string2'] = 'bar'
  1187. _maybe_remove(store, 'df')
  1188. store.append(
  1189. 'df', df_new, data_columns=['A', 'B', 'string', 'string2'])
  1190. result = store.select('df', [Term('string=foo'), Term(
  1191. 'string2=foo'), Term('A>0'), Term('B<0')])
  1192. expected = df_new[(df_new.string == 'foo') & (
  1193. df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)]
  1194. tm.assert_frame_equal(result, expected, check_index_type=False)
  1195. # yield an empty frame
  1196. result = store.select('df', [Term('string=foo'), Term(
  1197. 'string2=cool')])
  1198. expected = df_new[(df_new.string == 'foo') & (
  1199. df_new.string2 == 'cool')]
  1200. tm.assert_frame_equal(result, expected, check_index_type=False)
  1201. with ensure_clean_store(self.path) as store:
  1202. # doc example
  1203. df_dc = df.copy()
  1204. df_dc['string'] = 'foo'
  1205. df_dc.ix[4:6, 'string'] = np.nan
  1206. df_dc.ix[7:9, 'string'] = 'bar'
  1207. df_dc['string2'] = 'cool'
  1208. df_dc['datetime'] = Timestamp('20010102')
  1209. df_dc = df_dc._convert(datetime=True)
  1210. df_dc.ix[3:5, ['A', 'B', 'datetime']] = np.nan
  1211. _maybe_remove(store, 'df_dc')
  1212. store.append('df_dc', df_dc,
  1213. data_columns=['B', 'C', 'string',
  1214. 'string2', 'datetime'])
  1215. result = store.select('df_dc', [Term('B>0')])
  1216. expected = df_dc[df_dc.B > 0]
  1217. tm.assert_frame_equal(result, expected, check_index_type=False)
  1218. result = store.select(
  1219. 'df_dc', ['B > 0', 'C > 0', 'string == foo'])
  1220. expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (
  1221. df_dc.string == 'foo')]
  1222. tm.assert_frame_equal(result, expected, check_index_type=False)
  1223. with ensure_clean_store(self.path) as store:
  1224. # doc example part 2
  1225. np.random.seed(1234)
  1226. index = date_range('1/1/2000', periods=8)
  1227. df_dc = DataFrame(np.random.randn(8, 3), index=index,
  1228. columns=['A', 'B', 'C'])
  1229. df_dc['string'] = 'foo'
  1230. df_dc.ix[4:6, 'string'] = np.nan
  1231. df_dc.ix[7:9, 'string'] = 'bar'
  1232. df_dc.ix[:, ['B', 'C']] = df_dc.ix[:, ['B', 'C']].abs()
  1233. df_dc['string2'] = 'cool'
  1234. # on-disk operations
  1235. store.append('df_dc', df_dc, data_columns=[
  1236. 'B', 'C', 'string', 'string2'])
  1237. result = store.select('df_dc', [Term('B>0')])
  1238. expected = df_dc[df_dc.B > 0]
  1239. tm.assert_frame_equal(result, expected)
  1240. result = store.select(
  1241. 'df_dc', ['B > 0', 'C > 0', 'string == "foo"'])
  1242. expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) &
  1243. (df_dc.string == 'foo')]
  1244. tm.assert_frame_equal(result, expected)
  1245. with ensure_clean_store(self.path) as store:
  1246. # panel
  1247. # GH5717 not handling data_columns
  1248. np.random.seed(1234)
  1249. p = tm.makePanel()
  1250. store.append('p1', p)
  1251. tm.assert_panel_equal(store.select('p1'), p)
  1252. store.append('p2', p, data_columns=True)
  1253. tm.assert_panel_equal(store.select('p2'), p)
  1254. result = store.select('p2', where='ItemA>0')
  1255. expected = p.to_frame()
  1256. expected = expected[expected['ItemA'] > 0]
  1257. tm.assert_frame_equal(result.to_frame(), expected)
  1258. result = store.select('p2', where='ItemA>0 & minor_axis=["A","B"]')
  1259. expected = p.to_frame()
  1260. expected = expected[expected['ItemA'] > 0]
  1261. expected = expected[expected.reset_index(
  1262. level=['major']).index.isin(['A', 'B'])]
  1263. tm.assert_frame_equal(result.to_frame(), expected)
  1264. def test_create_table_index(self):
  1265. with ensure_clean_store(self.path) as store:
  1266. def col(t, column):
  1267. return getattr(store.get_storer(t).table.cols, column)
  1268. # index=False
  1269. wp = tm.makePanel()
  1270. store.append('p5', wp, index=False)
  1271. store.create_table_index('p5', columns=['major_axis'])
  1272. assert(col('p5', 'major_axis').is_indexed is True)
  1273. assert(col('p5', 'minor_axis').is_indexed is False)
  1274. # index=True
  1275. store.append('p5i', wp, index=True)
  1276. assert(col('p5i', 'major_axis').is_indexed is True)
  1277. assert(col('p5i', 'minor_axis').is_indexed is True)
  1278. # default optlevels
  1279. store.get_storer('p5').create_index()
  1280. assert(col('p5', 'major_axis').index.optlevel == 6)
  1281. assert(col('p5', 'minor_axis').index.kind == 'medium')
  1282. # let's change the indexing scheme
  1283. store.create_table_index('p5')
  1284. assert(col('p5', 'major_axis').index.optlevel == 6)
  1285. assert(col('p5', 'minor_axis').index.kind == 'medium')
  1286. store.create_table_index('p5', optlevel=9)
  1287. assert(col('p5', 'major_axis').index.optlevel == 9)
  1288. assert(col('p5', 'minor_axis').index.kind == 'medium')
  1289. store.create_table_index('p5', kind='full')
  1290. assert(col('p5', 'major_axis').index.optlevel == 9)
  1291. assert(col('p5', 'minor_axis').index.kind == 'full')
  1292. store.create_table_index('p5', optlevel=1, kind='light')
  1293. assert(col('p5', 'major_axis').index.optlevel == 1)
  1294. assert(col('p5', 'minor_axis').index.kind == 'light')
  1295. # data columns
  1296. df = tm.makeTimeDataFrame()
  1297. df['string'] = 'foo'
  1298. df['string2'] = 'bar'
  1299. store.append('f', df, data_columns=['string', 'string2'])
  1300. assert(col('f', 'index').is_indexed is True)
  1301. assert(col('f', 'string').is_indexed is True)
  1302. assert(col('f', 'string2').is_indexed is True)
  1303. # specify index=columns
  1304. store.append(
  1305. 'f2', df, index=['string'], data_columns=['string', 'string2'])
  1306. assert(col('f2', 'index').is_indexed is False)
  1307. assert(col('f2', 'string').is_indexed is True)
  1308. assert(col('f2', 'string2').is_indexed is False)
  1309. # try to index a non-table
  1310. _maybe_remove(store, 'f2')
  1311. store.put('f2', df)
  1312. self.assertRaises(TypeError, store.create_table_index, 'f2')
  1313. def test_append_diff_item_order(self):
  1314. wp = tm.makePanel()
  1315. wp1 = wp.ix[:, :10, :]
  1316. wp2 = wp.ix[['ItemC', 'ItemB', 'ItemA'], 10:, :]
  1317. with ensure_clean_store(self.path) as store:
  1318. store.put('panel', wp1, format='table')
  1319. self.assertRaises(ValueError, store.put, 'panel', wp2,
  1320. append=True)
  1321. def test_append_hierarchical(self):
  1322. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  1323. ['one', 'two', 'three']],
  1324. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  1325. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  1326. names=['foo', 'bar'])
  1327. df = DataFrame(np.random.randn(10, 3), index=index,
  1328. columns=['A', 'B', 'C'])
  1329. with ensure_clean_store(self.path) as store:
  1330. store.append('mi', df)
  1331. result = store.select('mi')
  1332. tm.assert_frame_equal(result, df)
  1333. # GH 3748
  1334. result = store.select('mi', columns=['A', 'B'])
  1335. expected = df.reindex(columns=['A', 'B'])
  1336. tm.assert_frame_equal(result, expected)
  1337. with ensure_clean_path('test.hdf') as path:
  1338. df.to_hdf(path, 'df', format='table')
  1339. result = read_hdf(path, 'df', columns=['A', 'B'])
  1340. expected = df.reindex(columns=['A', 'B'])
  1341. tm.assert_frame_equal(result, expected)
  1342. def test_column_multiindex(self):
  1343. # GH 4710
  1344. # recreate multi-indexes properly
  1345. index = MultiIndex.from_tuples([('A', 'a'), ('A', 'b'),
  1346. ('B', 'a'), ('B', 'b')],
  1347. names=['first', 'second'])
  1348. df = DataFrame(np.arange(12).reshape(3, 4), columns=index)
  1349. expected = df.copy()
  1350. if isinstance(expected.index, RangeIndex):
  1351. expected.index = Int64Index(expected.index)
  1352. with ensure_clean_store(self.path) as store:
  1353. store.put('df', df)
  1354. tm.assert_frame_equal(store['df'], expected,
  1355. check_index_type=True,
  1356. check_column_type=True)
  1357. store.put('df1', df, format='table')
  1358. tm.assert_frame_equal(store['df1'], expected,
  1359. check_index_type=True,
  1360. check_column_type=True)
  1361. self.assertRaises(ValueError, store.put, 'df2', df,
  1362. format='table', data_columns=['A'])
  1363. self.assertRaises(ValueError, store.put, 'df3', df,
  1364. format='table', data_columns=True)
  1365. # appending multi-column on existing table (see GH 6167)
  1366. with ensure_clean_store(self.path) as store:
  1367. store.append('df2', df)
  1368. store.append('df2', df)
  1369. tm.assert_frame_equal(store['df2'], concat((df, df)))
  1370. # non_index_axes name
  1371. df = DataFrame(np.arange(12).reshape(3, 4),
  1372. columns=Index(list('ABCD'), name='foo'))
  1373. expected = df.copy()
  1374. if isinstance(expected.index, RangeIndex):
  1375. expected.index = Int64Index(expected.index)
  1376. with ensure_clean_store(self.path) as store:
  1377. store.put('df1', df, format='table')
  1378. tm.assert_frame_equal(store['df1'], expected,
  1379. check_index_type=True,
  1380. check_column_type=True)
  1381. def test_store_multiindex(self):
  1382. # validate multi-index names
  1383. # GH 5527
  1384. with ensure_clean_store(self.path) as store:
  1385. def make_index(names=None):
  1386. return MultiIndex.from_tuples([(datetime.datetime(2013, 12, d),
  1387. s, t)
  1388. for d in range(1, 3)
  1389. for s in range(2)
  1390. for t in range(3)],
  1391. names=names)
  1392. # no names
  1393. _maybe_remove(store, 'df')
  1394. df = DataFrame(np.zeros((12, 2)), columns=[
  1395. 'a', 'b'], index=make_index())
  1396. store.append('df', df)
  1397. tm.assert_frame_equal(store.select('df'), df)
  1398. # partial names
  1399. _maybe_remove(store, 'df')
  1400. df = DataFrame(np.zeros((12, 2)), columns=[
  1401. 'a', 'b'], index=make_index(['date', None, None]))
  1402. store.append('df', df)
  1403. tm.assert_frame_equal(store.select('df'), df)
  1404. # series
  1405. _maybe_remove(store, 's')
  1406. s = Series(np.zeros(12), index=make_index(['date', None, None]))
  1407. store.append('s', s)
  1408. xp = Series(np.zeros(12), index=make_index(
  1409. ['date', 'level_1', 'level_2']))
  1410. tm.assert_series_equal(store.select('s'), xp)
  1411. # dup with column
  1412. _maybe_remove(store, 'df')
  1413. df = DataFrame(np.zeros((12, 2)), columns=[
  1414. 'a', 'b'], index=make_index(['date', 'a', 't']))
  1415. self.assertRaises(ValueError, store.append, 'df', df)
  1416. # dup within level
  1417. _maybe_remove(store, 'df')
  1418. df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'],
  1419. index=make_index(['date', 'date', 'date']))
  1420. self.assertRaises(ValueError, store.append, 'df', df)
  1421. # fully names
  1422. _maybe_remove(store, 'df')
  1423. df = DataFrame(np.zeros((12, 2)), columns=[
  1424. 'a', 'b'], index=make_index(['date', 's', 't']))
  1425. store.append('df', df)
  1426. tm.assert_frame_equal(store.select('df'), df)
  1427. def test_select_columns_in_where(self):
  1428. # GH 6169
  1429. # recreate multi-indexes when columns is passed
  1430. # in the `where` argument
  1431. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  1432. ['one', 'two', 'three']],
  1433. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  1434. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  1435. names=['foo_name', 'bar_name'])
  1436. # With a DataFrame
  1437. df = DataFrame(np.random.randn(10, 3), index=index,
  1438. columns=['A', 'B', 'C'])
  1439. with ensure_clean_store(self.path) as store:
  1440. store.put('df', df, format='table')
  1441. expected = df[['A']]
  1442. tm.assert_frame_equal(store.select('df', columns=['A']), expected)
  1443. tm.assert_frame_equal(store.select(
  1444. 'df', where="columns=['A']"), expected)
  1445. # With a Series
  1446. s = Series(np.random.randn(10), index=index,
  1447. name='A')
  1448. with ensure_clean_store(self.path) as store:
  1449. store.put('s', s, format='table')
  1450. tm.assert_series_equal(store.select('s', where="columns=['A']"), s)
  1451. def test_pass_spec_to_storer(self):
  1452. df = tm.makeDataFrame()
  1453. with ensure_clean_store(self.path) as store:
  1454. store.put('df', df)
  1455. self.assertRaises(TypeError, store.select, 'df', columns=['A'])
  1456. self.assertRaises(TypeError, store.select,
  1457. 'df', where=[('columns=A')])
  1458. def test_append_misc(self):
  1459. with ensure_clean_store(self.path) as store:
  1460. with tm.assert_produces_warning(FutureWarning,
  1461. check_stacklevel=False):
  1462. # unsuported data types for non-tables
  1463. p4d = tm.makePanel4D()
  1464. self.assertRaises(TypeError, store.put, 'p4d', p4d)
  1465. # unsuported data types
  1466. self.assertRaises(TypeError, store.put, 'abc', None)
  1467. self.assertRaises(TypeError, store.put, 'abc', '123')
  1468. self.assertRaises(TypeError, store.put, 'abc', 123)
  1469. self.assertRaises(TypeError, store.put, 'abc', np.arange(5))
  1470. df = tm.makeDataFrame()
  1471. store.append('df', df, chunksize=1)
  1472. result = store.select('df')
  1473. tm.assert_frame_equal(result, df)
  1474. store.append('df1', df, expectedrows=10)
  1475. result = store.select('df1')
  1476. tm.assert_frame_equal(result, df)
  1477. # more chunksize in append tests
  1478. def check(obj, comparator):
  1479. for c in [10, 200, 1000]:
  1480. with ensure_clean_store(self.path, mode='w') as store:
  1481. store.append('obj', obj, chunksize=c)
  1482. result = store.select('obj')
  1483. comparator(result, obj)
  1484. df = tm.makeDataFrame()
  1485. df['string'] = 'foo'
  1486. df['float322'] = 1.
  1487. df['float322'] = df['float322'].astype('float32')
  1488. df['bool'] = df['float322'] > 0
  1489. df['time1'] = Timestamp('20130101')
  1490. df['time2'] = Timestamp('20130102')
  1491. check(df, tm.assert_frame_equal)
  1492. p = tm.makePanel()
  1493. check(p, assert_panel_equal)
  1494. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  1495. p4d = tm.makePanel4D()
  1496. check(p4d, assert_panel4d_equal)
  1497. # empty frame, GH4273
  1498. with ensure_clean_store(self.path) as store:
  1499. # 0 len
  1500. df_empty = DataFrame(columns=list('ABC'))
  1501. store.append('df', df_empty)
  1502. self.assertRaises(KeyError, store.select, 'df')
  1503. # repeated append of 0/non-zero frames
  1504. df = DataFrame(np.random.rand(10, 3), columns=list('ABC'))
  1505. store.append('df', df)
  1506. assert_frame_equal(store.select('df'), df)
  1507. store.append('df', df_empty)
  1508. assert_frame_equal(store.select('df'), df)
  1509. # store
  1510. df = DataFrame(columns=list('ABC'))
  1511. store.put('df2', df)
  1512. assert_frame_equal(store.select('df2'), df)
  1513. # 0 len
  1514. p_empty = Panel(items=list('ABC'))
  1515. store.append('p', p_empty)
  1516. self.assertRaises(KeyError, store.select, 'p')
  1517. # repeated append of 0/non-zero frames
  1518. p = Panel(np.random.randn(3, 4, 5), items=list('ABC'))
  1519. store.append('p', p)
  1520. assert_panel_equal(store.select('p'), p)
  1521. store.append('p', p_empty)
  1522. assert_panel_equal(store.select('p'), p)
  1523. # store
  1524. store.put('p2', p_empty)
  1525. assert_panel_equal(store.select('p2'), p_empty)
  1526. def test_append_raise(self):
  1527. with ensure_clean_store(self.path) as store:
  1528. # test append with invalid input to get good error messages
  1529. # list in column
  1530. df = tm.makeDataFrame()
  1531. df['invalid'] = [['a']] * len(df)
  1532. self.assertEqual(df.dtypes['invalid'], np.object_)
  1533. self.assertRaises(TypeError, store.append, 'df', df)
  1534. # multiple invalid columns
  1535. df['invalid2'] = [['a']] * len(df)
  1536. df['invalid3'] = [['a']] * len(df)
  1537. self.assertRaises(TypeError, store.append, 'df', df)
  1538. # datetime with embedded nans as object
  1539. df = tm.makeDataFrame()
  1540. s = Series(datetime.datetime(2001, 1, 2), index=df.index)
  1541. s = s.astype(object)
  1542. s[0:5] = np.nan
  1543. df['invalid'] = s
  1544. self.assertEqual(df.dtypes['invalid'], np.object_)
  1545. self.assertRaises(TypeError, store.append, 'df', df)
  1546. # directy ndarray
  1547. self.assertRaises(TypeError, store.append, 'df', np.arange(10))
  1548. # series directly
  1549. self.assertRaises(TypeError, store.append,
  1550. 'df', Series(np.arange(10)))
  1551. # appending an incompatbile table
  1552. df = tm.makeDataFrame()
  1553. store.append('df', df)
  1554. df['foo'] = 'foo'
  1555. self.assertRaises(ValueError, store.append, 'df', df)
  1556. def test_table_index_incompatible_dtypes(self):
  1557. df1 = DataFrame({'a': [1, 2, 3]})
  1558. df2 = DataFrame({'a': [4, 5, 6]},
  1559. index=date_range('1/1/2000', periods=3))
  1560. with ensure_clean_store(self.path) as store:
  1561. store.put('frame', df1, format='table')
  1562. self.assertRaises(TypeError, store.put, 'frame', df2,
  1563. format='table', append=True)
  1564. def test_table_values_dtypes_roundtrip(self):
  1565. with ensure_clean_store(self.path) as store:
  1566. df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8')
  1567. store.append('df_f8', df1)
  1568. assert_series_equal(df1.dtypes, store['df_f8'].dtypes)
  1569. df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8')
  1570. store.append('df_i8', df2)
  1571. assert_series_equal(df2.dtypes, store['df_i8'].dtypes)
  1572. # incompatible dtype
  1573. self.assertRaises(ValueError, store.append, 'df_i8', df1)
  1574. # check creation/storage/retrieval of float32 (a bit hacky to
  1575. # actually create them thought)
  1576. df1 = DataFrame(
  1577. np.array([[1], [2], [3]], dtype='f4'), columns=['A'])
  1578. store.append('df_f4', df1)
  1579. assert_series_equal(df1.dtypes, store['df_f4'].dtypes)
  1580. assert df1.dtypes[0] == 'float32'
  1581. # check with mixed dtypes
  1582. df1 = DataFrame(dict([(c, Series(np.random.randn(5), dtype=c))
  1583. for c in ['float32', 'float64', 'int32',
  1584. 'int64', 'int16', 'int8']]))
  1585. df1['string'] = 'foo'
  1586. df1['float322'] = 1.
  1587. df1['float322'] = df1['float322'].astype('float32')
  1588. df1['bool'] = df1['float32'] > 0
  1589. df1['time1'] = Timestamp('20130101')
  1590. df1['time2'] = Timestamp('20130102')
  1591. store.append('df_mixed_dtypes1', df1)
  1592. result = store.select('df_mixed_dtypes1').get_dtype_counts()
  1593. expected = Series({'float32': 2, 'float64': 1, 'int32': 1,
  1594. 'bool': 1, 'int16': 1, 'int8': 1,
  1595. 'int64': 1, 'object': 1, 'datetime64[ns]': 2})
  1596. result.sort()
  1597. expected.sort()
  1598. tm.assert_series_equal(result, expected)
  1599. def test_table_mixed_dtypes(self):
  1600. # frame
  1601. df = tm.makeDataFrame()
  1602. df['obj1'] = 'foo'
  1603. df['obj2'] = 'bar'
  1604. df['bool1'] = df['A'] > 0
  1605. df['bool2'] = df['B'] > 0
  1606. df['bool3'] = True
  1607. df['int1'] = 1
  1608. df['int2'] = 2
  1609. df['timestamp1'] = Timestamp('20010102')
  1610. df['timestamp2'] = Timestamp('20010103')
  1611. df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0)
  1612. df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0)
  1613. df.ix[3:6, ['obj1']] = np.nan
  1614. df = df.consolidate()._convert(datetime=True)
  1615. with ensure_clean_store(self.path) as store:
  1616. store.append('df1_mixed', df)
  1617. tm.assert_frame_equal(store.select('df1_mixed'), df)
  1618. # panel
  1619. wp = tm.makePanel()
  1620. wp['obj1'] = 'foo'
  1621. wp['obj2'] = 'bar'
  1622. wp['bool1'] = wp['ItemA'] > 0
  1623. wp['bool2'] = wp['ItemB'] > 0
  1624. wp['int1'] = 1
  1625. wp['int2'] = 2
  1626. wp = wp.consolidate()
  1627. with ensure_clean_store(self.path) as store:
  1628. store.append('p1_mixed', wp)
  1629. assert_panel_equal(store.select('p1_mixed'), wp)
  1630. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  1631. # ndim
  1632. wp = tm.makePanel4D()
  1633. wp['obj1'] = 'foo'
  1634. wp['obj2'] = 'bar'
  1635. wp['bool1'] = wp['l1'] > 0
  1636. wp['bool2'] = wp['l2'] > 0
  1637. wp['int1'] = 1
  1638. wp['int2'] = 2
  1639. wp = wp.consolidate()
  1640. with ensure_clean_store(self.path) as store:
  1641. store.append('p4d_mixed', wp)
  1642. assert_panel4d_equal(store.select('p4d_mixed'), wp)
  1643. def test_unimplemented_dtypes_table_columns(self):
  1644. with ensure_clean_store(self.path) as store:
  1645. l = [('date', datetime.date(2001, 1, 2))]
  1646. # py3 ok for unicode
  1647. if not compat.PY3:
  1648. l.append(('unicode', u('\\u03c3')))
  1649. # currently not supported dtypes ####
  1650. for n, f in l:
  1651. df = tm.makeDataFrame()
  1652. df[n] = f
  1653. self.assertRaises(
  1654. TypeError, store.append, 'df1_%s' % n, df)
  1655. # frame
  1656. df = tm.makeDataFrame()
  1657. df['obj1'] = 'foo'
  1658. df['obj2'] = 'bar'
  1659. df['datetime1'] = datetime.date(2001, 1, 2)
  1660. df = df.consolidate()._convert(datetime=True)
  1661. with ensure_clean_store(self.path) as store:
  1662. # this fails because we have a date in the object block......
  1663. self.assertRaises(TypeError, store.append, 'df_unimplemented', df)
  1664. def test_calendar_roundtrip_issue(self):
  1665. # 8591
  1666. # doc example from tseries holiday section
  1667. weekmask_egypt = 'Sun Mon Tue Wed Thu'
  1668. holidays = ['2012-05-01',
  1669. datetime.datetime(2013, 5, 1), np.datetime64('2014-05-01')]
  1670. bday_egypt = pandas.offsets.CustomBusinessDay(
  1671. holidays=holidays, weekmask=weekmask_egypt)
  1672. dt = datetime.datetime(2013, 4, 30)
  1673. dts = date_range(dt, periods=5, freq=bday_egypt)
  1674. s = (Series(dts.weekday, dts).map(
  1675. Series('Mon Tue Wed Thu Fri Sat Sun'.split())))
  1676. with ensure_clean_store(self.path) as store:
  1677. store.put('fixed', s)
  1678. result = store.select('fixed')
  1679. assert_series_equal(result, s)
  1680. store.append('table', s)
  1681. result = store.select('table')
  1682. assert_series_equal(result, s)
  1683. def test_append_with_timedelta(self):
  1684. # GH 3577
  1685. # append timedelta
  1686. from datetime import timedelta
  1687. df = DataFrame(dict(A=Timestamp('20130101'), B=[Timestamp(
  1688. '20130101') + timedelta(days=i, seconds=10) for i in range(10)]))
  1689. df['C'] = df['A'] - df['B']
  1690. df.ix[3:5, 'C'] = np.nan
  1691. with ensure_clean_store(self.path) as store:
  1692. # table
  1693. _maybe_remove(store, 'df')
  1694. store.append('df', df, data_columns=True)
  1695. result = store.select('df')
  1696. assert_frame_equal(result, df)
  1697. result = store.select('df', Term("C<100000"))
  1698. assert_frame_equal(result, df)
  1699. result = store.select('df', Term("C", "<", -3 * 86400))
  1700. assert_frame_equal(result, df.iloc[3:])
  1701. result = store.select('df', "C<'-3D'")
  1702. assert_frame_equal(result, df.iloc[3:])
  1703. # a bit hacky here as we don't really deal with the NaT properly
  1704. result = store.select('df', "C<'-500000s'")
  1705. result = result.dropna(subset=['C'])
  1706. assert_frame_equal(result, df.iloc[6:])
  1707. result = store.select('df', "C<'-3.5D'")
  1708. result = result.iloc[1:]
  1709. assert_frame_equal(result, df.iloc[4:])
  1710. # fixed
  1711. _maybe_remove(store, 'df2')
  1712. store.put('df2', df)
  1713. result = store.select('df2')
  1714. assert_frame_equal(result, df)
  1715. def test_remove(self):
  1716. with ensure_clean_store(self.path) as store:
  1717. ts = tm.makeTimeSeries()
  1718. df = tm.makeDataFrame()
  1719. store['a'] = ts
  1720. store['b'] = df
  1721. _maybe_remove(store, 'a')
  1722. self.assertEqual(len(store), 1)
  1723. tm.assert_frame_equal(df, store['b'])
  1724. _maybe_remove(store, 'b')
  1725. self.assertEqual(len(store), 0)
  1726. # nonexistence
  1727. self.assertRaises(KeyError, store.remove, 'a_nonexistent_store')
  1728. # pathing
  1729. store['a'] = ts
  1730. store['b/foo'] = df
  1731. _maybe_remove(store, 'foo')
  1732. _maybe_remove(store, 'b/foo')
  1733. self.assertEqual(len(store), 1)
  1734. store['a'] = ts
  1735. store['b/foo'] = df
  1736. _maybe_remove(store, 'b')
  1737. self.assertEqual(len(store), 1)
  1738. # __delitem__
  1739. store['a'] = ts
  1740. store['b'] = df
  1741. del store['a']
  1742. del store['b']
  1743. self.assertEqual(len(store), 0)
  1744. def test_remove_where(self):
  1745. with ensure_clean_store(self.path) as store:
  1746. # non-existance
  1747. crit1 = Term('index>foo')
  1748. self.assertRaises(KeyError, store.remove, 'a', [crit1])
  1749. # try to remove non-table (with crit)
  1750. # non-table ok (where = None)
  1751. wp = tm.makePanel(30)
  1752. store.put('wp', wp, format='table')
  1753. store.remove('wp', ["minor_axis=['A', 'D']"])
  1754. rs = store.select('wp')
  1755. expected = wp.reindex(minor_axis=['B', 'C'])
  1756. assert_panel_equal(rs, expected)
  1757. # empty where
  1758. _maybe_remove(store, 'wp')
  1759. store.put('wp', wp, format='table')
  1760. # deleted number (entire table)
  1761. n = store.remove('wp', [])
  1762. self.assertTrue(n == 120)
  1763. # non - empty where
  1764. _maybe_remove(store, 'wp')
  1765. store.put('wp', wp, format='table')
  1766. self.assertRaises(ValueError, store.remove,
  1767. 'wp', ['foo'])
  1768. # selectin non-table with a where
  1769. # store.put('wp2', wp, format='f')
  1770. # self.assertRaises(ValueError, store.remove,
  1771. # 'wp2', [('column', ['A', 'D'])])
  1772. def test_remove_startstop(self):
  1773. # GH #4835 and #6177
  1774. with ensure_clean_store(self.path) as store:
  1775. wp = tm.makePanel(30)
  1776. # start
  1777. _maybe_remove(store, 'wp1')
  1778. store.put('wp1', wp, format='t')
  1779. n = store.remove('wp1', start=32)
  1780. self.assertTrue(n == 120 - 32)
  1781. result = store.select('wp1')
  1782. expected = wp.reindex(major_axis=wp.major_axis[:32 // 4])
  1783. assert_panel_equal(result, expected)
  1784. _maybe_remove(store, 'wp2')
  1785. store.put('wp2', wp, format='t')
  1786. n = store.remove('wp2', start=-32)
  1787. self.assertTrue(n == 32)
  1788. result = store.select('wp2')
  1789. expected = wp.reindex(major_axis=wp.major_axis[:-32 // 4])
  1790. assert_panel_equal(result, expected)
  1791. # stop
  1792. _maybe_remove(store, 'wp3')
  1793. store.put('wp3', wp, format='t')
  1794. n = store.remove('wp3', stop=32)
  1795. self.assertTrue(n == 32)
  1796. result = store.select('wp3')
  1797. expected = wp.reindex(major_axis=wp.major_axis[32 // 4:])
  1798. assert_panel_equal(result, expected)
  1799. _maybe_remove(store, 'wp4')
  1800. store.put('wp4', wp, format='t')
  1801. n = store.remove('wp4', stop=-32)
  1802. self.assertTrue(n == 120 - 32)
  1803. result = store.select('wp4')
  1804. expected = wp.reindex(major_axis=wp.major_axis[-32 // 4:])
  1805. assert_panel_equal(result, expected)
  1806. # start n stop
  1807. _maybe_remove(store, 'wp5')
  1808. store.put('wp5', wp, format='t')
  1809. n = store.remove('wp5', start=16, stop=-16)
  1810. self.assertTrue(n == 120 - 32)
  1811. result = store.select('wp5')
  1812. expected = wp.reindex(major_axis=wp.major_axis[
  1813. :16 // 4].union(wp.major_axis[-16 // 4:]))
  1814. assert_panel_equal(result, expected)
  1815. _maybe_remove(store, 'wp6')
  1816. store.put('wp6', wp, format='t')
  1817. n = store.remove('wp6', start=16, stop=16)
  1818. self.assertTrue(n == 0)
  1819. result = store.select('wp6')
  1820. expected = wp.reindex(major_axis=wp.major_axis)
  1821. assert_panel_equal(result, expected)
  1822. # with where
  1823. _maybe_remove(store, 'wp7')
  1824. # TODO: unused?
  1825. date = wp.major_axis.take(np.arange(0, 30, 3)) # noqa
  1826. crit = Term('major_axis=date')
  1827. store.put('wp7', wp, format='t')
  1828. n = store.remove('wp7', where=[crit], stop=80)
  1829. self.assertTrue(n == 28)
  1830. result = store.select('wp7')
  1831. expected = wp.reindex(major_axis=wp.major_axis.difference(
  1832. wp.major_axis[np.arange(0, 20, 3)]))
  1833. assert_panel_equal(result, expected)
  1834. def test_remove_crit(self):
  1835. with ensure_clean_store(self.path) as store:
  1836. wp = tm.makePanel(30)
  1837. # group row removal
  1838. _maybe_remove(store, 'wp3')
  1839. date4 = wp.major_axis.take([0, 1, 2, 4, 5, 6, 8, 9, 10])
  1840. crit4 = Term('major_axis=date4')
  1841. store.put('wp3', wp, format='t')
  1842. n = store.remove('wp3', where=[crit4])
  1843. self.assertTrue(n == 36)
  1844. result = store.select('wp3')
  1845. expected = wp.reindex(major_axis=wp.major_axis.difference(date4))
  1846. assert_panel_equal(result, expected)
  1847. # upper half
  1848. _maybe_remove(store, 'wp')
  1849. store.put('wp', wp, format='table')
  1850. date = wp.major_axis[len(wp.major_axis) // 2]
  1851. crit1 = Term('major_axis>date')
  1852. crit2 = Term("minor_axis=['A', 'D']")
  1853. n = store.remove('wp', where=[crit1])
  1854. self.assertTrue(n == 56)
  1855. n = store.remove('wp', where=[crit2])
  1856. self.assertTrue(n == 32)
  1857. result = store['wp']
  1858. expected = wp.truncate(after=date).reindex(minor=['B', 'C'])
  1859. assert_panel_equal(result, expected)
  1860. # individual row elements
  1861. _maybe_remove(store, 'wp2')
  1862. store.put('wp2', wp, format='table')
  1863. date1 = wp.major_axis[1:3]
  1864. crit1 = Term('major_axis=date1')
  1865. store.remove('wp2', where=[crit1])
  1866. result = store.select('wp2')
  1867. expected = wp.reindex(major_axis=wp.major_axis.difference(date1))
  1868. assert_panel_equal(result, expected)
  1869. date2 = wp.major_axis[5]
  1870. crit2 = Term('major_axis=date2')
  1871. store.remove('wp2', where=[crit2])
  1872. result = store['wp2']
  1873. expected = wp.reindex(major_axis=wp.major_axis.difference(date1)
  1874. .difference(Index([date2])))
  1875. assert_panel_equal(result, expected)
  1876. date3 = [wp.major_axis[7], wp.major_axis[9]]
  1877. crit3 = Term('major_axis=date3')
  1878. store.remove('wp2', where=[crit3])
  1879. result = store['wp2']
  1880. expected = wp.reindex(major_axis=wp.major_axis
  1881. .difference(date1)
  1882. .difference(Index([date2]))
  1883. .difference(Index(date3)))
  1884. assert_panel_equal(result, expected)
  1885. # corners
  1886. _maybe_remove(store, 'wp4')
  1887. store.put('wp4', wp, format='table')
  1888. n = store.remove(
  1889. 'wp4', where=[Term('major_axis>wp.major_axis[-1]')])
  1890. result = store.select('wp4')
  1891. assert_panel_equal(result, wp)
  1892. def test_invalid_terms(self):
  1893. with ensure_clean_store(self.path) as store:
  1894. with compat_assert_produces_warning(FutureWarning):
  1895. df = tm.makeTimeDataFrame()
  1896. df['string'] = 'foo'
  1897. df.ix[0:4, 'string'] = 'bar'
  1898. wp = tm.makePanel()
  1899. p4d = tm.makePanel4D()
  1900. store.put('df', df, format='table')
  1901. store.put('wp', wp, format='table')
  1902. store.put('p4d', p4d, format='table')
  1903. # some invalid terms
  1904. self.assertRaises(ValueError, store.select,
  1905. 'wp', "minor=['A', 'B']")
  1906. self.assertRaises(ValueError, store.select,
  1907. 'wp', ["index=['20121114']"])
  1908. self.assertRaises(ValueError, store.select, 'wp', [
  1909. "index=['20121114', '20121114']"])
  1910. self.assertRaises(TypeError, Term)
  1911. # more invalid
  1912. self.assertRaises(
  1913. ValueError, store.select, 'df', 'df.index[3]')
  1914. self.assertRaises(SyntaxError, store.select, 'df', 'index>')
  1915. self.assertRaises(
  1916. ValueError, store.select, 'wp',
  1917. "major_axis<'20000108' & minor_axis['A', 'B']")
  1918. # from the docs
  1919. with ensure_clean_path(self.path) as path:
  1920. dfq = DataFrame(np.random.randn(10, 4), columns=list(
  1921. 'ABCD'), index=date_range('20130101', periods=10))
  1922. dfq.to_hdf(path, 'dfq', format='table', data_columns=True)
  1923. # check ok
  1924. read_hdf(path, 'dfq',
  1925. where="index>Timestamp('20130104') & columns=['A', 'B']")
  1926. read_hdf(path, 'dfq', where="A>0 or C>0")
  1927. # catch the invalid reference
  1928. with ensure_clean_path(self.path) as path:
  1929. dfq = DataFrame(np.random.randn(10, 4), columns=list(
  1930. 'ABCD'), index=date_range('20130101', periods=10))
  1931. dfq.to_hdf(path, 'dfq', format='table')
  1932. self.assertRaises(ValueError, read_hdf, path,
  1933. 'dfq', where="A>0 or C>0")
  1934. def test_terms(self):
  1935. with ensure_clean_store(self.path) as store:
  1936. wp = tm.makePanel()
  1937. wpneg = Panel.fromDict({-1: tm.makeDataFrame(),
  1938. 0: tm.makeDataFrame(),
  1939. 1: tm.makeDataFrame()})
  1940. with compat_assert_produces_warning(FutureWarning):
  1941. p4d = tm.makePanel4D()
  1942. store.put('p4d', p4d, format='table')
  1943. store.put('wp', wp, format='table')
  1944. store.put('wpneg', wpneg, format='table')
  1945. # panel
  1946. result = store.select('wp', [Term(
  1947. 'major_axis<"20000108"'), Term("minor_axis=['A', 'B']")])
  1948. expected = wp.truncate(after='20000108').reindex(minor=['A', 'B'])
  1949. assert_panel_equal(result, expected)
  1950. # with deprecation
  1951. result = store.select('wp', [Term(
  1952. 'major_axis', '<', "20000108"), Term("minor_axis=['A', 'B']")])
  1953. expected = wp.truncate(after='20000108').reindex(minor=['A', 'B'])
  1954. tm.assert_panel_equal(result, expected)
  1955. # p4d
  1956. with compat_assert_produces_warning(FutureWarning):
  1957. result = store.select('p4d',
  1958. [Term('major_axis<"20000108"'),
  1959. Term("minor_axis=['A', 'B']"),
  1960. Term("items=['ItemA', 'ItemB']")])
  1961. expected = p4d.truncate(after='20000108').reindex(
  1962. minor=['A', 'B'], items=['ItemA', 'ItemB'])
  1963. assert_panel4d_equal(result, expected)
  1964. # back compat invalid terms
  1965. terms = [dict(field='major_axis', op='>', value='20121114'),
  1966. [dict(field='major_axis', op='>', value='20121114')],
  1967. ["minor_axis=['A','B']",
  1968. dict(field='major_axis', op='>', value='20121114')]]
  1969. for t in terms:
  1970. with tm.assert_produces_warning(expected_warning=FutureWarning,
  1971. check_stacklevel=False):
  1972. Term(t)
  1973. with compat_assert_produces_warning(FutureWarning):
  1974. # valid terms
  1975. terms = [('major_axis=20121114'),
  1976. ('major_axis>20121114'),
  1977. (("major_axis=['20121114', '20121114']"),),
  1978. ('major_axis=datetime.datetime(2012, 11, 14)'),
  1979. 'major_axis> 20121114',
  1980. 'major_axis >20121114',
  1981. 'major_axis > 20121114',
  1982. (("minor_axis=['A', 'B']"),),
  1983. (("minor_axis=['A', 'B']"),),
  1984. ((("minor_axis==['A', 'B']"),),),
  1985. (("items=['ItemA', 'ItemB']"),),
  1986. ('items=ItemA'),
  1987. ]
  1988. for t in terms:
  1989. store.select('wp', t)
  1990. store.select('p4d', t)
  1991. # valid for p4d only
  1992. terms = [(("labels=['l1', 'l2']"),),
  1993. Term("labels=['l1', 'l2']"),
  1994. ]
  1995. for t in terms:
  1996. store.select('p4d', t)
  1997. with tm.assertRaisesRegexp(TypeError,
  1998. 'Only named functions are supported'):
  1999. store.select('wp', Term(
  2000. 'major_axis == (lambda x: x)("20130101")'))
  2001. # check USub node parsing
  2002. res = store.select('wpneg', Term('items == -1'))
  2003. expected = Panel({-1: wpneg[-1]})
  2004. tm.assert_panel_equal(res, expected)
  2005. with tm.assertRaisesRegexp(NotImplementedError,
  2006. 'Unary addition not supported'):
  2007. store.select('wpneg', Term('items == +1'))
  2008. def test_term_compat(self):
  2009. with ensure_clean_store(self.path) as store:
  2010. wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
  2011. major_axis=date_range('1/1/2000', periods=5),
  2012. minor_axis=['A', 'B', 'C', 'D'])
  2013. store.append('wp', wp)
  2014. result = store.select('wp', [Term('major_axis>20000102'),
  2015. Term('minor_axis', '=', ['A', 'B'])])
  2016. expected = wp.loc[:, wp.major_axis >
  2017. Timestamp('20000102'), ['A', 'B']]
  2018. assert_panel_equal(result, expected)
  2019. store.remove('wp', Term('major_axis>20000103'))
  2020. result = store.select('wp')
  2021. expected = wp.loc[:, wp.major_axis <= Timestamp('20000103'), :]
  2022. assert_panel_equal(result, expected)
  2023. with ensure_clean_store(self.path) as store:
  2024. wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
  2025. major_axis=date_range('1/1/2000', periods=5),
  2026. minor_axis=['A', 'B', 'C', 'D'])
  2027. store.append('wp', wp)
  2028. # stringified datetimes
  2029. result = store.select(
  2030. 'wp', [Term('major_axis', '>', datetime.datetime(2000, 1, 2))])
  2031. expected = wp.loc[:, wp.major_axis > Timestamp('20000102')]
  2032. assert_panel_equal(result, expected)
  2033. result = store.select(
  2034. 'wp', [Term('major_axis', '>',
  2035. datetime.datetime(2000, 1, 2, 0, 0))])
  2036. expected = wp.loc[:, wp.major_axis > Timestamp('20000102')]
  2037. assert_panel_equal(result, expected)
  2038. result = store.select(
  2039. 'wp', [Term('major_axis', '=',
  2040. [datetime.datetime(2000, 1, 2, 0, 0),
  2041. datetime.datetime(2000, 1, 3, 0, 0)])])
  2042. expected = wp.loc[:, [Timestamp('20000102'),
  2043. Timestamp('20000103')]]
  2044. assert_panel_equal(result, expected)
  2045. result = store.select('wp', [Term('minor_axis', '=', ['A', 'B'])])
  2046. expected = wp.loc[:, :, ['A', 'B']]
  2047. assert_panel_equal(result, expected)
  2048. def test_backwards_compat_without_term_object(self):
  2049. with ensure_clean_store(self.path) as store:
  2050. wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
  2051. major_axis=date_range('1/1/2000', periods=5),
  2052. minor_axis=['A', 'B', 'C', 'D'])
  2053. store.append('wp', wp)
  2054. with assert_produces_warning(expected_warning=FutureWarning,
  2055. check_stacklevel=False):
  2056. result = store.select('wp', [('major_axis>20000102'),
  2057. ('minor_axis', '=', ['A', 'B'])])
  2058. expected = wp.loc[:,
  2059. wp.major_axis > Timestamp('20000102'),
  2060. ['A', 'B']]
  2061. assert_panel_equal(result, expected)
  2062. store.remove('wp', ('major_axis>20000103'))
  2063. result = store.select('wp')
  2064. expected = wp.loc[:, wp.major_axis <= Timestamp('20000103'), :]
  2065. assert_panel_equal(result, expected)
  2066. with ensure_clean_store(self.path) as store:
  2067. wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
  2068. major_axis=date_range('1/1/2000', periods=5),
  2069. minor_axis=['A', 'B', 'C', 'D'])
  2070. store.append('wp', wp)
  2071. # stringified datetimes
  2072. with assert_produces_warning(expected_warning=FutureWarning,
  2073. check_stacklevel=False):
  2074. result = store.select('wp',
  2075. [('major_axis',
  2076. '>',
  2077. datetime.datetime(2000, 1, 2))])
  2078. expected = wp.loc[:, wp.major_axis > Timestamp('20000102')]
  2079. assert_panel_equal(result, expected)
  2080. with assert_produces_warning(expected_warning=FutureWarning,
  2081. check_stacklevel=False):
  2082. result = store.select('wp',
  2083. [('major_axis',
  2084. '>',
  2085. datetime.datetime(2000, 1, 2, 0, 0))])
  2086. expected = wp.loc[:, wp.major_axis > Timestamp('20000102')]
  2087. assert_panel_equal(result, expected)
  2088. with assert_produces_warning(expected_warning=FutureWarning,
  2089. check_stacklevel=False):
  2090. result = store.select('wp',
  2091. [('major_axis',
  2092. '=',
  2093. [datetime.datetime(2000, 1, 2, 0, 0),
  2094. datetime.datetime(2000, 1, 3, 0, 0)])]
  2095. )
  2096. expected = wp.loc[:, [Timestamp('20000102'),
  2097. Timestamp('20000103')]]
  2098. assert_panel_equal(result, expected)
  2099. def test_same_name_scoping(self):
  2100. with ensure_clean_store(self.path) as store:
  2101. import pandas as pd
  2102. df = DataFrame(np.random.randn(20, 2),
  2103. index=pd.date_range('20130101', periods=20))
  2104. store.put('df', df, format='table')
  2105. expected = df[df.index > pd.Timestamp('20130105')]
  2106. import datetime # noqa
  2107. result = store.select('df', 'index>datetime.datetime(2013,1,5)')
  2108. assert_frame_equal(result, expected)
  2109. from datetime import datetime # noqa
  2110. # technically an error, but allow it
  2111. result = store.select('df', 'index>datetime.datetime(2013,1,5)')
  2112. assert_frame_equal(result, expected)
  2113. result = store.select('df', 'index>datetime(2013,1,5)')
  2114. assert_frame_equal(result, expected)
  2115. def test_series(self):
  2116. s = tm.makeStringSeries()
  2117. self._check_roundtrip(s, tm.assert_series_equal)
  2118. ts = tm.makeTimeSeries()
  2119. self._check_roundtrip(ts, tm.assert_series_equal)
  2120. ts2 = Series(ts.index, Index(ts.index, dtype=object))
  2121. self._check_roundtrip(ts2, tm.assert_series_equal)
  2122. ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object),
  2123. dtype=object))
  2124. self._check_roundtrip(ts3, tm.assert_series_equal,
  2125. check_index_type=False)
  2126. def test_sparse_series(self):
  2127. s = tm.makeStringSeries()
  2128. s[3:5] = np.nan
  2129. ss = s.to_sparse()
  2130. self._check_roundtrip(ss, tm.assert_series_equal,
  2131. check_series_type=True)
  2132. ss2 = s.to_sparse(kind='integer')
  2133. self._check_roundtrip(ss2, tm.assert_series_equal,
  2134. check_series_type=True)
  2135. ss3 = s.to_sparse(fill_value=0)
  2136. self._check_roundtrip(ss3, tm.assert_series_equal,
  2137. check_series_type=True)
  2138. def test_sparse_frame(self):
  2139. s = tm.makeDataFrame()
  2140. s.ix[3:5, 1:3] = np.nan
  2141. s.ix[8:10, -2] = np.nan
  2142. ss = s.to_sparse()
  2143. self._check_double_roundtrip(ss, tm.assert_frame_equal,
  2144. check_frame_type=True)
  2145. ss2 = s.to_sparse(kind='integer')
  2146. self._check_double_roundtrip(ss2, tm.assert_frame_equal,
  2147. check_frame_type=True)
  2148. ss3 = s.to_sparse(fill_value=0)
  2149. self._check_double_roundtrip(ss3, tm.assert_frame_equal,
  2150. check_frame_type=True)
  2151. def test_float_index(self):
  2152. # GH #454
  2153. index = np.random.randn(10)
  2154. s = Series(np.random.randn(10), index=index)
  2155. self._check_roundtrip(s, tm.assert_series_equal)
  2156. def test_tuple_index(self):
  2157. # GH #492
  2158. col = np.arange(10)
  2159. idx = [(0., 1.), (2., 3.), (4., 5.)]
  2160. data = np.random.randn(30).reshape((3, 10))
  2161. DF = DataFrame(data, index=idx, columns=col)
  2162. expected_warning = Warning if PY35 else PerformanceWarning
  2163. with tm.assert_produces_warning(expected_warning=expected_warning,
  2164. check_stacklevel=False):
  2165. self._check_roundtrip(DF, tm.assert_frame_equal)
  2166. def test_index_types(self):
  2167. values = np.random.randn(2)
  2168. func = lambda l, r: tm.assert_series_equal(l, r,
  2169. check_dtype=True,
  2170. check_index_type=True,
  2171. check_series_type=True)
  2172. # nose has a deprecation warning in 3.5
  2173. expected_warning = Warning if PY35 else PerformanceWarning
  2174. with tm.assert_produces_warning(expected_warning=expected_warning,
  2175. check_stacklevel=False):
  2176. ser = Series(values, [0, 'y'])
  2177. self._check_roundtrip(ser, func)
  2178. with tm.assert_produces_warning(expected_warning=expected_warning,
  2179. check_stacklevel=False):
  2180. ser = Series(values, [datetime.datetime.today(), 0])
  2181. self._check_roundtrip(ser, func)
  2182. with tm.assert_produces_warning(expected_warning=expected_warning,
  2183. check_stacklevel=False):
  2184. ser = Series(values, ['y', 0])
  2185. self._check_roundtrip(ser, func)
  2186. with tm.assert_produces_warning(expected_warning=expected_warning,
  2187. check_stacklevel=False):
  2188. ser = Series(values, [datetime.date.today(), 'a'])
  2189. self._check_roundtrip(ser, func)
  2190. with tm.assert_produces_warning(expected_warning=expected_warning,
  2191. check_stacklevel=False):
  2192. ser = Series(values, [1.23, 'b'])
  2193. self._check_roundtrip(ser, func)
  2194. ser = Series(values, [1, 1.53])
  2195. self._check_roundtrip(ser, func)
  2196. ser = Series(values, [1, 5])
  2197. self._check_roundtrip(ser, func)
  2198. ser = Series(values, [datetime.datetime(
  2199. 2012, 1, 1), datetime.datetime(2012, 1, 2)])
  2200. self._check_roundtrip(ser, func)
  2201. def test_timeseries_preepoch(self):
  2202. if sys.version_info[0] == 2 and sys.version_info[1] < 7:
  2203. raise nose.SkipTest("won't work on Python < 2.7")
  2204. dr = bdate_range('1/1/1940', '1/1/1960')
  2205. ts = Series(np.random.randn(len(dr)), index=dr)
  2206. try:
  2207. self._check_roundtrip(ts, tm.assert_series_equal)
  2208. except OverflowError:
  2209. raise nose.SkipTest('known failer on some windows platforms')
  2210. def test_frame(self):
  2211. df = tm.makeDataFrame()
  2212. # put in some random NAs
  2213. df.values[0, 0] = np.nan
  2214. df.values[5, 3] = np.nan
  2215. self._check_roundtrip_table(df, tm.assert_frame_equal)
  2216. self._check_roundtrip(df, tm.assert_frame_equal)
  2217. if not skip_compression:
  2218. self._check_roundtrip_table(df, tm.assert_frame_equal,
  2219. compression=True)
  2220. self._check_roundtrip(df, tm.assert_frame_equal,
  2221. compression=True)
  2222. tdf = tm.makeTimeDataFrame()
  2223. self._check_roundtrip(tdf, tm.assert_frame_equal)
  2224. if not skip_compression:
  2225. self._check_roundtrip(tdf, tm.assert_frame_equal,
  2226. compression=True)
  2227. with ensure_clean_store(self.path) as store:
  2228. # not consolidated
  2229. df['foo'] = np.random.randn(len(df))
  2230. store['df'] = df
  2231. recons = store['df']
  2232. self.assertTrue(recons._data.is_consolidated())
  2233. # empty
  2234. self._check_roundtrip(df[:0], tm.assert_frame_equal)
  2235. def test_empty_series_frame(self):
  2236. s0 = Series()
  2237. s1 = Series(name='myseries')
  2238. df0 = DataFrame()
  2239. df1 = DataFrame(index=['a', 'b', 'c'])
  2240. df2 = DataFrame(columns=['d', 'e', 'f'])
  2241. self._check_roundtrip(s0, tm.assert_series_equal)
  2242. self._check_roundtrip(s1, tm.assert_series_equal)
  2243. self._check_roundtrip(df0, tm.assert_frame_equal)
  2244. self._check_roundtrip(df1, tm.assert_frame_equal)
  2245. self._check_roundtrip(df2, tm.assert_frame_equal)
  2246. def test_empty_series(self):
  2247. for dtype in [np.int64, np.float64, np.object, 'm8[ns]', 'M8[ns]']:
  2248. s = Series(dtype=dtype)
  2249. self._check_roundtrip(s, tm.assert_series_equal)
  2250. def test_can_serialize_dates(self):
  2251. rng = [x.date() for x in bdate_range('1/1/2000', '1/30/2000')]
  2252. frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
  2253. self._check_roundtrip(frame, tm.assert_frame_equal)
  2254. def test_store_hierarchical(self):
  2255. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  2256. ['one', 'two', 'three']],
  2257. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  2258. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  2259. names=['foo', 'bar'])
  2260. frame = DataFrame(np.random.randn(10, 3), index=index,
  2261. columns=['A', 'B', 'C'])
  2262. self._check_roundtrip(frame, tm.assert_frame_equal)
  2263. self._check_roundtrip(frame.T, tm.assert_frame_equal)
  2264. self._check_roundtrip(frame['A'], tm.assert_series_equal)
  2265. # check that the names are stored
  2266. with ensure_clean_store(self.path) as store:
  2267. store['frame'] = frame
  2268. recons = store['frame']
  2269. tm.assert_frame_equal(recons, frame)
  2270. def test_store_index_name(self):
  2271. df = tm.makeDataFrame()
  2272. df.index.name = 'foo'
  2273. with ensure_clean_store(self.path) as store:
  2274. store['frame'] = df
  2275. recons = store['frame']
  2276. tm.assert_frame_equal(recons, df)
  2277. def test_store_index_name_with_tz(self):
  2278. # GH 13884
  2279. df = pd.DataFrame({'A': [1, 2]})
  2280. df.index = pd.DatetimeIndex([1234567890123456787, 1234567890123456788])
  2281. df.index = df.index.tz_localize('UTC')
  2282. df.index.name = 'foo'
  2283. with ensure_clean_store(self.path) as store:
  2284. store.put('frame', df, format='table')
  2285. recons = store['frame']
  2286. tm.assert_frame_equal(recons, df)
  2287. def test_store_series_name(self):
  2288. df = tm.makeDataFrame()
  2289. series = df['A']
  2290. with ensure_clean_store(self.path) as store:
  2291. store['series'] = series
  2292. recons = store['series']
  2293. tm.assert_series_equal(recons, series)
  2294. def test_store_mixed(self):
  2295. def _make_one():
  2296. df = tm.makeDataFrame()
  2297. df['obj1'] = 'foo'
  2298. df['obj2'] = 'bar'
  2299. df['bool1'] = df['A'] > 0
  2300. df['bool2'] = df['B'] > 0
  2301. df['int1'] = 1
  2302. df['int2'] = 2
  2303. return df.consolidate()
  2304. df1 = _make_one()
  2305. df2 = _make_one()
  2306. self._check_roundtrip(df1, tm.assert_frame_equal)
  2307. self._check_roundtrip(df2, tm.assert_frame_equal)
  2308. with ensure_clean_store(self.path) as store:
  2309. store['obj'] = df1
  2310. tm.assert_frame_equal(store['obj'], df1)
  2311. store['obj'] = df2
  2312. tm.assert_frame_equal(store['obj'], df2)
  2313. # check that can store Series of all of these types
  2314. self._check_roundtrip(df1['obj1'], tm.assert_series_equal)
  2315. self._check_roundtrip(df1['bool1'], tm.assert_series_equal)
  2316. self._check_roundtrip(df1['int1'], tm.assert_series_equal)
  2317. if not skip_compression:
  2318. self._check_roundtrip(df1['obj1'], tm.assert_series_equal,
  2319. compression=True)
  2320. self._check_roundtrip(df1['bool1'], tm.assert_series_equal,
  2321. compression=True)
  2322. self._check_roundtrip(df1['int1'], tm.assert_series_equal,
  2323. compression=True)
  2324. self._check_roundtrip(df1, tm.assert_frame_equal,
  2325. compression=True)
  2326. def test_wide(self):
  2327. wp = tm.makePanel()
  2328. self._check_roundtrip(wp, assert_panel_equal)
  2329. def test_wide_table(self):
  2330. wp = tm.makePanel()
  2331. self._check_roundtrip_table(wp, assert_panel_equal)
  2332. def test_select_with_dups(self):
  2333. # single dtypes
  2334. df = DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B'])
  2335. df.index = date_range('20130101 9:30', periods=10, freq='T')
  2336. with ensure_clean_store(self.path) as store:
  2337. store.append('df', df)
  2338. result = store.select('df')
  2339. expected = df
  2340. assert_frame_equal(result, expected, by_blocks=True)
  2341. result = store.select('df', columns=df.columns)
  2342. expected = df
  2343. assert_frame_equal(result, expected, by_blocks=True)
  2344. result = store.select('df', columns=['A'])
  2345. expected = df.loc[:, ['A']]
  2346. assert_frame_equal(result, expected)
  2347. # dups accross dtypes
  2348. df = concat([DataFrame(np.random.randn(10, 4),
  2349. columns=['A', 'A', 'B', 'B']),
  2350. DataFrame(np.random.randint(0, 10, size=20)
  2351. .reshape(10, 2),
  2352. columns=['A', 'C'])],
  2353. axis=1)
  2354. df.index = date_range('20130101 9:30', periods=10, freq='T')
  2355. with ensure_clean_store(self.path) as store:
  2356. store.append('df', df)
  2357. result = store.select('df')
  2358. expected = df
  2359. assert_frame_equal(result, expected, by_blocks=True)
  2360. result = store.select('df', columns=df.columns)
  2361. expected = df
  2362. assert_frame_equal(result, expected, by_blocks=True)
  2363. expected = df.loc[:, ['A']]
  2364. result = store.select('df', columns=['A'])
  2365. assert_frame_equal(result, expected, by_blocks=True)
  2366. expected = df.loc[:, ['B', 'A']]
  2367. result = store.select('df', columns=['B', 'A'])
  2368. assert_frame_equal(result, expected, by_blocks=True)
  2369. # duplicates on both index and columns
  2370. with ensure_clean_store(self.path) as store:
  2371. store.append('df', df)
  2372. store.append('df', df)
  2373. expected = df.loc[:, ['B', 'A']]
  2374. expected = concat([expected, expected])
  2375. result = store.select('df', columns=['B', 'A'])
  2376. assert_frame_equal(result, expected, by_blocks=True)
  2377. def test_wide_table_dups(self):
  2378. wp = tm.makePanel()
  2379. with ensure_clean_store(self.path) as store:
  2380. store.put('panel', wp, format='table')
  2381. store.put('panel', wp, format='table', append=True)
  2382. with tm.assert_produces_warning(expected_warning=DuplicateWarning):
  2383. recons = store['panel']
  2384. assert_panel_equal(recons, wp)
  2385. def test_long(self):
  2386. def _check(left, right):
  2387. assert_panel_equal(left.to_panel(), right.to_panel())
  2388. wp = tm.makePanel()
  2389. self._check_roundtrip(wp.to_frame(), _check)
  2390. # empty
  2391. # self._check_roundtrip(wp.to_frame()[:0], _check)
  2392. def test_longpanel(self):
  2393. pass
  2394. def test_overwrite_node(self):
  2395. with ensure_clean_store(self.path) as store:
  2396. store['a'] = tm.makeTimeDataFrame()
  2397. ts = tm.makeTimeSeries()
  2398. store['a'] = ts
  2399. tm.assert_series_equal(store['a'], ts)
  2400. def test_sparse_with_compression(self):
  2401. # GH 2931
  2402. # make sparse dataframe
  2403. arr = np.random.binomial(n=1, p=.01, size=(1000, 10))
  2404. df = DataFrame(arr).to_sparse(fill_value=0)
  2405. # case 1: store uncompressed
  2406. self._check_double_roundtrip(df, tm.assert_frame_equal,
  2407. compression=False,
  2408. check_frame_type=True)
  2409. # case 2: store compressed (works)
  2410. self._check_double_roundtrip(df, tm.assert_frame_equal,
  2411. compression='zlib',
  2412. check_frame_type=True)
  2413. # set one series to be completely sparse
  2414. df[0] = np.zeros(1000)
  2415. # case 3: store df with completely sparse series uncompressed
  2416. self._check_double_roundtrip(df, tm.assert_frame_equal,
  2417. compression=False,
  2418. check_frame_type=True)
  2419. # case 4: try storing df with completely sparse series compressed
  2420. # (fails)
  2421. self._check_double_roundtrip(df, tm.assert_frame_equal,
  2422. compression='zlib',
  2423. check_frame_type=True)
  2424. def test_select(self):
  2425. wp = tm.makePanel()
  2426. with ensure_clean_store(self.path) as store:
  2427. # put/select ok
  2428. _maybe_remove(store, 'wp')
  2429. store.put('wp', wp, format='table')
  2430. store.select('wp')
  2431. # non-table ok (where = None)
  2432. _maybe_remove(store, 'wp')
  2433. store.put('wp2', wp)
  2434. store.select('wp2')
  2435. # selection on the non-indexable with a large number of columns
  2436. wp = Panel(np.random.randn(100, 100, 100),
  2437. items=['Item%03d' % i for i in range(100)],
  2438. major_axis=date_range('1/1/2000', periods=100),
  2439. minor_axis=['E%03d' % i for i in range(100)])
  2440. _maybe_remove(store, 'wp')
  2441. store.append('wp', wp)
  2442. items = ['Item%03d' % i for i in range(80)]
  2443. result = store.select('wp', Term('items=items'))
  2444. expected = wp.reindex(items=items)
  2445. assert_panel_equal(expected, result)
  2446. # selectin non-table with a where
  2447. # self.assertRaises(ValueError, store.select,
  2448. # 'wp2', ('column', ['A', 'D']))
  2449. # select with columns=
  2450. df = tm.makeTimeDataFrame()
  2451. _maybe_remove(store, 'df')
  2452. store.append('df', df)
  2453. result = store.select('df', columns=['A', 'B'])
  2454. expected = df.reindex(columns=['A', 'B'])
  2455. tm.assert_frame_equal(expected, result)
  2456. # equivalentsly
  2457. result = store.select('df', [("columns=['A', 'B']")])
  2458. expected = df.reindex(columns=['A', 'B'])
  2459. tm.assert_frame_equal(expected, result)
  2460. # with a data column
  2461. _maybe_remove(store, 'df')
  2462. store.append('df', df, data_columns=['A'])
  2463. result = store.select('df', ['A > 0'], columns=['A', 'B'])
  2464. expected = df[df.A > 0].reindex(columns=['A', 'B'])
  2465. tm.assert_frame_equal(expected, result)
  2466. # all a data columns
  2467. _maybe_remove(store, 'df')
  2468. store.append('df', df, data_columns=True)
  2469. result = store.select('df', ['A > 0'], columns=['A', 'B'])
  2470. expected = df[df.A > 0].reindex(columns=['A', 'B'])
  2471. tm.assert_frame_equal(expected, result)
  2472. # with a data column, but different columns
  2473. _maybe_remove(store, 'df')
  2474. store.append('df', df, data_columns=['A'])
  2475. result = store.select('df', ['A > 0'], columns=['C', 'D'])
  2476. expected = df[df.A > 0].reindex(columns=['C', 'D'])
  2477. tm.assert_frame_equal(expected, result)
  2478. def test_select_dtypes(self):
  2479. with ensure_clean_store(self.path) as store:
  2480. # with a Timestamp data column (GH #2637)
  2481. df = DataFrame(dict(
  2482. ts=bdate_range('2012-01-01', periods=300),
  2483. A=np.random.randn(300)))
  2484. _maybe_remove(store, 'df')
  2485. store.append('df', df, data_columns=['ts', 'A'])
  2486. result = store.select('df', [Term("ts>=Timestamp('2012-02-01')")])
  2487. expected = df[df.ts >= Timestamp('2012-02-01')]
  2488. tm.assert_frame_equal(expected, result)
  2489. # bool columns (GH #2849)
  2490. df = DataFrame(np.random.randn(5, 2), columns=['A', 'B'])
  2491. df['object'] = 'foo'
  2492. df.ix[4:5, 'object'] = 'bar'
  2493. df['boolv'] = df['A'] > 0
  2494. _maybe_remove(store, 'df')
  2495. store.append('df', df, data_columns=True)
  2496. expected = (df[df.boolv == True] # noqa
  2497. .reindex(columns=['A', 'boolv']))
  2498. for v in [True, 'true', 1]:
  2499. result = store.select('df', Term(
  2500. 'boolv == %s' % str(v)), columns=['A', 'boolv'])
  2501. tm.assert_frame_equal(expected, result)
  2502. expected = (df[df.boolv == False] # noqa
  2503. .reindex(columns=['A', 'boolv']))
  2504. for v in [False, 'false', 0]:
  2505. result = store.select('df', Term(
  2506. 'boolv == %s' % str(v)), columns=['A', 'boolv'])
  2507. tm.assert_frame_equal(expected, result)
  2508. # integer index
  2509. df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20)))
  2510. _maybe_remove(store, 'df_int')
  2511. store.append('df_int', df)
  2512. result = store.select(
  2513. 'df_int', [Term("index<10"), Term("columns=['A']")])
  2514. expected = df.reindex(index=list(df.index)[0:10], columns=['A'])
  2515. tm.assert_frame_equal(expected, result)
  2516. # float index
  2517. df = DataFrame(dict(A=np.random.rand(
  2518. 20), B=np.random.rand(20), index=np.arange(20, dtype='f8')))
  2519. _maybe_remove(store, 'df_float')
  2520. store.append('df_float', df)
  2521. result = store.select(
  2522. 'df_float', [Term("index<10.0"), Term("columns=['A']")])
  2523. expected = df.reindex(index=list(df.index)[0:10], columns=['A'])
  2524. tm.assert_frame_equal(expected, result)
  2525. with ensure_clean_store(self.path) as store:
  2526. # floats w/o NaN
  2527. df = DataFrame(
  2528. dict(cols=range(11), values=range(11)), dtype='float64')
  2529. df['cols'] = (df['cols'] + 10).apply(str)
  2530. store.append('df1', df, data_columns=True)
  2531. result = store.select(
  2532. 'df1', where='values>2.0')
  2533. expected = df[df['values'] > 2.0]
  2534. tm.assert_frame_equal(expected, result)
  2535. # floats with NaN
  2536. df.iloc[0] = np.nan
  2537. expected = df[df['values'] > 2.0]
  2538. store.append('df2', df, data_columns=True, index=False)
  2539. result = store.select(
  2540. 'df2', where='values>2.0')
  2541. tm.assert_frame_equal(expected, result)
  2542. # https://github.com/PyTables/PyTables/issues/282
  2543. # bug in selection when 0th row has a np.nan and an index
  2544. # store.append('df3',df,data_columns=True)
  2545. # result = store.select(
  2546. # 'df3', where='values>2.0')
  2547. # tm.assert_frame_equal(expected, result)
  2548. # not in first position float with NaN ok too
  2549. df = DataFrame(
  2550. dict(cols=range(11), values=range(11)), dtype='float64')
  2551. df['cols'] = (df['cols'] + 10).apply(str)
  2552. df.iloc[1] = np.nan
  2553. expected = df[df['values'] > 2.0]
  2554. store.append('df4', df, data_columns=True)
  2555. result = store.select(
  2556. 'df4', where='values>2.0')
  2557. tm.assert_frame_equal(expected, result)
  2558. # test selection with comparison against numpy scalar
  2559. # GH 11283
  2560. with ensure_clean_store(self.path) as store:
  2561. df = tm.makeDataFrame()
  2562. expected = df[df['A'] > 0]
  2563. store.append('df', df, data_columns=True)
  2564. np_zero = np.float64(0) # noqa
  2565. result = store.select('df', where=["A>np_zero"])
  2566. tm.assert_frame_equal(expected, result)
  2567. def test_select_with_many_inputs(self):
  2568. with ensure_clean_store(self.path) as store:
  2569. df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300),
  2570. A=np.random.randn(300),
  2571. B=range(300),
  2572. users=['a'] * 50 + ['b'] * 50 + ['c'] * 100 +
  2573. ['a%03d' % i for i in range(100)]))
  2574. _maybe_remove(store, 'df')
  2575. store.append('df', df, data_columns=['ts', 'A', 'B', 'users'])
  2576. # regular select
  2577. result = store.select('df', [Term("ts>=Timestamp('2012-02-01')")])
  2578. expected = df[df.ts >= Timestamp('2012-02-01')]
  2579. tm.assert_frame_equal(expected, result)
  2580. # small selector
  2581. result = store.select(
  2582. 'df', [Term("ts>=Timestamp('2012-02-01') & "
  2583. "users=['a','b','c']")])
  2584. expected = df[(df.ts >= Timestamp('2012-02-01')) &
  2585. df.users.isin(['a', 'b', 'c'])]
  2586. tm.assert_frame_equal(expected, result)
  2587. # big selector along the columns
  2588. selector = ['a', 'b', 'c'] + ['a%03d' % i for i in range(60)]
  2589. result = store.select(
  2590. 'df', [Term("ts>=Timestamp('2012-02-01')"),
  2591. Term('users=selector')])
  2592. expected = df[(df.ts >= Timestamp('2012-02-01')) &
  2593. df.users.isin(selector)]
  2594. tm.assert_frame_equal(expected, result)
  2595. selector = range(100, 200)
  2596. result = store.select('df', [Term('B=selector')])
  2597. expected = df[df.B.isin(selector)]
  2598. tm.assert_frame_equal(expected, result)
  2599. self.assertEqual(len(result), 100)
  2600. # big selector along the index
  2601. selector = Index(df.ts[0:100].values)
  2602. result = store.select('df', [Term('ts=selector')])
  2603. expected = df[df.ts.isin(selector.values)]
  2604. tm.assert_frame_equal(expected, result)
  2605. self.assertEqual(len(result), 100)
  2606. def test_select_iterator(self):
  2607. # single table
  2608. with ensure_clean_store(self.path) as store:
  2609. df = tm.makeTimeDataFrame(500)
  2610. _maybe_remove(store, 'df')
  2611. store.append('df', df)
  2612. expected = store.select('df')
  2613. results = [s for s in store.select('df', iterator=True)]
  2614. result = concat(results)
  2615. tm.assert_frame_equal(expected, result)
  2616. results = [s for s in store.select('df', chunksize=100)]
  2617. self.assertEqual(len(results), 5)
  2618. result = concat(results)
  2619. tm.assert_frame_equal(expected, result)
  2620. results = [s for s in store.select('df', chunksize=150)]
  2621. result = concat(results)
  2622. tm.assert_frame_equal(result, expected)
  2623. with ensure_clean_path(self.path) as path:
  2624. df = tm.makeTimeDataFrame(500)
  2625. df.to_hdf(path, 'df_non_table')
  2626. self.assertRaises(TypeError, read_hdf, path,
  2627. 'df_non_table', chunksize=100)
  2628. self.assertRaises(TypeError, read_hdf, path,
  2629. 'df_non_table', iterator=True)
  2630. with ensure_clean_path(self.path) as path:
  2631. df = tm.makeTimeDataFrame(500)
  2632. df.to_hdf(path, 'df', format='table')
  2633. results = [s for s in read_hdf(path, 'df', chunksize=100)]
  2634. result = concat(results)
  2635. self.assertEqual(len(results), 5)
  2636. tm.assert_frame_equal(result, df)
  2637. tm.assert_frame_equal(result, read_hdf(path, 'df'))
  2638. # multiple
  2639. with ensure_clean_store(self.path) as store:
  2640. df1 = tm.makeTimeDataFrame(500)
  2641. store.append('df1', df1, data_columns=True)
  2642. df2 = tm.makeTimeDataFrame(500).rename(
  2643. columns=lambda x: "%s_2" % x)
  2644. df2['foo'] = 'bar'
  2645. store.append('df2', df2)
  2646. df = concat([df1, df2], axis=1)
  2647. # full selection
  2648. expected = store.select_as_multiple(
  2649. ['df1', 'df2'], selector='df1')
  2650. results = [s for s in store.select_as_multiple(
  2651. ['df1', 'df2'], selector='df1', chunksize=150)]
  2652. result = concat(results)
  2653. tm.assert_frame_equal(expected, result)
  2654. # where selection
  2655. # expected = store.select_as_multiple(
  2656. # ['df1', 'df2'], where= Term('A>0'), selector='df1')
  2657. # results = []
  2658. # for s in store.select_as_multiple(
  2659. # ['df1', 'df2'], where= Term('A>0'), selector='df1',
  2660. # chunksize=25):
  2661. # results.append(s)
  2662. # result = concat(results)
  2663. # tm.assert_frame_equal(expected, result)
  2664. def test_select_iterator_complete_8014(self):
  2665. # GH 8014
  2666. # using iterator and where clause
  2667. chunksize = 1e4
  2668. # no iterator
  2669. with ensure_clean_store(self.path) as store:
  2670. expected = tm.makeTimeDataFrame(100064, 'S')
  2671. _maybe_remove(store, 'df')
  2672. store.append('df', expected)
  2673. beg_dt = expected.index[0]
  2674. end_dt = expected.index[-1]
  2675. # select w/o iteration and no where clause works
  2676. result = store.select('df')
  2677. tm.assert_frame_equal(expected, result)
  2678. # select w/o iterator and where clause, single term, begin
  2679. # of range, works
  2680. where = "index >= '%s'" % beg_dt
  2681. result = store.select('df', where=where)
  2682. tm.assert_frame_equal(expected, result)
  2683. # select w/o iterator and where clause, single term, end
  2684. # of range, works
  2685. where = "index <= '%s'" % end_dt
  2686. result = store.select('df', where=where)
  2687. tm.assert_frame_equal(expected, result)
  2688. # select w/o iterator and where clause, inclusive range,
  2689. # works
  2690. where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt)
  2691. result = store.select('df', where=where)
  2692. tm.assert_frame_equal(expected, result)
  2693. # with iterator, full range
  2694. with ensure_clean_store(self.path) as store:
  2695. expected = tm.makeTimeDataFrame(100064, 'S')
  2696. _maybe_remove(store, 'df')
  2697. store.append('df', expected)
  2698. beg_dt = expected.index[0]
  2699. end_dt = expected.index[-1]
  2700. # select w/iterator and no where clause works
  2701. results = [s for s in store.select('df', chunksize=chunksize)]
  2702. result = concat(results)
  2703. tm.assert_frame_equal(expected, result)
  2704. # select w/iterator and where clause, single term, begin of range
  2705. where = "index >= '%s'" % beg_dt
  2706. results = [s for s in store.select(
  2707. 'df', where=where, chunksize=chunksize)]
  2708. result = concat(results)
  2709. tm.assert_frame_equal(expected, result)
  2710. # select w/iterator and where clause, single term, end of range
  2711. where = "index <= '%s'" % end_dt
  2712. results = [s for s in store.select(
  2713. 'df', where=where, chunksize=chunksize)]
  2714. result = concat(results)
  2715. tm.assert_frame_equal(expected, result)
  2716. # select w/iterator and where clause, inclusive range
  2717. where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt)
  2718. results = [s for s in store.select(
  2719. 'df', where=where, chunksize=chunksize)]
  2720. result = concat(results)
  2721. tm.assert_frame_equal(expected, result)
  2722. def test_select_iterator_non_complete_8014(self):
  2723. # GH 8014
  2724. # using iterator and where clause
  2725. chunksize = 1e4
  2726. # with iterator, non complete range
  2727. with ensure_clean_store(self.path) as store:
  2728. expected = tm.makeTimeDataFrame(100064, 'S')
  2729. _maybe_remove(store, 'df')
  2730. store.append('df', expected)
  2731. beg_dt = expected.index[1]
  2732. end_dt = expected.index[-2]
  2733. # select w/iterator and where clause, single term, begin of range
  2734. where = "index >= '%s'" % beg_dt
  2735. results = [s for s in store.select(
  2736. 'df', where=where, chunksize=chunksize)]
  2737. result = concat(results)
  2738. rexpected = expected[expected.index >= beg_dt]
  2739. tm.assert_frame_equal(rexpected, result)
  2740. # select w/iterator and where clause, single term, end of range
  2741. where = "index <= '%s'" % end_dt
  2742. results = [s for s in store.select(
  2743. 'df', where=where, chunksize=chunksize)]
  2744. result = concat(results)
  2745. rexpected = expected[expected.index <= end_dt]
  2746. tm.assert_frame_equal(rexpected, result)
  2747. # select w/iterator and where clause, inclusive range
  2748. where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt)
  2749. results = [s for s in store.select(
  2750. 'df', where=where, chunksize=chunksize)]
  2751. result = concat(results)
  2752. rexpected = expected[(expected.index >= beg_dt) &
  2753. (expected.index <= end_dt)]
  2754. tm.assert_frame_equal(rexpected, result)
  2755. # with iterator, empty where
  2756. with ensure_clean_store(self.path) as store:
  2757. expected = tm.makeTimeDataFrame(100064, 'S')
  2758. _maybe_remove(store, 'df')
  2759. store.append('df', expected)
  2760. end_dt = expected.index[-1]
  2761. # select w/iterator and where clause, single term, begin of range
  2762. where = "index > '%s'" % end_dt
  2763. results = [s for s in store.select(
  2764. 'df', where=where, chunksize=chunksize)]
  2765. self.assertEqual(0, len(results))
  2766. def test_select_iterator_many_empty_frames(self):
  2767. # GH 8014
  2768. # using iterator and where clause can return many empty
  2769. # frames.
  2770. chunksize = int(1e4)
  2771. # with iterator, range limited to the first chunk
  2772. with ensure_clean_store(self.path) as store:
  2773. expected = tm.makeTimeDataFrame(100000, 'S')
  2774. _maybe_remove(store, 'df')
  2775. store.append('df', expected)
  2776. beg_dt = expected.index[0]
  2777. end_dt = expected.index[chunksize - 1]
  2778. # select w/iterator and where clause, single term, begin of range
  2779. where = "index >= '%s'" % beg_dt
  2780. results = [s for s in store.select(
  2781. 'df', where=where, chunksize=chunksize)]
  2782. result = concat(results)
  2783. rexpected = expected[expected.index >= beg_dt]
  2784. tm.assert_frame_equal(rexpected, result)
  2785. # select w/iterator and where clause, single term, end of range
  2786. where = "index <= '%s'" % end_dt
  2787. results = [s for s in store.select(
  2788. 'df', where=where, chunksize=chunksize)]
  2789. tm.assert_equal(1, len(results))
  2790. result = concat(results)
  2791. rexpected = expected[expected.index <= end_dt]
  2792. tm.assert_frame_equal(rexpected, result)
  2793. # select w/iterator and where clause, inclusive range
  2794. where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt)
  2795. results = [s for s in store.select(
  2796. 'df', where=where, chunksize=chunksize)]
  2797. # should be 1, is 10
  2798. tm.assert_equal(1, len(results))
  2799. result = concat(results)
  2800. rexpected = expected[(expected.index >= beg_dt) &
  2801. (expected.index <= end_dt)]
  2802. tm.assert_frame_equal(rexpected, result)
  2803. # select w/iterator and where clause which selects
  2804. # *nothing*.
  2805. #
  2806. # To be consistent with Python idiom I suggest this should
  2807. # return [] e.g. `for e in []: print True` never prints
  2808. # True.
  2809. where = "index <= '%s' & index >= '%s'" % (beg_dt, end_dt)
  2810. results = [s for s in store.select(
  2811. 'df', where=where, chunksize=chunksize)]
  2812. # should be []
  2813. tm.assert_equal(0, len(results))
  2814. def test_retain_index_attributes(self):
  2815. # GH 3499, losing frequency info on index recreation
  2816. df = DataFrame(dict(
  2817. A=Series(lrange(3),
  2818. index=date_range('2000-1-1', periods=3, freq='H'))))
  2819. with ensure_clean_store(self.path) as store:
  2820. _maybe_remove(store, 'data')
  2821. store.put('data', df, format='table')
  2822. result = store.get('data')
  2823. tm.assert_frame_equal(df, result)
  2824. for attr in ['freq', 'tz', 'name']:
  2825. for idx in ['index', 'columns']:
  2826. self.assertEqual(getattr(getattr(df, idx), attr, None),
  2827. getattr(getattr(result, idx), attr, None))
  2828. # try to append a table with a different frequency
  2829. with tm.assert_produces_warning(
  2830. expected_warning=AttributeConflictWarning):
  2831. df2 = DataFrame(dict(
  2832. A=Series(lrange(3),
  2833. index=date_range('2002-1-1',
  2834. periods=3, freq='D'))))
  2835. store.append('data', df2)
  2836. self.assertIsNone(store.get_storer('data').info['index']['freq'])
  2837. # this is ok
  2838. _maybe_remove(store, 'df2')
  2839. df2 = DataFrame(dict(
  2840. A=Series(lrange(3),
  2841. index=[Timestamp('20010101'), Timestamp('20010102'),
  2842. Timestamp('20020101')])))
  2843. store.append('df2', df2)
  2844. df3 = DataFrame(dict(
  2845. A=Series(lrange(3),
  2846. index=date_range('2002-1-1', periods=3,
  2847. freq='D'))))
  2848. store.append('df2', df3)
  2849. def test_retain_index_attributes2(self):
  2850. with ensure_clean_path(self.path) as path:
  2851. expected_warning = Warning if PY35 else AttributeConflictWarning
  2852. with tm.assert_produces_warning(expected_warning=expected_warning,
  2853. check_stacklevel=False):
  2854. df = DataFrame(dict(
  2855. A=Series(lrange(3),
  2856. index=date_range('2000-1-1',
  2857. periods=3, freq='H'))))
  2858. df.to_hdf(path, 'data', mode='w', append=True)
  2859. df2 = DataFrame(dict(
  2860. A=Series(lrange(3),
  2861. index=date_range('2002-1-1', periods=3,
  2862. freq='D'))))
  2863. df2.to_hdf(path, 'data', append=True)
  2864. idx = date_range('2000-1-1', periods=3, freq='H')
  2865. idx.name = 'foo'
  2866. df = DataFrame(dict(A=Series(lrange(3), index=idx)))
  2867. df.to_hdf(path, 'data', mode='w', append=True)
  2868. self.assertEqual(read_hdf(path, 'data').index.name, 'foo')
  2869. with tm.assert_produces_warning(expected_warning=expected_warning,
  2870. check_stacklevel=False):
  2871. idx2 = date_range('2001-1-1', periods=3, freq='H')
  2872. idx2.name = 'bar'
  2873. df2 = DataFrame(dict(A=Series(lrange(3), index=idx2)))
  2874. df2.to_hdf(path, 'data', append=True)
  2875. self.assertIsNone(read_hdf(path, 'data').index.name)
  2876. def test_panel_select(self):
  2877. wp = tm.makePanel()
  2878. with ensure_clean_store(self.path) as store:
  2879. store.put('wp', wp, format='table')
  2880. date = wp.major_axis[len(wp.major_axis) // 2]
  2881. crit1 = ('major_axis>=date')
  2882. crit2 = ("minor_axis=['A', 'D']")
  2883. result = store.select('wp', [crit1, crit2])
  2884. expected = wp.truncate(before=date).reindex(minor=['A', 'D'])
  2885. assert_panel_equal(result, expected)
  2886. result = store.select(
  2887. 'wp', ['major_axis>="20000124"', ("minor_axis=['A', 'B']")])
  2888. expected = wp.truncate(before='20000124').reindex(minor=['A', 'B'])
  2889. assert_panel_equal(result, expected)
  2890. def test_frame_select(self):
  2891. df = tm.makeTimeDataFrame()
  2892. with ensure_clean_store(self.path) as store:
  2893. store.put('frame', df, format='table')
  2894. date = df.index[len(df) // 2]
  2895. crit1 = Term('index>=date')
  2896. self.assertEqual(crit1.env.scope['date'], date)
  2897. crit2 = ("columns=['A', 'D']")
  2898. crit3 = ('columns=A')
  2899. result = store.select('frame', [crit1, crit2])
  2900. expected = df.ix[date:, ['A', 'D']]
  2901. tm.assert_frame_equal(result, expected)
  2902. result = store.select('frame', [crit3])
  2903. expected = df.ix[:, ['A']]
  2904. tm.assert_frame_equal(result, expected)
  2905. # invalid terms
  2906. df = tm.makeTimeDataFrame()
  2907. store.append('df_time', df)
  2908. self.assertRaises(
  2909. ValueError, store.select, 'df_time', [Term("index>0")])
  2910. # can't select if not written as table
  2911. # store['frame'] = df
  2912. # self.assertRaises(ValueError, store.select,
  2913. # 'frame', [crit1, crit2])
  2914. def test_frame_select_complex(self):
  2915. # select via complex criteria
  2916. df = tm.makeTimeDataFrame()
  2917. df['string'] = 'foo'
  2918. df.loc[df.index[0:4], 'string'] = 'bar'
  2919. with ensure_clean_store(self.path) as store:
  2920. store.put('df', df, format='table', data_columns=['string'])
  2921. # empty
  2922. result = store.select('df', 'index>df.index[3] & string="bar"')
  2923. expected = df.loc[(df.index > df.index[3]) & (df.string == 'bar')]
  2924. tm.assert_frame_equal(result, expected)
  2925. result = store.select('df', 'index>df.index[3] & string="foo"')
  2926. expected = df.loc[(df.index > df.index[3]) & (df.string == 'foo')]
  2927. tm.assert_frame_equal(result, expected)
  2928. # or
  2929. result = store.select('df', 'index>df.index[3] | string="bar"')
  2930. expected = df.loc[(df.index > df.index[3]) | (df.string == 'bar')]
  2931. tm.assert_frame_equal(result, expected)
  2932. result = store.select('df', '(index>df.index[3] & '
  2933. 'index<=df.index[6]) | string="bar"')
  2934. expected = df.loc[((df.index > df.index[3]) & (
  2935. df.index <= df.index[6])) | (df.string == 'bar')]
  2936. tm.assert_frame_equal(result, expected)
  2937. # invert
  2938. result = store.select('df', 'string!="bar"')
  2939. expected = df.loc[df.string != 'bar']
  2940. tm.assert_frame_equal(result, expected)
  2941. # invert not implemented in numexpr :(
  2942. self.assertRaises(NotImplementedError,
  2943. store.select, 'df', '~(string="bar")')
  2944. # invert ok for filters
  2945. result = store.select('df', "~(columns=['A','B'])")
  2946. expected = df.loc[:, df.columns.difference(['A', 'B'])]
  2947. tm.assert_frame_equal(result, expected)
  2948. # in
  2949. result = store.select(
  2950. 'df', "index>df.index[3] & columns in ['A','B']")
  2951. expected = df.loc[df.index > df.index[3]].reindex(columns=[
  2952. 'A', 'B'])
  2953. tm.assert_frame_equal(result, expected)
  2954. def test_frame_select_complex2(self):
  2955. with ensure_clean_path(['parms.hdf', 'hist.hdf']) as paths:
  2956. pp, hh = paths
  2957. # use non-trivial selection criteria
  2958. parms = DataFrame({'A': [1, 1, 2, 2, 3]})
  2959. parms.to_hdf(pp, 'df', mode='w',
  2960. format='table', data_columns=['A'])
  2961. selection = read_hdf(pp, 'df', where='A=[2,3]')
  2962. hist = DataFrame(np.random.randn(25, 1),
  2963. columns=['data'],
  2964. index=MultiIndex.from_tuples(
  2965. [(i, j) for i in range(5)
  2966. for j in range(5)],
  2967. names=['l1', 'l2']))
  2968. hist.to_hdf(hh, 'df', mode='w', format='table')
  2969. expected = read_hdf(hh, 'df', where=Term('l1', '=', [2, 3, 4]))
  2970. # list like
  2971. result = read_hdf(hh, 'df', where=Term(
  2972. 'l1', '=', selection.index.tolist()))
  2973. assert_frame_equal(result, expected)
  2974. l = selection.index.tolist() # noqa
  2975. # sccope with list like
  2976. store = HDFStore(hh)
  2977. result = store.select('df', where='l1=l')
  2978. assert_frame_equal(result, expected)
  2979. store.close()
  2980. result = read_hdf(hh, 'df', where='l1=l')
  2981. assert_frame_equal(result, expected)
  2982. # index
  2983. index = selection.index # noqa
  2984. result = read_hdf(hh, 'df', where='l1=index')
  2985. assert_frame_equal(result, expected)
  2986. result = read_hdf(hh, 'df', where='l1=selection.index')
  2987. assert_frame_equal(result, expected)
  2988. result = read_hdf(hh, 'df', where='l1=selection.index.tolist()')
  2989. assert_frame_equal(result, expected)
  2990. result = read_hdf(hh, 'df', where='l1=list(selection.index)')
  2991. assert_frame_equal(result, expected)
  2992. # sccope with index
  2993. store = HDFStore(hh)
  2994. result = store.select('df', where='l1=index')
  2995. assert_frame_equal(result, expected)
  2996. result = store.select('df', where='l1=selection.index')
  2997. assert_frame_equal(result, expected)
  2998. result = store.select('df', where='l1=selection.index.tolist()')
  2999. assert_frame_equal(result, expected)
  3000. result = store.select('df', where='l1=list(selection.index)')
  3001. assert_frame_equal(result, expected)
  3002. store.close()
  3003. def test_invalid_filtering(self):
  3004. # can't use more than one filter (atm)
  3005. df = tm.makeTimeDataFrame()
  3006. with ensure_clean_store(self.path) as store:
  3007. store.put('df', df, format='table')
  3008. # not implemented
  3009. self.assertRaises(NotImplementedError, store.select,
  3010. 'df', "columns=['A'] | columns=['B']")
  3011. # in theory we could deal with this
  3012. self.assertRaises(NotImplementedError, store.select,
  3013. 'df', "columns=['A','B'] & columns=['C']")
  3014. def test_string_select(self):
  3015. # GH 2973
  3016. with ensure_clean_store(self.path) as store:
  3017. df = tm.makeTimeDataFrame()
  3018. # test string ==/!=
  3019. df['x'] = 'none'
  3020. df.ix[2:7, 'x'] = ''
  3021. store.append('df', df, data_columns=['x'])
  3022. result = store.select('df', Term('x=none'))
  3023. expected = df[df.x == 'none']
  3024. assert_frame_equal(result, expected)
  3025. try:
  3026. result = store.select('df', Term('x!=none'))
  3027. expected = df[df.x != 'none']
  3028. assert_frame_equal(result, expected)
  3029. except Exception as detail:
  3030. pprint_thing("[{0}]".format(detail))
  3031. pprint_thing(store)
  3032. pprint_thing(expected)
  3033. df2 = df.copy()
  3034. df2.loc[df2.x == '', 'x'] = np.nan
  3035. store.append('df2', df2, data_columns=['x'])
  3036. result = store.select('df2', Term('x!=none'))
  3037. expected = df2[isnull(df2.x)]
  3038. assert_frame_equal(result, expected)
  3039. # int ==/!=
  3040. df['int'] = 1
  3041. df.ix[2:7, 'int'] = 2
  3042. store.append('df3', df, data_columns=['int'])
  3043. result = store.select('df3', Term('int=2'))
  3044. expected = df[df.int == 2]
  3045. assert_frame_equal(result, expected)
  3046. result = store.select('df3', Term('int!=2'))
  3047. expected = df[df.int != 2]
  3048. assert_frame_equal(result, expected)
  3049. def test_read_column(self):
  3050. df = tm.makeTimeDataFrame()
  3051. with ensure_clean_store(self.path) as store:
  3052. _maybe_remove(store, 'df')
  3053. store.append('df', df)
  3054. # error
  3055. self.assertRaises(KeyError, store.select_column, 'df', 'foo')
  3056. def f():
  3057. store.select_column('df', 'index', where=['index>5'])
  3058. self.assertRaises(Exception, f)
  3059. # valid
  3060. result = store.select_column('df', 'index')
  3061. tm.assert_almost_equal(result.values, Series(df.index).values)
  3062. self.assertIsInstance(result, Series)
  3063. # not a data indexable column
  3064. self.assertRaises(
  3065. ValueError, store.select_column, 'df', 'values_block_0')
  3066. # a data column
  3067. df2 = df.copy()
  3068. df2['string'] = 'foo'
  3069. store.append('df2', df2, data_columns=['string'])
  3070. result = store.select_column('df2', 'string')
  3071. tm.assert_almost_equal(result.values, df2['string'].values)
  3072. # a data column with NaNs, result excludes the NaNs
  3073. df3 = df.copy()
  3074. df3['string'] = 'foo'
  3075. df3.ix[4:6, 'string'] = np.nan
  3076. store.append('df3', df3, data_columns=['string'])
  3077. result = store.select_column('df3', 'string')
  3078. tm.assert_almost_equal(result.values, df3['string'].values)
  3079. # start/stop
  3080. result = store.select_column('df3', 'string', start=2)
  3081. tm.assert_almost_equal(result.values, df3['string'].values[2:])
  3082. result = store.select_column('df3', 'string', start=-2)
  3083. tm.assert_almost_equal(result.values, df3['string'].values[-2:])
  3084. result = store.select_column('df3', 'string', stop=2)
  3085. tm.assert_almost_equal(result.values, df3['string'].values[:2])
  3086. result = store.select_column('df3', 'string', stop=-2)
  3087. tm.assert_almost_equal(result.values, df3['string'].values[:-2])
  3088. result = store.select_column('df3', 'string', start=2, stop=-2)
  3089. tm.assert_almost_equal(result.values, df3['string'].values[2:-2])
  3090. result = store.select_column('df3', 'string', start=-2, stop=2)
  3091. tm.assert_almost_equal(result.values, df3['string'].values[-2:2])
  3092. # GH 10392 - make sure column name is preserved
  3093. df4 = DataFrame({'A': np.random.randn(10), 'B': 'foo'})
  3094. store.append('df4', df4, data_columns=True)
  3095. expected = df4['B']
  3096. result = store.select_column('df4', 'B')
  3097. tm.assert_series_equal(result, expected)
  3098. def test_coordinates(self):
  3099. df = tm.makeTimeDataFrame()
  3100. with ensure_clean_store(self.path) as store:
  3101. _maybe_remove(store, 'df')
  3102. store.append('df', df)
  3103. # all
  3104. c = store.select_as_coordinates('df')
  3105. assert((c.values == np.arange(len(df.index))).all())
  3106. # get coordinates back & test vs frame
  3107. _maybe_remove(store, 'df')
  3108. df = DataFrame(dict(A=lrange(5), B=lrange(5)))
  3109. store.append('df', df)
  3110. c = store.select_as_coordinates('df', ['index<3'])
  3111. assert((c.values == np.arange(3)).all())
  3112. result = store.select('df', where=c)
  3113. expected = df.ix[0:2, :]
  3114. tm.assert_frame_equal(result, expected)
  3115. c = store.select_as_coordinates('df', ['index>=3', 'index<=4'])
  3116. assert((c.values == np.arange(2) + 3).all())
  3117. result = store.select('df', where=c)
  3118. expected = df.ix[3:4, :]
  3119. tm.assert_frame_equal(result, expected)
  3120. self.assertIsInstance(c, Index)
  3121. # multiple tables
  3122. _maybe_remove(store, 'df1')
  3123. _maybe_remove(store, 'df2')
  3124. df1 = tm.makeTimeDataFrame()
  3125. df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x)
  3126. store.append('df1', df1, data_columns=['A', 'B'])
  3127. store.append('df2', df2)
  3128. c = store.select_as_coordinates('df1', ['A>0', 'B>0'])
  3129. df1_result = store.select('df1', c)
  3130. df2_result = store.select('df2', c)
  3131. result = concat([df1_result, df2_result], axis=1)
  3132. expected = concat([df1, df2], axis=1)
  3133. expected = expected[(expected.A > 0) & (expected.B > 0)]
  3134. tm.assert_frame_equal(result, expected)
  3135. # pass array/mask as the coordinates
  3136. with ensure_clean_store(self.path) as store:
  3137. df = DataFrame(np.random.randn(1000, 2),
  3138. index=date_range('20000101', periods=1000))
  3139. store.append('df', df)
  3140. c = store.select_column('df', 'index')
  3141. where = c[DatetimeIndex(c).month == 5].index
  3142. expected = df.iloc[where]
  3143. # locations
  3144. result = store.select('df', where=where)
  3145. tm.assert_frame_equal(result, expected)
  3146. # boolean
  3147. result = store.select('df', where=where)
  3148. tm.assert_frame_equal(result, expected)
  3149. # invalid
  3150. self.assertRaises(ValueError, store.select, 'df',
  3151. where=np.arange(len(df), dtype='float64'))
  3152. self.assertRaises(ValueError, store.select, 'df',
  3153. where=np.arange(len(df) + 1))
  3154. self.assertRaises(ValueError, store.select, 'df',
  3155. where=np.arange(len(df)), start=5)
  3156. self.assertRaises(ValueError, store.select, 'df',
  3157. where=np.arange(len(df)), start=5, stop=10)
  3158. # selection with filter
  3159. selection = date_range('20000101', periods=500)
  3160. result = store.select('df', where='index in selection')
  3161. expected = df[df.index.isin(selection)]
  3162. tm.assert_frame_equal(result, expected)
  3163. # list
  3164. df = DataFrame(np.random.randn(10, 2))
  3165. store.append('df2', df)
  3166. result = store.select('df2', where=[0, 3, 5])
  3167. expected = df.iloc[[0, 3, 5]]
  3168. tm.assert_frame_equal(result, expected)
  3169. # boolean
  3170. where = [True] * 10
  3171. where[-2] = False
  3172. result = store.select('df2', where=where)
  3173. expected = df.loc[where]
  3174. tm.assert_frame_equal(result, expected)
  3175. # start/stop
  3176. result = store.select('df2', start=5, stop=10)
  3177. expected = df[5:10]
  3178. tm.assert_frame_equal(result, expected)
  3179. def test_append_to_multiple(self):
  3180. df1 = tm.makeTimeDataFrame()
  3181. df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x)
  3182. df2['foo'] = 'bar'
  3183. df = concat([df1, df2], axis=1)
  3184. with ensure_clean_store(self.path) as store:
  3185. # exceptions
  3186. self.assertRaises(ValueError, store.append_to_multiple,
  3187. {'df1': ['A', 'B'], 'df2': None}, df,
  3188. selector='df3')
  3189. self.assertRaises(ValueError, store.append_to_multiple,
  3190. {'df1': None, 'df2': None}, df, selector='df3')
  3191. self.assertRaises(
  3192. ValueError, store.append_to_multiple, 'df1', df, 'df1')
  3193. # regular operation
  3194. store.append_to_multiple(
  3195. {'df1': ['A', 'B'], 'df2': None}, df, selector='df1')
  3196. result = store.select_as_multiple(
  3197. ['df1', 'df2'], where=['A>0', 'B>0'], selector='df1')
  3198. expected = df[(df.A > 0) & (df.B > 0)]
  3199. tm.assert_frame_equal(result, expected)
  3200. def test_append_to_multiple_dropna(self):
  3201. df1 = tm.makeTimeDataFrame()
  3202. df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x)
  3203. df1.ix[1, ['A', 'B']] = np.nan
  3204. df = concat([df1, df2], axis=1)
  3205. with ensure_clean_store(self.path) as store:
  3206. # dropna=True should guarantee rows are synchronized
  3207. store.append_to_multiple(
  3208. {'df1': ['A', 'B'], 'df2': None}, df, selector='df1',
  3209. dropna=True)
  3210. result = store.select_as_multiple(['df1', 'df2'])
  3211. expected = df.dropna()
  3212. tm.assert_frame_equal(result, expected)
  3213. tm.assert_index_equal(store.select('df1').index,
  3214. store.select('df2').index)
  3215. # dropna=False shouldn't synchronize row indexes
  3216. store.append_to_multiple(
  3217. {'df1': ['A', 'B'], 'df2': None}, df, selector='df1',
  3218. dropna=False)
  3219. self.assertRaises(
  3220. ValueError, store.select_as_multiple, ['df1', 'df2'])
  3221. assert not store.select('df1').index.equals(
  3222. store.select('df2').index)
  3223. def test_select_as_multiple(self):
  3224. df1 = tm.makeTimeDataFrame()
  3225. df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x)
  3226. df2['foo'] = 'bar'
  3227. with ensure_clean_store(self.path) as store:
  3228. # no tables stored
  3229. self.assertRaises(Exception, store.select_as_multiple,
  3230. None, where=['A>0', 'B>0'], selector='df1')
  3231. store.append('df1', df1, data_columns=['A', 'B'])
  3232. store.append('df2', df2)
  3233. # exceptions
  3234. self.assertRaises(Exception, store.select_as_multiple,
  3235. None, where=['A>0', 'B>0'], selector='df1')
  3236. self.assertRaises(Exception, store.select_as_multiple,
  3237. [None], where=['A>0', 'B>0'], selector='df1')
  3238. self.assertRaises(KeyError, store.select_as_multiple,
  3239. ['df1', 'df3'], where=['A>0', 'B>0'],
  3240. selector='df1')
  3241. self.assertRaises(KeyError, store.select_as_multiple,
  3242. ['df3'], where=['A>0', 'B>0'], selector='df1')
  3243. self.assertRaises(KeyError, store.select_as_multiple,
  3244. ['df1', 'df2'], where=['A>0', 'B>0'],
  3245. selector='df4')
  3246. # default select
  3247. result = store.select('df1', ['A>0', 'B>0'])
  3248. expected = store.select_as_multiple(
  3249. ['df1'], where=['A>0', 'B>0'], selector='df1')
  3250. tm.assert_frame_equal(result, expected)
  3251. expected = store.select_as_multiple(
  3252. 'df1', where=['A>0', 'B>0'], selector='df1')
  3253. tm.assert_frame_equal(result, expected)
  3254. # multiple
  3255. result = store.select_as_multiple(
  3256. ['df1', 'df2'], where=['A>0', 'B>0'], selector='df1')
  3257. expected = concat([df1, df2], axis=1)
  3258. expected = expected[(expected.A > 0) & (expected.B > 0)]
  3259. tm.assert_frame_equal(result, expected)
  3260. # multiple (diff selector)
  3261. result = store.select_as_multiple(['df1', 'df2'], where=[Term(
  3262. 'index>df2.index[4]')], selector='df2')
  3263. expected = concat([df1, df2], axis=1)
  3264. expected = expected[5:]
  3265. tm.assert_frame_equal(result, expected)
  3266. # test excpection for diff rows
  3267. store.append('df3', tm.makeTimeDataFrame(nper=50))
  3268. self.assertRaises(ValueError, store.select_as_multiple,
  3269. ['df1', 'df3'], where=['A>0', 'B>0'],
  3270. selector='df1')
  3271. def test_nan_selection_bug_4858(self):
  3272. # GH 4858; nan selection bug, only works for pytables >= 3.1
  3273. if LooseVersion(tables.__version__) < '3.1.0':
  3274. raise nose.SkipTest('tables version does not support fix for nan '
  3275. 'selection bug: GH 4858')
  3276. with ensure_clean_store(self.path) as store:
  3277. df = DataFrame(dict(cols=range(6), values=range(6)),
  3278. dtype='float64')
  3279. df['cols'] = (df['cols'] + 10).apply(str)
  3280. df.iloc[0] = np.nan
  3281. expected = DataFrame(dict(cols=['13.0', '14.0', '15.0'], values=[
  3282. 3., 4., 5.]), index=[3, 4, 5])
  3283. # write w/o the index on that particular column
  3284. store.append('df', df, data_columns=True, index=['cols'])
  3285. result = store.select('df', where='values>2.0')
  3286. assert_frame_equal(result, expected)
  3287. def test_start_stop_table(self):
  3288. with ensure_clean_store(self.path) as store:
  3289. # table
  3290. df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20)))
  3291. store.append('df', df)
  3292. result = store.select(
  3293. 'df', [Term("columns=['A']")], start=0, stop=5)
  3294. expected = df.ix[0:4, ['A']]
  3295. tm.assert_frame_equal(result, expected)
  3296. # out of range
  3297. result = store.select(
  3298. 'df', [Term("columns=['A']")], start=30, stop=40)
  3299. self.assertTrue(len(result) == 0)
  3300. expected = df.ix[30:40, ['A']]
  3301. tm.assert_frame_equal(result, expected)
  3302. def test_start_stop_fixed(self):
  3303. with ensure_clean_store(self.path) as store:
  3304. # fixed, GH 8287
  3305. df = DataFrame(dict(A=np.random.rand(20),
  3306. B=np.random.rand(20)),
  3307. index=pd.date_range('20130101', periods=20))
  3308. store.put('df', df)
  3309. result = store.select(
  3310. 'df', start=0, stop=5)
  3311. expected = df.iloc[0:5, :]
  3312. tm.assert_frame_equal(result, expected)
  3313. result = store.select(
  3314. 'df', start=5, stop=10)
  3315. expected = df.iloc[5:10, :]
  3316. tm.assert_frame_equal(result, expected)
  3317. # out of range
  3318. result = store.select(
  3319. 'df', start=30, stop=40)
  3320. expected = df.iloc[30:40, :]
  3321. tm.assert_frame_equal(result, expected)
  3322. # series
  3323. s = df.A
  3324. store.put('s', s)
  3325. result = store.select('s', start=0, stop=5)
  3326. expected = s.iloc[0:5]
  3327. tm.assert_series_equal(result, expected)
  3328. result = store.select('s', start=5, stop=10)
  3329. expected = s.iloc[5:10]
  3330. tm.assert_series_equal(result, expected)
  3331. # sparse; not implemented
  3332. df = tm.makeDataFrame()
  3333. df.ix[3:5, 1:3] = np.nan
  3334. df.ix[8:10, -2] = np.nan
  3335. dfs = df.to_sparse()
  3336. store.put('dfs', dfs)
  3337. with self.assertRaises(NotImplementedError):
  3338. store.select('dfs', start=0, stop=5)
  3339. def test_select_filter_corner(self):
  3340. df = DataFrame(np.random.randn(50, 100))
  3341. df.index = ['%.3d' % c for c in df.index]
  3342. df.columns = ['%.3d' % c for c in df.columns]
  3343. with ensure_clean_store(self.path) as store:
  3344. store.put('frame', df, format='table')
  3345. crit = Term('columns=df.columns[:75]')
  3346. result = store.select('frame', [crit])
  3347. tm.assert_frame_equal(result, df.ix[:, df.columns[:75]])
  3348. crit = Term('columns=df.columns[:75:2]')
  3349. result = store.select('frame', [crit])
  3350. tm.assert_frame_equal(result, df.ix[:, df.columns[:75:2]])
  3351. def _check_roundtrip(self, obj, comparator, compression=False, **kwargs):
  3352. options = {}
  3353. if compression:
  3354. options['complib'] = _default_compressor
  3355. with ensure_clean_store(self.path, 'w', **options) as store:
  3356. store['obj'] = obj
  3357. retrieved = store['obj']
  3358. comparator(retrieved, obj, **kwargs)
  3359. def _check_double_roundtrip(self, obj, comparator, compression=False,
  3360. **kwargs):
  3361. options = {}
  3362. if compression:
  3363. options['complib'] = compression or _default_compressor
  3364. with ensure_clean_store(self.path, 'w', **options) as store:
  3365. store['obj'] = obj
  3366. retrieved = store['obj']
  3367. comparator(retrieved, obj, **kwargs)
  3368. store['obj'] = retrieved
  3369. again = store['obj']
  3370. comparator(again, obj, **kwargs)
  3371. def _check_roundtrip_table(self, obj, comparator, compression=False):
  3372. options = {}
  3373. if compression:
  3374. options['complib'] = _default_compressor
  3375. with ensure_clean_store(self.path, 'w', **options) as store:
  3376. store.put('obj', obj, format='table')
  3377. retrieved = store['obj']
  3378. # sorted_obj = _test_sort(obj)
  3379. comparator(retrieved, obj)
  3380. def test_multiple_open_close(self):
  3381. # GH 4409, open & close multiple times
  3382. with ensure_clean_path(self.path) as path:
  3383. df = tm.makeDataFrame()
  3384. df.to_hdf(path, 'df', mode='w', format='table')
  3385. # single
  3386. store = HDFStore(path)
  3387. self.assertNotIn('CLOSED', str(store))
  3388. self.assertTrue(store.is_open)
  3389. store.close()
  3390. self.assertIn('CLOSED', str(store))
  3391. self.assertFalse(store.is_open)
  3392. with ensure_clean_path(self.path) as path:
  3393. if pytables._table_file_open_policy_is_strict:
  3394. # multiples
  3395. store1 = HDFStore(path)
  3396. def f():
  3397. HDFStore(path)
  3398. self.assertRaises(ValueError, f)
  3399. store1.close()
  3400. else:
  3401. # multiples
  3402. store1 = HDFStore(path)
  3403. store2 = HDFStore(path)
  3404. self.assertNotIn('CLOSED', str(store1))
  3405. self.assertNotIn('CLOSED', str(store2))
  3406. self.assertTrue(store1.is_open)
  3407. self.assertTrue(store2.is_open)
  3408. store1.close()
  3409. self.assertIn('CLOSED', str(store1))
  3410. self.assertFalse(store1.is_open)
  3411. self.assertNotIn('CLOSED', str(store2))
  3412. self.assertTrue(store2.is_open)
  3413. store2.close()
  3414. self.assertIn('CLOSED', str(store1))
  3415. self.assertIn('CLOSED', str(store2))
  3416. self.assertFalse(store1.is_open)
  3417. self.assertFalse(store2.is_open)
  3418. # nested close
  3419. store = HDFStore(path, mode='w')
  3420. store.append('df', df)
  3421. store2 = HDFStore(path)
  3422. store2.append('df2', df)
  3423. store2.close()
  3424. self.assertIn('CLOSED', str(store2))
  3425. self.assertFalse(store2.is_open)
  3426. store.close()
  3427. self.assertIn('CLOSED', str(store))
  3428. self.assertFalse(store.is_open)
  3429. # double closing
  3430. store = HDFStore(path, mode='w')
  3431. store.append('df', df)
  3432. store2 = HDFStore(path)
  3433. store.close()
  3434. self.assertIn('CLOSED', str(store))
  3435. self.assertFalse(store.is_open)
  3436. store2.close()
  3437. self.assertIn('CLOSED', str(store2))
  3438. self.assertFalse(store2.is_open)
  3439. # ops on a closed store
  3440. with ensure_clean_path(self.path) as path:
  3441. df = tm.makeDataFrame()
  3442. df.to_hdf(path, 'df', mode='w', format='table')
  3443. store = HDFStore(path)
  3444. store.close()
  3445. self.assertRaises(ClosedFileError, store.keys)
  3446. self.assertRaises(ClosedFileError, lambda: 'df' in store)
  3447. self.assertRaises(ClosedFileError, lambda: len(store))
  3448. self.assertRaises(ClosedFileError, lambda: store['df'])
  3449. self.assertRaises(ClosedFileError, lambda: store.df)
  3450. self.assertRaises(ClosedFileError, store.select, 'df')
  3451. self.assertRaises(ClosedFileError, store.get, 'df')
  3452. self.assertRaises(ClosedFileError, store.append, 'df2', df)
  3453. self.assertRaises(ClosedFileError, store.put, 'df3', df)
  3454. self.assertRaises(ClosedFileError, store.get_storer, 'df2')
  3455. self.assertRaises(ClosedFileError, store.remove, 'df2')
  3456. def f():
  3457. store.select('df')
  3458. tm.assertRaisesRegexp(ClosedFileError, 'file is not open', f)
  3459. def test_pytables_native_read(self):
  3460. with ensure_clean_store(
  3461. tm.get_data_path('legacy_hdf/pytables_native.h5'),
  3462. mode='r') as store:
  3463. d2 = store['detector/readout']
  3464. self.assertIsInstance(d2, DataFrame)
  3465. def test_pytables_native2_read(self):
  3466. # fails on win/3.5 oddly
  3467. if PY35 and is_platform_windows():
  3468. raise nose.SkipTest("native2 read fails oddly on windows / 3.5")
  3469. with ensure_clean_store(
  3470. tm.get_data_path('legacy_hdf/pytables_native2.h5'),
  3471. mode='r') as store:
  3472. str(store)
  3473. d1 = store['detector']
  3474. self.assertIsInstance(d1, DataFrame)
  3475. def test_legacy_read(self):
  3476. with ensure_clean_store(
  3477. tm.get_data_path('legacy_hdf/legacy.h5'),
  3478. mode='r') as store:
  3479. store['a']
  3480. store['b']
  3481. store['c']
  3482. store['d']
  3483. def test_legacy_table_read(self):
  3484. # legacy table types
  3485. with ensure_clean_store(
  3486. tm.get_data_path('legacy_hdf/legacy_table.h5'),
  3487. mode='r') as store:
  3488. store.select('df1')
  3489. store.select('df2')
  3490. store.select('wp1')
  3491. # force the frame
  3492. store.select('df2', typ='legacy_frame')
  3493. # old version warning
  3494. with tm.assert_produces_warning(
  3495. expected_warning=IncompatibilityWarning):
  3496. self.assertRaises(
  3497. Exception, store.select, 'wp1', Term('minor_axis=B'))
  3498. df2 = store.select('df2')
  3499. result = store.select('df2', Term('index>df2.index[2]'))
  3500. expected = df2[df2.index > df2.index[2]]
  3501. assert_frame_equal(expected, result)
  3502. def test_legacy_0_10_read(self):
  3503. # legacy from 0.10
  3504. with compat_assert_produces_warning(FutureWarning):
  3505. path = tm.get_data_path('legacy_hdf/legacy_0.10.h5')
  3506. with ensure_clean_store(path, mode='r') as store:
  3507. str(store)
  3508. for k in store.keys():
  3509. store.select(k)
  3510. def test_legacy_0_11_read(self):
  3511. # legacy from 0.11
  3512. path = os.path.join('legacy_hdf', 'legacy_table_0.11.h5')
  3513. with ensure_clean_store(tm.get_data_path(path), mode='r') as store:
  3514. str(store)
  3515. assert 'df' in store
  3516. assert 'df1' in store
  3517. assert 'mi' in store
  3518. df = store.select('df')
  3519. df1 = store.select('df1')
  3520. mi = store.select('mi')
  3521. assert isinstance(df, DataFrame)
  3522. assert isinstance(df1, DataFrame)
  3523. assert isinstance(mi, DataFrame)
  3524. def test_copy(self):
  3525. with compat_assert_produces_warning(FutureWarning):
  3526. def do_copy(f=None, new_f=None, keys=None,
  3527. propindexes=True, **kwargs):
  3528. try:
  3529. if f is None:
  3530. f = tm.get_data_path(os.path.join('legacy_hdf',
  3531. 'legacy_0.10.h5'))
  3532. store = HDFStore(f, 'r')
  3533. if new_f is None:
  3534. import tempfile
  3535. fd, new_f = tempfile.mkstemp()
  3536. tstore = store.copy(
  3537. new_f, keys=keys, propindexes=propindexes, **kwargs)
  3538. # check keys
  3539. if keys is None:
  3540. keys = store.keys()
  3541. self.assertEqual(set(keys), set(tstore.keys()))
  3542. # check indicies & nrows
  3543. for k in tstore.keys():
  3544. if tstore.get_storer(k).is_table:
  3545. new_t = tstore.get_storer(k)
  3546. orig_t = store.get_storer(k)
  3547. self.assertEqual(orig_t.nrows, new_t.nrows)
  3548. # check propindixes
  3549. if propindexes:
  3550. for a in orig_t.axes:
  3551. if a.is_indexed:
  3552. self.assertTrue(
  3553. new_t[a.name].is_indexed)
  3554. finally:
  3555. safe_close(store)
  3556. safe_close(tstore)
  3557. try:
  3558. os.close(fd)
  3559. except:
  3560. pass
  3561. safe_remove(new_f)
  3562. do_copy()
  3563. do_copy(keys=['/a', '/b', '/df1_mixed'])
  3564. do_copy(propindexes=False)
  3565. # new table
  3566. df = tm.makeDataFrame()
  3567. try:
  3568. path = create_tempfile(self.path)
  3569. st = HDFStore(path)
  3570. st.append('df', df, data_columns=['A'])
  3571. st.close()
  3572. do_copy(f=path)
  3573. do_copy(f=path, propindexes=False)
  3574. finally:
  3575. safe_remove(path)
  3576. def test_legacy_table_write(self):
  3577. raise nose.SkipTest("cannot write legacy tables")
  3578. store = HDFStore(tm.get_data_path(
  3579. 'legacy_hdf/legacy_table_%s.h5' % pandas.__version__), 'a')
  3580. df = tm.makeDataFrame()
  3581. wp = tm.makePanel()
  3582. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  3583. ['one', 'two', 'three']],
  3584. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  3585. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  3586. names=['foo', 'bar'])
  3587. df = DataFrame(np.random.randn(10, 3), index=index,
  3588. columns=['A', 'B', 'C'])
  3589. store.append('mi', df)
  3590. df = DataFrame(dict(A='foo', B='bar'), index=lrange(10))
  3591. store.append('df', df, data_columns=['B'], min_itemsize={'A': 200})
  3592. store.append('wp', wp)
  3593. store.close()
  3594. def test_store_datetime_fractional_secs(self):
  3595. with ensure_clean_store(self.path) as store:
  3596. dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456)
  3597. series = Series([0], [dt])
  3598. store['a'] = series
  3599. self.assertEqual(store['a'].index[0], dt)
  3600. def test_tseries_indices_series(self):
  3601. with ensure_clean_store(self.path) as store:
  3602. idx = tm.makeDateIndex(10)
  3603. ser = Series(np.random.randn(len(idx)), idx)
  3604. store['a'] = ser
  3605. result = store['a']
  3606. assert_series_equal(result, ser)
  3607. self.assertEqual(type(result.index), type(ser.index))
  3608. self.assertEqual(result.index.freq, ser.index.freq)
  3609. idx = tm.makePeriodIndex(10)
  3610. ser = Series(np.random.randn(len(idx)), idx)
  3611. store['a'] = ser
  3612. result = store['a']
  3613. assert_series_equal(result, ser)
  3614. self.assertEqual(type(result.index), type(ser.index))
  3615. self.assertEqual(result.index.freq, ser.index.freq)
  3616. def test_tseries_indices_frame(self):
  3617. with ensure_clean_store(self.path) as store:
  3618. idx = tm.makeDateIndex(10)
  3619. df = DataFrame(np.random.randn(len(idx), 3), index=idx)
  3620. store['a'] = df
  3621. result = store['a']
  3622. assert_frame_equal(result, df)
  3623. self.assertEqual(type(result.index), type(df.index))
  3624. self.assertEqual(result.index.freq, df.index.freq)
  3625. idx = tm.makePeriodIndex(10)
  3626. df = DataFrame(np.random.randn(len(idx), 3), idx)
  3627. store['a'] = df
  3628. result = store['a']
  3629. assert_frame_equal(result, df)
  3630. self.assertEqual(type(result.index), type(df.index))
  3631. self.assertEqual(result.index.freq, df.index.freq)
  3632. def test_unicode_index(self):
  3633. unicode_values = [u('\u03c3'), u('\u03c3\u03c3')]
  3634. with compat_assert_produces_warning(PerformanceWarning):
  3635. s = Series(np.random.randn(len(unicode_values)), unicode_values)
  3636. self._check_roundtrip(s, tm.assert_series_equal)
  3637. def test_unicode_longer_encoded(self):
  3638. # GH 11234
  3639. char = '\u0394'
  3640. df = pd.DataFrame({'A': [char]})
  3641. with ensure_clean_store(self.path) as store:
  3642. store.put('df', df, format='table', encoding='utf-8')
  3643. result = store.get('df')
  3644. tm.assert_frame_equal(result, df)
  3645. df = pd.DataFrame({'A': ['a', char], 'B': ['b', 'b']})
  3646. with ensure_clean_store(self.path) as store:
  3647. store.put('df', df, format='table', encoding='utf-8')
  3648. result = store.get('df')
  3649. tm.assert_frame_equal(result, df)
  3650. def test_store_datetime_mixed(self):
  3651. df = DataFrame(
  3652. {'a': [1, 2, 3], 'b': [1., 2., 3.], 'c': ['a', 'b', 'c']})
  3653. ts = tm.makeTimeSeries()
  3654. df['d'] = ts.index[:3]
  3655. self._check_roundtrip(df, tm.assert_frame_equal)
  3656. # def test_cant_write_multiindex_table(self):
  3657. # # for now, #1848
  3658. # df = DataFrame(np.random.randn(10, 4),
  3659. # index=[np.arange(5).repeat(2),
  3660. # np.tile(np.arange(2), 5)])
  3661. # self.assertRaises(Exception, store.put, 'foo', df, format='table')
  3662. def test_append_with_diff_col_name_types_raises_value_error(self):
  3663. df = DataFrame(np.random.randn(10, 1))
  3664. df2 = DataFrame({'a': np.random.randn(10)})
  3665. df3 = DataFrame({(1, 2): np.random.randn(10)})
  3666. df4 = DataFrame({('1', 2): np.random.randn(10)})
  3667. df5 = DataFrame({('1', 2, object): np.random.randn(10)})
  3668. with ensure_clean_store(self.path) as store:
  3669. name = 'df_%s' % tm.rands(10)
  3670. store.append(name, df)
  3671. for d in (df2, df3, df4, df5):
  3672. with tm.assertRaises(ValueError):
  3673. store.append(name, d)
  3674. def test_query_with_nested_special_character(self):
  3675. df = DataFrame({'a': ['a', 'a', 'c', 'b',
  3676. 'test & test', 'c', 'b', 'e'],
  3677. 'b': [1, 2, 3, 4, 5, 6, 7, 8]})
  3678. expected = df[df.a == 'test & test']
  3679. with ensure_clean_store(self.path) as store:
  3680. store.append('test', df, format='table', data_columns=True)
  3681. result = store.select('test', 'a = "test & test"')
  3682. tm.assert_frame_equal(expected, result)
  3683. def test_categorical(self):
  3684. with ensure_clean_store(self.path) as store:
  3685. # basic
  3686. _maybe_remove(store, 's')
  3687. s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[
  3688. 'a', 'b', 'c', 'd'], ordered=False))
  3689. store.append('s', s, format='table')
  3690. result = store.select('s')
  3691. tm.assert_series_equal(s, result)
  3692. _maybe_remove(store, 's_ordered')
  3693. s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[
  3694. 'a', 'b', 'c', 'd'], ordered=True))
  3695. store.append('s_ordered', s, format='table')
  3696. result = store.select('s_ordered')
  3697. tm.assert_series_equal(s, result)
  3698. _maybe_remove(store, 'df')
  3699. df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]})
  3700. store.append('df', df, format='table')
  3701. result = store.select('df')
  3702. tm.assert_frame_equal(result, df)
  3703. # dtypes
  3704. s = Series([1, 1, 2, 2, 3, 4, 5]).astype('category')
  3705. store.append('si', s)
  3706. result = store.select('si')
  3707. tm.assert_series_equal(result, s)
  3708. s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype('category')
  3709. store.append('si2', s)
  3710. result = store.select('si2')
  3711. tm.assert_series_equal(result, s)
  3712. # multiple
  3713. df2 = df.copy()
  3714. df2['s2'] = Series(list('abcdefg')).astype('category')
  3715. store.append('df2', df2)
  3716. result = store.select('df2')
  3717. tm.assert_frame_equal(result, df2)
  3718. # make sure the metadata is ok
  3719. self.assertTrue('/df2 ' in str(store))
  3720. self.assertTrue('/df2/meta/values_block_0/meta' in str(store))
  3721. self.assertTrue('/df2/meta/values_block_1/meta' in str(store))
  3722. # unordered
  3723. s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[
  3724. 'a', 'b', 'c', 'd'], ordered=False))
  3725. store.append('s2', s, format='table')
  3726. result = store.select('s2')
  3727. tm.assert_series_equal(result, s)
  3728. # query
  3729. store.append('df3', df, data_columns=['s'])
  3730. expected = df[df.s.isin(['b', 'c'])]
  3731. result = store.select('df3', where=['s in ["b","c"]'])
  3732. tm.assert_frame_equal(result, expected)
  3733. expected = df[df.s.isin(['b', 'c'])]
  3734. result = store.select('df3', where=['s = ["b","c"]'])
  3735. tm.assert_frame_equal(result, expected)
  3736. expected = df[df.s.isin(['d'])]
  3737. result = store.select('df3', where=['s in ["d"]'])
  3738. tm.assert_frame_equal(result, expected)
  3739. expected = df[df.s.isin(['f'])]
  3740. result = store.select('df3', where=['s in ["f"]'])
  3741. tm.assert_frame_equal(result, expected)
  3742. # appending with same categories is ok
  3743. store.append('df3', df)
  3744. df = concat([df, df])
  3745. expected = df[df.s.isin(['b', 'c'])]
  3746. result = store.select('df3', where=['s in ["b","c"]'])
  3747. tm.assert_frame_equal(result, expected)
  3748. # appending must have the same categories
  3749. df3 = df.copy()
  3750. df3['s'].cat.remove_unused_categories(inplace=True)
  3751. self.assertRaises(ValueError, lambda: store.append('df3', df3))
  3752. # remove
  3753. # make sure meta data is removed (its a recursive removal so should
  3754. # be)
  3755. result = store.select('df3/meta/s/meta')
  3756. self.assertIsNotNone(result)
  3757. store.remove('df3')
  3758. self.assertRaises(
  3759. KeyError, lambda: store.select('df3/meta/s/meta'))
  3760. def test_categorical_conversion(self):
  3761. # GH13322
  3762. # Check that read_hdf with categorical columns doesn't return rows if
  3763. # where criteria isn't met.
  3764. obsids = ['ESP_012345_6789', 'ESP_987654_3210']
  3765. imgids = ['APF00006np', 'APF0001imm']
  3766. data = [4.3, 9.8]
  3767. # Test without categories
  3768. df = DataFrame(dict(obsids=obsids, imgids=imgids, data=data))
  3769. # We are expecting an empty DataFrame matching types of df
  3770. expected = df.iloc[[], :]
  3771. with ensure_clean_path(self.path) as path:
  3772. df.to_hdf(path, 'df', format='table', data_columns=True)
  3773. result = read_hdf(path, 'df', where='obsids=B')
  3774. tm.assert_frame_equal(result, expected)
  3775. # Test with categories
  3776. df.obsids = df.obsids.astype('category')
  3777. df.imgids = df.imgids.astype('category')
  3778. # We are expecting an empty DataFrame matching types of df
  3779. expected = df.iloc[[], :]
  3780. with ensure_clean_path(self.path) as path:
  3781. df.to_hdf(path, 'df', format='table', data_columns=True)
  3782. result = read_hdf(path, 'df', where='obsids=B')
  3783. tm.assert_frame_equal(result, expected)
  3784. def test_duplicate_column_name(self):
  3785. df = DataFrame(columns=["a", "a"], data=[[0, 0]])
  3786. with ensure_clean_path(self.path) as path:
  3787. self.assertRaises(ValueError, df.to_hdf,
  3788. path, 'df', format='fixed')
  3789. df.to_hdf(path, 'df', format='table')
  3790. other = read_hdf(path, 'df')
  3791. tm.assert_frame_equal(df, other)
  3792. self.assertTrue(df.equals(other))
  3793. self.assertTrue(other.equals(df))
  3794. def test_round_trip_equals(self):
  3795. # GH 9330
  3796. df = DataFrame({"B": [1, 2], "A": ["x", "y"]})
  3797. with ensure_clean_path(self.path) as path:
  3798. df.to_hdf(path, 'df', format='table')
  3799. other = read_hdf(path, 'df')
  3800. tm.assert_frame_equal(df, other)
  3801. self.assertTrue(df.equals(other))
  3802. self.assertTrue(other.equals(df))
  3803. def test_preserve_timedeltaindex_type(self):
  3804. # GH9635
  3805. # Storing TimedeltaIndexed DataFrames in fixed stores did not preserve
  3806. # the type of the index.
  3807. df = DataFrame(np.random.normal(size=(10, 5)))
  3808. df.index = timedelta_range(
  3809. start='0s', periods=10, freq='1s', name='example')
  3810. with ensure_clean_store(self.path) as store:
  3811. store['df'] = df
  3812. assert_frame_equal(store['df'], df)
  3813. def test_colums_multiindex_modified(self):
  3814. # BUG: 7212
  3815. # read_hdf store.select modified the passed columns parameters
  3816. # when multi-indexed.
  3817. df = DataFrame(np.random.rand(4, 5),
  3818. index=list('abcd'),
  3819. columns=list('ABCDE'))
  3820. df.index.name = 'letters'
  3821. df = df.set_index(keys='E', append=True)
  3822. data_columns = df.index.names + df.columns.tolist()
  3823. with ensure_clean_path(self.path) as path:
  3824. df.to_hdf(path, 'df',
  3825. mode='a',
  3826. append=True,
  3827. data_columns=data_columns,
  3828. index=False)
  3829. cols2load = list('BCD')
  3830. cols2load_original = list(cols2load)
  3831. df_loaded = read_hdf(path, 'df', columns=cols2load) # noqa
  3832. self.assertTrue(cols2load_original == cols2load)
  3833. def test_to_hdf_with_object_column_names(self):
  3834. # GH9057
  3835. # Writing HDF5 table format should only work for string-like
  3836. # column types
  3837. types_should_fail = [tm.makeIntIndex, tm.makeFloatIndex,
  3838. tm.makeDateIndex, tm.makeTimedeltaIndex,
  3839. tm.makePeriodIndex]
  3840. types_should_run = [tm.makeStringIndex, tm.makeCategoricalIndex]
  3841. if compat.PY3:
  3842. types_should_run.append(tm.makeUnicodeIndex)
  3843. else:
  3844. types_should_fail.append(tm.makeUnicodeIndex)
  3845. for index in types_should_fail:
  3846. df = DataFrame(np.random.randn(10, 2), columns=index(2))
  3847. with ensure_clean_path(self.path) as path:
  3848. with self.assertRaises(
  3849. ValueError, msg=("cannot have non-object label "
  3850. "DataIndexableCol")):
  3851. df.to_hdf(path, 'df', format='table', data_columns=True)
  3852. for index in types_should_run:
  3853. df = DataFrame(np.random.randn(10, 2), columns=index(2))
  3854. with ensure_clean_path(self.path) as path:
  3855. df.to_hdf(path, 'df', format='table', data_columns=True)
  3856. result = pd.read_hdf(
  3857. path, 'df', where="index = [{0}]".format(df.index[0]))
  3858. assert(len(result))
  3859. def test_read_hdf_open_store(self):
  3860. # GH10330
  3861. # No check for non-string path_or-buf, and no test of open store
  3862. df = DataFrame(np.random.rand(4, 5),
  3863. index=list('abcd'),
  3864. columns=list('ABCDE'))
  3865. df.index.name = 'letters'
  3866. df = df.set_index(keys='E', append=True)
  3867. with ensure_clean_path(self.path) as path:
  3868. df.to_hdf(path, 'df', mode='w')
  3869. direct = read_hdf(path, 'df')
  3870. store = HDFStore(path, mode='r')
  3871. indirect = read_hdf(store, 'df')
  3872. tm.assert_frame_equal(direct, indirect)
  3873. self.assertTrue(store.is_open)
  3874. store.close()
  3875. def test_read_hdf_iterator(self):
  3876. df = DataFrame(np.random.rand(4, 5),
  3877. index=list('abcd'),
  3878. columns=list('ABCDE'))
  3879. df.index.name = 'letters'
  3880. df = df.set_index(keys='E', append=True)
  3881. with ensure_clean_path(self.path) as path:
  3882. df.to_hdf(path, 'df', mode='w', format='t')
  3883. direct = read_hdf(path, 'df')
  3884. iterator = read_hdf(path, 'df', iterator=True)
  3885. self.assertTrue(isinstance(iterator, TableIterator))
  3886. indirect = next(iterator.__iter__())
  3887. tm.assert_frame_equal(direct, indirect)
  3888. iterator.store.close()
  3889. def test_read_hdf_errors(self):
  3890. df = DataFrame(np.random.rand(4, 5),
  3891. index=list('abcd'),
  3892. columns=list('ABCDE'))
  3893. with ensure_clean_path(self.path) as path:
  3894. self.assertRaises(IOError, read_hdf, path, 'key')
  3895. df.to_hdf(path, 'df')
  3896. store = HDFStore(path, mode='r')
  3897. store.close()
  3898. self.assertRaises(IOError, read_hdf, store, 'df')
  3899. with open(path, mode='r') as store:
  3900. self.assertRaises(NotImplementedError, read_hdf, store, 'df')
  3901. def test_invalid_complib(self):
  3902. df = DataFrame(np.random.rand(4, 5),
  3903. index=list('abcd'),
  3904. columns=list('ABCDE'))
  3905. with ensure_clean_path(self.path) as path:
  3906. self.assertRaises(ValueError, df.to_hdf, path,
  3907. 'df', complib='blosc:zlib')
  3908. # GH10443
  3909. def test_read_nokey(self):
  3910. df = DataFrame(np.random.rand(4, 5),
  3911. index=list('abcd'),
  3912. columns=list('ABCDE'))
  3913. # Categorical dtype not supported for "fixed" format. So no need
  3914. # to test with that dtype in the dataframe here.
  3915. with ensure_clean_path(self.path) as path:
  3916. df.to_hdf(path, 'df', mode='a')
  3917. reread = read_hdf(path)
  3918. assert_frame_equal(df, reread)
  3919. df.to_hdf(path, 'df2', mode='a')
  3920. self.assertRaises(ValueError, read_hdf, path)
  3921. def test_read_nokey_table(self):
  3922. # GH13231
  3923. df = DataFrame({'i': range(5),
  3924. 'c': Series(list('abacd'), dtype='category')})
  3925. with ensure_clean_path(self.path) as path:
  3926. df.to_hdf(path, 'df', mode='a', format='table')
  3927. reread = read_hdf(path)
  3928. assert_frame_equal(df, reread)
  3929. df.to_hdf(path, 'df2', mode='a', format='table')
  3930. self.assertRaises(ValueError, read_hdf, path)
  3931. def test_read_nokey_empty(self):
  3932. with ensure_clean_path(self.path) as path:
  3933. store = HDFStore(path)
  3934. store.close()
  3935. self.assertRaises(ValueError, read_hdf, path)
  3936. def test_read_from_pathlib_path(self):
  3937. # GH11773
  3938. tm._skip_if_no_pathlib()
  3939. from pathlib import Path
  3940. expected = DataFrame(np.random.rand(4, 5),
  3941. index=list('abcd'),
  3942. columns=list('ABCDE'))
  3943. with ensure_clean_path(self.path) as filename:
  3944. path_obj = Path(filename)
  3945. expected.to_hdf(path_obj, 'df', mode='a')
  3946. actual = read_hdf(path_obj, 'df')
  3947. tm.assert_frame_equal(expected, actual)
  3948. def test_read_from_py_localpath(self):
  3949. # GH11773
  3950. tm._skip_if_no_localpath()
  3951. from py.path import local as LocalPath
  3952. expected = DataFrame(np.random.rand(4, 5),
  3953. index=list('abcd'),
  3954. columns=list('ABCDE'))
  3955. with ensure_clean_path(self.path) as filename:
  3956. path_obj = LocalPath(filename)
  3957. expected.to_hdf(path_obj, 'df', mode='a')
  3958. actual = read_hdf(path_obj, 'df')
  3959. tm.assert_frame_equal(expected, actual)
  3960. class TestHDFComplexValues(Base):
  3961. # GH10447
  3962. def test_complex_fixed(self):
  3963. df = DataFrame(np.random.rand(4, 5).astype(np.complex64),
  3964. index=list('abcd'),
  3965. columns=list('ABCDE'))
  3966. with ensure_clean_path(self.path) as path:
  3967. df.to_hdf(path, 'df')
  3968. reread = read_hdf(path, 'df')
  3969. assert_frame_equal(df, reread)
  3970. df = DataFrame(np.random.rand(4, 5).astype(np.complex128),
  3971. index=list('abcd'),
  3972. columns=list('ABCDE'))
  3973. with ensure_clean_path(self.path) as path:
  3974. df.to_hdf(path, 'df')
  3975. reread = read_hdf(path, 'df')
  3976. assert_frame_equal(df, reread)
  3977. def test_complex_table(self):
  3978. df = DataFrame(np.random.rand(4, 5).astype(np.complex64),
  3979. index=list('abcd'),
  3980. columns=list('ABCDE'))
  3981. with ensure_clean_path(self.path) as path:
  3982. df.to_hdf(path, 'df', format='table')
  3983. reread = read_hdf(path, 'df')
  3984. assert_frame_equal(df, reread)
  3985. df = DataFrame(np.random.rand(4, 5).astype(np.complex128),
  3986. index=list('abcd'),
  3987. columns=list('ABCDE'))
  3988. with ensure_clean_path(self.path) as path:
  3989. df.to_hdf(path, 'df', format='table', mode='w')
  3990. reread = read_hdf(path, 'df')
  3991. assert_frame_equal(df, reread)
  3992. def test_complex_mixed_fixed(self):
  3993. complex64 = np.array([1.0 + 1.0j, 1.0 + 1.0j,
  3994. 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64)
  3995. complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j],
  3996. dtype=np.complex128)
  3997. df = DataFrame({'A': [1, 2, 3, 4],
  3998. 'B': ['a', 'b', 'c', 'd'],
  3999. 'C': complex64,
  4000. 'D': complex128,
  4001. 'E': [1.0, 2.0, 3.0, 4.0]},
  4002. index=list('abcd'))
  4003. with ensure_clean_path(self.path) as path:
  4004. df.to_hdf(path, 'df')
  4005. reread = read_hdf(path, 'df')
  4006. assert_frame_equal(df, reread)
  4007. def test_complex_mixed_table(self):
  4008. complex64 = np.array([1.0 + 1.0j, 1.0 + 1.0j,
  4009. 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64)
  4010. complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j],
  4011. dtype=np.complex128)
  4012. df = DataFrame({'A': [1, 2, 3, 4],
  4013. 'B': ['a', 'b', 'c', 'd'],
  4014. 'C': complex64,
  4015. 'D': complex128,
  4016. 'E': [1.0, 2.0, 3.0, 4.0]},
  4017. index=list('abcd'))
  4018. with ensure_clean_store(self.path) as store:
  4019. store.append('df', df, data_columns=['A', 'B'])
  4020. result = store.select('df', where=Term('A>2'))
  4021. assert_frame_equal(df.loc[df.A > 2], result)
  4022. with ensure_clean_path(self.path) as path:
  4023. df.to_hdf(path, 'df', format='table')
  4024. reread = read_hdf(path, 'df')
  4025. assert_frame_equal(df, reread)
  4026. def test_complex_across_dimensions_fixed(self):
  4027. complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
  4028. s = Series(complex128, index=list('abcd'))
  4029. df = DataFrame({'A': s, 'B': s})
  4030. p = Panel({'One': df, 'Two': df})
  4031. objs = [s, df, p]
  4032. comps = [tm.assert_series_equal, tm.assert_frame_equal,
  4033. tm.assert_panel_equal]
  4034. for obj, comp in zip(objs, comps):
  4035. with ensure_clean_path(self.path) as path:
  4036. obj.to_hdf(path, 'obj', format='fixed')
  4037. reread = read_hdf(path, 'obj')
  4038. comp(obj, reread)
  4039. def test_complex_across_dimensions(self):
  4040. complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
  4041. s = Series(complex128, index=list('abcd'))
  4042. df = DataFrame({'A': s, 'B': s})
  4043. p = Panel({'One': df, 'Two': df})
  4044. with compat_assert_produces_warning(FutureWarning):
  4045. p4d = pd.Panel4D({'i': p, 'ii': p})
  4046. objs = [df, p, p4d]
  4047. comps = [tm.assert_frame_equal, tm.assert_panel_equal,
  4048. tm.assert_panel4d_equal]
  4049. for obj, comp in zip(objs, comps):
  4050. with ensure_clean_path(self.path) as path:
  4051. obj.to_hdf(path, 'obj', format='table')
  4052. reread = read_hdf(path, 'obj')
  4053. comp(obj, reread)
  4054. def test_complex_indexing_error(self):
  4055. complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j],
  4056. dtype=np.complex128)
  4057. df = DataFrame({'A': [1, 2, 3, 4],
  4058. 'B': ['a', 'b', 'c', 'd'],
  4059. 'C': complex128},
  4060. index=list('abcd'))
  4061. with ensure_clean_store(self.path) as store:
  4062. self.assertRaises(TypeError, store.append,
  4063. 'df', df, data_columns=['C'])
  4064. def test_complex_series_error(self):
  4065. complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
  4066. s = Series(complex128, index=list('abcd'))
  4067. with ensure_clean_path(self.path) as path:
  4068. self.assertRaises(TypeError, s.to_hdf, path, 'obj', format='t')
  4069. with ensure_clean_path(self.path) as path:
  4070. s.to_hdf(path, 'obj', format='t', index=False)
  4071. reread = read_hdf(path, 'obj')
  4072. tm.assert_series_equal(s, reread)
  4073. def test_complex_append(self):
  4074. df = DataFrame({'a': np.random.randn(100).astype(np.complex128),
  4075. 'b': np.random.randn(100)})
  4076. with ensure_clean_store(self.path) as store:
  4077. store.append('df', df, data_columns=['b'])
  4078. store.append('df', df)
  4079. result = store.select('df')
  4080. assert_frame_equal(pd.concat([df, df], 0), result)
  4081. class TestTimezones(Base, tm.TestCase):
  4082. def _compare_with_tz(self, a, b):
  4083. tm.assert_frame_equal(a, b)
  4084. # compare the zones on each element
  4085. for c in a.columns:
  4086. for i in a.index:
  4087. a_e = a.loc[i, c]
  4088. b_e = b.loc[i, c]
  4089. if not (a_e == b_e and a_e.tz == b_e.tz):
  4090. raise AssertionError(
  4091. "invalid tz comparsion [%s] [%s]" % (a_e, b_e))
  4092. def test_append_with_timezones_dateutil(self):
  4093. from datetime import timedelta
  4094. tm._skip_if_no_dateutil()
  4095. # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows
  4096. # filename issues.
  4097. from pandas.tslib import maybe_get_tz
  4098. gettz = lambda x: maybe_get_tz('dateutil/' + x)
  4099. # as columns
  4100. with ensure_clean_store(self.path) as store:
  4101. _maybe_remove(store, 'df_tz')
  4102. df = DataFrame(dict(A=[Timestamp('20130102 2:00:00', tz=gettz(
  4103. 'US/Eastern')) + timedelta(hours=1) * i for i in range(5)]))
  4104. store.append('df_tz', df, data_columns=['A'])
  4105. result = store['df_tz']
  4106. self._compare_with_tz(result, df)
  4107. assert_frame_equal(result, df)
  4108. # select with tz aware
  4109. expected = df[df.A >= df.A[3]]
  4110. result = store.select('df_tz', where=Term('A>=df.A[3]'))
  4111. self._compare_with_tz(result, expected)
  4112. # ensure we include dates in DST and STD time here.
  4113. _maybe_remove(store, 'df_tz')
  4114. df = DataFrame(dict(A=Timestamp('20130102',
  4115. tz=gettz('US/Eastern')),
  4116. B=Timestamp('20130603',
  4117. tz=gettz('US/Eastern'))),
  4118. index=range(5))
  4119. store.append('df_tz', df)
  4120. result = store['df_tz']
  4121. self._compare_with_tz(result, df)
  4122. assert_frame_equal(result, df)
  4123. df = DataFrame(dict(A=Timestamp('20130102',
  4124. tz=gettz('US/Eastern')),
  4125. B=Timestamp('20130102', tz=gettz('EET'))),
  4126. index=range(5))
  4127. self.assertRaises(ValueError, store.append, 'df_tz', df)
  4128. # this is ok
  4129. _maybe_remove(store, 'df_tz')
  4130. store.append('df_tz', df, data_columns=['A', 'B'])
  4131. result = store['df_tz']
  4132. self._compare_with_tz(result, df)
  4133. assert_frame_equal(result, df)
  4134. # can't append with diff timezone
  4135. df = DataFrame(dict(A=Timestamp('20130102',
  4136. tz=gettz('US/Eastern')),
  4137. B=Timestamp('20130102', tz=gettz('CET'))),
  4138. index=range(5))
  4139. self.assertRaises(ValueError, store.append, 'df_tz', df)
  4140. # as index
  4141. with ensure_clean_store(self.path) as store:
  4142. # GH 4098 example
  4143. df = DataFrame(dict(A=Series(lrange(3), index=date_range(
  4144. '2000-1-1', periods=3, freq='H', tz=gettz('US/Eastern')))))
  4145. _maybe_remove(store, 'df')
  4146. store.put('df', df)
  4147. result = store.select('df')
  4148. assert_frame_equal(result, df)
  4149. _maybe_remove(store, 'df')
  4150. store.append('df', df)
  4151. result = store.select('df')
  4152. assert_frame_equal(result, df)
  4153. def test_append_with_timezones_pytz(self):
  4154. from datetime import timedelta
  4155. # as columns
  4156. with ensure_clean_store(self.path) as store:
  4157. _maybe_remove(store, 'df_tz')
  4158. df = DataFrame(dict(A=[Timestamp('20130102 2:00:00',
  4159. tz='US/Eastern') +
  4160. timedelta(hours=1) * i
  4161. for i in range(5)]))
  4162. store.append('df_tz', df, data_columns=['A'])
  4163. result = store['df_tz']
  4164. self._compare_with_tz(result, df)
  4165. assert_frame_equal(result, df)
  4166. # select with tz aware
  4167. self._compare_with_tz(store.select(
  4168. 'df_tz', where=Term('A>=df.A[3]')), df[df.A >= df.A[3]])
  4169. _maybe_remove(store, 'df_tz')
  4170. # ensure we include dates in DST and STD time here.
  4171. df = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'),
  4172. B=Timestamp('20130603', tz='US/Eastern')),
  4173. index=range(5))
  4174. store.append('df_tz', df)
  4175. result = store['df_tz']
  4176. self._compare_with_tz(result, df)
  4177. assert_frame_equal(result, df)
  4178. df = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'),
  4179. B=Timestamp('20130102', tz='EET')),
  4180. index=range(5))
  4181. self.assertRaises(ValueError, store.append, 'df_tz', df)
  4182. # this is ok
  4183. _maybe_remove(store, 'df_tz')
  4184. store.append('df_tz', df, data_columns=['A', 'B'])
  4185. result = store['df_tz']
  4186. self._compare_with_tz(result, df)
  4187. assert_frame_equal(result, df)
  4188. # can't append with diff timezone
  4189. df = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'),
  4190. B=Timestamp('20130102', tz='CET')),
  4191. index=range(5))
  4192. self.assertRaises(ValueError, store.append, 'df_tz', df)
  4193. # as index
  4194. with ensure_clean_store(self.path) as store:
  4195. # GH 4098 example
  4196. df = DataFrame(dict(A=Series(lrange(3), index=date_range(
  4197. '2000-1-1', periods=3, freq='H', tz='US/Eastern'))))
  4198. _maybe_remove(store, 'df')
  4199. store.put('df', df)
  4200. result = store.select('df')
  4201. assert_frame_equal(result, df)
  4202. _maybe_remove(store, 'df')
  4203. store.append('df', df)
  4204. result = store.select('df')
  4205. assert_frame_equal(result, df)
  4206. def test_tseries_select_index_column(self):
  4207. # GH7777
  4208. # selecting a UTC datetimeindex column did
  4209. # not preserve UTC tzinfo set before storing
  4210. # check that no tz still works
  4211. rng = date_range('1/1/2000', '1/30/2000')
  4212. frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
  4213. with ensure_clean_store(self.path) as store:
  4214. store.append('frame', frame)
  4215. result = store.select_column('frame', 'index')
  4216. self.assertEqual(rng.tz, DatetimeIndex(result.values).tz)
  4217. # check utc
  4218. rng = date_range('1/1/2000', '1/30/2000', tz='UTC')
  4219. frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
  4220. with ensure_clean_store(self.path) as store:
  4221. store.append('frame', frame)
  4222. result = store.select_column('frame', 'index')
  4223. self.assertEqual(rng.tz, result.dt.tz)
  4224. # double check non-utc
  4225. rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern')
  4226. frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
  4227. with ensure_clean_store(self.path) as store:
  4228. store.append('frame', frame)
  4229. result = store.select_column('frame', 'index')
  4230. self.assertEqual(rng.tz, result.dt.tz)
  4231. def test_timezones_fixed(self):
  4232. with ensure_clean_store(self.path) as store:
  4233. # index
  4234. rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern')
  4235. df = DataFrame(np.random.randn(len(rng), 4), index=rng)
  4236. store['df'] = df
  4237. result = store['df']
  4238. assert_frame_equal(result, df)
  4239. # as data
  4240. # GH11411
  4241. _maybe_remove(store, 'df')
  4242. df = DataFrame({'A': rng,
  4243. 'B': rng.tz_convert('UTC').tz_localize(None),
  4244. 'C': rng.tz_convert('CET'),
  4245. 'D': range(len(rng))}, index=rng)
  4246. store['df'] = df
  4247. result = store['df']
  4248. assert_frame_equal(result, df)
  4249. def test_fixed_offset_tz(self):
  4250. rng = date_range('1/1/2000 00:00:00-07:00', '1/30/2000 00:00:00-07:00')
  4251. frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
  4252. with ensure_clean_store(self.path) as store:
  4253. store['frame'] = frame
  4254. recons = store['frame']
  4255. self.assert_index_equal(recons.index, rng)
  4256. self.assertEqual(rng.tz, recons.index.tz)
  4257. def test_store_timezone(self):
  4258. # GH2852
  4259. # issue storing datetime.date with a timezone as it resets when read
  4260. # back in a new timezone
  4261. # original method
  4262. with ensure_clean_store(self.path) as store:
  4263. today = datetime.date(2013, 9, 10)
  4264. df = DataFrame([1, 2, 3], index=[today, today, today])
  4265. store['obj1'] = df
  4266. result = store['obj1']
  4267. assert_frame_equal(result, df)
  4268. # with tz setting
  4269. with ensure_clean_store(self.path) as store:
  4270. with set_timezone('EST5EDT'):
  4271. today = datetime.date(2013, 9, 10)
  4272. df = DataFrame([1, 2, 3], index=[today, today, today])
  4273. store['obj1'] = df
  4274. with set_timezone('CST6CDT'):
  4275. result = store['obj1']
  4276. assert_frame_equal(result, df)
  4277. def test_legacy_datetimetz_object(self):
  4278. # legacy from < 0.17.0
  4279. # 8260
  4280. expected = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'),
  4281. B=Timestamp('20130603', tz='CET')),
  4282. index=range(5))
  4283. with ensure_clean_store(
  4284. tm.get_data_path('legacy_hdf/datetimetz_object.h5'),
  4285. mode='r') as store:
  4286. result = store['df']
  4287. assert_frame_equal(result, expected)
  4288. def test_dst_transitions(self):
  4289. # make sure we are not failing on transaitions
  4290. with ensure_clean_store(self.path) as store:
  4291. times = pd.date_range("2013-10-26 23:00", "2013-10-27 01:00",
  4292. tz="Europe/London",
  4293. freq="H",
  4294. ambiguous='infer')
  4295. for i in [times, times + pd.Timedelta('10min')]:
  4296. _maybe_remove(store, 'df')
  4297. df = DataFrame({'A': range(len(i)), 'B': i}, index=i)
  4298. store.append('df', df)
  4299. result = store.select('df')
  4300. assert_frame_equal(result, df)
  4301. def _test_sort(obj):
  4302. if isinstance(obj, DataFrame):
  4303. return obj.reindex(sorted(obj.index))
  4304. elif isinstance(obj, Panel):
  4305. return obj.reindex(major=sorted(obj.major_axis))
  4306. else:
  4307. raise ValueError('type not supported here')
  4308. if __name__ == '__main__':
  4309. import nose
  4310. nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
  4311. exit=False)