PageRenderTime 51ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/io/tests/sas/test_xport.py

http://github.com/wesm/pandas
Python | 126 lines | 70 code | 33 blank | 23 comment | 3 complexity | d2bba4a28c54aac37f88e10a88e03379 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. import pandas as pd
  2. import pandas.util.testing as tm
  3. from pandas.io.sas.sasreader import read_sas
  4. import numpy as np
  5. import os
  6. # CSV versions of test xpt files were obtained using the R foreign library
  7. # Numbers in a SAS xport file are always float64, so need to convert
  8. # before making comparisons.
  9. def numeric_as_float(data):
  10. for v in data.columns:
  11. if data[v].dtype is np.dtype('int64'):
  12. data[v] = data[v].astype(np.float64)
  13. class TestXport(tm.TestCase):
  14. def setUp(self):
  15. self.dirpath = tm.get_data_path()
  16. self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt")
  17. self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt")
  18. self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt")
  19. self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt")
  20. def test1_basic(self):
  21. # Tests with DEMO_G.xpt (all numeric file)
  22. # Compare to this
  23. data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
  24. numeric_as_float(data_csv)
  25. # Read full file
  26. data = read_sas(self.file01, format="xport")
  27. tm.assert_frame_equal(data, data_csv)
  28. # Test incremental read with `read` method.
  29. reader = read_sas(self.file01, format="xport", iterator=True)
  30. data = reader.read(10)
  31. reader.close()
  32. tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
  33. # Test incremental read with `get_chunk` method.
  34. reader = read_sas(self.file01, format="xport", chunksize=10)
  35. data = reader.get_chunk()
  36. reader.close()
  37. tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
  38. # Read full file with `read_sas` method
  39. data = read_sas(self.file01)
  40. tm.assert_frame_equal(data, data_csv)
  41. def test1_index(self):
  42. # Tests with DEMO_G.xpt using index (all numeric file)
  43. # Compare to this
  44. data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
  45. data_csv = data_csv.set_index("SEQN")
  46. numeric_as_float(data_csv)
  47. # Read full file
  48. data = read_sas(self.file01, index="SEQN", format="xport")
  49. tm.assert_frame_equal(data, data_csv, check_index_type=False)
  50. # Test incremental read with `read` method.
  51. reader = read_sas(self.file01, index="SEQN", format="xport",
  52. iterator=True)
  53. data = reader.read(10)
  54. reader.close()
  55. tm.assert_frame_equal(data, data_csv.iloc[0:10, :],
  56. check_index_type=False)
  57. # Test incremental read with `get_chunk` method.
  58. reader = read_sas(self.file01, index="SEQN", format="xport",
  59. chunksize=10)
  60. data = reader.get_chunk()
  61. reader.close()
  62. tm.assert_frame_equal(data, data_csv.iloc[0:10, :],
  63. check_index_type=False)
  64. def test1_incremental(self):
  65. # Test with DEMO_G.xpt, reading full file incrementally
  66. data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
  67. data_csv = data_csv.set_index("SEQN")
  68. numeric_as_float(data_csv)
  69. reader = read_sas(self.file01, index="SEQN", chunksize=1000)
  70. all_data = [x for x in reader]
  71. data = pd.concat(all_data, axis=0)
  72. tm.assert_frame_equal(data, data_csv, check_index_type=False)
  73. def test2(self):
  74. # Test with SSHSV1_A.xpt
  75. # Compare to this
  76. data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv"))
  77. numeric_as_float(data_csv)
  78. data = read_sas(self.file02)
  79. tm.assert_frame_equal(data, data_csv)
  80. def test_multiple_types(self):
  81. # Test with DRXFCD_G.xpt (contains text and numeric variables)
  82. # Compare to this
  83. data_csv = pd.read_csv(self.file03.replace(".xpt", ".csv"))
  84. data = read_sas(self.file03, encoding="utf-8")
  85. tm.assert_frame_equal(data, data_csv)
  86. def test_truncated_float_support(self):
  87. # Test with paxraw_d_short.xpt, a shortened version of:
  88. # http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP
  89. # This file has truncated floats (5 bytes in this case).
  90. # GH 11713
  91. data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv"))
  92. data = read_sas(self.file04, format="xport")
  93. tm.assert_frame_equal(data.astype('int64'), data_csv)