PageRenderTime 346ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/io/tests/test_parsers.py

http://github.com/pydata/pandas
Python | 3569 lines | 3512 code | 35 blank | 22 comment | 15 complexity | d63a8c14b3ed6a7fbb6579f61b69ceca MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. # -*- coding: utf-8 -*-
  2. # pylint: disable=E1101
  3. from datetime import datetime
  4. import csv
  5. import os
  6. import sys
  7. import re
  8. import nose
  9. import platform
  10. from numpy import nan
  11. import numpy as np
  12. from pandas.io.common import DtypeWarning
  13. from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex
  14. from pandas.compat import(
  15. StringIO, BytesIO, PY3, range, long, lrange, lmap, u
  16. )
  17. from pandas.io.common import URLError
  18. import pandas.io.parsers as parsers
  19. from pandas.io.parsers import (read_csv, read_table, read_fwf,
  20. TextFileReader, TextParser)
  21. import pandas.util.testing as tm
  22. import pandas as pd
  23. from pandas.compat import parse_date
  24. import pandas.lib as lib
  25. from pandas import compat
  26. from pandas.lib import Timestamp
  27. from pandas.tseries.index import date_range
  28. import pandas.tseries.tools as tools
  29. from numpy.testing.decorators import slow
  30. from numpy.testing import assert_array_equal
  31. from pandas.parser import OverflowError, CParserError
  32. class ParserTests(object):
  33. """
  34. Want to be able to test either C+Cython or Python+Cython parsers
  35. """
  36. data1 = """index,A,B,C,D
  37. foo,2,3,4,5
  38. bar,7,8,9,10
  39. baz,12,13,14,15
  40. qux,12,13,14,15
  41. foo2,12,13,14,15
  42. bar2,12,13,14,15
  43. """
  44. def read_csv(self, *args, **kwargs):
  45. raise NotImplementedError
  46. def read_table(self, *args, **kwargs):
  47. raise NotImplementedError
  48. def setUp(self):
  49. import warnings
  50. warnings.filterwarnings(action='ignore', category=FutureWarning)
  51. self.dirpath = tm.get_data_path()
  52. self.csv1 = os.path.join(self.dirpath, 'test1.csv')
  53. self.csv2 = os.path.join(self.dirpath, 'test2.csv')
  54. self.xls1 = os.path.join(self.dirpath, 'test.xls')
  55. def test_converters_type_must_be_dict(self):
  56. with tm.assertRaisesRegexp(TypeError, 'Type converters.+'):
  57. self.read_csv(StringIO(self.data1), converters=0)
  58. def test_multi_character_decimal_marker(self):
  59. data = """A|B|C
  60. 1|2,334|5
  61. 10|13|10.
  62. """
  63. self.assertRaises(ValueError, read_csv, StringIO(data), decimal=',,')
  64. def test_empty_decimal_marker(self):
  65. data = """A|B|C
  66. 1|2,334|5
  67. 10|13|10.
  68. """
  69. self.assertRaises(ValueError, read_csv, StringIO(data), decimal='')
  70. def test_empty_thousands_marker(self):
  71. data = """A|B|C
  72. 1|2,334|5
  73. 10|13|10.
  74. """
  75. self.assertRaises(ValueError, read_csv, StringIO(data), thousands='')
  76. def test_multi_character_decimal_marker(self):
  77. data = """A|B|C
  78. 1|2,334|5
  79. 10|13|10.
  80. """
  81. self.assertRaises(ValueError, read_csv, StringIO(data), thousands=',,')
  82. def test_empty_string(self):
  83. data = """\
  84. One,Two,Three
  85. a,1,one
  86. b,2,two
  87. ,3,three
  88. d,4,nan
  89. e,5,five
  90. nan,6,
  91. g,7,seven
  92. """
  93. df = self.read_csv(StringIO(data))
  94. xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
  95. 'Two': [1, 2, 3, 4, 5, 6, 7],
  96. 'Three': ['one', 'two', 'three', np.nan, 'five',
  97. np.nan, 'seven']})
  98. tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
  99. df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []},
  100. keep_default_na=False)
  101. xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
  102. 'Two': [1, 2, 3, 4, 5, 6, 7],
  103. 'Three': ['one', 'two', 'three', 'nan', 'five',
  104. '', 'seven']})
  105. tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
  106. df = self.read_csv(
  107. StringIO(data), na_values=['a'], keep_default_na=False)
  108. xp = DataFrame({'One': [np.nan, 'b', '', 'd', 'e', 'nan', 'g'],
  109. 'Two': [1, 2, 3, 4, 5, 6, 7],
  110. 'Three': ['one', 'two', 'three', 'nan', 'five', '',
  111. 'seven']})
  112. tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
  113. df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []})
  114. xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
  115. 'Two': [1, 2, 3, 4, 5, 6, 7],
  116. 'Three': ['one', 'two', 'three', np.nan, 'five',
  117. np.nan, 'seven']})
  118. tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
  119. # GH4318, passing na_values=None and keep_default_na=False yields 'None' as a na_value
  120. data = """\
  121. One,Two,Three
  122. a,1,None
  123. b,2,two
  124. ,3,None
  125. d,4,nan
  126. e,5,five
  127. nan,6,
  128. g,7,seven
  129. """
  130. df = self.read_csv(
  131. StringIO(data), keep_default_na=False)
  132. xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
  133. 'Two': [1, 2, 3, 4, 5, 6, 7],
  134. 'Three': ['None', 'two', 'None', 'nan', 'five', '',
  135. 'seven']})
  136. tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
  137. def test_read_csv(self):
  138. if not compat.PY3:
  139. if 'win' in sys.platform:
  140. prefix = u("file:///")
  141. else:
  142. prefix = u("file://")
  143. fname = prefix + compat.text_type(self.csv1)
  144. # it works!
  145. df1 = read_csv(fname, index_col=0, parse_dates=True)
  146. def test_dialect(self):
  147. data = """\
  148. label1,label2,label3
  149. index1,"a,c,e
  150. index2,b,d,f
  151. """
  152. dia = csv.excel()
  153. dia.quoting = csv.QUOTE_NONE
  154. df = self.read_csv(StringIO(data), dialect=dia)
  155. data = '''\
  156. label1,label2,label3
  157. index1,a,c,e
  158. index2,b,d,f
  159. '''
  160. exp = self.read_csv(StringIO(data))
  161. exp.replace('a', '"a', inplace=True)
  162. tm.assert_frame_equal(df, exp)
  163. def test_1000_sep(self):
  164. data = """A|B|C
  165. 1|2,334|5
  166. 10|13|10.
  167. """
  168. expected = DataFrame({
  169. 'A': [1, 10],
  170. 'B': [2334, 13],
  171. 'C': [5, 10.]
  172. })
  173. df = self.read_csv(StringIO(data), sep='|', thousands=',')
  174. tm.assert_frame_equal(df, expected)
  175. df = self.read_table(StringIO(data), sep='|', thousands=',')
  176. tm.assert_frame_equal(df, expected)
  177. def test_1000_sep_with_decimal(self):
  178. data = """A|B|C
  179. 1|2,334.01|5
  180. 10|13|10.
  181. """
  182. expected = DataFrame({
  183. 'A': [1, 10],
  184. 'B': [2334.01, 13],
  185. 'C': [5, 10.]
  186. })
  187. tm.assert_equal(expected.A.dtype, 'int64')
  188. tm.assert_equal(expected.B.dtype, 'float')
  189. tm.assert_equal(expected.C.dtype, 'float')
  190. df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
  191. tm.assert_frame_equal(df, expected)
  192. df = self.read_table(StringIO(data), sep='|', thousands=',', decimal='.')
  193. tm.assert_frame_equal(df, expected)
  194. data_with_odd_sep = """A|B|C
  195. 1|2.334,01|5
  196. 10|13|10,
  197. """
  198. df = self.read_csv(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
  199. tm.assert_frame_equal(df, expected)
  200. df = self.read_table(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
  201. tm.assert_frame_equal(df, expected)
  202. def test_separator_date_conflict(self):
  203. # Regression test for issue #4678: make sure thousands separator and
  204. # date parsing do not conflict.
  205. data = '06-02-2013;13:00;1-000.215'
  206. expected = DataFrame(
  207. [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]],
  208. columns=['Date', 2]
  209. )
  210. df = self.read_csv(StringIO(data), sep=';', thousands='-', parse_dates={'Date': [0, 1]}, header=None)
  211. tm.assert_frame_equal(df, expected)
  212. def test_squeeze(self):
  213. data = """\
  214. a,1
  215. b,2
  216. c,3
  217. """
  218. expected = Series([1, 2, 3], ['a', 'b', 'c'])
  219. result = self.read_table(StringIO(data), sep=',', index_col=0,
  220. header=None, squeeze=True)
  221. tm.assert_isinstance(result, Series)
  222. tm.assert_series_equal(result, expected)
  223. def test_inf_parsing(self):
  224. data = """\
  225. ,A
  226. a,inf
  227. b,-inf
  228. c,Inf
  229. d,-Inf
  230. e,INF
  231. f,-INF
  232. g,INf
  233. h,-INf
  234. i,inF
  235. j,-inF"""
  236. inf = float('inf')
  237. expected = Series([inf, -inf] * 5)
  238. df = read_csv(StringIO(data), index_col=0)
  239. tm.assert_almost_equal(df['A'].values, expected.values)
  240. df = read_csv(StringIO(data), index_col=0, na_filter=False)
  241. tm.assert_almost_equal(df['A'].values, expected.values)
  242. def test_multiple_date_col(self):
  243. # Can use multiple date parsers
  244. data = """\
  245. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  246. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  247. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  248. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  249. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  250. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
  251. """
  252. def func(*date_cols):
  253. return lib.try_parse_dates(parsers._concat_date_cols(date_cols))
  254. df = self.read_csv(StringIO(data), header=None,
  255. date_parser=func,
  256. prefix='X',
  257. parse_dates={'nominal': [1, 2],
  258. 'actual': [1, 3]})
  259. self.assertIn('nominal', df)
  260. self.assertIn('actual', df)
  261. self.assertNotIn('X1', df)
  262. self.assertNotIn('X2', df)
  263. self.assertNotIn('X3', df)
  264. d = datetime(1999, 1, 27, 19, 0)
  265. self.assertEqual(df.ix[0, 'nominal'], d)
  266. df = self.read_csv(StringIO(data), header=None,
  267. date_parser=func,
  268. parse_dates={'nominal': [1, 2],
  269. 'actual': [1, 3]},
  270. keep_date_col=True)
  271. self.assertIn('nominal', df)
  272. self.assertIn('actual', df)
  273. self.assertIn(1, df)
  274. self.assertIn(2, df)
  275. self.assertIn(3, df)
  276. data = """\
  277. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  278. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  279. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  280. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  281. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  282. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
  283. """
  284. df = read_csv(StringIO(data), header=None,
  285. prefix='X',
  286. parse_dates=[[1, 2], [1, 3]])
  287. self.assertIn('X1_X2', df)
  288. self.assertIn('X1_X3', df)
  289. self.assertNotIn('X1', df)
  290. self.assertNotIn('X2', df)
  291. self.assertNotIn('X3', df)
  292. d = datetime(1999, 1, 27, 19, 0)
  293. self.assertEqual(df.ix[0, 'X1_X2'], d)
  294. df = read_csv(StringIO(data), header=None,
  295. parse_dates=[[1, 2], [1, 3]], keep_date_col=True)
  296. self.assertIn('1_2', df)
  297. self.assertIn('1_3', df)
  298. self.assertIn(1, df)
  299. self.assertIn(2, df)
  300. self.assertIn(3, df)
  301. data = '''\
  302. KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  303. KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  304. KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  305. KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  306. KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  307. '''
  308. df = self.read_csv(StringIO(data), sep=',', header=None,
  309. parse_dates=[1], index_col=1)
  310. d = datetime(1999, 1, 27, 19, 0)
  311. self.assertEqual(df.index[0], d)
  312. def test_multiple_date_cols_int_cast(self):
  313. data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
  314. "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
  315. "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
  316. "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
  317. "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
  318. "KORD,19990127, 23:00:00, 22:56:00, -0.5900")
  319. date_spec = {'nominal': [1, 2], 'actual': [1, 3]}
  320. import pandas.io.date_converters as conv
  321. # it works!
  322. df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec,
  323. date_parser=conv.parse_date_time)
  324. self.assertIn('nominal', df)
  325. def test_multiple_date_col_timestamp_parse(self):
  326. data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
  327. 05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25"""
  328. result = self.read_csv(StringIO(data), sep=',', header=None,
  329. parse_dates=[[0,1]], date_parser=Timestamp)
  330. ex_val = Timestamp('05/31/2012 15:30:00.029')
  331. self.assertEqual(result['0_1'][0], ex_val)
  332. def test_single_line(self):
  333. # GH 6607
  334. # Test currently only valid with python engine because sep=None and
  335. # delim_whitespace=False. Temporarily copied to TestPythonParser.
  336. # Test for ValueError with other engines:
  337. with tm.assertRaisesRegexp(ValueError,
  338. 'sep=None with delim_whitespace=False'):
  339. # sniff separator
  340. buf = StringIO()
  341. sys.stdout = buf
  342. # printing warning message when engine == 'c' for now
  343. try:
  344. # it works!
  345. df = self.read_csv(StringIO('1,2'), names=['a', 'b'],
  346. header=None, sep=None)
  347. tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df)
  348. finally:
  349. sys.stdout = sys.__stdout__
  350. def test_multiple_date_cols_with_header(self):
  351. data = """\
  352. ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
  353. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  354. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  355. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  356. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  357. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  358. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
  359. df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]})
  360. self.assertNotIsInstance(df.nominal[0], compat.string_types)
  361. ts_data = """\
  362. ID,date,nominalTime,actualTime,A,B,C,D,E
  363. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  364. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  365. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  366. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  367. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  368. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
  369. """
  370. def test_multiple_date_col_name_collision(self):
  371. self.assertRaises(ValueError, self.read_csv, StringIO(self.ts_data),
  372. parse_dates={'ID': [1, 2]})
  373. data = """\
  374. date_NominalTime,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
  375. KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  376. KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  377. KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  378. KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  379. KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  380. KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
  381. self.assertRaises(ValueError, self.read_csv, StringIO(data),
  382. parse_dates=[[1, 2]])
  383. def test_index_col_named(self):
  384. no_header = """\
  385. KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  386. KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  387. KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  388. KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  389. KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  390. KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
  391. h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
  392. data = h + no_header
  393. # import pdb; pdb.set_trace()
  394. rs = self.read_csv(StringIO(data), index_col='ID')
  395. xp = self.read_csv(StringIO(data), header=0).set_index('ID')
  396. tm.assert_frame_equal(rs, xp)
  397. self.assertRaises(ValueError, self.read_csv, StringIO(no_header),
  398. index_col='ID')
  399. data = """\
  400. 1,2,3,4,hello
  401. 5,6,7,8,world
  402. 9,10,11,12,foo
  403. """
  404. names = ['a', 'b', 'c', 'd', 'message']
  405. xp = DataFrame({'a': [1, 5, 9], 'b': [2, 6, 10], 'c': [3, 7, 11],
  406. 'd': [4, 8, 12]},
  407. index=Index(['hello', 'world', 'foo'], name='message'))
  408. rs = self.read_csv(StringIO(data), names=names, index_col=['message'])
  409. tm.assert_frame_equal(xp, rs)
  410. self.assertEqual(xp.index.name, rs.index.name)
  411. rs = self.read_csv(StringIO(data), names=names, index_col='message')
  412. tm.assert_frame_equal(xp, rs)
  413. self.assertEqual(xp.index.name, rs.index.name)
  414. def test_converter_index_col_bug(self):
  415. # 1835
  416. data = "A;B\n1;2\n3;4"
  417. rs = self.read_csv(StringIO(data), sep=';', index_col='A',
  418. converters={'A': lambda x: x})
  419. xp = DataFrame({'B': [2, 4]}, index=Index([1, 3], name='A'))
  420. tm.assert_frame_equal(rs, xp)
  421. self.assertEqual(rs.index.name, xp.index.name)
  422. def test_date_parser_int_bug(self):
  423. # #3071
  424. log_file = StringIO(
  425. 'posix_timestamp,elapsed,sys,user,queries,query_time,rows,'
  426. 'accountid,userid,contactid,level,silo,method\n'
  427. '1343103150,0.062353,0,4,6,0.01690,3,'
  428. '12345,1,-1,3,invoice_InvoiceResource,search\n'
  429. )
  430. def f(posix_string):
  431. return datetime.utcfromtimestamp(int(posix_string))
  432. # it works!
  433. read_csv(log_file, index_col=0, parse_dates=0, date_parser=f)
  434. def test_multiple_skts_example(self):
  435. data = "year, month, a, b\n 2001, 01, 0.0, 10.\n 2001, 02, 1.1, 11."
  436. pass
  437. def test_malformed(self):
  438. # all
  439. data = """ignore
  440. A,B,C
  441. 1,2,3 # comment
  442. 1,2,3,4,5
  443. 2,3,4
  444. """
  445. try:
  446. df = self.read_table(
  447. StringIO(data), sep=',', header=1, comment='#')
  448. self.assertTrue(False)
  449. except Exception as inst:
  450. self.assertIn('Expected 3 fields in line 4, saw 5', str(inst))
  451. # skip_footer
  452. data = """ignore
  453. A,B,C
  454. 1,2,3 # comment
  455. 1,2,3,4,5
  456. 2,3,4
  457. footer
  458. """
  459. # GH 6607
  460. # Test currently only valid with python engine because
  461. # skip_footer != 0. Temporarily copied to TestPythonParser.
  462. # Test for ValueError with other engines:
  463. try:
  464. with tm.assertRaisesRegexp(ValueError, 'skip_footer'): #XXX
  465. df = self.read_table(
  466. StringIO(data), sep=',', header=1, comment='#',
  467. skip_footer=1)
  468. self.assertTrue(False)
  469. except Exception as inst:
  470. self.assertIn('Expected 3 fields in line 4, saw 5', str(inst))
  471. # first chunk
  472. data = """ignore
  473. A,B,C
  474. skip
  475. 1,2,3
  476. 3,5,10 # comment
  477. 1,2,3,4,5
  478. 2,3,4
  479. """
  480. try:
  481. it = self.read_table(StringIO(data), sep=',',
  482. header=1, comment='#', iterator=True, chunksize=1,
  483. skiprows=[2])
  484. df = it.read(5)
  485. self.assertTrue(False)
  486. except Exception as inst:
  487. self.assertIn('Expected 3 fields in line 6, saw 5', str(inst))
  488. # middle chunk
  489. data = """ignore
  490. A,B,C
  491. skip
  492. 1,2,3
  493. 3,5,10 # comment
  494. 1,2,3,4,5
  495. 2,3,4
  496. """
  497. try:
  498. it = self.read_table(StringIO(data), sep=',', header=1,
  499. comment='#', iterator=True, chunksize=1,
  500. skiprows=[2])
  501. df = it.read(1)
  502. it.read(2)
  503. self.assertTrue(False)
  504. except Exception as inst:
  505. self.assertIn('Expected 3 fields in line 6, saw 5', str(inst))
  506. # last chunk
  507. data = """ignore
  508. A,B,C
  509. skip
  510. 1,2,3
  511. 3,5,10 # comment
  512. 1,2,3,4,5
  513. 2,3,4
  514. """
  515. try:
  516. it = self.read_table(StringIO(data), sep=',',
  517. header=1, comment='#', iterator=True, chunksize=1,
  518. skiprows=[2])
  519. df = it.read(1)
  520. it.read()
  521. self.assertTrue(False)
  522. except Exception as inst:
  523. self.assertIn('Expected 3 fields in line 6, saw 5', str(inst))
  524. def test_passing_dtype(self):
  525. # GH 6607
  526. # Passing dtype is currently only supported by the C engine.
  527. # Temporarily copied to TestCParser*.
  528. # Test for ValueError with other engines:
  529. with tm.assertRaisesRegexp(ValueError,
  530. "The 'dtype' option is not supported"):
  531. df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E'])
  532. with tm.ensure_clean('__passing_str_as_dtype__.csv') as path:
  533. df.to_csv(path)
  534. # GH 3795
  535. # passing 'str' as the dtype
  536. result = self.read_csv(path, dtype=str, index_col=0)
  537. tm.assert_series_equal(result.dtypes,Series({ 'A' : 'object', 'B' : 'object' }))
  538. # we expect all object columns, so need to convert to test for equivalence
  539. result = result.astype(float)
  540. tm.assert_frame_equal(result,df)
  541. # invalid dtype
  542. self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'foo', 'B' : 'float64' },
  543. index_col=0)
  544. # valid but we don't support it (date)
  545. self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' },
  546. index_col=0)
  547. self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' },
  548. index_col=0, parse_dates=['B'])
  549. # valid but we don't support it
  550. self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' },
  551. index_col=0)
  552. def test_quoting(self):
  553. bad_line_small = """printer\tresult\tvariant_name
  554. Klosterdruckerei\tKlosterdruckerei <Salem> (1611-1804)\tMuller, Jacob
  555. Klosterdruckerei\tKlosterdruckerei <Salem> (1611-1804)\tMuller, Jakob
  556. Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\t"Furststiftische Hofdruckerei, <Kempten""
  557. Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\tGaller, Alois
  558. Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\tHochfurstliche Buchhandlung <Kempten>"""
  559. self.assertRaises(Exception, self.read_table, StringIO(bad_line_small),
  560. sep='\t')
  561. good_line_small = bad_line_small + '"'
  562. df = self.read_table(StringIO(good_line_small), sep='\t')
  563. self.assertEqual(len(df), 3)
  564. def test_non_string_na_values(self):
  565. # GH3611, na_values that are not a string are an issue
  566. with tm.ensure_clean('__non_string_na_values__.csv') as path:
  567. df = DataFrame({'A' : [-999, 2, 3], 'B' : [1.2, -999, 4.5]})
  568. df.to_csv(path, sep=' ', index=False)
  569. result1 = read_csv(path, sep= ' ', header=0, na_values=['-999.0','-999'])
  570. result2 = read_csv(path, sep= ' ', header=0, na_values=[-999,-999.0])
  571. result3 = read_csv(path, sep= ' ', header=0, na_values=[-999.0,-999])
  572. tm.assert_frame_equal(result1,result2)
  573. tm.assert_frame_equal(result2,result3)
  574. result4 = read_csv(path, sep= ' ', header=0, na_values=['-999.0'])
  575. result5 = read_csv(path, sep= ' ', header=0, na_values=['-999'])
  576. result6 = read_csv(path, sep= ' ', header=0, na_values=[-999.0])
  577. result7 = read_csv(path, sep= ' ', header=0, na_values=[-999])
  578. tm.assert_frame_equal(result4,result3)
  579. tm.assert_frame_equal(result5,result3)
  580. tm.assert_frame_equal(result6,result3)
  581. tm.assert_frame_equal(result7,result3)
  582. good_compare = result3
  583. # with an odd float format, so we can't match the string 999.0 exactly,
  584. # but need float matching
  585. df.to_csv(path, sep=' ', index=False, float_format = '%.3f')
  586. result1 = read_csv(path, sep= ' ', header=0, na_values=['-999.0','-999'])
  587. result2 = read_csv(path, sep= ' ', header=0, na_values=[-999,-999.0])
  588. result3 = read_csv(path, sep= ' ', header=0, na_values=[-999.0,-999])
  589. tm.assert_frame_equal(result1,good_compare)
  590. tm.assert_frame_equal(result2,good_compare)
  591. tm.assert_frame_equal(result3,good_compare)
  592. result4 = read_csv(path, sep= ' ', header=0, na_values=['-999.0'])
  593. result5 = read_csv(path, sep= ' ', header=0, na_values=['-999'])
  594. result6 = read_csv(path, sep= ' ', header=0, na_values=[-999.0])
  595. result7 = read_csv(path, sep= ' ', header=0, na_values=[-999])
  596. tm.assert_frame_equal(result4,good_compare)
  597. tm.assert_frame_equal(result5,good_compare)
  598. tm.assert_frame_equal(result6,good_compare)
  599. tm.assert_frame_equal(result7,good_compare)
  600. def test_default_na_values(self):
  601. _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN',
  602. '#N/A','N/A', 'NA', '#NA', 'NULL', 'NaN',
  603. 'nan', '-NaN', '-nan', '#N/A N/A',''])
  604. assert_array_equal (_NA_VALUES, parsers._NA_VALUES)
  605. nv = len(_NA_VALUES)
  606. def f(i, v):
  607. if i == 0:
  608. buf = ''
  609. elif i > 0:
  610. buf = ''.join([','] * i)
  611. buf = "{0}{1}".format(buf,v)
  612. if i < nv-1:
  613. buf = "{0}{1}".format(buf,''.join([','] * (nv-i-1)))
  614. return buf
  615. data = StringIO('\n'.join([ f(i, v) for i, v in enumerate(_NA_VALUES) ]))
  616. expected = DataFrame(np.nan,columns=range(nv),index=range(nv))
  617. df = self.read_csv(data, header=None)
  618. tm.assert_frame_equal(df, expected)
  619. def test_custom_na_values(self):
  620. data = """A,B,C
  621. ignore,this,row
  622. 1,NA,3
  623. -1.#IND,5,baz
  624. 7,8,NaN
  625. """
  626. expected = [[1., nan, 3],
  627. [nan, 5, nan],
  628. [7, 8, nan]]
  629. df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1])
  630. tm.assert_almost_equal(df.values, expected)
  631. df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'],
  632. skiprows=[1])
  633. tm.assert_almost_equal(df2.values, expected)
  634. df3 = self.read_table(StringIO(data), sep=',', na_values='baz',
  635. skiprows=[1])
  636. tm.assert_almost_equal(df3.values, expected)
  637. def test_nat_parse(self):
  638. # GH 3062
  639. df = DataFrame(dict({
  640. 'A' : np.asarray(lrange(10),dtype='float64'),
  641. 'B' : pd.Timestamp('20010101') }))
  642. df.iloc[3:6,:] = np.nan
  643. with tm.ensure_clean('__nat_parse_.csv') as path:
  644. df.to_csv(path)
  645. result = read_csv(path,index_col=0,parse_dates=['B'])
  646. tm.assert_frame_equal(result,df)
  647. expected = Series(dict( A = 'float64',B = 'datetime64[ns]'))
  648. tm.assert_series_equal(expected,result.dtypes)
  649. # test with NaT for the nan_rep
  650. # we don't have a method to specif the Datetime na_rep (it defaults to '')
  651. df.to_csv(path)
  652. result = read_csv(path,index_col=0,parse_dates=['B'])
  653. tm.assert_frame_equal(result,df)
  654. def test_skiprows_bug(self):
  655. # GH #505
  656. text = """#foo,a,b,c
  657. #foo,a,b,c
  658. #foo,a,b,c
  659. #foo,a,b,c
  660. #foo,a,b,c
  661. #foo,a,b,c
  662. 1/1/2000,1.,2.,3.
  663. 1/2/2000,4,5,6
  664. 1/3/2000,7,8,9
  665. """
  666. data = self.read_csv(StringIO(text), skiprows=lrange(6), header=None,
  667. index_col=0, parse_dates=True)
  668. data2 = self.read_csv(StringIO(text), skiprows=6, header=None,
  669. index_col=0, parse_dates=True)
  670. expected = DataFrame(np.arange(1., 10.).reshape((3, 3)),
  671. columns=[1, 2, 3],
  672. index=[datetime(2000, 1, 1), datetime(2000, 1, 2),
  673. datetime(2000, 1, 3)])
  674. expected.index.name = 0
  675. tm.assert_frame_equal(data, expected)
  676. tm.assert_frame_equal(data, data2)
  677. def test_deep_skiprows(self):
  678. # GH #4382
  679. text = "a,b,c\n" + "\n".join([",".join([str(i), str(i+1), str(i+2)]) for i in range(10)])
  680. condensed_text = "a,b,c\n" + "\n".join([",".join([str(i), str(i+1), str(i+2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]])
  681. data = self.read_csv(StringIO(text), skiprows=[6, 8])
  682. condensed_data = self.read_csv(StringIO(condensed_text))
  683. tm.assert_frame_equal(data, condensed_data)
  684. def test_detect_string_na(self):
  685. data = """A,B
  686. foo,bar
  687. NA,baz
  688. NaN,nan
  689. """
  690. expected = [['foo', 'bar'],
  691. [nan, 'baz'],
  692. [nan, nan]]
  693. df = self.read_csv(StringIO(data))
  694. tm.assert_almost_equal(df.values, expected)
  695. def test_unnamed_columns(self):
  696. data = """A,B,C,,
  697. 1,2,3,4,5
  698. 6,7,8,9,10
  699. 11,12,13,14,15
  700. """
  701. expected = [[1, 2, 3, 4, 5.],
  702. [6, 7, 8, 9, 10],
  703. [11, 12, 13, 14, 15]]
  704. df = self.read_table(StringIO(data), sep=',')
  705. tm.assert_almost_equal(df.values, expected)
  706. self.assert_numpy_array_equal(df.columns,
  707. ['A', 'B', 'C', 'Unnamed: 3',
  708. 'Unnamed: 4'])
  709. def test_string_nas(self):
  710. data = """A,B,C
  711. a,b,c
  712. d,,f
  713. ,g,h
  714. """
  715. result = self.read_csv(StringIO(data))
  716. expected = DataFrame([['a', 'b', 'c'],
  717. ['d', np.nan, 'f'],
  718. [np.nan, 'g', 'h']],
  719. columns=['A', 'B', 'C'])
  720. tm.assert_frame_equal(result, expected)
  721. def test_duplicate_columns(self):
  722. for engine in ['python', 'c']:
  723. data = """A,A,B,B,B
  724. 1,2,3,4,5
  725. 6,7,8,9,10
  726. 11,12,13,14,15
  727. """
  728. # check default beahviour
  729. df = self.read_table(StringIO(data), sep=',',engine=engine)
  730. self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])
  731. df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=False)
  732. self.assertEqual(list(df.columns), ['A', 'A', 'B', 'B', 'B'])
  733. df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=True)
  734. self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])
  735. def test_csv_mixed_type(self):
  736. data = """A,B,C
  737. a,1,2
  738. b,3,4
  739. c,4,5
  740. """
  741. df = self.read_csv(StringIO(data))
  742. # TODO
  743. def test_csv_custom_parser(self):
  744. data = """A,B,C
  745. 20090101,a,1,2
  746. 20090102,b,3,4
  747. 20090103,c,4,5
  748. """
  749. f = lambda x: datetime.strptime(x, '%Y%m%d')
  750. df = self.read_csv(StringIO(data), date_parser=f)
  751. expected = self.read_csv(StringIO(data), parse_dates=True)
  752. tm.assert_frame_equal(df, expected)
  753. def test_parse_dates_implicit_first_col(self):
  754. data = """A,B,C
  755. 20090101,a,1,2
  756. 20090102,b,3,4
  757. 20090103,c,4,5
  758. """
  759. df = self.read_csv(StringIO(data), parse_dates=True)
  760. expected = self.read_csv(StringIO(data), index_col=0, parse_dates=True)
  761. self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp))
  762. tm.assert_frame_equal(df, expected)
  763. def test_parse_dates_string(self):
  764. data = """date,A,B,C
  765. 20090101,a,1,2
  766. 20090102,b,3,4
  767. 20090103,c,4,5
  768. """
  769. rs = self.read_csv(
  770. StringIO(data), index_col='date', parse_dates='date')
  771. idx = date_range('1/1/2009', periods=3)
  772. idx.name = 'date'
  773. xp = DataFrame({'A': ['a', 'b', 'c'],
  774. 'B': [1, 3, 4],
  775. 'C': [2, 4, 5]}, idx)
  776. tm.assert_frame_equal(rs, xp)
  777. def test_yy_format(self):
  778. data = """date,time,B,C
  779. 090131,0010,1,2
  780. 090228,1020,3,4
  781. 090331,0830,5,6
  782. """
  783. rs = self.read_csv(StringIO(data), index_col=0,
  784. parse_dates=[['date', 'time']])
  785. idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
  786. datetime(2009, 2, 28, 10, 20, 0),
  787. datetime(2009, 3, 31, 8, 30, 0)]).asobject
  788. idx.name = 'date_time'
  789. xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx)
  790. tm.assert_frame_equal(rs, xp)
  791. rs = self.read_csv(StringIO(data), index_col=0,
  792. parse_dates=[[0, 1]])
  793. idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
  794. datetime(2009, 2, 28, 10, 20, 0),
  795. datetime(2009, 3, 31, 8, 30, 0)]).asobject
  796. idx.name = 'date_time'
  797. xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx)
  798. tm.assert_frame_equal(rs, xp)
  799. def test_parse_dates_column_list(self):
  800. from pandas.core.datetools import to_datetime
  801. data = '''date;destination;ventilationcode;unitcode;units;aux_date
  802. 01/01/2010;P;P;50;1;12/1/2011
  803. 01/01/2010;P;R;50;1;13/1/2011
  804. 15/01/2010;P;P;50;1;14/1/2011
  805. 01/05/2010;P;P;50;1;15/1/2011'''
  806. expected = self.read_csv(StringIO(data), sep=";", index_col=lrange(4))
  807. lev = expected.index.levels[0]
  808. levels = list(expected.index.levels)
  809. levels[0] = lev.to_datetime(dayfirst=True)
  810. # hack to get this to work - remove for final test
  811. levels[0].name = lev.name
  812. expected.index.set_levels(levels, inplace=True)
  813. expected['aux_date'] = to_datetime(expected['aux_date'],
  814. dayfirst=True)
  815. expected['aux_date'] = lmap(Timestamp, expected['aux_date'])
  816. tm.assert_isinstance(expected['aux_date'][0], datetime)
  817. df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4),
  818. parse_dates=[0, 5], dayfirst=True)
  819. tm.assert_frame_equal(df, expected)
  820. df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4),
  821. parse_dates=['date', 'aux_date'], dayfirst=True)
  822. tm.assert_frame_equal(df, expected)
  823. def test_no_header(self):
  824. data = """1,2,3,4,5
  825. 6,7,8,9,10
  826. 11,12,13,14,15
  827. """
  828. df = self.read_table(StringIO(data), sep=',', header=None)
  829. df_pref = self.read_table(StringIO(data), sep=',', prefix='X',
  830. header=None)
  831. names = ['foo', 'bar', 'baz', 'quux', 'panda']
  832. df2 = self.read_table(StringIO(data), sep=',', names=names)
  833. expected = [[1, 2, 3, 4, 5.],
  834. [6, 7, 8, 9, 10],
  835. [11, 12, 13, 14, 15]]
  836. tm.assert_almost_equal(df.values, expected)
  837. tm.assert_almost_equal(df.values, df2.values)
  838. self.assert_numpy_array_equal(df_pref.columns,
  839. ['X0', 'X1', 'X2', 'X3', 'X4'])
  840. self.assert_numpy_array_equal(df.columns, lrange(5))
  841. self.assert_numpy_array_equal(df2.columns, names)
  842. def test_no_header_prefix(self):
  843. data = """1,2,3,4,5
  844. 6,7,8,9,10
  845. 11,12,13,14,15
  846. """
  847. df_pref = self.read_table(StringIO(data), sep=',', prefix='Field',
  848. header=None)
  849. expected = [[1, 2, 3, 4, 5.],
  850. [6, 7, 8, 9, 10],
  851. [11, 12, 13, 14, 15]]
  852. tm.assert_almost_equal(df_pref.values, expected)
  853. self.assert_numpy_array_equal(df_pref.columns,
  854. ['Field0', 'Field1', 'Field2', 'Field3', 'Field4'])
  855. def test_header_with_index_col(self):
  856. data = """foo,1,2,3
  857. bar,4,5,6
  858. baz,7,8,9
  859. """
  860. names = ['A', 'B', 'C']
  861. df = self.read_csv(StringIO(data), names=names)
  862. self.assertEqual(names, ['A', 'B', 'C'])
  863. values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
  864. expected = DataFrame(values, index=['foo', 'bar', 'baz'],
  865. columns=['A', 'B', 'C'])
  866. tm.assert_frame_equal(df, expected)
  867. def test_read_csv_dataframe(self):
  868. df = self.read_csv(self.csv1, index_col=0, parse_dates=True)
  869. df2 = self.read_table(self.csv1, sep=',', index_col=0,
  870. parse_dates=True)
  871. self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D'])
  872. self.assertEqual(df.index.name, 'index')
  873. self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp))
  874. self.assertEqual(df.values.dtype, np.float64)
  875. tm.assert_frame_equal(df, df2)
  876. def test_read_csv_no_index_name(self):
  877. df = self.read_csv(self.csv2, index_col=0, parse_dates=True)
  878. df2 = self.read_table(self.csv2, sep=',', index_col=0,
  879. parse_dates=True)
  880. self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D', 'E'])
  881. self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp))
  882. self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, np.float64)
  883. tm.assert_frame_equal(df, df2)
  884. def test_read_table_unicode(self):
  885. fin = BytesIO(u('\u0141aski, Jan;1').encode('utf-8'))
  886. df1 = read_table(fin, sep=";", encoding="utf-8", header=None)
  887. tm.assert_isinstance(df1[0].values[0], compat.text_type)
  888. def test_read_table_wrong_num_columns(self):
  889. # too few!
  890. data = """A,B,C,D,E,F
  891. 1,2,3,4,5,6
  892. 6,7,8,9,10,11,12
  893. 11,12,13,14,15,16
  894. """
  895. self.assertRaises(Exception, self.read_csv, StringIO(data))
  896. def test_read_table_duplicate_index(self):
  897. data = """index,A,B,C,D
  898. foo,2,3,4,5
  899. bar,7,8,9,10
  900. baz,12,13,14,15
  901. qux,12,13,14,15
  902. foo,12,13,14,15
  903. bar,12,13,14,15
  904. """
  905. result = self.read_csv(StringIO(data), index_col=0)
  906. expected = self.read_csv(StringIO(data)).set_index('index',
  907. verify_integrity=False)
  908. tm.assert_frame_equal(result, expected)
  909. def test_read_table_duplicate_index_implicit(self):
  910. data = """A,B,C,D
  911. foo,2,3,4,5
  912. bar,7,8,9,10
  913. baz,12,13,14,15
  914. qux,12,13,14,15
  915. foo,12,13,14,15
  916. bar,12,13,14,15
  917. """
  918. # it works!
  919. result = self.read_csv(StringIO(data))
  920. def test_parse_bools(self):
  921. data = """A,B
  922. True,1
  923. False,2
  924. True,3
  925. """
  926. data = self.read_csv(StringIO(data))
  927. self.assertEqual(data['A'].dtype, np.bool_)
  928. data = """A,B
  929. YES,1
  930. no,2
  931. yes,3
  932. No,3
  933. Yes,3
  934. """
  935. data = self.read_csv(StringIO(data),
  936. true_values=['yes', 'Yes', 'YES'],
  937. false_values=['no', 'NO', 'No'])
  938. self.assertEqual(data['A'].dtype, np.bool_)
  939. data = """A,B
  940. TRUE,1
  941. FALSE,2
  942. TRUE,3
  943. """
  944. data = self.read_csv(StringIO(data))
  945. self.assertEqual(data['A'].dtype, np.bool_)
  946. data = """A,B
  947. foo,bar
  948. bar,foo"""
  949. result = self.read_csv(StringIO(data), true_values=['foo'],
  950. false_values=['bar'])
  951. expected = DataFrame({'A': [True, False], 'B': [False, True]})
  952. tm.assert_frame_equal(result, expected)
  953. def test_int_conversion(self):
  954. data = """A,B
  955. 1.0,1
  956. 2.0,2
  957. 3.0,3
  958. """
  959. data = self.read_csv(StringIO(data))
  960. self.assertEqual(data['A'].dtype, np.float64)
  961. self.assertEqual(data['B'].dtype, np.int64)
  962. def test_infer_index_col(self):
  963. data = """A,B,C
  964. foo,1,2,3
  965. bar,4,5,6
  966. baz,7,8,9
  967. """
  968. data = self.read_csv(StringIO(data))
  969. self.assertTrue(data.index.equals(Index(['foo', 'bar', 'baz'])))
  970. def test_read_nrows(self):
  971. df = self.read_csv(StringIO(self.data1), nrows=3)
  972. expected = self.read_csv(StringIO(self.data1))[:3]
  973. tm.assert_frame_equal(df, expected)
  974. def test_read_chunksize(self):
  975. reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2)
  976. df = self.read_csv(StringIO(self.data1), index_col=0)
  977. chunks = list(reader)
  978. tm.assert_frame_equal(chunks[0], df[:2])
  979. tm.assert_frame_equal(chunks[1], df[2:4])
  980. tm.assert_frame_equal(chunks[2], df[4:])
  981. def test_read_chunksize_named(self):
  982. reader = self.read_csv(
  983. StringIO(self.data1), index_col='index', chunksize=2)
  984. df = self.read_csv(StringIO(self.data1), index_col='index')
  985. chunks = list(reader)
  986. tm.assert_frame_equal(chunks[0], df[:2])
  987. tm.assert_frame_equal(chunks[1], df[2:4])
  988. tm.assert_frame_equal(chunks[2], df[4:])
  989. def test_get_chunk_passed_chunksize(self):
  990. data = """A,B,C
  991. 1,2,3
  992. 4,5,6
  993. 7,8,9
  994. 1,2,3"""
  995. result = self.read_csv(StringIO(data), chunksize=2)
  996. piece = result.get_chunk()
  997. self.assertEqual(len(piece), 2)
  998. def test_read_text_list(self):
  999. data = """A,B,C\nfoo,1,2,3\nbar,4,5,6"""
  1000. as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar',
  1001. '4', '5', '6']]
  1002. df = self.read_csv(StringIO(data), index_col=0)
  1003. parser = TextParser(as_list, index_col=0, chunksize=2)
  1004. chunk = parser.read(None)
  1005. tm.assert_frame_equal(chunk, df)
  1006. def test_iterator(self):
  1007. # GH 6607
  1008. # Test currently only valid with python engine because
  1009. # skip_footer != 0. Temporarily copied to TestPythonParser.
  1010. # Test for ValueError with other engines:
  1011. with tm.assertRaisesRegexp(ValueError, 'skip_footer'):
  1012. reader = self.read_csv(StringIO(self.data1), index_col=0,
  1013. iterator=True)
  1014. df = self.read_csv(StringIO(self.data1), index_col=0)
  1015. chunk = reader.read(3)
  1016. tm.assert_frame_equal(chunk, df[:3])
  1017. last_chunk = reader.read(5)
  1018. tm.assert_frame_equal(last_chunk, df[3:])
  1019. # pass list
  1020. lines = list(csv.reader(StringIO(self.data1)))
  1021. parser = TextParser(lines, index_col=0, chunksize=2)
  1022. df = self.read_csv(StringIO(self.data1), index_col=0)
  1023. chunks = list(parser)
  1024. tm.assert_frame_equal(chunks[0], df[:2])
  1025. tm.assert_frame_equal(chunks[1], df[2:4])
  1026. tm.assert_frame_equal(chunks[2], df[4:])
  1027. # pass skiprows
  1028. parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1])
  1029. chunks = list(parser)
  1030. tm.assert_frame_equal(chunks[0], df[1:3])
  1031. # test bad parameter (skip_footer)
  1032. reader = self.read_csv(StringIO(self.data1), index_col=0,
  1033. iterator=True, skip_footer=True)
  1034. self.assertRaises(ValueError, reader.read, 3)
  1035. treader = self.read_table(StringIO(self.data1), sep=',', index_col=0,
  1036. iterator=True)
  1037. tm.assert_isinstance(treader, TextFileReader)
  1038. # stopping iteration when on chunksize is specified, GH 3967
  1039. data = """A,B,C
  1040. foo,1,2,3
  1041. bar,4,5,6
  1042. baz,7,8,9
  1043. """
  1044. reader = self.read_csv(StringIO(data), iterator=True)
  1045. result = list(reader)
  1046. expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz'])
  1047. tm.assert_frame_equal(result[0], expected)
  1048. # chunksize = 1
  1049. reader = self.read_csv(StringIO(data), chunksize=1)
  1050. result = list(reader)
  1051. expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz'])
  1052. self.assertEqual(len(result), 3)
  1053. tm.assert_frame_equal(pd.concat(result), expected)
  1054. def test_header_not_first_line(self):
  1055. data = """got,to,ignore,this,line
  1056. got,to,ignore,this,line
  1057. index,A,B,C,D
  1058. foo,2,3,4,5
  1059. bar,7,8,9,10
  1060. baz,12,13,14,15
  1061. """
  1062. data2 = """index,A,B,C,D
  1063. foo,2,3,4,5
  1064. bar,7,8,9,10
  1065. baz,12,13,14,15
  1066. """
  1067. df = self.read_csv(StringIO(data), header=2, index_col=0)
  1068. expected = self.read_csv(StringIO(data2), header=0, index_col=0)
  1069. tm.assert_frame_equal(df, expected)
  1070. def test_header_multi_index(self):
  1071. expected = tm.makeCustomDataframe(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
  1072. data = """\
  1073. C0,,C_l0_g0,C_l0_g1,C_l0_g2
  1074. C1,,C_l1_g0,C_l1_g1,C_l1_g2
  1075. C2,,C_l2_g0,C_l2_g1,C_l2_g2
  1076. C3,,C_l3_g0,C_l3_g1,C_l3_g2
  1077. R0,R1,,,
  1078. R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
  1079. R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
  1080. R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
  1081. R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
  1082. R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
  1083. """
  1084. df = self.read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False)
  1085. tm.assert_frame_equal(df, expected)
  1086. # skipping lines in the header
  1087. df = self.read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False)
  1088. tm.assert_frame_equal(df, expected)
  1089. #### invalid options ####
  1090. # no as_recarray
  1091. self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3],
  1092. index_col=[0,1], as_recarray=True, tupleize_cols=False)
  1093. # names
  1094. self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3],
  1095. index_col=[0,1], names=['foo','bar'], tupleize_cols=False)
  1096. # usecols
  1097. self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3],
  1098. index_col=[0,1], usecols=['foo','bar'], tupleize_cols=False)
  1099. # non-numeric index_col
  1100. self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3],
  1101. index_col=['foo','bar'], tupleize_cols=False)
  1102. def test_header_multiindex_common_format(self):
  1103. df = DataFrame([[1,2,3,4,5,6],[7,8,9,10,11,12]],
  1104. index=['one','two'],
  1105. columns=MultiIndex.from_tuples([('a','q'),('a','r'),('a','s'),
  1106. ('b','t'),('c','u'),('c','v')]))
  1107. # to_csv
  1108. data = """,a,a,a,b,c,c
  1109. ,q,r,s,t,u,v
  1110. ,,,,,,
  1111. one,1,2,3,4,5,6
  1112. two,7,8,9,10,11,12"""
  1113. result = self.read_csv(StringIO(data),header=[0,1],index_col=0)
  1114. tm.assert_frame_equal(df,result)
  1115. # common
  1116. data = """,a,a,a,b,c,c
  1117. ,q,r,s,t,u,v
  1118. one,1,2,3,4,5,6
  1119. two,7,8,9,10,11,12"""
  1120. result = self.read_csv(StringIO(data),header=[0,1],index_col=0)
  1121. tm.assert_frame_equal(df,result)
  1122. # common, no index_col
  1123. data = """a,a,a,b,c,c
  1124. q,r,s,t,u,v
  1125. 1,2,3,4,5,6
  1126. 7,8,9,10,11,12"""
  1127. result = self.read_csv(StringIO(data),header=[0,1],index_col=None)
  1128. tm.assert_frame_equal(df.reset_index(drop=True),result)
  1129. # malformed case 1
  1130. expected = DataFrame(np.array([[2, 3, 4, 5, 6],
  1131. [8, 9, 10, 11, 12]], dtype='int64'),
  1132. index=Index([1, 7]),
  1133. columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]],
  1134. labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
  1135. names=[u('a'), u('q')]))
  1136. data = """a,a,a,b,c,c
  1137. q,r,s,t,u,v
  1138. 1,2,3,4,5,6
  1139. 7,8,9,10,11,12"""
  1140. result = self.read_csv(StringIO(data),header=[0,1],index_col=0)
  1141. tm.assert_frame_equal(expected,result)
  1142. # malformed case 2
  1143. expected = DataFrame(np.array([[2, 3, 4, 5, 6],
  1144. [8, 9, 10, 11, 12]], dtype='int64'),
  1145. index=Index([1, 7]),
  1146. columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]],
  1147. labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
  1148. names=[None, u('q')]))
  1149. data = """,a,a,b,c,c
  1150. q,r,s,t,u,v
  1151. 1,2,3,4,5,6
  1152. 7,8,9,10,11,12"""
  1153. result = self.read_csv(StringIO(data),header=[0,1],index_col=0)
  1154. tm.assert_frame_equal(expected,result)
  1155. # mi on columns and index (malformed)
  1156. expected = DataFrame(np.array([[ 3, 4, 5, 6],
  1157. [ 9, 10, 11, 12]], dtype='int64'),
  1158. index=MultiIndex(levels=[[1, 7], [2, 8]],
  1159. labels=[[0, 1], [0, 1]]),
  1160. columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('s'), u('t'), u('u'), u('v')]],
  1161. labels=[[0, 1, 2, 2], [0, 1, 2, 3]],
  1162. names=[None, u('q')]))
  1163. data = """,a,a,b,c,c
  1164. q,r,s,t,u,v
  1165. 1,2,3,4,5,6
  1166. 7,8,9,10,11,12"""
  1167. result = self.read_csv(StringIO(data),header=[0,1],index_col=[0, 1])
  1168. tm.assert_frame_equal(expected,result)
  1169. def test_pass_names_with_index(self):
  1170. lines = self.data1.split('\n')
  1171. no_header = '\n'.join(lines[1:])
  1172. # regular index
  1173. names = ['index', 'A', 'B', 'C', 'D']
  1174. df = self.read_csv(StringIO(no_header), index_col=0, names=names)
  1175. expected = self.read_csv(StringIO(self.data1), index_col=0)
  1176. tm.assert_frame_equal(df, expected)
  1177. # multi index
  1178. data = """index1,index2,A,B,C,D
  1179. foo,one,2,3,4,5
  1180. foo,two,7,8,9,10
  1181. foo,three,12,13,14,15
  1182. bar,one,12,13,14,15
  1183. bar,two,12,13,14,15
  1184. """
  1185. lines = data.split('\n')
  1186. no_header = '\n'.join(lines[1:])
  1187. names = ['index1', 'index2', 'A', 'B', 'C', 'D']
  1188. df = self.read_csv(StringIO(no_header), index_col=[0, 1],
  1189. names=names)
  1190. expected = self.read_csv(StringIO(data), index_col=[0, 1])
  1191. tm.assert_frame_equal(df, expected)
  1192. df = self.read_csv(StringIO(data), index_col=['index1', 'index2'])
  1193. tm.assert_frame_equal(df, expected)
  1194. def test_multi_index_no_level_names(self):
  1195. data = """index1,index2,A,B,C,D
  1196. foo,one,2,3,4,5
  1197. foo,two,7,8,9,10
  1198. foo,three,12,13,14,15
  1199. bar,one,12,13,14,15
  1200. bar,two,12,13,14,15
  1201. """
  1202. data2 = """A,B,C,D
  1203. foo,one,2,3,4,5
  1204. foo,two,7,8,9,10
  1205. foo,three,12,13,14,15
  1206. bar,one,12,13,14,15
  1207. bar,two,12,13,14,15
  1208. """
  1209. lines = data.split('\n')
  1210. no_header = '\n'.join(lines[1:])
  1211. names = ['A', 'B', 'C', 'D']
  1212. df = self.read_csv(StringIO(no_header), index_col=[0, 1],
  1213. header=None, names=names)
  1214. expected = self.read_csv(StringIO(data), index_col=[0, 1])
  1215. tm.assert_frame_equal(df, expected, check_names=False)
  1216. # 2 implicit first cols
  1217. df2 = self.read_csv(StringIO(data2))
  1218. tm.assert_frame_equal(df2, df)
  1219. # reverse order of index
  1220. df = self.read_csv(StringIO(no_header), index_col=[1, 0], names=names,
  1221. header=None)
  1222. expected = self.read_csv(StringIO(data), index_col=[1, 0])
  1223. tm.assert_frame_equal(df, expected, check_names=False)
  1224. def test_multi_index_parse_dates(self):
  1225. data = """index1,index2,A,B,C
  1226. 20090101,one,a,1,2
  1227. 20090101,two,b,3,4
  1228. 20090101,three,c,4,5
  1229. 20090102,one,a,1,2
  1230. 20090102,two,b,3,4
  1231. 20090102,three,c,4,5
  1232. 20090103,one,a,1,2
  1233. 20090103,two,b,3,4
  1234. 20090103,three,c,4,5
  1235. """
  1236. df = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=True)
  1237. self.assertIsInstance(df.index.levels[0][0],
  1238. (datetime, np.datetime64, Timestamp))
  1239. # specify columns out of order!
  1240. df2 = self.read_csv(StringIO(data), index_col=[1, 0], parse_dates=True)
  1241. self.assertIsInstance(df2.index.levels[1][0],
  1242. (datetime, np.datetime64, Timestamp))
  1243. def test_skip_footer(self):
  1244. # GH 6607
  1245. # Test currently only valid with python engine because
  1246. # skip_footer != 0. Temporarily copied to TestPythonParser.
  1247. # Test for ValueError with other engines:
  1248. with tm.assertRaisesRegexp(ValueError, 'skip_footer'):
  1249. data = """A,B,C
  1250. 1,2,3
  1251. 4,5,6
  1252. 7,8,9
  1253. want to skip this
  1254. also also skip this
  1255. """
  1256. result = self.read_csv(StringIO(data), skip_footer=2)
  1257. no_footer = '\n'.join(data.split('\n')[:-3])
  1258. expected = self.read_csv(StringIO(no_footer))
  1259. tm.assert_frame_equal(result, expected)
  1260. result = self.read_csv(StringIO(data), nrows=3)
  1261. tm.assert_frame_equal(result, expected)
  1262. # skipfooter alias
  1263. result = read_csv(StringIO(data), skipfooter=2)
  1264. no_footer = '\n'.join(data.split('\n')[:-3])
  1265. expected = read_csv(StringIO(no_footer))
  1266. tm.assert_frame_equal(result, expected)
  1267. def test_no_unnamed_index(self):
  1268. data = """ id c0 c1 c2
  1269. 0 1 0 a b
  1270. 1 2 0 c d
  1271. 2 2 2 e f
  1272. """
  1273. df = self.read_table(StringIO(data), sep=' ')
  1274. self.assertIsNone(df.index.name)
  1275. def test_converters(self):
  1276. data = """A,B,C,D
  1277. a,1,2,01/01/2009
  1278. b,3,4,01/02/2009
  1279. c,4,5,01/03/2009
  1280. """
  1281. from pandas.compat import parse_date
  1282. result = self.read_csv(StringIO(data), converters={'D': parse_date})
  1283. result2 = self.read_csv(StringIO(data), converters={3: parse_date})
  1284. expected = self.read_csv(StringIO(data))
  1285. expected['D'] = expected['D'].map(parse_date)
  1286. tm.assert_isinstance(result['D'][0], (datetime, Timestamp))
  1287. tm.assert_frame_equal(result, expected)
  1288. tm.assert_frame_equal(result2, expected)
  1289. # produce integer
  1290. converter = lambda x: int(x.split('/')[2])
  1291. result = self.read_csv(StringIO(data), converters={'D': converter})
  1292. expected = self.read_csv(StringIO(data))
  1293. expected['D'] = expected['D'].map(converter)
  1294. tm.assert_frame_equal(result, expected)
  1295. def test_converters_no_implicit_conv(self):
  1296. # GH2184
  1297. data = """000102,1.2,A\n001245,2,B"""
  1298. f = lambda x: x.strip()
  1299. converter = {0: f}
  1300. df = self.read_csv(StringIO(data), header=None, converters=converter)
  1301. self.assertEqual(df[0].dtype, object)
  1302. def test_converters_euro_decimal_format(self):
  1303. data = """Id;Number1;Number2;Text1;Text2;Number3
  1304. 1;1521,1541;187101,9543;ABC;poi;4,738797819
  1305. 2;121,12;14897,76;DEF;uyt;0,377320872
  1306. 3;878,158;108013,434;GHI;rez;2,735694704"""
  1307. f = lambda x: float(x.replace(",", "."))
  1308. converter = {'Number1': f, 'Number2': f, 'Number3': f}
  1309. df2 = self.read_csv(StringIO(data), sep=';', converters=converter)
  1310. self.assertEqual(df2['Number1'].dtype, float)
  1311. self.assertEqual(df2['Number2'].dtype, float)
  1312. self.assertEqual(df2['Number3'].dtype, float)
  1313. def test_converter_return_string_bug(self):
  1314. # GH #583
  1315. data = """Id;Number1;Number2;Text1;Text2;Number3
  1316. 1;1521,1541;187101,9543;ABC;poi;4,738797819
  1317. 2;121,12;14897,76;DEF;uyt;0,377320872
  1318. 3;878,158;108013,434;GHI;rez;2,735694704"""
  1319. f = lambda x: float(x.replace(",", "."))
  1320. converter = {'Number1': f, 'Number2': f, 'Number3': f}
  1321. df2 = self.read_csv(StringIO(data), sep=';', converters=converter)
  1322. self.assertEqual(df2['Number1'].dtype, float)
  1323. def test_read_table_buglet_4x_multiindex(self):
  1324. # GH 6607
  1325. # Parsing multi-level index currently causes an error in the C parser.
  1326. # Temporarily copied to TestPythonParser.
  1327. # Here test that CParserError is raised:
  1328. with tm.assertRaises(CParserError):
  1329. text = """ A B C D E
  1330. one two three four
  1331. a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
  1332. a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
  1333. x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
  1334. # it works!
  1335. df = self.read_table(StringIO(text), sep='\s+')
  1336. self.assertEqual(df.index.names, ('one', 'two', 'three', 'four'))
  1337. def test_line_comment(self):
  1338. data = """# empty
  1339. A,B,C
  1340. 1,2.,4.#hello world
  1341. #ignore this line
  1342. 5.,NaN,10.0
  1343. """
  1344. expected = [[1., 2., 4.],
  1345. [5., np.nan, 10.]]
  1346. df = self.read_csv(StringIO(data), comment='#')
  1347. tm.assert_almost_equal(df.values, expected)
  1348. def test_comment_skiprows(self):
  1349. data = """# empty
  1350. random line
  1351. # second empty line
  1352. 1,2,3
  1353. A,B,C
  1354. 1,2.,4.
  1355. 5.,NaN,10.0
  1356. """
  1357. expected = [[1., 2., 4.],
  1358. [5., np.nan, 10.]]
  1359. # this should ignore the first four lines (including comments)
  1360. df = self.read_csv(StringIO(data), comment='#', skiprows=4)
  1361. tm.assert_almost_equal(df.values, expected)
  1362. def test_comment_header(self):
  1363. data = """# empty
  1364. # second empty line
  1365. 1,2,3
  1366. A,B,C
  1367. 1,2.,4.
  1368. 5.,NaN,10.0
  1369. """
  1370. expected = [[1., 2., 4.],
  1371. [5., np.nan, 10.]]
  1372. # header should begin at the second non-comment line
  1373. df = self.read_csv(StringIO(data), comment='#', header=1)
  1374. tm.assert_almost_equal(df.values, expected)
  1375. def test_comment_skiprows_header(self):
  1376. data = """# empty
  1377. # second empty line
  1378. # third empty line
  1379. X,Y,Z
  1380. 1,2,3
  1381. A,B,C
  1382. 1,2.,4.
  1383. 5.,NaN,10.0
  1384. """
  1385. expected = [[1., 2., 4.],
  1386. [5., np.nan, 10.]]
  1387. # skiprows should skip the first 4 lines (including comments), while
  1388. # header should start from the second non-commented line starting
  1389. # with line 5
  1390. df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1)
  1391. tm.assert_almost_equal(df.values, expected)
  1392. def test_read_csv_parse_simple_list(self):
  1393. text = """foo
  1394. bar baz
  1395. qux foo
  1396. foo
  1397. bar"""
  1398. df = read_csv(StringIO(text), header=None)
  1399. expected = DataFrame({0: ['foo', 'bar baz', 'qux foo',
  1400. 'foo', 'bar']})
  1401. tm.assert_frame_equal(df, expected)
  1402. def test_parse_dates_custom_euroformat(self):
  1403. text = """foo,bar,baz
  1404. 31/01/2010,1,2
  1405. 01/02/2010,1,NA
  1406. 02/02/2010,1,2
  1407. """
  1408. parser = lambda d: parse_date(d, dayfirst=True)
  1409. df = self.read_csv(StringIO(text),
  1410. names=['time', 'Q', 'NTU'], header=0,
  1411. index_col=0, parse_dates=True,
  1412. date_parser=parser, na_values=['NA'])
  1413. exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1),
  1414. datetime(2010, 2, 2)], name='time')
  1415. expected = DataFrame({'Q': [1, 1, 1], 'NTU': [2, np.nan, 2]},
  1416. index=exp_index, columns=['Q', 'NTU'])
  1417. tm.assert_frame_equal(df, expected)
  1418. parser = lambda d: parse_date(d, day_first=True)
  1419. self.assertRaises(Exception, self.read_csv,
  1420. StringIO(text), skiprows=[0],
  1421. names=['time', 'Q', 'NTU'], index_col=0,
  1422. parse_dates=True, date_parser=parser,
  1423. na_values=['NA'])
  1424. def test_na_value_dict(self):
  1425. data = """A,B,C
  1426. foo,bar,NA
  1427. bar,foo,foo
  1428. foo,bar,NA
  1429. bar,foo,foo"""
  1430. df = self.read_csv(StringIO(data),
  1431. na_values={'A': ['foo'], 'B': ['bar']})
  1432. expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'],
  1433. 'B': [np.nan, 'foo', np.nan, 'foo'],
  1434. 'C': [np.nan, 'foo', np.nan, 'foo']})
  1435. tm.assert_frame_equal(df, expected)
  1436. data = """\
  1437. a,b,c,d
  1438. 0,NA,1,5
  1439. """
  1440. xp = DataFrame({'b': [np.nan], 'c': [1], 'd': [5]}, index=[0])
  1441. xp.index.name = 'a'
  1442. df = self.read_csv(StringIO(data), na_values={}, index_col=0)
  1443. tm.assert_frame_equal(df, xp)
  1444. xp = DataFrame({'b': [np.nan], 'd': [5]},
  1445. MultiIndex.from_tuples([(0, 1)]))
  1446. xp.index.names = ['a', 'c']
  1447. df = self.read_csv(StringIO(data), na_values={}, index_col=[0, 2])
  1448. tm.assert_frame_equal(df, xp)
  1449. xp = DataFrame({'b': [np.nan], 'd': [5]},
  1450. MultiIndex.from_tuples([(0, 1)]))
  1451. xp.index.names = ['a', 'c']
  1452. df = self.read_csv(StringIO(data), na_values={}, index_col=['a', 'c'])
  1453. tm.assert_frame_equal(df, xp)
  1454. @tm.network
  1455. def test_url(self):
  1456. # HTTP(S)
  1457. url = ('https://raw.github.com/pydata/pandas/master/'
  1458. 'pandas/io/tests/data/salary.table')
  1459. url_table = self.read_table(url)
  1460. dirpath = tm.get_data_path()
  1461. localtable = os.path.join(dirpath, 'salary.table')
  1462. local_table = self.read_table(localtable)
  1463. tm.assert_frame_equal(url_table, local_table)
  1464. # TODO: ftp testing
  1465. @slow
  1466. def test_file(self):
  1467. # FILE
  1468. if sys.version_info[:2] < (2, 6):
  1469. raise nose.SkipTest("file:// not supported with Python < 2.6")
  1470. dirpath = tm.get_data_path()
  1471. localtable = os.path.join(dirpath, 'salary.table')
  1472. local_table = self.read_table(localtable)
  1473. try:
  1474. url_table = self.read_table('file://localhost/' + localtable)
  1475. except URLError:
  1476. # fails on some systems
  1477. raise nose.SkipTest("failing on %s" %
  1478. ' '.join(platform.uname()).strip())
  1479. tm.assert_frame_equal(url_table, local_table)
  1480. def test_parse_tz_aware(self):
  1481. import pytz
  1482. # #1693
  1483. data = StringIO("Date,x\n2012-06-13T01:39:00Z,0.5")
  1484. # it works
  1485. result = read_csv(data, index_col=0, parse_dates=True)
  1486. stamp = result.index[0]
  1487. self.assertEqual(stamp.minute, 39)
  1488. try:
  1489. self.assertIs(result.index.tz, pytz.utc)
  1490. except AssertionError: # hello Yaroslav
  1491. arr = result.index.to_pydatetime()
  1492. result = tools.to_datetime(arr, utc=True)[0]
  1493. self.assertEqual(stamp.minute, result.minute)
  1494. self.assertEqual(stamp.hour, result.hour)
  1495. self.assertEqual(stamp.day, result.day)
  1496. def test_multiple_date_cols_index(self):
  1497. data = """\
  1498. ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
  1499. KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  1500. KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  1501. KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  1502. KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  1503. KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  1504. KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
  1505. xp = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]})
  1506. df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]},
  1507. index_col='nominal')
  1508. tm.assert_frame_equal(xp.set_index('nominal'), df)
  1509. df2 = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]},
  1510. index_col=0)
  1511. tm.assert_frame_equal(df2, df)
  1512. df3 = self.read_csv(StringIO(data), parse_dates=[[1, 2]], index_col=0)
  1513. tm.assert_frame_equal(df3, df, check_names=False)
  1514. def test_multiple_date_cols_chunked(self):
  1515. df = self.read_csv(StringIO(self.ts_data), parse_dates={
  1516. 'nominal': [1, 2]}, index_col='nominal')
  1517. reader = self.read_csv(StringIO(self.ts_data), parse_dates={'nominal':
  1518. [1, 2]}, index_col='nominal', chunksize=2)
  1519. chunks = list(reader)
  1520. self.assertNotIn('nominalTime', df)
  1521. tm.assert_frame_equal(chunks[0], df[:2])
  1522. tm.assert_frame_equal(chunks[1], df[2:4])
  1523. tm.assert_frame_equal(chunks[2], df[4:])
  1524. def test_multiple_date_col_named_components(self):
  1525. xp = self.read_csv(StringIO(self.ts_data),
  1526. parse_dates={'nominal': [1, 2]},
  1527. index_col='nominal')
  1528. colspec = {'nominal': ['date', 'nominalTime']}
  1529. df = self.read_csv(StringIO(self.ts_data), parse_dates=colspec,
  1530. index_col='nominal')
  1531. tm.assert_frame_equal(df, xp)
  1532. def test_multiple_date_col_multiple_index(self):
  1533. df = self.read_csv(StringIO(self.ts_data),
  1534. parse_dates={'nominal': [1, 2]},
  1535. index_col=['nominal', 'ID'])
  1536. xp = self.read_csv(StringIO(self.ts_data),
  1537. parse_dates={'nominal': [1, 2]})
  1538. tm.assert_frame_equal(xp.set_index(['nominal', 'ID']), df)
  1539. def test_comment(self):
  1540. data = """A,B,C
  1541. 1,2.,4.#hello world
  1542. 5.,NaN,10.0
  1543. """
  1544. expected = [[1., 2., 4.],
  1545. [5., np.nan, 10.]]
  1546. df = self.read_csv(StringIO(data), comment='#')
  1547. tm.assert_almost_equal(df.values, expected)
  1548. df = self.read_table(StringIO(data), sep=',', comment='#',
  1549. na_values=['NaN'])
  1550. tm.assert_almost_equal(df.values, expected)
  1551. def test_bool_na_values(self):
  1552. data = """A,B,C
  1553. True,False,True
  1554. NA,True,False
  1555. False,NA,True"""
  1556. result = self.read_csv(StringIO(data))
  1557. expected = DataFrame({'A': np.array([True, nan, False], dtype=object),
  1558. 'B': np.array([False, True, nan], dtype=object),
  1559. 'C': [True, False, True]})
  1560. tm.assert_frame_equal(result, expected)
  1561. def test_nonexistent_path(self):
  1562. # don't segfault pls #2428
  1563. path = '%s.csv' % tm.rands(10)
  1564. self.assertRaises(Exception, self.read_csv, path)
  1565. def test_missing_trailing_delimiters(self):
  1566. data = """A,B,C,D
  1567. 1,2,3,4
  1568. 1,3,3,
  1569. 1,4,5"""
  1570. result = self.read_csv(StringIO(data))
  1571. self.assertTrue(result['D'].isnull()[1:].all())
  1572. def test_skipinitialspace(self):
  1573. s = ('"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
  1574. '1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, '
  1575. '314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, '
  1576. '70.06056, 344.98370, 1, 1, -0.689265, -0.692787, '
  1577. '0.212036, 14.7674, 41.605, -9999.0, -9999.0, '
  1578. '-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128')
  1579. sfile = StringIO(s)
  1580. # it's 33 columns
  1581. result = self.read_csv(sfile, names=lrange(33), na_values=['-9999.0'],
  1582. header=None, skipinitialspace=True)
  1583. self.assertTrue(pd.isnull(result.ix[0, 29]))
  1584. def test_utf16_bom_skiprows(self):
  1585. # #2298
  1586. data = u("""skip this
  1587. skip this too
  1588. A\tB\tC
  1589. 1\t2\t3
  1590. 4\t5\t6""")
  1591. data2 = u("""skip this
  1592. skip this too
  1593. A,B,C
  1594. 1,2,3
  1595. 4,5,6""")
  1596. path = '__%s__.csv' % tm.rands(10)
  1597. with tm.ensure_clean(path) as path:
  1598. for sep, dat in [('\t', data), (',', data2)]:
  1599. for enc in ['utf-16', 'utf-16le', 'utf-16be']:
  1600. bytes = dat.encode(enc)
  1601. with open(path, 'wb') as f:
  1602. f.write(bytes)
  1603. s = BytesIO(dat.encode('utf-8'))
  1604. if compat.PY3:
  1605. # somewhat False since the code never sees bytes
  1606. from io import TextIOWrapper
  1607. s = TextIOWrapper(s, encoding='utf-8')
  1608. result = self.read_csv(path, encoding=enc, skiprows=2,
  1609. sep=sep)
  1610. expected = self.read_csv(s, encoding='utf-8', skiprows=2,
  1611. sep=sep)
  1612. tm.assert_frame_equal(result, expected)
  1613. def test_utf16_example(self):
  1614. path = tm.get_data_path('utf16_ex.txt')
  1615. # it works! and is the right length
  1616. result = self.read_table(path, encoding='utf-16')
  1617. self.assertEqual(len(result), 50)
  1618. if not compat.PY3:
  1619. buf = BytesIO(open(path, 'rb').read())
  1620. result = self.read_table(buf, encoding='utf-16')
  1621. self.assertEqual(len(result), 50)
  1622. def test_converters_corner_with_nas(self):
  1623. # skip aberration observed on Win64 Python 3.2.2
  1624. if hash(np.int64(-1)) != -2:
  1625. raise nose.SkipTest("skipping because of windows hash on Python"
  1626. " 3.2.2")
  1627. csv = """id,score,days
  1628. 1,2,12
  1629. 2,2-5,
  1630. 3,,14+
  1631. 4,6-12,2"""
  1632. def convert_days(x):
  1633. x = x.strip()
  1634. if not x:
  1635. return np.nan
  1636. is_plus = x.endswith('+')
  1637. if is_plus:
  1638. x = int(x[:-1]) + 1
  1639. else:
  1640. x = int(x)
  1641. return x
  1642. def convert_days_sentinel(x):
  1643. x = x.strip()
  1644. if not x:
  1645. return np.nan
  1646. is_plus = x.endswith('+')
  1647. if is_plus:
  1648. x = int(x[:-1]) + 1
  1649. else:
  1650. x = int(x)
  1651. return x
  1652. def convert_score(x):
  1653. x = x.strip()
  1654. if not x:
  1655. return np.nan
  1656. if x.find('-') > 0:
  1657. valmin, valmax = lmap(int, x.split('-'))
  1658. val = 0.5 * (valmin + valmax)
  1659. else:
  1660. val = float(x)
  1661. return val
  1662. fh = StringIO(csv)
  1663. result = self.read_csv(fh, converters={'score': convert_score,
  1664. 'days': convert_days},
  1665. na_values=['', None])
  1666. self.assertTrue(pd.isnull(result['days'][1]))
  1667. fh = StringIO(csv)
  1668. result2 = self.read_csv(fh, converters={'score': convert_score,
  1669. 'days': convert_days_sentinel},
  1670. na_values=['', None])
  1671. tm.assert_frame_equal(result, result2)
  1672. def test_unicode_encoding(self):
  1673. pth = tm.get_data_path('unicode_series.csv')
  1674. result = self.read_csv(pth, header=None, encoding='latin-1')
  1675. result = result.set_index(0)
  1676. got = result[1][1632]
  1677. expected = u('\xc1 k\xf6ldum klaka (Cold Fever) (1994)')
  1678. self.assertEqual(got, expected)
  1679. def test_trailing_delimiters(self):
  1680. # #2442. grumble grumble
  1681. data = """A,B,C
  1682. 1,2,3,
  1683. 4,5,6,
  1684. 7,8,9,"""
  1685. result = self.read_csv(StringIO(data), index_col=False)
  1686. expected = DataFrame({'A': [1, 4, 7], 'B': [2, 5, 8],
  1687. 'C': [3, 6, 9]})
  1688. tm.assert_frame_equal(result, expected)
  1689. def test_escapechar(self):
  1690. # http://stackoverflow.com/questions/13824840/feature-request-for-
  1691. # pandas-read-csv
  1692. data = '''SEARCH_TERM,ACTUAL_URL
  1693. "bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
  1694. "tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
  1695. "SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals serie","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"'''
  1696. result = self.read_csv(StringIO(data), escapechar='\\',
  1697. quotechar='"', encoding='utf-8')
  1698. self.assertEqual(result['SEARCH_TERM'][2],
  1699. 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals serie')
  1700. self.assertTrue(np.array_equal(result.columns,
  1701. ['SEARCH_TERM', 'ACTUAL_URL']))
  1702. def test_header_names_backward_compat(self):
  1703. # #2539
  1704. data = '1,2,3\n4,5,6'
  1705. result = self.read_csv(StringIO(data), names=['a', 'b', 'c'])
  1706. expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
  1707. header=None)
  1708. tm.assert_frame_equal(result, expected)
  1709. data2 = 'foo,bar,baz\n' + data
  1710. result = self.read_csv(StringIO(data2), names=['a', 'b', 'c'],
  1711. header=0)
  1712. tm.assert_frame_equal(result, expected)
  1713. def test_int64_min_issues(self):
  1714. # #2599
  1715. data = 'A,B\n0,0\n0,'
  1716. result = self.read_csv(StringIO(data))
  1717. expected = DataFrame({'A': [0, 0], 'B': [0, np.nan]})
  1718. tm.assert_frame_equal(result, expected)
  1719. def test_parse_integers_above_fp_precision(self):
  1720. data = """Numbers
  1721. 17007000002000191
  1722. 17007000002000191
  1723. 17007000002000191
  1724. 17007000002000191
  1725. 17007000002000192
  1726. 17007000002000192
  1727. 17007000002000192
  1728. 17007000002000192
  1729. 17007000002000192
  1730. 17007000002000194"""
  1731. result = self.read_csv(StringIO(data))
  1732. expected = DataFrame({'Numbers': [17007000002000191,
  1733. 17007000002000191,
  1734. 17007000002000191,
  1735. 17007000002000191,
  1736. 17007000002000192,
  1737. 17007000002000192,
  1738. 17007000002000192,
  1739. 17007000002000192,
  1740. 17007000002000192,
  1741. 17007000002000194]})
  1742. self.assertTrue(np.array_equal(result['Numbers'], expected['Numbers']))
  1743. def test_usecols_index_col_conflict(self):
  1744. # Issue 4201 Test that index_col as integer reflects usecols
  1745. data = """SecId,Time,Price,P2,P3
  1746. 10000,2013-5-11,100,10,1
  1747. 500,2013-5-12,101,11,1
  1748. """
  1749. expected = DataFrame({'Price': [100, 101]}, index=[datetime(2013, 5, 11), datetime(2013, 5, 12)])
  1750. expected.index.name = 'Time'
  1751. df = self.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col=0)
  1752. tm.assert_frame_equal(expected, df)
  1753. df = self.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col='Time')
  1754. tm.assert_frame_equal(expected, df)
  1755. df = self.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col='Time')
  1756. tm.assert_frame_equal(expected, df)
  1757. df = self.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col=0)
  1758. tm.assert_frame_equal(expected, df)
  1759. expected = DataFrame({'P3': [1, 1], 'Price': (100, 101), 'P2': (10, 11)})
  1760. expected = expected.set_index(['Price', 'P2'])
  1761. df = self.read_csv(StringIO(data), usecols=['Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2'])
  1762. tm.assert_frame_equal(expected, df)
  1763. def test_chunks_have_consistent_numerical_type(self):
  1764. integers = [str(i) for i in range(499999)]
  1765. data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
  1766. with tm.assert_produces_warning(False):
  1767. df = self.read_csv(StringIO(data))
  1768. self.assertTrue(type(df.a[0]) is np.float64) # Assert that types were coerced.
  1769. self.assertEqual(df.a.dtype, np.float)
  1770. def test_warn_if_chunks_have_mismatched_type(self):
  1771. # See test in TestCParserLowMemory.
  1772. integers = [str(i) for i in range(499999)]
  1773. data = "a\n" + "\n".join(integers + ['a', 'b'] + integers)
  1774. with tm.assert_produces_warning(False):
  1775. df = self.read_csv(StringIO(data))
  1776. self.assertEqual(df.a.dtype, np.object)
  1777. def test_usecols(self):
  1778. data = """\
  1779. a,b,c
  1780. 1,2,3
  1781. 4,5,6
  1782. 7,8,9
  1783. 10,11,12"""
  1784. result = self.read_csv(StringIO(data), usecols=(1, 2))
  1785. result2 = self.read_csv(StringIO(data), usecols=('b', 'c'))
  1786. exp = self.read_csv(StringIO(data))
  1787. self.assertEqual(len(result.columns), 2)
  1788. self.assertTrue((result['b'] == exp['b']).all())
  1789. self.assertTrue((result['c'] == exp['c']).all())
  1790. tm.assert_frame_equal(result, result2)
  1791. result = self.read_csv(StringIO(data), usecols=[1, 2], header=0,
  1792. names=['foo', 'bar'])
  1793. expected = self.read_csv(StringIO(data), usecols=[1, 2])
  1794. expected.columns = ['foo', 'bar']
  1795. tm.assert_frame_equal(result, expected)
  1796. data = """\
  1797. 1,2,3
  1798. 4,5,6
  1799. 7,8,9
  1800. 10,11,12"""
  1801. result = self.read_csv(StringIO(data), names=['b', 'c'],
  1802. header=None, usecols=[1, 2])
  1803. expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
  1804. header=None)
  1805. expected = expected[['b', 'c']]
  1806. tm.assert_frame_equal(result, expected)
  1807. result2 = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
  1808. header=None, usecols=['b', 'c'])
  1809. tm.assert_frame_equal(result2, result)
  1810. # 5766
  1811. result = self.read_csv(StringIO(data), names=['a', 'b'],
  1812. header=None, usecols=[0, 1])
  1813. expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
  1814. header=None)
  1815. expected = expected[['a', 'b']]
  1816. tm.assert_frame_equal(result, expected)
  1817. # length conflict, passed names and usecols disagree
  1818. self.assertRaises(ValueError, self.read_csv, StringIO(data),
  1819. names=['a', 'b'], usecols=[1], header=None)
  1820. def test_integer_overflow_bug(self):
  1821. # #2601
  1822. data = "65248E10 11\n55555E55 22\n"
  1823. result = self.read_csv(StringIO(data), header=None, sep=' ')
  1824. self.assertTrue(result[0].dtype == np.float64)
  1825. result = self.read_csv(StringIO(data), header=None, sep='\s+')
  1826. self.assertTrue(result[0].dtype == np.float64)
  1827. def test_catch_too_many_names(self):
  1828. # Issue 5156
  1829. data = """\
  1830. 1,2,3
  1831. 4,,6
  1832. 7,8,9
  1833. 10,11,12\n"""
  1834. tm.assertRaises(Exception, read_csv, StringIO(data), header=0, names=['a', 'b', 'c', 'd'])
  1835. def test_ignore_leading_whitespace(self):
  1836. # GH 6607, GH 3374
  1837. data = ' a b c\n 1 2 3\n 4 5 6\n 7 8 9'
  1838. result = self.read_table(StringIO(data), sep='\s+')
  1839. expected = DataFrame({'a':[1,4,7], 'b':[2,5,8], 'c': [3,6,9]})
  1840. tm.assert_frame_equal(result, expected)
  1841. def test_nrows_and_chunksize_raises_notimplemented(self):
  1842. data = 'a b c'
  1843. self.assertRaises(NotImplementedError, self.read_csv, StringIO(data),
  1844. nrows=10, chunksize=5)
  1845. class TestPythonParser(ParserTests, tm.TestCase):
  1846. def test_negative_skipfooter_raises(self):
  1847. text = """#foo,a,b,c
  1848. #foo,a,b,c
  1849. #foo,a,b,c
  1850. #foo,a,b,c
  1851. #foo,a,b,c
  1852. #foo,a,b,c
  1853. 1/1/2000,1.,2.,3.
  1854. 1/2/2000,4,5,6
  1855. 1/3/2000,7,8,9
  1856. """
  1857. with tm.assertRaisesRegexp(ValueError,
  1858. 'skip footer cannot be negative'):
  1859. df = self.read_csv(StringIO(text), skipfooter=-1)
  1860. def read_csv(self, *args, **kwds):
  1861. kwds = kwds.copy()
  1862. kwds['engine'] = 'python'
  1863. return read_csv(*args, **kwds)
  1864. def read_table(self, *args, **kwds):
  1865. kwds = kwds.copy()
  1866. kwds['engine'] = 'python'
  1867. return read_table(*args, **kwds)
  1868. def test_sniff_delimiter(self):
  1869. text = """index|A|B|C
  1870. foo|1|2|3
  1871. bar|4|5|6
  1872. baz|7|8|9
  1873. """
  1874. data = self.read_csv(StringIO(text), index_col=0, sep=None)
  1875. self.assertTrue(data.index.equals(Index(['foo', 'bar', 'baz'])))
  1876. data2 = self.read_csv(StringIO(text), index_col=0, delimiter='|')
  1877. tm.assert_frame_equal(data, data2)
  1878. text = """ignore this
  1879. ignore this too
  1880. index|A|B|C
  1881. foo|1|2|3
  1882. bar|4|5|6
  1883. baz|7|8|9
  1884. """
  1885. data3 = self.read_csv(StringIO(text), index_col=0,
  1886. sep=None, skiprows=2)
  1887. tm.assert_frame_equal(data, data3)
  1888. text = u("""ignore this
  1889. ignore this too
  1890. index|A|B|C
  1891. foo|1|2|3
  1892. bar|4|5|6
  1893. baz|7|8|9
  1894. """).encode('utf-8')
  1895. s = BytesIO(text)
  1896. if compat.PY3:
  1897. # somewhat False since the code never sees bytes
  1898. from io import TextIOWrapper
  1899. s = TextIOWrapper(s, encoding='utf-8')
  1900. data4 = self.read_csv(s, index_col=0, sep=None, skiprows=2,
  1901. encoding='utf-8')
  1902. tm.assert_frame_equal(data, data4)
  1903. def test_regex_separator(self):
  1904. data = """ A B C D
  1905. a 1 2 3 4
  1906. b 1 2 3 4
  1907. c 1 2 3 4
  1908. """
  1909. df = self.read_table(StringIO(data), sep='\s+')
  1910. expected = self.read_csv(StringIO(re.sub('[ ]+', ',', data)),
  1911. index_col=0)
  1912. self.assertIsNone(expected.index.name)
  1913. tm.assert_frame_equal(df, expected)
  1914. def test_1000_fwf(self):
  1915. data = """
  1916. 1 2,334.0 5
  1917. 10 13 10.
  1918. """
  1919. expected = [[1, 2334., 5],
  1920. [10, 13, 10]]
  1921. df = read_fwf(StringIO(data), colspecs=[(0, 3), (3, 11), (12, 16)],
  1922. thousands=',')
  1923. tm.assert_almost_equal(df.values, expected)
  1924. def test_1000_sep_with_decimal(self):
  1925. data = """A|B|C
  1926. 1|2,334.01|5
  1927. 10|13|10.
  1928. """
  1929. expected = DataFrame({
  1930. 'A': [1, 10],
  1931. 'B': [2334.01, 13],
  1932. 'C': [5, 10.]
  1933. })
  1934. df = self.read_csv(StringIO(data), sep='|', thousands=',')
  1935. tm.assert_frame_equal(df, expected)
  1936. df = self.read_table(StringIO(data), sep='|', thousands=',')
  1937. tm.assert_frame_equal(df, expected)
  1938. def test_comment_fwf(self):
  1939. data = """
  1940. 1 2. 4 #hello world
  1941. 5 NaN 10.0
  1942. """
  1943. expected = [[1, 2., 4],
  1944. [5, np.nan, 10.]]
  1945. df = read_fwf(StringIO(data), colspecs=[(0, 3), (4, 9), (9, 25)],
  1946. comment='#')
  1947. tm.assert_almost_equal(df.values, expected)
  1948. def test_fwf(self):
  1949. data_expected = """\
  1950. 2011,58,360.242940,149.910199,11950.7
  1951. 2011,59,444.953632,166.985655,11788.4
  1952. 2011,60,364.136849,183.628767,11806.2
  1953. 2011,61,413.836124,184.375703,11916.8
  1954. 2011,62,502.953953,173.237159,12468.3
  1955. """
  1956. expected = self.read_csv(StringIO(data_expected), header=None)
  1957. data1 = """\
  1958. 201158 360.242940 149.910199 11950.7
  1959. 201159 444.953632 166.985655 11788.4
  1960. 201160 364.136849 183.628767 11806.2
  1961. 201161 413.836124 184.375703 11916.8
  1962. 201162 502.953953 173.237159 12468.3
  1963. """
  1964. colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
  1965. df = read_fwf(StringIO(data1), colspecs=colspecs, header=None)
  1966. tm.assert_frame_equal(df, expected)
  1967. data2 = """\
  1968. 2011 58 360.242940 149.910199 11950.7
  1969. 2011 59 444.953632 166.985655 11788.4
  1970. 2011 60 364.136849 183.628767 11806.2
  1971. 2011 61 413.836124 184.375703 11916.8
  1972. 2011 62 502.953953 173.237159 12468.3
  1973. """
  1974. df = read_fwf(StringIO(data2), widths=[5, 5, 13, 13, 7], header=None)
  1975. tm.assert_frame_equal(df, expected)
  1976. # From Thomas Kluyver: apparently some non-space filler characters can
  1977. # be seen, this is supported by specifying the 'delimiter' character:
  1978. # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html
  1979. data3 = """\
  1980. 201158~~~~360.242940~~~149.910199~~~11950.7
  1981. 201159~~~~444.953632~~~166.985655~~~11788.4
  1982. 201160~~~~364.136849~~~183.628767~~~11806.2
  1983. 201161~~~~413.836124~~~184.375703~~~11916.8
  1984. 201162~~~~502.953953~~~173.237159~~~12468.3
  1985. """
  1986. df = read_fwf(
  1987. StringIO(data3), colspecs=colspecs, delimiter='~', header=None)
  1988. tm.assert_frame_equal(df, expected)
  1989. with tm.assertRaisesRegexp(ValueError, "must specify only one of"):
  1990. read_fwf(StringIO(data3), colspecs=colspecs, widths=[6, 10, 10, 7])
  1991. with tm.assertRaisesRegexp(ValueError, "Must specify either"):
  1992. read_fwf(StringIO(data3), colspecs=None, widths=None)
  1993. def test_fwf_colspecs_is_list_or_tuple(self):
  1994. with tm.assertRaisesRegexp(TypeError,
  1995. 'column specifications must be a list or '
  1996. 'tuple.+'):
  1997. pd.io.parsers.FixedWidthReader(StringIO(self.data1),
  1998. {'a': 1}, ',', '#')
  1999. def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(self):
  2000. with tm.assertRaisesRegexp(TypeError,
  2001. 'Each column specification must be.+'):
  2002. read_fwf(StringIO(self.data1), [('a', 1)])
  2003. def test_fwf_colspecs_None(self):
  2004. # GH 7079
  2005. data = """\
  2006. 123456
  2007. 456789
  2008. """
  2009. colspecs = [(0, 3), (3, None)]
  2010. result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
  2011. expected = DataFrame([[123, 456], [456, 789]])
  2012. tm.assert_frame_equal(result, expected)
  2013. colspecs = [(None, 3), (3, 6)]
  2014. result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
  2015. expected = DataFrame([[123, 456], [456, 789]])
  2016. tm.assert_frame_equal(result, expected)
  2017. colspecs = [(0, None), (3, None)]
  2018. result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
  2019. expected = DataFrame([[123456, 456], [456789, 789]])
  2020. tm.assert_frame_equal(result, expected)
  2021. colspecs = [(None, None), (3, 6)]
  2022. result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
  2023. expected = DataFrame([[123456, 456], [456789, 789]])
  2024. tm.assert_frame_equal(result, expected)
  2025. def test_fwf_regression(self):
  2026. # GH 3594
  2027. #### turns out 'T060' is parsable as a datetime slice!
  2028. tzlist = [1,10,20,30,60,80,100]
  2029. ntz = len(tzlist)
  2030. tcolspecs = [16]+[8]*ntz
  2031. tcolnames = ['SST'] + ["T%03d" % z for z in tzlist[1:]]
  2032. data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192
  2033. 2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869
  2034. 2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657
  2035. 2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379
  2036. 2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039
  2037. """
  2038. df = read_fwf(StringIO(data),
  2039. index_col=0,
  2040. header=None,
  2041. names=tcolnames,
  2042. widths=tcolspecs,
  2043. parse_dates=True,
  2044. date_parser=lambda s: datetime.strptime(s,'%Y%j%H%M%S'))
  2045. for c in df.columns:
  2046. res = df.loc[:,c]
  2047. self.assertTrue(len(res))
  2048. def test_fwf_compression(self):
  2049. try:
  2050. import gzip
  2051. import bz2
  2052. except ImportError:
  2053. raise nose.SkipTest("Need gzip and bz2 to run this test")
  2054. data = """1111111111
  2055. 2222222222
  2056. 3333333333""".strip()
  2057. widths = [5, 5]
  2058. names = ['one', 'two']
  2059. expected = read_fwf(StringIO(data), widths=widths, names=names)
  2060. if compat.PY3:
  2061. data = bytes(data, encoding='utf-8')
  2062. comps = [('gzip', gzip.GzipFile), ('bz2', bz2.BZ2File)]
  2063. for comp_name, compresser in comps:
  2064. with tm.ensure_clean() as path:
  2065. tmp = compresser(path, mode='wb')
  2066. tmp.write(data)
  2067. tmp.close()
  2068. result = read_fwf(path, widths=widths, names=names,
  2069. compression=comp_name)
  2070. tm.assert_frame_equal(result, expected)
  2071. def test_BytesIO_input(self):
  2072. if not compat.PY3:
  2073. raise nose.SkipTest("Bytes-related test - only needs to work on Python 3")
  2074. result = pd.read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[2,2], encoding='utf8')
  2075. expected = pd.DataFrame([["של", "ום"]], columns=["של", "ום"])
  2076. tm.assert_frame_equal(result, expected)
  2077. data = BytesIO("שלום::1234\n562::123".encode('cp1255'))
  2078. result = pd.read_table(data, sep="::", engine='python',
  2079. encoding='cp1255')
  2080. expected = pd.DataFrame([[562, 123]], columns=["שלום","1234"])
  2081. tm.assert_frame_equal(result, expected)
  2082. def test_verbose_import(self):
  2083. text = """a,b,c,d
  2084. one,1,2,3
  2085. one,1,2,3
  2086. ,1,2,3
  2087. one,1,2,3
  2088. ,1,2,3
  2089. ,1,2,3
  2090. one,1,2,3
  2091. two,1,2,3"""
  2092. buf = StringIO()
  2093. sys.stdout = buf
  2094. try:
  2095. # it works!
  2096. df = self.read_csv(StringIO(text), verbose=True)
  2097. self.assertEqual(buf.getvalue(), 'Filled 3 NA values in column a\n')
  2098. finally:
  2099. sys.stdout = sys.__stdout__
  2100. buf = StringIO()
  2101. sys.stdout = buf
  2102. text = """a,b,c,d
  2103. one,1,2,3
  2104. two,1,2,3
  2105. three,1,2,3
  2106. four,1,2,3
  2107. five,1,2,3
  2108. ,1,2,3
  2109. seven,1,2,3
  2110. eight,1,2,3"""
  2111. try:
  2112. # it works!
  2113. df = self.read_csv(StringIO(text), verbose=True, index_col=0)
  2114. self.assertEqual(buf.getvalue(), 'Filled 1 NA values in column a\n')
  2115. finally:
  2116. sys.stdout = sys.__stdout__
  2117. def test_iteration_open_handle(self):
  2118. if PY3:
  2119. raise nose.SkipTest("won't work in Python 3 {0}".format(sys.version_info))
  2120. with tm.ensure_clean() as path:
  2121. with open(path, 'wb') as f:
  2122. f.write('AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG')
  2123. with open(path, 'rb') as f:
  2124. for line in f:
  2125. if 'CCC' in line:
  2126. break
  2127. try:
  2128. read_table(f, squeeze=True, header=None, engine='c')
  2129. except Exception:
  2130. pass
  2131. else:
  2132. raise ValueError('this should not happen')
  2133. result = read_table(f, squeeze=True, header=None,
  2134. engine='python')
  2135. expected = Series(['DDD', 'EEE', 'FFF', 'GGG'])
  2136. tm.assert_series_equal(result, expected)
  2137. def test_iterator(self):
  2138. # GH 6607
  2139. # This is a copy which should eventually be merged into ParserTests
  2140. # when the issue with the C parser is fixed
  2141. reader = self.read_csv(StringIO(self.data1), index_col=0,
  2142. iterator=True)
  2143. df = self.read_csv(StringIO(self.data1), index_col=0)
  2144. chunk = reader.read(3)
  2145. tm.assert_frame_equal(chunk, df[:3])
  2146. last_chunk = reader.read(5)
  2147. tm.assert_frame_equal(last_chunk, df[3:])
  2148. # pass list
  2149. lines = list(csv.reader(StringIO(self.data1)))
  2150. parser = TextParser(lines, index_col=0, chunksize=2)
  2151. df = self.read_csv(StringIO(self.data1), index_col=0)
  2152. chunks = list(parser)
  2153. tm.assert_frame_equal(chunks[0], df[:2])
  2154. tm.assert_frame_equal(chunks[1], df[2:4])
  2155. tm.assert_frame_equal(chunks[2], df[4:])
  2156. # pass skiprows
  2157. parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1])
  2158. chunks = list(parser)
  2159. tm.assert_frame_equal(chunks[0], df[1:3])
  2160. # test bad parameter (skip_footer)
  2161. reader = self.read_csv(StringIO(self.data1), index_col=0,
  2162. iterator=True, skip_footer=True)
  2163. self.assertRaises(ValueError, reader.read, 3)
  2164. treader = self.read_table(StringIO(self.data1), sep=',', index_col=0,
  2165. iterator=True)
  2166. tm.assert_isinstance(treader, TextFileReader)
  2167. # stopping iteration when on chunksize is specified, GH 3967
  2168. data = """A,B,C
  2169. foo,1,2,3
  2170. bar,4,5,6
  2171. baz,7,8,9
  2172. """
  2173. reader = self.read_csv(StringIO(data), iterator=True)
  2174. result = list(reader)
  2175. expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz'])
  2176. tm.assert_frame_equal(result[0], expected)
  2177. # chunksize = 1
  2178. reader = self.read_csv(StringIO(data), chunksize=1)
  2179. result = list(reader)
  2180. expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz'])
  2181. self.assertEqual(len(result), 3)
  2182. tm.assert_frame_equal(pd.concat(result), expected)
  2183. def test_single_line(self):
  2184. # GH 6607
  2185. # This is a copy which should eventually be merged into ParserTests
  2186. # when the issue with the C parser is fixed
  2187. # sniff separator
  2188. buf = StringIO()
  2189. sys.stdout = buf
  2190. # printing warning message when engine == 'c' for now
  2191. try:
  2192. # it works!
  2193. df = self.read_csv(StringIO('1,2'), names=['a', 'b'],
  2194. header=None, sep=None)
  2195. tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df)
  2196. finally:
  2197. sys.stdout = sys.__stdout__
  2198. def test_malformed(self):
  2199. # GH 6607
  2200. # This is a copy which should eventually be merged into ParserTests
  2201. # when the issue with the C parser is fixed
  2202. # all
  2203. data = """ignore
  2204. A,B,C
  2205. 1,2,3 # comment
  2206. 1,2,3,4,5
  2207. 2,3,4
  2208. """
  2209. try:
  2210. df = self.read_table(
  2211. StringIO(data), sep=',', header=1, comment='#')
  2212. self.assertTrue(False)
  2213. except Exception as inst:
  2214. self.assertIn('Expected 3 fields in line 4, saw 5', str(inst))
  2215. # skip_footer
  2216. data = """ignore
  2217. A,B,C
  2218. 1,2,3 # comment
  2219. 1,2,3,4,5
  2220. 2,3,4
  2221. footer
  2222. """
  2223. try:
  2224. df = self.read_table(
  2225. StringIO(data), sep=',', header=1, comment='#',
  2226. skip_footer=1)
  2227. self.assertTrue(False)
  2228. except Exception as inst:
  2229. self.assertIn('Expected 3 fields in line 4, saw 5', str(inst))
  2230. # first chunk
  2231. data = """ignore
  2232. A,B,C
  2233. skip
  2234. 1,2,3
  2235. 3,5,10 # comment
  2236. 1,2,3,4,5
  2237. 2,3,4
  2238. """
  2239. try:
  2240. it = self.read_table(StringIO(data), sep=',',
  2241. header=1, comment='#', iterator=True, chunksize=1,
  2242. skiprows=[2])
  2243. df = it.read(5)
  2244. self.assertTrue(False)
  2245. except Exception as inst:
  2246. self.assertIn('Expected 3 fields in line 6, saw 5', str(inst))
  2247. # middle chunk
  2248. data = """ignore
  2249. A,B,C
  2250. skip
  2251. 1,2,3
  2252. 3,5,10 # comment
  2253. 1,2,3,4,5
  2254. 2,3,4
  2255. """
  2256. try:
  2257. it = self.read_table(StringIO(data), sep=',', header=1,
  2258. comment='#', iterator=True, chunksize=1,
  2259. skiprows=[2])
  2260. df = it.read(1)
  2261. it.read(2)
  2262. self.assertTrue(False)
  2263. except Exception as inst:
  2264. self.assertIn('Expected 3 fields in line 6, saw 5', str(inst))
  2265. # last chunk
  2266. data = """ignore
  2267. A,B,C
  2268. skip
  2269. 1,2,3
  2270. 3,5,10 # comment
  2271. 1,2,3,4,5
  2272. 2,3,4
  2273. """
  2274. try:
  2275. it = self.read_table(StringIO(data), sep=',',
  2276. header=1, comment='#', iterator=True, chunksize=1,
  2277. skiprows=[2])
  2278. df = it.read(1)
  2279. it.read()
  2280. self.assertTrue(False)
  2281. except Exception as inst:
  2282. self.assertIn('Expected 3 fields in line 6, saw 5', str(inst))
  2283. def test_skip_footer(self):
  2284. # GH 6607
  2285. # This is a copy which should eventually be merged into ParserTests
  2286. # when the issue with the C parser is fixed
  2287. data = """A,B,C
  2288. 1,2,3
  2289. 4,5,6
  2290. 7,8,9
  2291. want to skip this
  2292. also also skip this
  2293. """
  2294. result = self.read_csv(StringIO(data), skip_footer=2)
  2295. no_footer = '\n'.join(data.split('\n')[:-3])
  2296. expected = self.read_csv(StringIO(no_footer))
  2297. tm.assert_frame_equal(result, expected)
  2298. result = self.read_csv(StringIO(data), nrows=3)
  2299. tm.assert_frame_equal(result, expected)
  2300. # skipfooter alias
  2301. result = self.read_csv(StringIO(data), skipfooter=2)
  2302. no_footer = '\n'.join(data.split('\n')[:-3])
  2303. expected = self.read_csv(StringIO(no_footer))
  2304. tm.assert_frame_equal(result, expected)
  2305. def test_decompression_regex_sep(self):
  2306. # GH 6607
  2307. # This is a copy which should eventually be moved to ParserTests
  2308. # when the issue with the C parser is fixed
  2309. try:
  2310. import gzip
  2311. import bz2
  2312. except ImportError:
  2313. raise nose.SkipTest('need gzip and bz2 to run')
  2314. data = open(self.csv1, 'rb').read()
  2315. data = data.replace(b',', b'::')
  2316. expected = self.read_csv(self.csv1)
  2317. with tm.ensure_clean() as path:
  2318. tmp = gzip.GzipFile(path, mode='wb')
  2319. tmp.write(data)
  2320. tmp.close()
  2321. result = self.read_csv(path, sep='::', compression='gzip')
  2322. tm.assert_frame_equal(result, expected)
  2323. with tm.ensure_clean() as path:
  2324. tmp = bz2.BZ2File(path, mode='wb')
  2325. tmp.write(data)
  2326. tmp.close()
  2327. result = self.read_csv(path, sep='::', compression='bz2')
  2328. tm.assert_frame_equal(result, expected)
  2329. self.assertRaises(ValueError, self.read_csv,
  2330. path, compression='bz3')
  2331. def test_read_table_buglet_4x_multiindex(self):
  2332. # GH 6607
  2333. # This is a copy which should eventually be merged into ParserTests
  2334. # when the issue with multi-level index is fixed in the C parser.
  2335. text = """ A B C D E
  2336. one two three four
  2337. a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
  2338. a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
  2339. x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
  2340. # it works!
  2341. df = self.read_table(StringIO(text), sep='\s+')
  2342. self.assertEqual(df.index.names, ('one', 'two', 'three', 'four'))
  2343. # GH 6893
  2344. data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9'
  2345. expected = DataFrame.from_records([(1,3,7,0,3,6), (3,1,4,1,5,9)],
  2346. columns=list('abcABC'), index=list('abc'))
  2347. actual = self.read_table(StringIO(data), sep='\s+')
  2348. tm.assert_frame_equal(actual, expected)
  2349. class TestFwfColspaceSniffing(tm.TestCase):
  2350. def test_full_file(self):
  2351. # File with all values
  2352. test = '''index A B C
  2353. 2000-01-03T00:00:00 0.980268513777 3 foo
  2354. 2000-01-04T00:00:00 1.04791624281 -4 bar
  2355. 2000-01-05T00:00:00 0.498580885705 73 baz
  2356. 2000-01-06T00:00:00 1.12020151869 1 foo
  2357. 2000-01-07T00:00:00 0.487094399463 0 bar
  2358. 2000-01-10T00:00:00 0.836648671666 2 baz
  2359. 2000-01-11T00:00:00 0.157160753327 34 foo'''
  2360. colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
  2361. expected = read_fwf(StringIO(test), colspecs=colspecs)
  2362. tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
  2363. def test_full_file_with_missing(self):
  2364. # File with missing values
  2365. test = '''index A B C
  2366. 2000-01-03T00:00:00 0.980268513777 3 foo
  2367. 2000-01-04T00:00:00 1.04791624281 -4 bar
  2368. 0.498580885705 73 baz
  2369. 2000-01-06T00:00:00 1.12020151869 1 foo
  2370. 2000-01-07T00:00:00 0 bar
  2371. 2000-01-10T00:00:00 0.836648671666 2 baz
  2372. 34'''
  2373. colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
  2374. expected = read_fwf(StringIO(test), colspecs=colspecs)
  2375. tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
  2376. def test_full_file_with_spaces(self):
  2377. # File with spaces in columns
  2378. test = '''
  2379. Account Name Balance CreditLimit AccountCreated
  2380. 101 Keanu Reeves 9315.45 10000.00 1/17/1998
  2381. 312 Gerard Butler 90.00 1000.00 8/6/2003
  2382. 868 Jennifer Love Hewitt 0 17000.00 5/25/1985
  2383. 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
  2384. 317 Bill Murray 789.65 5000.00 2/5/2007
  2385. '''.strip('\r\n')
  2386. colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
  2387. expected = read_fwf(StringIO(test), colspecs=colspecs)
  2388. tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
  2389. def test_full_file_with_spaces_and_missing(self):
  2390. # File with spaces and missing values in columsn
  2391. test = '''
  2392. Account Name Balance CreditLimit AccountCreated
  2393. 101 10000.00 1/17/1998
  2394. 312 Gerard Butler 90.00 1000.00 8/6/2003
  2395. 868 5/25/1985
  2396. 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
  2397. 317 Bill Murray 789.65
  2398. '''.strip('\r\n')
  2399. colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
  2400. expected = read_fwf(StringIO(test), colspecs=colspecs)
  2401. tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
  2402. def test_messed_up_data(self):
  2403. # Completely messed up file
  2404. test = '''
  2405. Account Name Balance Credit Limit Account Created
  2406. 101 10000.00 1/17/1998
  2407. 312 Gerard Butler 90.00 1000.00
  2408. 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
  2409. 317 Bill Murray 789.65
  2410. '''.strip('\r\n')
  2411. colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
  2412. expected = read_fwf(StringIO(test), colspecs=colspecs)
  2413. tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
  2414. def test_multiple_delimiters(self):
  2415. test = r'''
  2416. col1~~~~~col2 col3++++++++++++++++++col4
  2417. ~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
  2418. 33+++122.33\\\bar.........Gerard Butler
  2419. ++44~~~~12.01 baz~~Jennifer Love Hewitt
  2420. ~~55 11+++foo++++Jada Pinkett-Smith
  2421. ..66++++++.03~~~bar Bill Murray
  2422. '''.strip('\r\n')
  2423. colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
  2424. expected = read_fwf(StringIO(test), colspecs=colspecs,
  2425. delimiter=' +~.\\')
  2426. tm.assert_frame_equal(expected, read_fwf(StringIO(test),
  2427. delimiter=' +~.\\'))
  2428. def test_variable_width_unicode(self):
  2429. if not compat.PY3:
  2430. raise nose.SkipTest('Bytes-related test - only needs to work on Python 3')
  2431. test = '''
  2432. שלום שלום
  2433. ום שלל
  2434. של ום
  2435. '''.strip('\r\n')
  2436. expected = pd.read_fwf(BytesIO(test.encode('utf8')),
  2437. colspecs=[(0, 4), (5, 9)], header=None, encoding='utf8')
  2438. tm.assert_frame_equal(expected, read_fwf(BytesIO(test.encode('utf8')),
  2439. header=None, encoding='utf8'))
  2440. class TestCParserHighMemory(ParserTests, tm.TestCase):
  2441. def read_csv(self, *args, **kwds):
  2442. kwds = kwds.copy()
  2443. kwds['engine'] = 'c'
  2444. kwds['low_memory'] = False
  2445. return read_csv(*args, **kwds)
  2446. def read_table(self, *args, **kwds):
  2447. kwds = kwds.copy()
  2448. kwds['engine'] = 'c'
  2449. kwds['low_memory'] = False
  2450. return read_table(*args, **kwds)
  2451. def test_compact_ints(self):
  2452. data = ('0,1,0,0\n'
  2453. '1,1,0,0\n'
  2454. '0,1,0,1')
  2455. result = read_csv(StringIO(data), delimiter=',', header=None,
  2456. compact_ints=True, as_recarray=True)
  2457. ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
  2458. self.assertEqual(result.dtype, ex_dtype)
  2459. result = read_csv(StringIO(data), delimiter=',', header=None,
  2460. as_recarray=True, compact_ints=True,
  2461. use_unsigned=True)
  2462. ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
  2463. self.assertEqual(result.dtype, ex_dtype)
  2464. def test_parse_dates_empty_string(self):
  2465. # #2263
  2466. s = StringIO("Date, test\n2012-01-01, 1\n,2")
  2467. result = self.read_csv(s, parse_dates=["Date"], na_filter=False)
  2468. self.assertTrue(result['Date'].isnull()[1])
  2469. def test_usecols(self):
  2470. raise nose.SkipTest("Usecols is not supported in C High Memory engine.")
  2471. def test_line_comment(self):
  2472. data = """# empty
  2473. A,B,C
  2474. 1,2.,4.#hello world
  2475. #ignore this line
  2476. 5.,NaN,10.0
  2477. """
  2478. expected = [[1., 2., 4.],
  2479. [5., np.nan, 10.]]
  2480. df = self.read_csv(StringIO(data), comment='#')
  2481. tm.assert_almost_equal(df.values, expected)
  2482. def test_comment_skiprows(self):
  2483. data = """# empty
  2484. random line
  2485. # second empty line
  2486. 1,2,3
  2487. A,B,C
  2488. 1,2.,4.
  2489. 5.,NaN,10.0
  2490. """
  2491. expected = [[1., 2., 4.],
  2492. [5., np.nan, 10.]]
  2493. # this should ignore the first four lines (including comments)
  2494. df = self.read_csv(StringIO(data), comment='#', skiprows=4)
  2495. tm.assert_almost_equal(df.values, expected)
  2496. def test_comment_header(self):
  2497. data = """# empty
  2498. # second empty line
  2499. 1,2,3
  2500. A,B,C
  2501. 1,2.,4.
  2502. 5.,NaN,10.0
  2503. """
  2504. expected = [[1., 2., 4.],
  2505. [5., np.nan, 10.]]
  2506. # header should begin at the second non-comment line
  2507. df = self.read_csv(StringIO(data), comment='#', header=1)
  2508. tm.assert_almost_equal(df.values, expected)
  2509. def test_comment_skiprows_header(self):
  2510. data = """# empty
  2511. # second empty line
  2512. # third empty line
  2513. X,Y,Z
  2514. 1,2,3
  2515. A,B,C
  2516. 1,2.,4.
  2517. 5.,NaN,10.0
  2518. """
  2519. expected = [[1., 2., 4.],
  2520. [5., np.nan, 10.]]
  2521. # skiprows should skip the first 4 lines (including comments), while
  2522. # header should start from the second non-commented line starting
  2523. # with line 5
  2524. df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1)
  2525. tm.assert_almost_equal(df.values, expected)
  2526. def test_passing_dtype(self):
  2527. # GH 6607
  2528. # This is a copy which should eventually be merged into ParserTests
  2529. # when the dtype argument is supported by all engines.
  2530. df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E'])
  2531. with tm.ensure_clean('__passing_str_as_dtype__.csv') as path:
  2532. df.to_csv(path)
  2533. # GH 3795
  2534. # passing 'str' as the dtype
  2535. result = self.read_csv(path, dtype=str, index_col=0)
  2536. tm.assert_series_equal(result.dtypes,Series({ 'A' : 'object', 'B' : 'object' }))
  2537. # we expect all object columns, so need to convert to test for equivalence
  2538. result = result.astype(float)
  2539. tm.assert_frame_equal(result,df)
  2540. # invalid dtype
  2541. self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'foo', 'B' : 'float64' },
  2542. index_col=0)
  2543. # valid but we don't support it (date)
  2544. self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' },
  2545. index_col=0)
  2546. self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' },
  2547. index_col=0, parse_dates=['B'])
  2548. # valid but we don't support it
  2549. self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' },
  2550. index_col=0)
  2551. def test_fallback_to_python(self):
  2552. # GH 6607
  2553. data = 'a b c\n1 2 3'
  2554. # specify C engine with unsupported options (raise)
  2555. with tm.assertRaisesRegexp(ValueError, 'does not support'):
  2556. self.read_table(StringIO(data), engine='c', sep=None,
  2557. delim_whitespace=False)
  2558. with tm.assertRaisesRegexp(ValueError, 'does not support'):
  2559. self.read_table(StringIO(data), engine='c', sep='\s')
  2560. with tm.assertRaisesRegexp(ValueError, 'does not support'):
  2561. self.read_table(StringIO(data), engine='c', skip_footer=1)
  2562. class TestCParserLowMemory(ParserTests, tm.TestCase):
  2563. def read_csv(self, *args, **kwds):
  2564. kwds = kwds.copy()
  2565. kwds['engine'] = 'c'
  2566. kwds['low_memory'] = True
  2567. kwds['buffer_lines'] = 2
  2568. return read_csv(*args, **kwds)
  2569. def read_table(self, *args, **kwds):
  2570. kwds = kwds.copy()
  2571. kwds['engine'] = 'c'
  2572. kwds['low_memory'] = True
  2573. kwds['buffer_lines'] = 2
  2574. return read_table(*args, **kwds)
  2575. def test_compact_ints(self):
  2576. data = ('0,1,0,0\n'
  2577. '1,1,0,0\n'
  2578. '0,1,0,1')
  2579. result = read_csv(StringIO(data), delimiter=',', header=None,
  2580. compact_ints=True, as_recarray=True)
  2581. ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
  2582. self.assertEqual(result.dtype, ex_dtype)
  2583. result = read_csv(StringIO(data), delimiter=',', header=None,
  2584. as_recarray=True, compact_ints=True,
  2585. use_unsigned=True)
  2586. ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
  2587. self.assertEqual(result.dtype, ex_dtype)
  2588. def test_pass_dtype(self):
  2589. data = """\
  2590. one,two
  2591. 1,2.5
  2592. 2,3.5
  2593. 3,4.5
  2594. 4,5.5"""
  2595. result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'},
  2596. as_recarray=True)
  2597. self.assertEqual(result['one'].dtype, 'u1')
  2598. self.assertEqual(result['two'].dtype, 'S1')
  2599. def test_usecols_dtypes(self):
  2600. data = """\
  2601. 1,2,3
  2602. 4,5,6
  2603. 7,8,9
  2604. 10,11,12"""
  2605. result = self.read_csv(StringIO(data), usecols=(0, 1, 2),
  2606. names=('a', 'b', 'c'),
  2607. header=None,
  2608. converters={'a': str},
  2609. dtype={'b': int, 'c': float},
  2610. )
  2611. result2 = self.read_csv(StringIO(data), usecols=(0, 2),
  2612. names=('a', 'b', 'c'),
  2613. header=None,
  2614. converters={'a': str},
  2615. dtype={'b': int, 'c': float},
  2616. )
  2617. self.assertTrue((result.dtypes == [object, np.int, np.float]).all())
  2618. self.assertTrue((result2.dtypes == [object, np.float]).all())
  2619. def test_usecols_implicit_index_col(self):
  2620. # #2654
  2621. data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
  2622. result = self.read_csv(StringIO(data), usecols=['a', 'b'])
  2623. expected = DataFrame({'a': ['apple', 'orange'],
  2624. 'b': ['bat', 'cow']}, index=[4, 8])
  2625. tm.assert_frame_equal(result, expected)
  2626. def test_usecols_with_whitespace(self):
  2627. data = 'a b c\n4 apple bat 5.7\n8 orange cow 10'
  2628. result = self.read_csv(StringIO(data), delim_whitespace=True,
  2629. usecols=('a', 'b'))
  2630. expected = DataFrame({'a': ['apple', 'orange'],
  2631. 'b': ['bat', 'cow']}, index=[4, 8])
  2632. tm.assert_frame_equal(result, expected)
  2633. def test_usecols_regex_sep(self):
  2634. # #2733
  2635. data = 'a b c\n4 apple bat 5.7\n8 orange cow 10'
  2636. df = self.read_csv(StringIO(data), sep='\s+', usecols=('a', 'b'))
  2637. expected = DataFrame({'a': ['apple', 'orange'],
  2638. 'b': ['bat', 'cow']}, index=[4, 8])
  2639. tm.assert_frame_equal(df, expected)
  2640. def test_pure_python_failover(self):
  2641. data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
  2642. result = self.read_csv(StringIO(data), comment='#')
  2643. expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]})
  2644. tm.assert_frame_equal(result, expected)
  2645. def test_decompression(self):
  2646. try:
  2647. import gzip
  2648. import bz2
  2649. except ImportError:
  2650. raise nose.SkipTest('need gzip and bz2 to run')
  2651. data = open(self.csv1, 'rb').read()
  2652. expected = self.read_csv(self.csv1)
  2653. with tm.ensure_clean() as path:
  2654. tmp = gzip.GzipFile(path, mode='wb')
  2655. tmp.write(data)
  2656. tmp.close()
  2657. result = self.read_csv(path, compression='gzip')
  2658. tm.assert_frame_equal(result, expected)
  2659. result = self.read_csv(open(path, 'rb'), compression='gzip')
  2660. tm.assert_frame_equal(result, expected)
  2661. with tm.ensure_clean() as path:
  2662. tmp = bz2.BZ2File(path, mode='wb')
  2663. tmp.write(data)
  2664. tmp.close()
  2665. result = self.read_csv(path, compression='bz2')
  2666. tm.assert_frame_equal(result, expected)
  2667. # result = self.read_csv(open(path, 'rb'), compression='bz2')
  2668. # tm.assert_frame_equal(result, expected)
  2669. self.assertRaises(ValueError, self.read_csv,
  2670. path, compression='bz3')
  2671. def test_decompression_regex_sep(self):
  2672. try:
  2673. import gzip
  2674. import bz2
  2675. except ImportError:
  2676. raise nose.SkipTest('need gzip and bz2 to run')
  2677. data = open(self.csv1, 'rb').read()
  2678. data = data.replace(b',', b'::')
  2679. expected = self.read_csv(self.csv1)
  2680. with tm.ensure_clean() as path:
  2681. tmp = gzip.GzipFile(path, mode='wb')
  2682. tmp.write(data)
  2683. tmp.close()
  2684. # GH 6607
  2685. # Test currently only valid with the python engine because of
  2686. # regex sep. Temporarily copied to TestPythonParser.
  2687. # Here test for ValueError when passing regex sep:
  2688. with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX
  2689. result = self.read_csv(path, sep='::', compression='gzip')
  2690. tm.assert_frame_equal(result, expected)
  2691. with tm.ensure_clean() as path:
  2692. tmp = bz2.BZ2File(path, mode='wb')
  2693. tmp.write(data)
  2694. tmp.close()
  2695. # GH 6607
  2696. with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX
  2697. result = self.read_csv(path, sep='::', compression='bz2')
  2698. tm.assert_frame_equal(result, expected)
  2699. self.assertRaises(ValueError, self.read_csv,
  2700. path, compression='bz3')
  2701. def test_memory_map(self):
  2702. # it works!
  2703. result = self.read_csv(self.csv1, memory_map=True)
  2704. def test_disable_bool_parsing(self):
  2705. # #2090
  2706. data = """A,B,C
  2707. Yes,No,Yes
  2708. No,Yes,Yes
  2709. Yes,,Yes
  2710. No,No,No"""
  2711. result = read_csv(StringIO(data), dtype=object)
  2712. self.assertTrue((result.dtypes == object).all())
  2713. result = read_csv(StringIO(data), dtype=object, na_filter=False)
  2714. self.assertEqual(result['B'][2], '')
  2715. def test_int64_overflow(self):
  2716. data = """ID
  2717. 00013007854817840016671868
  2718. 00013007854817840016749251
  2719. 00013007854817840016754630
  2720. 00013007854817840016781876
  2721. 00013007854817840017028824
  2722. 00013007854817840017963235
  2723. 00013007854817840018860166"""
  2724. result = read_csv(StringIO(data))
  2725. self.assertTrue(result['ID'].dtype == object)
  2726. self.assertRaises(OverflowError, read_csv, StringIO(data),
  2727. dtype='i8')
  2728. def test_euro_decimal_format(self):
  2729. data = """Id;Number1;Number2;Text1;Text2;Number3
  2730. 1;1521,1541;187101,9543;ABC;poi;4,738797819
  2731. 2;121,12;14897,76;DEF;uyt;0,377320872
  2732. 3;878,158;108013,434;GHI;rez;2,735694704"""
  2733. df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
  2734. self.assertEqual(df2['Number1'].dtype, float)
  2735. self.assertEqual(df2['Number2'].dtype, float)
  2736. self.assertEqual(df2['Number3'].dtype, float)
  2737. def test_custom_lineterminator(self):
  2738. data = 'a,b,c~1,2,3~4,5,6'
  2739. result = self.read_csv(StringIO(data), lineterminator='~')
  2740. expected = self.read_csv(StringIO(data.replace('~', '\n')))
  2741. tm.assert_frame_equal(result, expected)
  2742. data2 = data.replace('~', '~~')
  2743. result = self.assertRaises(ValueError, read_csv, StringIO(data2),
  2744. lineterminator='~~')
  2745. def test_raise_on_passed_int_dtype_with_nas(self):
  2746. # #2631
  2747. data = """YEAR, DOY, a
  2748. 2001,106380451,10
  2749. 2001,,11
  2750. 2001,106380451,67"""
  2751. self.assertRaises(Exception, read_csv, StringIO(data), sep=",",
  2752. skipinitialspace=True,
  2753. dtype={'DOY': np.int64})
  2754. def test_na_trailing_columns(self):
  2755. data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax
  2756. 2012-03-14,USD,AAPL,BUY,1000
  2757. 2012-05-12,USD,SBUX,SELL,500"""
  2758. result = self.read_csv(StringIO(data))
  2759. self.assertEqual(result['Date'][1], '2012-05-12')
  2760. self.assertTrue(result['UnitPrice'].isnull().all())
  2761. def test_parse_ragged_csv(self):
  2762. data = """1,2,3
  2763. 1,2,3,4
  2764. 1,2,3,4,5
  2765. 1,2
  2766. 1,2,3,4"""
  2767. nice_data = """1,2,3,,
  2768. 1,2,3,4,
  2769. 1,2,3,4,5
  2770. 1,2,,,
  2771. 1,2,3,4,"""
  2772. result = self.read_csv(StringIO(data), header=None,
  2773. names=['a', 'b', 'c', 'd', 'e'])
  2774. expected = self.read_csv(StringIO(nice_data), header=None,
  2775. names=['a', 'b', 'c', 'd', 'e'])
  2776. tm.assert_frame_equal(result, expected)
  2777. # too many columns, cause segfault if not careful
  2778. data = "1,2\n3,4,5"
  2779. result = self.read_csv(StringIO(data), header=None,
  2780. names=lrange(50))
  2781. expected = self.read_csv(StringIO(data), header=None,
  2782. names=lrange(3)).reindex(columns=lrange(50))
  2783. tm.assert_frame_equal(result, expected)
  2784. def test_tokenize_CR_with_quoting(self):
  2785. # #3453, this doesn't work with Python parser for some reason
  2786. data = ' a,b,c\r"a,b","e,d","f,f"'
  2787. result = self.read_csv(StringIO(data), header=None)
  2788. expected = self.read_csv(StringIO(data.replace('\r', '\n')),
  2789. header=None)
  2790. tm.assert_frame_equal(result, expected)
  2791. result = self.read_csv(StringIO(data))
  2792. expected = self.read_csv(StringIO(data.replace('\r', '\n')))
  2793. tm.assert_frame_equal(result, expected)
  2794. def test_raise_on_no_columns(self):
  2795. # single newline
  2796. data = "\n"
  2797. self.assertRaises(ValueError, self.read_csv, StringIO(data))
  2798. # test with more than a single newline
  2799. data = "\n\n\n"
  2800. self.assertRaises(ValueError, self.read_csv, StringIO(data))
  2801. def test_warn_if_chunks_have_mismatched_type(self):
  2802. # Issue #3866 If chunks are different types and can't
  2803. # be coerced using numerical types, then issue warning.
  2804. integers = [str(i) for i in range(499999)]
  2805. data = "a\n" + "\n".join(integers + ['a', 'b'] + integers)
  2806. with tm.assert_produces_warning(DtypeWarning):
  2807. df = self.read_csv(StringIO(data))
  2808. self.assertEqual(df.a.dtype, np.object)
  2809. def test_invalid_c_parser_opts_with_not_c_parser(self):
  2810. from pandas.io.parsers import _c_parser_defaults as c_defaults
  2811. data = """1,2,3,,
  2812. 1,2,3,4,
  2813. 1,2,3,4,5
  2814. 1,2,,,
  2815. 1,2,3,4,"""
  2816. engines = 'python', 'python-fwf'
  2817. for default in c_defaults:
  2818. for engine in engines:
  2819. kwargs = {default: object()}
  2820. with tm.assertRaisesRegexp(ValueError,
  2821. 'The %r option is not supported '
  2822. 'with the %r engine' % (default,
  2823. engine)):
  2824. read_csv(StringIO(data), engine=engine, **kwargs)
  2825. def test_passing_dtype(self):
  2826. # GH 6607
  2827. # This is a copy which should eventually be merged into ParserTests
  2828. # when the dtype argument is supported by all engines.
  2829. df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E'])
  2830. with tm.ensure_clean('__passing_str_as_dtype__.csv') as path:
  2831. df.to_csv(path)
  2832. # GH 3795
  2833. # passing 'str' as the dtype
  2834. result = self.read_csv(path, dtype=str, index_col=0)
  2835. tm.assert_series_equal(result.dtypes,Series({ 'A' : 'object', 'B' : 'object' }))
  2836. # we expect all object columns, so need to convert to test for equivalence
  2837. result = result.astype(float)
  2838. tm.assert_frame_equal(result,df)
  2839. # invalid dtype
  2840. self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'foo', 'B' : 'float64' },
  2841. index_col=0)
  2842. # valid but we don't support it (date)
  2843. self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' },
  2844. index_col=0)
  2845. self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' },
  2846. index_col=0, parse_dates=['B'])
  2847. # valid but we don't support it
  2848. self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' },
  2849. index_col=0)
  2850. def test_fallback_to_python(self):
  2851. # GH 6607
  2852. data = 'a b c\n1 2 3'
  2853. # specify C engine with C-unsupported options (raise)
  2854. with tm.assertRaisesRegexp(ValueError, 'does not support'):
  2855. self.read_table(StringIO(data), engine='c', sep=None,
  2856. delim_whitespace=False)
  2857. with tm.assertRaisesRegexp(ValueError, 'does not support'):
  2858. self.read_table(StringIO(data), engine='c', sep='\s')
  2859. with tm.assertRaisesRegexp(ValueError, 'does not support'):
  2860. self.read_table(StringIO(data), engine='c', skip_footer=1)
  2861. def test_raise_on_sep_with_delim_whitespace(self):
  2862. # GH 6607
  2863. data = 'a b c\n1 2 3'
  2864. with tm.assertRaisesRegexp(ValueError, 'you can only specify one'):
  2865. self.read_table(StringIO(data), sep='\s', delim_whitespace=True)
  2866. class TestMiscellaneous(tm.TestCase):
  2867. # for tests that don't fit into any of the other classes, e.g. those that
  2868. # compare results for different engines or test the behavior when 'engine'
  2869. # is not passed
  2870. def test_compare_whitespace_regex(self):
  2871. # GH 6607
  2872. data = ' a b c\n1 2 3 \n4 5 6\n 7 8 9'
  2873. result_c = pd.read_table(StringIO(data), sep='\s+', engine='c')
  2874. result_py = pd.read_table(StringIO(data), sep='\s+', engine='python')
  2875. tm.assert_frame_equal(result_c, result_py)
  2876. def test_fallback_to_python(self):
  2877. # GH 6607
  2878. data = 'a b c\n1 2 3'
  2879. # specify C-unsupported options with python-unsupported option
  2880. # (options will be ignored on fallback, raise)
  2881. with tm.assertRaisesRegexp(ValueError, 'Falling back'):
  2882. pd.read_table(StringIO(data), sep=None,
  2883. delim_whitespace=False, dtype={'a': float})
  2884. with tm.assertRaisesRegexp(ValueError, 'Falling back'):
  2885. pd.read_table(StringIO(data), sep='\s', dtype={'a': float})
  2886. with tm.assertRaisesRegexp(ValueError, 'Falling back'):
  2887. pd.read_table(StringIO(data), skip_footer=1, dtype={'a': float})
  2888. # specify C-unsupported options without python-unsupported options
  2889. with tm.assert_produces_warning(parsers.ParserWarning):
  2890. pd.read_table(StringIO(data), sep=None, delim_whitespace=False)
  2891. with tm.assert_produces_warning(parsers.ParserWarning):
  2892. pd.read_table(StringIO(data), sep='\s')
  2893. with tm.assert_produces_warning(parsers.ParserWarning):
  2894. pd.read_table(StringIO(data), skip_footer=1)
  2895. class TestParseSQL(tm.TestCase):
  2896. def test_convert_sql_column_floats(self):
  2897. arr = np.array([1.5, None, 3, 4.2], dtype=object)
  2898. result = lib.convert_sql_column(arr)
  2899. expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8')
  2900. assert_same_values_and_dtype(result, expected)
  2901. def test_convert_sql_column_strings(self):
  2902. arr = np.array(['1.5', None, '3', '4.2'], dtype=object)
  2903. result = lib.convert_sql_column(arr)
  2904. expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object)
  2905. assert_same_values_and_dtype(result, expected)
  2906. def test_convert_sql_column_unicode(self):
  2907. arr = np.array([u('1.5'), None, u('3'), u('4.2')],
  2908. dtype=object)
  2909. result = lib.convert_sql_column(arr)
  2910. expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')],
  2911. dtype=object)
  2912. assert_same_values_and_dtype(result, expected)
  2913. def test_convert_sql_column_ints(self):
  2914. arr = np.array([1, 2, 3, 4], dtype='O')
  2915. arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O')
  2916. result = lib.convert_sql_column(arr)
  2917. result2 = lib.convert_sql_column(arr2)
  2918. expected = np.array([1, 2, 3, 4], dtype='i8')
  2919. assert_same_values_and_dtype(result, expected)
  2920. assert_same_values_and_dtype(result2, expected)
  2921. arr = np.array([1, 2, 3, None, 4], dtype='O')
  2922. result = lib.convert_sql_column(arr)
  2923. expected = np.array([1, 2, 3, np.nan, 4], dtype='f8')
  2924. assert_same_values_and_dtype(result, expected)
  2925. def test_convert_sql_column_longs(self):
  2926. arr = np.array([long(1), long(2), long(3), long(4)], dtype='O')
  2927. result = lib.convert_sql_column(arr)
  2928. expected = np.array([1, 2, 3, 4], dtype='i8')
  2929. assert_same_values_and_dtype(result, expected)
  2930. arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O')
  2931. result = lib.convert_sql_column(arr)
  2932. expected = np.array([1, 2, 3, np.nan, 4], dtype='f8')
  2933. assert_same_values_and_dtype(result, expected)
  2934. def test_convert_sql_column_bools(self):
  2935. arr = np.array([True, False, True, False], dtype='O')
  2936. result = lib.convert_sql_column(arr)
  2937. expected = np.array([True, False, True, False], dtype=bool)
  2938. assert_same_values_and_dtype(result, expected)
  2939. arr = np.array([True, False, None, False], dtype='O')
  2940. result = lib.convert_sql_column(arr)
  2941. expected = np.array([True, False, np.nan, False], dtype=object)
  2942. assert_same_values_and_dtype(result, expected)
  2943. def test_convert_sql_column_decimals(self):
  2944. from decimal import Decimal
  2945. arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')])
  2946. result = lib.convert_sql_column(arr)
  2947. expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8')
  2948. assert_same_values_and_dtype(result, expected)
  2949. class TestS3(tm.TestCase):
  2950. def setUp(self):
  2951. try:
  2952. import boto
  2953. except ImportError:
  2954. raise nose.SkipTest("boto not installed")
  2955. if compat.PY3:
  2956. raise nose.SkipTest("boto incompatible with Python 3")
  2957. @tm.network
  2958. def test_parse_public_s3_bucket(self):
  2959. import nose.tools as nt
  2960. df = pd.read_csv('s3://nyqpug/tips.csv')
  2961. nt.assert_true(isinstance(df, pd.DataFrame))
  2962. nt.assert_false(df.empty)
  2963. tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df)
  2964. @tm.network
  2965. def test_s3_fails(self):
  2966. import boto
  2967. with tm.assertRaisesRegexp(boto.exception.S3ResponseError,
  2968. 'S3ResponseError: 404 Not Found'):
  2969. pd.read_csv('s3://nyqpug/asdf.csv')
  2970. with tm.assertRaisesRegexp(boto.exception.S3ResponseError,
  2971. 'S3ResponseError: 403 Forbidden'):
  2972. pd.read_csv('s3://cant_get_it/tips.csv')
  2973. def assert_same_values_and_dtype(res, exp):
  2974. tm.assert_equal(res.dtype, exp.dtype)
  2975. tm.assert_almost_equal(res, exp)
  2976. if __name__ == '__main__':
  2977. nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
  2978. exit=False)