/pandas/io/tests/test_parsers.py
Python | 3569 lines | 3512 code | 35 blank | 22 comment | 15 complexity | d63a8c14b3ed6a7fbb6579f61b69ceca MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- # -*- coding: utf-8 -*-
- # pylint: disable=E1101
- from datetime import datetime
- import csv
- import os
- import sys
- import re
- import nose
- import platform
- from numpy import nan
- import numpy as np
- from pandas.io.common import DtypeWarning
- from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex
- from pandas.compat import(
- StringIO, BytesIO, PY3, range, long, lrange, lmap, u
- )
- from pandas.io.common import URLError
- import pandas.io.parsers as parsers
- from pandas.io.parsers import (read_csv, read_table, read_fwf,
- TextFileReader, TextParser)
- import pandas.util.testing as tm
- import pandas as pd
- from pandas.compat import parse_date
- import pandas.lib as lib
- from pandas import compat
- from pandas.lib import Timestamp
- from pandas.tseries.index import date_range
- import pandas.tseries.tools as tools
- from numpy.testing.decorators import slow
- from numpy.testing import assert_array_equal
- from pandas.parser import OverflowError, CParserError
- class ParserTests(object):
- """
- Want to be able to test either C+Cython or Python+Cython parsers
- """
- data1 = """index,A,B,C,D
- foo,2,3,4,5
- bar,7,8,9,10
- baz,12,13,14,15
- qux,12,13,14,15
- foo2,12,13,14,15
- bar2,12,13,14,15
- """
- def read_csv(self, *args, **kwargs):
- raise NotImplementedError
- def read_table(self, *args, **kwargs):
- raise NotImplementedError
- def setUp(self):
- import warnings
- warnings.filterwarnings(action='ignore', category=FutureWarning)
- self.dirpath = tm.get_data_path()
- self.csv1 = os.path.join(self.dirpath, 'test1.csv')
- self.csv2 = os.path.join(self.dirpath, 'test2.csv')
- self.xls1 = os.path.join(self.dirpath, 'test.xls')
- def test_converters_type_must_be_dict(self):
- with tm.assertRaisesRegexp(TypeError, 'Type converters.+'):
- self.read_csv(StringIO(self.data1), converters=0)
- def test_multi_character_decimal_marker(self):
- data = """A|B|C
- 1|2,334|5
- 10|13|10.
- """
- self.assertRaises(ValueError, read_csv, StringIO(data), decimal=',,')
- def test_empty_decimal_marker(self):
- data = """A|B|C
- 1|2,334|5
- 10|13|10.
- """
- self.assertRaises(ValueError, read_csv, StringIO(data), decimal='')
- def test_empty_thousands_marker(self):
- data = """A|B|C
- 1|2,334|5
- 10|13|10.
- """
- self.assertRaises(ValueError, read_csv, StringIO(data), thousands='')
- def test_multi_character_decimal_marker(self):
- data = """A|B|C
- 1|2,334|5
- 10|13|10.
- """
- self.assertRaises(ValueError, read_csv, StringIO(data), thousands=',,')
- def test_empty_string(self):
- data = """\
- One,Two,Three
- a,1,one
- b,2,two
- ,3,three
- d,4,nan
- e,5,five
- nan,6,
- g,7,seven
- """
- df = self.read_csv(StringIO(data))
- xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
- 'Two': [1, 2, 3, 4, 5, 6, 7],
- 'Three': ['one', 'two', 'three', np.nan, 'five',
- np.nan, 'seven']})
- tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
- df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []},
- keep_default_na=False)
- xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
- 'Two': [1, 2, 3, 4, 5, 6, 7],
- 'Three': ['one', 'two', 'three', 'nan', 'five',
- '', 'seven']})
- tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
- df = self.read_csv(
- StringIO(data), na_values=['a'], keep_default_na=False)
- xp = DataFrame({'One': [np.nan, 'b', '', 'd', 'e', 'nan', 'g'],
- 'Two': [1, 2, 3, 4, 5, 6, 7],
- 'Three': ['one', 'two', 'three', 'nan', 'five', '',
- 'seven']})
- tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
- df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []})
- xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
- 'Two': [1, 2, 3, 4, 5, 6, 7],
- 'Three': ['one', 'two', 'three', np.nan, 'five',
- np.nan, 'seven']})
- tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
- # GH4318, passing na_values=None and keep_default_na=False yields 'None' as a na_value
- data = """\
- One,Two,Three
- a,1,None
- b,2,two
- ,3,None
- d,4,nan
- e,5,five
- nan,6,
- g,7,seven
- """
- df = self.read_csv(
- StringIO(data), keep_default_na=False)
- xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
- 'Two': [1, 2, 3, 4, 5, 6, 7],
- 'Three': ['None', 'two', 'None', 'nan', 'five', '',
- 'seven']})
- tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
- def test_read_csv(self):
- if not compat.PY3:
- if 'win' in sys.platform:
- prefix = u("file:///")
- else:
- prefix = u("file://")
- fname = prefix + compat.text_type(self.csv1)
- # it works!
- df1 = read_csv(fname, index_col=0, parse_dates=True)
- def test_dialect(self):
- data = """\
- label1,label2,label3
- index1,"a,c,e
- index2,b,d,f
- """
- dia = csv.excel()
- dia.quoting = csv.QUOTE_NONE
- df = self.read_csv(StringIO(data), dialect=dia)
- data = '''\
- label1,label2,label3
- index1,a,c,e
- index2,b,d,f
- '''
- exp = self.read_csv(StringIO(data))
- exp.replace('a', '"a', inplace=True)
- tm.assert_frame_equal(df, exp)
- def test_1000_sep(self):
- data = """A|B|C
- 1|2,334|5
- 10|13|10.
- """
- expected = DataFrame({
- 'A': [1, 10],
- 'B': [2334, 13],
- 'C': [5, 10.]
- })
- df = self.read_csv(StringIO(data), sep='|', thousands=',')
- tm.assert_frame_equal(df, expected)
- df = self.read_table(StringIO(data), sep='|', thousands=',')
- tm.assert_frame_equal(df, expected)
- def test_1000_sep_with_decimal(self):
- data = """A|B|C
- 1|2,334.01|5
- 10|13|10.
- """
- expected = DataFrame({
- 'A': [1, 10],
- 'B': [2334.01, 13],
- 'C': [5, 10.]
- })
- tm.assert_equal(expected.A.dtype, 'int64')
- tm.assert_equal(expected.B.dtype, 'float')
- tm.assert_equal(expected.C.dtype, 'float')
- df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
- tm.assert_frame_equal(df, expected)
- df = self.read_table(StringIO(data), sep='|', thousands=',', decimal='.')
- tm.assert_frame_equal(df, expected)
- data_with_odd_sep = """A|B|C
- 1|2.334,01|5
- 10|13|10,
- """
- df = self.read_csv(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
- tm.assert_frame_equal(df, expected)
- df = self.read_table(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
- tm.assert_frame_equal(df, expected)
- def test_separator_date_conflict(self):
- # Regression test for issue #4678: make sure thousands separator and
- # date parsing do not conflict.
- data = '06-02-2013;13:00;1-000.215'
- expected = DataFrame(
- [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]],
- columns=['Date', 2]
- )
- df = self.read_csv(StringIO(data), sep=';', thousands='-', parse_dates={'Date': [0, 1]}, header=None)
- tm.assert_frame_equal(df, expected)
- def test_squeeze(self):
- data = """\
- a,1
- b,2
- c,3
- """
- expected = Series([1, 2, 3], ['a', 'b', 'c'])
- result = self.read_table(StringIO(data), sep=',', index_col=0,
- header=None, squeeze=True)
- tm.assert_isinstance(result, Series)
- tm.assert_series_equal(result, expected)
- def test_inf_parsing(self):
- data = """\
- ,A
- a,inf
- b,-inf
- c,Inf
- d,-Inf
- e,INF
- f,-INF
- g,INf
- h,-INf
- i,inF
- j,-inF"""
- inf = float('inf')
- expected = Series([inf, -inf] * 5)
- df = read_csv(StringIO(data), index_col=0)
- tm.assert_almost_equal(df['A'].values, expected.values)
- df = read_csv(StringIO(data), index_col=0, na_filter=False)
- tm.assert_almost_equal(df['A'].values, expected.values)
- def test_multiple_date_col(self):
- # Can use multiple date parsers
- data = """\
- KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
- """
- def func(*date_cols):
- return lib.try_parse_dates(parsers._concat_date_cols(date_cols))
- df = self.read_csv(StringIO(data), header=None,
- date_parser=func,
- prefix='X',
- parse_dates={'nominal': [1, 2],
- 'actual': [1, 3]})
- self.assertIn('nominal', df)
- self.assertIn('actual', df)
- self.assertNotIn('X1', df)
- self.assertNotIn('X2', df)
- self.assertNotIn('X3', df)
- d = datetime(1999, 1, 27, 19, 0)
- self.assertEqual(df.ix[0, 'nominal'], d)
- df = self.read_csv(StringIO(data), header=None,
- date_parser=func,
- parse_dates={'nominal': [1, 2],
- 'actual': [1, 3]},
- keep_date_col=True)
- self.assertIn('nominal', df)
- self.assertIn('actual', df)
- self.assertIn(1, df)
- self.assertIn(2, df)
- self.assertIn(3, df)
- data = """\
- KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
- """
- df = read_csv(StringIO(data), header=None,
- prefix='X',
- parse_dates=[[1, 2], [1, 3]])
- self.assertIn('X1_X2', df)
- self.assertIn('X1_X3', df)
- self.assertNotIn('X1', df)
- self.assertNotIn('X2', df)
- self.assertNotIn('X3', df)
- d = datetime(1999, 1, 27, 19, 0)
- self.assertEqual(df.ix[0, 'X1_X2'], d)
- df = read_csv(StringIO(data), header=None,
- parse_dates=[[1, 2], [1, 3]], keep_date_col=True)
- self.assertIn('1_2', df)
- self.assertIn('1_3', df)
- self.assertIn(1, df)
- self.assertIn(2, df)
- self.assertIn(3, df)
- data = '''\
- KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- '''
- df = self.read_csv(StringIO(data), sep=',', header=None,
- parse_dates=[1], index_col=1)
- d = datetime(1999, 1, 27, 19, 0)
- self.assertEqual(df.index[0], d)
- def test_multiple_date_cols_int_cast(self):
- data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
- "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
- "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
- "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
- "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
- "KORD,19990127, 23:00:00, 22:56:00, -0.5900")
- date_spec = {'nominal': [1, 2], 'actual': [1, 3]}
- import pandas.io.date_converters as conv
- # it works!
- df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec,
- date_parser=conv.parse_date_time)
- self.assertIn('nominal', df)
- def test_multiple_date_col_timestamp_parse(self):
- data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
- 05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25"""
- result = self.read_csv(StringIO(data), sep=',', header=None,
- parse_dates=[[0,1]], date_parser=Timestamp)
- ex_val = Timestamp('05/31/2012 15:30:00.029')
- self.assertEqual(result['0_1'][0], ex_val)
- def test_single_line(self):
- # GH 6607
- # Test currently only valid with python engine because sep=None and
- # delim_whitespace=False. Temporarily copied to TestPythonParser.
- # Test for ValueError with other engines:
- with tm.assertRaisesRegexp(ValueError,
- 'sep=None with delim_whitespace=False'):
- # sniff separator
- buf = StringIO()
- sys.stdout = buf
- # printing warning message when engine == 'c' for now
- try:
- # it works!
- df = self.read_csv(StringIO('1,2'), names=['a', 'b'],
- header=None, sep=None)
- tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df)
- finally:
- sys.stdout = sys.__stdout__
- def test_multiple_date_cols_with_header(self):
- data = """\
- ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
- KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
- df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]})
- self.assertNotIsInstance(df.nominal[0], compat.string_types)
- ts_data = """\
- ID,date,nominalTime,actualTime,A,B,C,D,E
- KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
- """
- def test_multiple_date_col_name_collision(self):
- self.assertRaises(ValueError, self.read_csv, StringIO(self.ts_data),
- parse_dates={'ID': [1, 2]})
- data = """\
- date_NominalTime,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
- KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
- self.assertRaises(ValueError, self.read_csv, StringIO(data),
- parse_dates=[[1, 2]])
- def test_index_col_named(self):
- no_header = """\
- KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
- h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
- data = h + no_header
- # import pdb; pdb.set_trace()
- rs = self.read_csv(StringIO(data), index_col='ID')
- xp = self.read_csv(StringIO(data), header=0).set_index('ID')
- tm.assert_frame_equal(rs, xp)
- self.assertRaises(ValueError, self.read_csv, StringIO(no_header),
- index_col='ID')
- data = """\
- 1,2,3,4,hello
- 5,6,7,8,world
- 9,10,11,12,foo
- """
- names = ['a', 'b', 'c', 'd', 'message']
- xp = DataFrame({'a': [1, 5, 9], 'b': [2, 6, 10], 'c': [3, 7, 11],
- 'd': [4, 8, 12]},
- index=Index(['hello', 'world', 'foo'], name='message'))
- rs = self.read_csv(StringIO(data), names=names, index_col=['message'])
- tm.assert_frame_equal(xp, rs)
- self.assertEqual(xp.index.name, rs.index.name)
- rs = self.read_csv(StringIO(data), names=names, index_col='message')
- tm.assert_frame_equal(xp, rs)
- self.assertEqual(xp.index.name, rs.index.name)
- def test_converter_index_col_bug(self):
- # 1835
- data = "A;B\n1;2\n3;4"
- rs = self.read_csv(StringIO(data), sep=';', index_col='A',
- converters={'A': lambda x: x})
- xp = DataFrame({'B': [2, 4]}, index=Index([1, 3], name='A'))
- tm.assert_frame_equal(rs, xp)
- self.assertEqual(rs.index.name, xp.index.name)
- def test_date_parser_int_bug(self):
- # #3071
- log_file = StringIO(
- 'posix_timestamp,elapsed,sys,user,queries,query_time,rows,'
- 'accountid,userid,contactid,level,silo,method\n'
- '1343103150,0.062353,0,4,6,0.01690,3,'
- '12345,1,-1,3,invoice_InvoiceResource,search\n'
- )
- def f(posix_string):
- return datetime.utcfromtimestamp(int(posix_string))
- # it works!
- read_csv(log_file, index_col=0, parse_dates=0, date_parser=f)
- def test_multiple_skts_example(self):
- data = "year, month, a, b\n 2001, 01, 0.0, 10.\n 2001, 02, 1.1, 11."
- pass
- def test_malformed(self):
- # all
- data = """ignore
- A,B,C
- 1,2,3 # comment
- 1,2,3,4,5
- 2,3,4
- """
- try:
- df = self.read_table(
- StringIO(data), sep=',', header=1, comment='#')
- self.assertTrue(False)
- except Exception as inst:
- self.assertIn('Expected 3 fields in line 4, saw 5', str(inst))
- # skip_footer
- data = """ignore
- A,B,C
- 1,2,3 # comment
- 1,2,3,4,5
- 2,3,4
- footer
- """
- # GH 6607
- # Test currently only valid with python engine because
- # skip_footer != 0. Temporarily copied to TestPythonParser.
- # Test for ValueError with other engines:
- try:
- with tm.assertRaisesRegexp(ValueError, 'skip_footer'): #XXX
- df = self.read_table(
- StringIO(data), sep=',', header=1, comment='#',
- skip_footer=1)
- self.assertTrue(False)
- except Exception as inst:
- self.assertIn('Expected 3 fields in line 4, saw 5', str(inst))
- # first chunk
- data = """ignore
- A,B,C
- skip
- 1,2,3
- 3,5,10 # comment
- 1,2,3,4,5
- 2,3,4
- """
- try:
- it = self.read_table(StringIO(data), sep=',',
- header=1, comment='#', iterator=True, chunksize=1,
- skiprows=[2])
- df = it.read(5)
- self.assertTrue(False)
- except Exception as inst:
- self.assertIn('Expected 3 fields in line 6, saw 5', str(inst))
- # middle chunk
- data = """ignore
- A,B,C
- skip
- 1,2,3
- 3,5,10 # comment
- 1,2,3,4,5
- 2,3,4
- """
- try:
- it = self.read_table(StringIO(data), sep=',', header=1,
- comment='#', iterator=True, chunksize=1,
- skiprows=[2])
- df = it.read(1)
- it.read(2)
- self.assertTrue(False)
- except Exception as inst:
- self.assertIn('Expected 3 fields in line 6, saw 5', str(inst))
- # last chunk
- data = """ignore
- A,B,C
- skip
- 1,2,3
- 3,5,10 # comment
- 1,2,3,4,5
- 2,3,4
- """
- try:
- it = self.read_table(StringIO(data), sep=',',
- header=1, comment='#', iterator=True, chunksize=1,
- skiprows=[2])
- df = it.read(1)
- it.read()
- self.assertTrue(False)
- except Exception as inst:
- self.assertIn('Expected 3 fields in line 6, saw 5', str(inst))
- def test_passing_dtype(self):
- # GH 6607
- # Passing dtype is currently only supported by the C engine.
- # Temporarily copied to TestCParser*.
- # Test for ValueError with other engines:
- with tm.assertRaisesRegexp(ValueError,
- "The 'dtype' option is not supported"):
- df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E'])
- with tm.ensure_clean('__passing_str_as_dtype__.csv') as path:
- df.to_csv(path)
- # GH 3795
- # passing 'str' as the dtype
- result = self.read_csv(path, dtype=str, index_col=0)
- tm.assert_series_equal(result.dtypes,Series({ 'A' : 'object', 'B' : 'object' }))
- # we expect all object columns, so need to convert to test for equivalence
- result = result.astype(float)
- tm.assert_frame_equal(result,df)
- # invalid dtype
- self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'foo', 'B' : 'float64' },
- index_col=0)
- # valid but we don't support it (date)
- self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' },
- index_col=0)
- self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' },
- index_col=0, parse_dates=['B'])
- # valid but we don't support it
- self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' },
- index_col=0)
- def test_quoting(self):
- bad_line_small = """printer\tresult\tvariant_name
- Klosterdruckerei\tKlosterdruckerei <Salem> (1611-1804)\tMuller, Jacob
- Klosterdruckerei\tKlosterdruckerei <Salem> (1611-1804)\tMuller, Jakob
- Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\t"Furststiftische Hofdruckerei, <Kempten""
- Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\tGaller, Alois
- Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\tHochfurstliche Buchhandlung <Kempten>"""
- self.assertRaises(Exception, self.read_table, StringIO(bad_line_small),
- sep='\t')
- good_line_small = bad_line_small + '"'
- df = self.read_table(StringIO(good_line_small), sep='\t')
- self.assertEqual(len(df), 3)
- def test_non_string_na_values(self):
- # GH3611, na_values that are not a string are an issue
- with tm.ensure_clean('__non_string_na_values__.csv') as path:
- df = DataFrame({'A' : [-999, 2, 3], 'B' : [1.2, -999, 4.5]})
- df.to_csv(path, sep=' ', index=False)
- result1 = read_csv(path, sep= ' ', header=0, na_values=['-999.0','-999'])
- result2 = read_csv(path, sep= ' ', header=0, na_values=[-999,-999.0])
- result3 = read_csv(path, sep= ' ', header=0, na_values=[-999.0,-999])
- tm.assert_frame_equal(result1,result2)
- tm.assert_frame_equal(result2,result3)
- result4 = read_csv(path, sep= ' ', header=0, na_values=['-999.0'])
- result5 = read_csv(path, sep= ' ', header=0, na_values=['-999'])
- result6 = read_csv(path, sep= ' ', header=0, na_values=[-999.0])
- result7 = read_csv(path, sep= ' ', header=0, na_values=[-999])
- tm.assert_frame_equal(result4,result3)
- tm.assert_frame_equal(result5,result3)
- tm.assert_frame_equal(result6,result3)
- tm.assert_frame_equal(result7,result3)
- good_compare = result3
- # with an odd float format, so we can't match the string 999.0 exactly,
- # but need float matching
- df.to_csv(path, sep=' ', index=False, float_format = '%.3f')
- result1 = read_csv(path, sep= ' ', header=0, na_values=['-999.0','-999'])
- result2 = read_csv(path, sep= ' ', header=0, na_values=[-999,-999.0])
- result3 = read_csv(path, sep= ' ', header=0, na_values=[-999.0,-999])
- tm.assert_frame_equal(result1,good_compare)
- tm.assert_frame_equal(result2,good_compare)
- tm.assert_frame_equal(result3,good_compare)
- result4 = read_csv(path, sep= ' ', header=0, na_values=['-999.0'])
- result5 = read_csv(path, sep= ' ', header=0, na_values=['-999'])
- result6 = read_csv(path, sep= ' ', header=0, na_values=[-999.0])
- result7 = read_csv(path, sep= ' ', header=0, na_values=[-999])
- tm.assert_frame_equal(result4,good_compare)
- tm.assert_frame_equal(result5,good_compare)
- tm.assert_frame_equal(result6,good_compare)
- tm.assert_frame_equal(result7,good_compare)
- def test_default_na_values(self):
- _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN',
- '#N/A','N/A', 'NA', '#NA', 'NULL', 'NaN',
- 'nan', '-NaN', '-nan', '#N/A N/A',''])
- assert_array_equal (_NA_VALUES, parsers._NA_VALUES)
- nv = len(_NA_VALUES)
- def f(i, v):
- if i == 0:
- buf = ''
- elif i > 0:
- buf = ''.join([','] * i)
- buf = "{0}{1}".format(buf,v)
- if i < nv-1:
- buf = "{0}{1}".format(buf,''.join([','] * (nv-i-1)))
- return buf
- data = StringIO('\n'.join([ f(i, v) for i, v in enumerate(_NA_VALUES) ]))
- expected = DataFrame(np.nan,columns=range(nv),index=range(nv))
- df = self.read_csv(data, header=None)
- tm.assert_frame_equal(df, expected)
- def test_custom_na_values(self):
- data = """A,B,C
- ignore,this,row
- 1,NA,3
- -1.#IND,5,baz
- 7,8,NaN
- """
- expected = [[1., nan, 3],
- [nan, 5, nan],
- [7, 8, nan]]
- df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1])
- tm.assert_almost_equal(df.values, expected)
- df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'],
- skiprows=[1])
- tm.assert_almost_equal(df2.values, expected)
- df3 = self.read_table(StringIO(data), sep=',', na_values='baz',
- skiprows=[1])
- tm.assert_almost_equal(df3.values, expected)
- def test_nat_parse(self):
- # GH 3062
- df = DataFrame(dict({
- 'A' : np.asarray(lrange(10),dtype='float64'),
- 'B' : pd.Timestamp('20010101') }))
- df.iloc[3:6,:] = np.nan
- with tm.ensure_clean('__nat_parse_.csv') as path:
- df.to_csv(path)
- result = read_csv(path,index_col=0,parse_dates=['B'])
- tm.assert_frame_equal(result,df)
- expected = Series(dict( A = 'float64',B = 'datetime64[ns]'))
- tm.assert_series_equal(expected,result.dtypes)
- # test with NaT for the nan_rep
- # we don't have a method to specif the Datetime na_rep (it defaults to '')
- df.to_csv(path)
- result = read_csv(path,index_col=0,parse_dates=['B'])
- tm.assert_frame_equal(result,df)
- def test_skiprows_bug(self):
- # GH #505
- text = """#foo,a,b,c
- #foo,a,b,c
- #foo,a,b,c
- #foo,a,b,c
- #foo,a,b,c
- #foo,a,b,c
- 1/1/2000,1.,2.,3.
- 1/2/2000,4,5,6
- 1/3/2000,7,8,9
- """
- data = self.read_csv(StringIO(text), skiprows=lrange(6), header=None,
- index_col=0, parse_dates=True)
- data2 = self.read_csv(StringIO(text), skiprows=6, header=None,
- index_col=0, parse_dates=True)
- expected = DataFrame(np.arange(1., 10.).reshape((3, 3)),
- columns=[1, 2, 3],
- index=[datetime(2000, 1, 1), datetime(2000, 1, 2),
- datetime(2000, 1, 3)])
- expected.index.name = 0
- tm.assert_frame_equal(data, expected)
- tm.assert_frame_equal(data, data2)
- def test_deep_skiprows(self):
- # GH #4382
- text = "a,b,c\n" + "\n".join([",".join([str(i), str(i+1), str(i+2)]) for i in range(10)])
- condensed_text = "a,b,c\n" + "\n".join([",".join([str(i), str(i+1), str(i+2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]])
- data = self.read_csv(StringIO(text), skiprows=[6, 8])
- condensed_data = self.read_csv(StringIO(condensed_text))
- tm.assert_frame_equal(data, condensed_data)
- def test_detect_string_na(self):
- data = """A,B
- foo,bar
- NA,baz
- NaN,nan
- """
- expected = [['foo', 'bar'],
- [nan, 'baz'],
- [nan, nan]]
- df = self.read_csv(StringIO(data))
- tm.assert_almost_equal(df.values, expected)
- def test_unnamed_columns(self):
- data = """A,B,C,,
- 1,2,3,4,5
- 6,7,8,9,10
- 11,12,13,14,15
- """
- expected = [[1, 2, 3, 4, 5.],
- [6, 7, 8, 9, 10],
- [11, 12, 13, 14, 15]]
- df = self.read_table(StringIO(data), sep=',')
- tm.assert_almost_equal(df.values, expected)
- self.assert_numpy_array_equal(df.columns,
- ['A', 'B', 'C', 'Unnamed: 3',
- 'Unnamed: 4'])
- def test_string_nas(self):
- data = """A,B,C
- a,b,c
- d,,f
- ,g,h
- """
- result = self.read_csv(StringIO(data))
- expected = DataFrame([['a', 'b', 'c'],
- ['d', np.nan, 'f'],
- [np.nan, 'g', 'h']],
- columns=['A', 'B', 'C'])
- tm.assert_frame_equal(result, expected)
- def test_duplicate_columns(self):
- for engine in ['python', 'c']:
- data = """A,A,B,B,B
- 1,2,3,4,5
- 6,7,8,9,10
- 11,12,13,14,15
- """
- # check default beahviour
- df = self.read_table(StringIO(data), sep=',',engine=engine)
- self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])
- df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=False)
- self.assertEqual(list(df.columns), ['A', 'A', 'B', 'B', 'B'])
- df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=True)
- self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])
- def test_csv_mixed_type(self):
- data = """A,B,C
- a,1,2
- b,3,4
- c,4,5
- """
- df = self.read_csv(StringIO(data))
- # TODO
- def test_csv_custom_parser(self):
- data = """A,B,C
- 20090101,a,1,2
- 20090102,b,3,4
- 20090103,c,4,5
- """
- f = lambda x: datetime.strptime(x, '%Y%m%d')
- df = self.read_csv(StringIO(data), date_parser=f)
- expected = self.read_csv(StringIO(data), parse_dates=True)
- tm.assert_frame_equal(df, expected)
- def test_parse_dates_implicit_first_col(self):
- data = """A,B,C
- 20090101,a,1,2
- 20090102,b,3,4
- 20090103,c,4,5
- """
- df = self.read_csv(StringIO(data), parse_dates=True)
- expected = self.read_csv(StringIO(data), index_col=0, parse_dates=True)
- self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp))
- tm.assert_frame_equal(df, expected)
- def test_parse_dates_string(self):
- data = """date,A,B,C
- 20090101,a,1,2
- 20090102,b,3,4
- 20090103,c,4,5
- """
- rs = self.read_csv(
- StringIO(data), index_col='date', parse_dates='date')
- idx = date_range('1/1/2009', periods=3)
- idx.name = 'date'
- xp = DataFrame({'A': ['a', 'b', 'c'],
- 'B': [1, 3, 4],
- 'C': [2, 4, 5]}, idx)
- tm.assert_frame_equal(rs, xp)
- def test_yy_format(self):
- data = """date,time,B,C
- 090131,0010,1,2
- 090228,1020,3,4
- 090331,0830,5,6
- """
- rs = self.read_csv(StringIO(data), index_col=0,
- parse_dates=[['date', 'time']])
- idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
- datetime(2009, 2, 28, 10, 20, 0),
- datetime(2009, 3, 31, 8, 30, 0)]).asobject
- idx.name = 'date_time'
- xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx)
- tm.assert_frame_equal(rs, xp)
- rs = self.read_csv(StringIO(data), index_col=0,
- parse_dates=[[0, 1]])
- idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
- datetime(2009, 2, 28, 10, 20, 0),
- datetime(2009, 3, 31, 8, 30, 0)]).asobject
- idx.name = 'date_time'
- xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx)
- tm.assert_frame_equal(rs, xp)
- def test_parse_dates_column_list(self):
- from pandas.core.datetools import to_datetime
- data = '''date;destination;ventilationcode;unitcode;units;aux_date
- 01/01/2010;P;P;50;1;12/1/2011
- 01/01/2010;P;R;50;1;13/1/2011
- 15/01/2010;P;P;50;1;14/1/2011
- 01/05/2010;P;P;50;1;15/1/2011'''
- expected = self.read_csv(StringIO(data), sep=";", index_col=lrange(4))
- lev = expected.index.levels[0]
- levels = list(expected.index.levels)
- levels[0] = lev.to_datetime(dayfirst=True)
- # hack to get this to work - remove for final test
- levels[0].name = lev.name
- expected.index.set_levels(levels, inplace=True)
- expected['aux_date'] = to_datetime(expected['aux_date'],
- dayfirst=True)
- expected['aux_date'] = lmap(Timestamp, expected['aux_date'])
- tm.assert_isinstance(expected['aux_date'][0], datetime)
- df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4),
- parse_dates=[0, 5], dayfirst=True)
- tm.assert_frame_equal(df, expected)
- df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4),
- parse_dates=['date', 'aux_date'], dayfirst=True)
- tm.assert_frame_equal(df, expected)
- def test_no_header(self):
- data = """1,2,3,4,5
- 6,7,8,9,10
- 11,12,13,14,15
- """
- df = self.read_table(StringIO(data), sep=',', header=None)
- df_pref = self.read_table(StringIO(data), sep=',', prefix='X',
- header=None)
- names = ['foo', 'bar', 'baz', 'quux', 'panda']
- df2 = self.read_table(StringIO(data), sep=',', names=names)
- expected = [[1, 2, 3, 4, 5.],
- [6, 7, 8, 9, 10],
- [11, 12, 13, 14, 15]]
- tm.assert_almost_equal(df.values, expected)
- tm.assert_almost_equal(df.values, df2.values)
- self.assert_numpy_array_equal(df_pref.columns,
- ['X0', 'X1', 'X2', 'X3', 'X4'])
- self.assert_numpy_array_equal(df.columns, lrange(5))
- self.assert_numpy_array_equal(df2.columns, names)
- def test_no_header_prefix(self):
- data = """1,2,3,4,5
- 6,7,8,9,10
- 11,12,13,14,15
- """
- df_pref = self.read_table(StringIO(data), sep=',', prefix='Field',
- header=None)
- expected = [[1, 2, 3, 4, 5.],
- [6, 7, 8, 9, 10],
- [11, 12, 13, 14, 15]]
- tm.assert_almost_equal(df_pref.values, expected)
- self.assert_numpy_array_equal(df_pref.columns,
- ['Field0', 'Field1', 'Field2', 'Field3', 'Field4'])
- def test_header_with_index_col(self):
- data = """foo,1,2,3
- bar,4,5,6
- baz,7,8,9
- """
- names = ['A', 'B', 'C']
- df = self.read_csv(StringIO(data), names=names)
- self.assertEqual(names, ['A', 'B', 'C'])
- values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
- expected = DataFrame(values, index=['foo', 'bar', 'baz'],
- columns=['A', 'B', 'C'])
- tm.assert_frame_equal(df, expected)
- def test_read_csv_dataframe(self):
- df = self.read_csv(self.csv1, index_col=0, parse_dates=True)
- df2 = self.read_table(self.csv1, sep=',', index_col=0,
- parse_dates=True)
- self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D'])
- self.assertEqual(df.index.name, 'index')
- self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp))
- self.assertEqual(df.values.dtype, np.float64)
- tm.assert_frame_equal(df, df2)
- def test_read_csv_no_index_name(self):
- df = self.read_csv(self.csv2, index_col=0, parse_dates=True)
- df2 = self.read_table(self.csv2, sep=',', index_col=0,
- parse_dates=True)
- self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D', 'E'])
- self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp))
- self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, np.float64)
- tm.assert_frame_equal(df, df2)
- def test_read_table_unicode(self):
- fin = BytesIO(u('\u0141aski, Jan;1').encode('utf-8'))
- df1 = read_table(fin, sep=";", encoding="utf-8", header=None)
- tm.assert_isinstance(df1[0].values[0], compat.text_type)
- def test_read_table_wrong_num_columns(self):
- # too few!
- data = """A,B,C,D,E,F
- 1,2,3,4,5,6
- 6,7,8,9,10,11,12
- 11,12,13,14,15,16
- """
- self.assertRaises(Exception, self.read_csv, StringIO(data))
- def test_read_table_duplicate_index(self):
- data = """index,A,B,C,D
- foo,2,3,4,5
- bar,7,8,9,10
- baz,12,13,14,15
- qux,12,13,14,15
- foo,12,13,14,15
- bar,12,13,14,15
- """
- result = self.read_csv(StringIO(data), index_col=0)
- expected = self.read_csv(StringIO(data)).set_index('index',
- verify_integrity=False)
- tm.assert_frame_equal(result, expected)
- def test_read_table_duplicate_index_implicit(self):
- data = """A,B,C,D
- foo,2,3,4,5
- bar,7,8,9,10
- baz,12,13,14,15
- qux,12,13,14,15
- foo,12,13,14,15
- bar,12,13,14,15
- """
- # it works!
- result = self.read_csv(StringIO(data))
- def test_parse_bools(self):
- data = """A,B
- True,1
- False,2
- True,3
- """
- data = self.read_csv(StringIO(data))
- self.assertEqual(data['A'].dtype, np.bool_)
- data = """A,B
- YES,1
- no,2
- yes,3
- No,3
- Yes,3
- """
- data = self.read_csv(StringIO(data),
- true_values=['yes', 'Yes', 'YES'],
- false_values=['no', 'NO', 'No'])
- self.assertEqual(data['A'].dtype, np.bool_)
- data = """A,B
- TRUE,1
- FALSE,2
- TRUE,3
- """
- data = self.read_csv(StringIO(data))
- self.assertEqual(data['A'].dtype, np.bool_)
- data = """A,B
- foo,bar
- bar,foo"""
- result = self.read_csv(StringIO(data), true_values=['foo'],
- false_values=['bar'])
- expected = DataFrame({'A': [True, False], 'B': [False, True]})
- tm.assert_frame_equal(result, expected)
- def test_int_conversion(self):
- data = """A,B
- 1.0,1
- 2.0,2
- 3.0,3
- """
- data = self.read_csv(StringIO(data))
- self.assertEqual(data['A'].dtype, np.float64)
- self.assertEqual(data['B'].dtype, np.int64)
- def test_infer_index_col(self):
- data = """A,B,C
- foo,1,2,3
- bar,4,5,6
- baz,7,8,9
- """
- data = self.read_csv(StringIO(data))
- self.assertTrue(data.index.equals(Index(['foo', 'bar', 'baz'])))
- def test_read_nrows(self):
- df = self.read_csv(StringIO(self.data1), nrows=3)
- expected = self.read_csv(StringIO(self.data1))[:3]
- tm.assert_frame_equal(df, expected)
- def test_read_chunksize(self):
- reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2)
- df = self.read_csv(StringIO(self.data1), index_col=0)
- chunks = list(reader)
- tm.assert_frame_equal(chunks[0], df[:2])
- tm.assert_frame_equal(chunks[1], df[2:4])
- tm.assert_frame_equal(chunks[2], df[4:])
- def test_read_chunksize_named(self):
- reader = self.read_csv(
- StringIO(self.data1), index_col='index', chunksize=2)
- df = self.read_csv(StringIO(self.data1), index_col='index')
- chunks = list(reader)
- tm.assert_frame_equal(chunks[0], df[:2])
- tm.assert_frame_equal(chunks[1], df[2:4])
- tm.assert_frame_equal(chunks[2], df[4:])
- def test_get_chunk_passed_chunksize(self):
- data = """A,B,C
- 1,2,3
- 4,5,6
- 7,8,9
- 1,2,3"""
- result = self.read_csv(StringIO(data), chunksize=2)
- piece = result.get_chunk()
- self.assertEqual(len(piece), 2)
- def test_read_text_list(self):
- data = """A,B,C\nfoo,1,2,3\nbar,4,5,6"""
- as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar',
- '4', '5', '6']]
- df = self.read_csv(StringIO(data), index_col=0)
- parser = TextParser(as_list, index_col=0, chunksize=2)
- chunk = parser.read(None)
- tm.assert_frame_equal(chunk, df)
- def test_iterator(self):
- # GH 6607
- # Test currently only valid with python engine because
- # skip_footer != 0. Temporarily copied to TestPythonParser.
- # Test for ValueError with other engines:
- with tm.assertRaisesRegexp(ValueError, 'skip_footer'):
- reader = self.read_csv(StringIO(self.data1), index_col=0,
- iterator=True)
- df = self.read_csv(StringIO(self.data1), index_col=0)
- chunk = reader.read(3)
- tm.assert_frame_equal(chunk, df[:3])
- last_chunk = reader.read(5)
- tm.assert_frame_equal(last_chunk, df[3:])
- # pass list
- lines = list(csv.reader(StringIO(self.data1)))
- parser = TextParser(lines, index_col=0, chunksize=2)
- df = self.read_csv(StringIO(self.data1), index_col=0)
- chunks = list(parser)
- tm.assert_frame_equal(chunks[0], df[:2])
- tm.assert_frame_equal(chunks[1], df[2:4])
- tm.assert_frame_equal(chunks[2], df[4:])
- # pass skiprows
- parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1])
- chunks = list(parser)
- tm.assert_frame_equal(chunks[0], df[1:3])
- # test bad parameter (skip_footer)
- reader = self.read_csv(StringIO(self.data1), index_col=0,
- iterator=True, skip_footer=True)
- self.assertRaises(ValueError, reader.read, 3)
- treader = self.read_table(StringIO(self.data1), sep=',', index_col=0,
- iterator=True)
- tm.assert_isinstance(treader, TextFileReader)
- # stopping iteration when on chunksize is specified, GH 3967
- data = """A,B,C
- foo,1,2,3
- bar,4,5,6
- baz,7,8,9
- """
- reader = self.read_csv(StringIO(data), iterator=True)
- result = list(reader)
- expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz'])
- tm.assert_frame_equal(result[0], expected)
- # chunksize = 1
- reader = self.read_csv(StringIO(data), chunksize=1)
- result = list(reader)
- expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz'])
- self.assertEqual(len(result), 3)
- tm.assert_frame_equal(pd.concat(result), expected)
- def test_header_not_first_line(self):
- data = """got,to,ignore,this,line
- got,to,ignore,this,line
- index,A,B,C,D
- foo,2,3,4,5
- bar,7,8,9,10
- baz,12,13,14,15
- """
- data2 = """index,A,B,C,D
- foo,2,3,4,5
- bar,7,8,9,10
- baz,12,13,14,15
- """
- df = self.read_csv(StringIO(data), header=2, index_col=0)
- expected = self.read_csv(StringIO(data2), header=0, index_col=0)
- tm.assert_frame_equal(df, expected)
- def test_header_multi_index(self):
- expected = tm.makeCustomDataframe(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
- data = """\
- C0,,C_l0_g0,C_l0_g1,C_l0_g2
- C1,,C_l1_g0,C_l1_g1,C_l1_g2
- C2,,C_l2_g0,C_l2_g1,C_l2_g2
- C3,,C_l3_g0,C_l3_g1,C_l3_g2
- R0,R1,,,
- R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
- R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
- R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
- R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
- R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
- """
- df = self.read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False)
- tm.assert_frame_equal(df, expected)
- # skipping lines in the header
- df = self.read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False)
- tm.assert_frame_equal(df, expected)
- #### invalid options ####
- # no as_recarray
- self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3],
- index_col=[0,1], as_recarray=True, tupleize_cols=False)
- # names
- self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3],
- index_col=[0,1], names=['foo','bar'], tupleize_cols=False)
- # usecols
- self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3],
- index_col=[0,1], usecols=['foo','bar'], tupleize_cols=False)
- # non-numeric index_col
- self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3],
- index_col=['foo','bar'], tupleize_cols=False)
- def test_header_multiindex_common_format(self):
- df = DataFrame([[1,2,3,4,5,6],[7,8,9,10,11,12]],
- index=['one','two'],
- columns=MultiIndex.from_tuples([('a','q'),('a','r'),('a','s'),
- ('b','t'),('c','u'),('c','v')]))
- # to_csv
- data = """,a,a,a,b,c,c
- ,q,r,s,t,u,v
- ,,,,,,
- one,1,2,3,4,5,6
- two,7,8,9,10,11,12"""
- result = self.read_csv(StringIO(data),header=[0,1],index_col=0)
- tm.assert_frame_equal(df,result)
- # common
- data = """,a,a,a,b,c,c
- ,q,r,s,t,u,v
- one,1,2,3,4,5,6
- two,7,8,9,10,11,12"""
- result = self.read_csv(StringIO(data),header=[0,1],index_col=0)
- tm.assert_frame_equal(df,result)
- # common, no index_col
- data = """a,a,a,b,c,c
- q,r,s,t,u,v
- 1,2,3,4,5,6
- 7,8,9,10,11,12"""
- result = self.read_csv(StringIO(data),header=[0,1],index_col=None)
- tm.assert_frame_equal(df.reset_index(drop=True),result)
- # malformed case 1
- expected = DataFrame(np.array([[2, 3, 4, 5, 6],
- [8, 9, 10, 11, 12]], dtype='int64'),
- index=Index([1, 7]),
- columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]],
- labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
- names=[u('a'), u('q')]))
- data = """a,a,a,b,c,c
- q,r,s,t,u,v
- 1,2,3,4,5,6
- 7,8,9,10,11,12"""
- result = self.read_csv(StringIO(data),header=[0,1],index_col=0)
- tm.assert_frame_equal(expected,result)
- # malformed case 2
- expected = DataFrame(np.array([[2, 3, 4, 5, 6],
- [8, 9, 10, 11, 12]], dtype='int64'),
- index=Index([1, 7]),
- columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]],
- labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
- names=[None, u('q')]))
- data = """,a,a,b,c,c
- q,r,s,t,u,v
- 1,2,3,4,5,6
- 7,8,9,10,11,12"""
- result = self.read_csv(StringIO(data),header=[0,1],index_col=0)
- tm.assert_frame_equal(expected,result)
- # mi on columns and index (malformed)
- expected = DataFrame(np.array([[ 3, 4, 5, 6],
- [ 9, 10, 11, 12]], dtype='int64'),
- index=MultiIndex(levels=[[1, 7], [2, 8]],
- labels=[[0, 1], [0, 1]]),
- columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('s'), u('t'), u('u'), u('v')]],
- labels=[[0, 1, 2, 2], [0, 1, 2, 3]],
- names=[None, u('q')]))
- data = """,a,a,b,c,c
- q,r,s,t,u,v
- 1,2,3,4,5,6
- 7,8,9,10,11,12"""
- result = self.read_csv(StringIO(data),header=[0,1],index_col=[0, 1])
- tm.assert_frame_equal(expected,result)
- def test_pass_names_with_index(self):
- lines = self.data1.split('\n')
- no_header = '\n'.join(lines[1:])
- # regular index
- names = ['index', 'A', 'B', 'C', 'D']
- df = self.read_csv(StringIO(no_header), index_col=0, names=names)
- expected = self.read_csv(StringIO(self.data1), index_col=0)
- tm.assert_fram…
Large files files are truncated, but you can click here to view the full file