PageRenderTime 42ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/itmi_vcfqc_optimized/pymodules/python2.7/lib/python/pandas-0.17.1-py2.7-linux-x86_64.egg/pandas/io/tests/test_cparser.py

https://gitlab.com/pooja043/Globus_Docker_4
Python | 407 lines | 349 code | 51 blank | 7 comment | 7 complexity | c950cdc7ca4ced0c511aeaa3a08eaacd MD5 | raw file
  1. """
  2. C/Cython ascii file parser tests
  3. """
  4. from pandas.compat import StringIO, BytesIO, map
  5. from datetime import datetime
  6. from pandas import compat
  7. import csv
  8. import os
  9. import sys
  10. import re
  11. import nose
  12. from numpy import nan
  13. import numpy as np
  14. from pandas import DataFrame, Series, Index, isnull, MultiIndex
  15. import pandas.io.parsers as parsers
  16. from pandas.io.parsers import (read_csv, read_table, read_fwf,
  17. TextParser, TextFileReader)
  18. from pandas.util.testing import (assert_almost_equal, assert_frame_equal,
  19. assert_series_equal, network)
  20. import pandas.lib as lib
  21. from pandas import compat
  22. from pandas.lib import Timestamp
  23. import pandas.util.testing as tm
  24. from pandas.parser import TextReader
  25. import pandas.parser as parser
  26. class TestCParser(tm.TestCase):
  27. def setUp(self):
  28. self.dirpath = tm.get_data_path()
  29. self.csv1 = os.path.join(self.dirpath, 'test1.csv')
  30. self.csv2 = os.path.join(self.dirpath, 'test2.csv')
  31. self.xls1 = os.path.join(self.dirpath, 'test.xls')
  32. def test_file_handle(self):
  33. try:
  34. f = open(self.csv1, 'rb')
  35. reader = TextReader(f)
  36. result = reader.read()
  37. finally:
  38. f.close()
  39. def test_string_filename(self):
  40. reader = TextReader(self.csv1, header=None)
  41. result = reader.read()
  42. def test_file_handle_mmap(self):
  43. try:
  44. f = open(self.csv1, 'rb')
  45. reader = TextReader(f, memory_map=True, header=None)
  46. result = reader.read()
  47. finally:
  48. f.close()
  49. def test_StringIO(self):
  50. text = open(self.csv1, 'rb').read()
  51. src = BytesIO(text)
  52. reader = TextReader(src, header=None)
  53. result = reader.read()
  54. def test_string_factorize(self):
  55. # should this be optional?
  56. data = 'a\nb\na\nb\na'
  57. reader = TextReader(StringIO(data), header=None)
  58. result = reader.read()
  59. self.assertEqual(len(set(map(id, result[0]))), 2)
  60. def test_skipinitialspace(self):
  61. data = ('a, b\n'
  62. 'a, b\n'
  63. 'a, b\n'
  64. 'a, b')
  65. reader = TextReader(StringIO(data), skipinitialspace=True,
  66. header=None)
  67. result = reader.read()
  68. self.assert_numpy_array_equal(result[0], ['a', 'a', 'a', 'a'])
  69. self.assert_numpy_array_equal(result[1], ['b', 'b', 'b', 'b'])
  70. def test_parse_booleans(self):
  71. data = 'True\nFalse\nTrue\nTrue'
  72. reader = TextReader(StringIO(data), header=None)
  73. result = reader.read()
  74. self.assertEqual(result[0].dtype, np.bool_)
  75. def test_delimit_whitespace(self):
  76. data = 'a b\na\t\t "b"\n"a"\t \t b'
  77. reader = TextReader(StringIO(data), delim_whitespace=True,
  78. header=None)
  79. result = reader.read()
  80. self.assert_numpy_array_equal(result[0], ['a', 'a', 'a'])
  81. self.assert_numpy_array_equal(result[1], ['b', 'b', 'b'])
  82. def test_embedded_newline(self):
  83. data = 'a\n"hello\nthere"\nthis'
  84. reader = TextReader(StringIO(data), header=None)
  85. result = reader.read()
  86. expected = ['a', 'hello\nthere', 'this']
  87. self.assert_numpy_array_equal(result[0], expected)
  88. def test_euro_decimal(self):
  89. data = '12345,67\n345,678'
  90. reader = TextReader(StringIO(data), delimiter=':',
  91. decimal=',', header=None)
  92. result = reader.read()
  93. expected = [12345.67, 345.678]
  94. tm.assert_almost_equal(result[0], expected)
  95. def test_integer_thousands(self):
  96. data = '123,456\n12,500'
  97. reader = TextReader(StringIO(data), delimiter=':',
  98. thousands=',', header=None)
  99. result = reader.read()
  100. expected = [123456, 12500]
  101. tm.assert_almost_equal(result[0], expected)
  102. def test_integer_thousands_alt(self):
  103. data = '123.456\n12.500'
  104. reader = TextFileReader(StringIO(data), delimiter=':',
  105. thousands='.', header=None)
  106. result = reader.read()
  107. expected = [123456, 12500]
  108. tm.assert_almost_equal(result[0], expected)
  109. def test_skip_bad_lines(self):
  110. # too many lines, see #2430 for why
  111. data = ('a:b:c\n'
  112. 'd:e:f\n'
  113. 'g:h:i\n'
  114. 'j:k:l:m\n'
  115. 'l:m:n\n'
  116. 'o:p:q:r')
  117. reader = TextReader(StringIO(data), delimiter=':',
  118. header=None)
  119. self.assertRaises(parser.CParserError, reader.read)
  120. reader = TextReader(StringIO(data), delimiter=':',
  121. header=None,
  122. error_bad_lines=False,
  123. warn_bad_lines=False)
  124. result = reader.read()
  125. expected = {0: ['a', 'd', 'g', 'l'],
  126. 1: ['b', 'e', 'h', 'm'],
  127. 2: ['c', 'f', 'i', 'n']}
  128. assert_array_dicts_equal(result, expected)
  129. stderr = sys.stderr
  130. sys.stderr = StringIO()
  131. try:
  132. reader = TextReader(StringIO(data), delimiter=':',
  133. header=None,
  134. error_bad_lines=False,
  135. warn_bad_lines=True)
  136. reader.read()
  137. val = sys.stderr.getvalue()
  138. self.assertTrue('Skipping line 4' in val)
  139. self.assertTrue('Skipping line 6' in val)
  140. finally:
  141. sys.stderr = stderr
  142. def test_header_not_enough_lines(self):
  143. data = ('skip this\n'
  144. 'skip this\n'
  145. 'a,b,c\n'
  146. '1,2,3\n'
  147. '4,5,6')
  148. reader = TextReader(StringIO(data), delimiter=',', header=2)
  149. header = reader.header
  150. expected = [['a', 'b', 'c']]
  151. self.assertEqual(header, expected)
  152. recs = reader.read()
  153. expected = {0 : [1, 4], 1 : [2, 5], 2 : [3, 6]}
  154. assert_array_dicts_equal(expected, recs)
  155. # not enough rows
  156. self.assertRaises(parser.CParserError, TextReader, StringIO(data),
  157. delimiter=',', header=5, as_recarray=True)
  158. def test_header_not_enough_lines_as_recarray(self):
  159. if compat.is_platform_windows():
  160. raise nose.SkipTest("segfaults on win-64, only when all tests are run")
  161. data = ('skip this\n'
  162. 'skip this\n'
  163. 'a,b,c\n'
  164. '1,2,3\n'
  165. '4,5,6')
  166. reader = TextReader(StringIO(data), delimiter=',', header=2,
  167. as_recarray=True)
  168. header = reader.header
  169. expected = [['a', 'b', 'c']]
  170. self.assertEqual(header, expected)
  171. recs = reader.read()
  172. expected = {'a': [1, 4], 'b': [2, 5], 'c': [3, 6]}
  173. assert_array_dicts_equal(expected, recs)
  174. # not enough rows
  175. self.assertRaises(parser.CParserError, TextReader, StringIO(data),
  176. delimiter=',', header=5, as_recarray=True)
  177. def test_escapechar(self):
  178. data = ('\\"hello world\"\n'
  179. '\\"hello world\"\n'
  180. '\\"hello world\"')
  181. reader = TextReader(StringIO(data), delimiter=',', header=None,
  182. escapechar='\\')
  183. result = reader.read()
  184. expected = {0: ['"hello world"'] * 3}
  185. assert_array_dicts_equal(result, expected)
  186. def test_eof_has_eol(self):
  187. # handling of new line at EOF
  188. pass
  189. def test_na_substitution(self):
  190. pass
  191. def test_numpy_string_dtype(self):
  192. data = """\
  193. a,1
  194. aa,2
  195. aaa,3
  196. aaaa,4
  197. aaaaa,5"""
  198. def _make_reader(**kwds):
  199. return TextReader(StringIO(data), delimiter=',', header=None,
  200. **kwds)
  201. reader = _make_reader(dtype='S5,i4')
  202. result = reader.read()
  203. self.assertEqual(result[0].dtype, 'S5')
  204. ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaaa'], dtype='S5')
  205. self.assertTrue((result[0] == ex_values).all())
  206. self.assertEqual(result[1].dtype, 'i4')
  207. reader = _make_reader(dtype='S4')
  208. result = reader.read()
  209. self.assertEqual(result[0].dtype, 'S4')
  210. ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4')
  211. self.assertTrue((result[0] == ex_values).all())
  212. self.assertEqual(result[1].dtype, 'S4')
  213. def test_numpy_string_dtype_as_recarray(self):
  214. data = """\
  215. a,1
  216. aa,2
  217. aaa,3
  218. aaaa,4
  219. aaaaa,5"""
  220. if compat.is_platform_windows():
  221. raise nose.SkipTest("segfaults on win-64, only when all tests are run")
  222. def _make_reader(**kwds):
  223. return TextReader(StringIO(data), delimiter=',', header=None,
  224. **kwds)
  225. reader = _make_reader(dtype='S4', as_recarray=True)
  226. result = reader.read()
  227. self.assertEqual(result['0'].dtype, 'S4')
  228. ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4')
  229. self.assertTrue((result['0'] == ex_values).all())
  230. self.assertEqual(result['1'].dtype, 'S4')
  231. def test_pass_dtype(self):
  232. data = """\
  233. one,two
  234. 1,a
  235. 2,b
  236. 3,c
  237. 4,d"""
  238. def _make_reader(**kwds):
  239. return TextReader(StringIO(data), delimiter=',', **kwds)
  240. reader = _make_reader(dtype={'one': 'u1', 1: 'S1'})
  241. result = reader.read()
  242. self.assertEqual(result[0].dtype, 'u1')
  243. self.assertEqual(result[1].dtype, 'S1')
  244. reader = _make_reader(dtype={'one': np.uint8, 1: object})
  245. result = reader.read()
  246. self.assertEqual(result[0].dtype, 'u1')
  247. self.assertEqual(result[1].dtype, 'O')
  248. reader = _make_reader(dtype={'one': np.dtype('u1'),
  249. 1: np.dtype('O')})
  250. result = reader.read()
  251. self.assertEqual(result[0].dtype, 'u1')
  252. self.assertEqual(result[1].dtype, 'O')
  253. def test_usecols(self):
  254. data = """\
  255. a,b,c
  256. 1,2,3
  257. 4,5,6
  258. 7,8,9
  259. 10,11,12"""
  260. def _make_reader(**kwds):
  261. return TextReader(StringIO(data), delimiter=',', **kwds)
  262. reader = _make_reader(usecols=(1, 2))
  263. result = reader.read()
  264. exp = _make_reader().read()
  265. self.assertEqual(len(result), 2)
  266. self.assertTrue((result[1] == exp[1]).all())
  267. self.assertTrue((result[2] == exp[2]).all())
  268. def test_cr_delimited(self):
  269. def _test(text, **kwargs):
  270. nice_text = text.replace('\r', '\r\n')
  271. result = TextReader(StringIO(text), **kwargs).read()
  272. expected = TextReader(StringIO(nice_text), **kwargs).read()
  273. assert_array_dicts_equal(result, expected)
  274. data = 'a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12'
  275. _test(data, delimiter=',')
  276. data = 'a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12'
  277. _test(data, delim_whitespace=True)
  278. data = 'a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12'
  279. _test(data, delimiter=',')
  280. sample = ('A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r'
  281. 'AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r'
  282. ',BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0')
  283. _test(sample, delimiter=',')
  284. data = 'A B C\r 2 3\r4 5 6'
  285. _test(data, delim_whitespace=True)
  286. data = 'A B C\r2 3\r4 5 6'
  287. _test(data, delim_whitespace=True)
  288. def test_empty_field_eof(self):
  289. data = 'a,b,c\n1,2,3\n4,,'
  290. result = TextReader(StringIO(data), delimiter=',').read()
  291. expected = {0: np.array([1, 4]),
  292. 1: np.array(['2', ''], dtype=object),
  293. 2: np.array(['3', ''], dtype=object)}
  294. assert_array_dicts_equal(result, expected)
  295. # GH5664
  296. a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c'])
  297. b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]],
  298. columns=list('abcd'),
  299. index=[1, 1])
  300. c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan],
  301. [8, 9, 10, 11], [13, 14, nan, nan]],
  302. columns=list('abcd'),
  303. index=[0, 5, 7, 12])
  304. for _ in range(100):
  305. df = read_csv(StringIO('a,b\nc\n'), skiprows=0,
  306. names=['a'], engine='c')
  307. assert_frame_equal(df, a)
  308. df = read_csv(StringIO('1,1,1,1,0\n'*2 + '\n'*2),
  309. names=list("abcd"), engine='c')
  310. assert_frame_equal(df, b)
  311. df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'),
  312. names=list('abcd'), engine='c')
  313. assert_frame_equal(df, c)
  314. def assert_array_dicts_equal(left, right):
  315. for k, v in compat.iteritems(left):
  316. assert(np.array_equal(v, right[k]))
  317. if __name__ == '__main__':
  318. nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
  319. exit=False)