PageRenderTime 69ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/io/parsers.py

http://github.com/wesm/pandas
Python | 3610 lines | 3593 code | 9 blank | 8 comment | 19 complexity | 801a9f05bffb2ca4bf300d4b8b808101 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. """
  2. Module contains tools for processing files into DataFrames or other objects
  3. """
  4. from __future__ import print_function
  5. from collections import defaultdict
  6. import csv
  7. import datetime
  8. import re
  9. import sys
  10. from textwrap import fill
  11. import warnings
  12. import numpy as np
  13. import pandas._libs.lib as lib
  14. import pandas._libs.ops as libops
  15. import pandas._libs.parsers as parsers
  16. from pandas._libs.tslibs import parsing
  17. import pandas.compat as compat
  18. from pandas.compat import (
  19. PY3, StringIO, lrange, lzip, map, range, string_types, u, zip)
  20. from pandas.errors import (
  21. AbstractMethodError, EmptyDataError, ParserError, ParserWarning)
  22. from pandas.util._decorators import Appender
  23. from pandas.core.dtypes.cast import astype_nansafe
  24. from pandas.core.dtypes.common import (
  25. ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal,
  26. is_extension_array_dtype, is_float, is_integer, is_integer_dtype,
  27. is_list_like, is_object_dtype, is_scalar, is_string_dtype, pandas_dtype)
  28. from pandas.core.dtypes.dtypes import CategoricalDtype
  29. from pandas.core.dtypes.missing import isna
  30. from pandas.core import algorithms
  31. from pandas.core.arrays import Categorical
  32. from pandas.core.frame import DataFrame
  33. from pandas.core.index import (
  34. Index, MultiIndex, RangeIndex, ensure_index_from_sequences)
  35. from pandas.core.series import Series
  36. from pandas.core.tools import datetimes as tools
  37. from pandas.io.common import (
  38. _NA_VALUES, BaseIterator, UnicodeReader, UTF8Recoder, _get_handle,
  39. _infer_compression, _validate_header_arg, get_filepath_or_buffer,
  40. is_file_like)
  41. from pandas.io.date_converters import generic_parser
  42. # BOM character (byte order mark)
  43. # This exists at the beginning of a file to indicate endianness
  44. # of a file (stream). Unfortunately, this marker screws up parsing,
  45. # so we need to remove it if we see it.
  46. _BOM = u('\ufeff')
  47. _doc_read_csv_and_table = r"""
  48. {summary}
  49. Also supports optionally iterating or breaking of the file
  50. into chunks.
  51. Additional help can be found in the online docs for
  52. `IO Tools <http://pandas.pydata.org/pandas-docs/stable/io.html>`_.
  53. Parameters
  54. ----------
  55. filepath_or_buffer : str, path object, or file-like object
  56. Any valid string path is acceptable. The string could be a URL. Valid
  57. URL schemes include http, ftp, s3, and file. For file URLs, a host is
  58. expected. A local file could be: file://localhost/path/to/table.csv.
  59. If you want to pass in a path object, pandas accepts either
  60. ``pathlib.Path`` or ``py._path.local.LocalPath``.
  61. By file-like object, we refer to objects with a ``read()`` method, such as
  62. a file handler (e.g. via builtin ``open`` function) or ``StringIO``.
  63. sep : str, default {_default_sep}
  64. Delimiter to use. If sep is None, the C engine cannot automatically detect
  65. the separator, but the Python parsing engine can, meaning the latter will
  66. be used and automatically detect the separator by Python's builtin sniffer
  67. tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
  68. different from ``'\s+'`` will be interpreted as regular expressions and
  69. will also force the use of the Python parsing engine. Note that regex
  70. delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
  71. delimiter : str, default ``None``
  72. Alias for sep.
  73. header : int, list of int, default 'infer'
  74. Row number(s) to use as the column names, and the start of the
  75. data. Default behavior is to infer the column names: if no names
  76. are passed the behavior is identical to ``header=0`` and column
  77. names are inferred from the first line of the file, if column
  78. names are passed explicitly then the behavior is identical to
  79. ``header=None``. Explicitly pass ``header=0`` to be able to
  80. replace existing names. The header can be a list of integers that
  81. specify row locations for a multi-index on the columns
  82. e.g. [0,1,3]. Intervening rows that are not specified will be
  83. skipped (e.g. 2 in this example is skipped). Note that this
  84. parameter ignores commented lines and empty lines if
  85. ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
  86. data rather than the first line of the file.
  87. names : array-like, optional
  88. List of column names to use. If file contains no header row, then you
  89. should explicitly pass ``header=None``. Duplicates in this list will cause
  90. a ``UserWarning`` to be issued.
  91. index_col : int, sequence or bool, optional
  92. Column to use as the row labels of the DataFrame. If a sequence is given, a
  93. MultiIndex is used. If you have a malformed file with delimiters at the end
  94. of each line, you might consider ``index_col=False`` to force pandas to
  95. not use the first column as the index (row names).
  96. usecols : list-like or callable, optional
  97. Return a subset of the columns. If list-like, all elements must either
  98. be positional (i.e. integer indices into the document columns) or strings
  99. that correspond to column names provided either by the user in `names` or
  100. inferred from the document header row(s). For example, a valid list-like
  101. `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
  102. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
  103. To instantiate a DataFrame from ``data`` with element order preserved use
  104. ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
  105. in ``['foo', 'bar']`` order or
  106. ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
  107. for ``['bar', 'foo']`` order.
  108. If callable, the callable function will be evaluated against the column
  109. names, returning names where the callable function evaluates to True. An
  110. example of a valid callable argument would be ``lambda x: x.upper() in
  111. ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
  112. parsing time and lower memory usage.
  113. squeeze : bool, default False
  114. If the parsed data only contains one column then return a Series.
  115. prefix : str, optional
  116. Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
  117. mangle_dupe_cols : bool, default True
  118. Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
  119. 'X'...'X'. Passing in False will cause data to be overwritten if there
  120. are duplicate names in the columns.
  121. dtype : Type name or dict of column -> type, optional
  122. Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
  123. 'c': 'Int64'}}
  124. Use `str` or `object` together with suitable `na_values` settings
  125. to preserve and not interpret dtype.
  126. If converters are specified, they will be applied INSTEAD
  127. of dtype conversion.
  128. engine : {{'c', 'python'}}, optional
  129. Parser engine to use. The C engine is faster while the python engine is
  130. currently more feature-complete.
  131. converters : dict, optional
  132. Dict of functions for converting values in certain columns. Keys can either
  133. be integers or column labels.
  134. true_values : list, optional
  135. Values to consider as True.
  136. false_values : list, optional
  137. Values to consider as False.
  138. skipinitialspace : bool, default False
  139. Skip spaces after delimiter.
  140. skiprows : list-like, int or callable, optional
  141. Line numbers to skip (0-indexed) or number of lines to skip (int)
  142. at the start of the file.
  143. If callable, the callable function will be evaluated against the row
  144. indices, returning True if the row should be skipped and False otherwise.
  145. An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
  146. skipfooter : int, default 0
  147. Number of lines at bottom of file to skip (Unsupported with engine='c').
  148. nrows : int, optional
  149. Number of rows of file to read. Useful for reading pieces of large files.
  150. na_values : scalar, str, list-like, or dict, optional
  151. Additional strings to recognize as NA/NaN. If dict passed, specific
  152. per-column NA values. By default the following values are interpreted as
  153. NaN: '""" + fill("', '".join(sorted(_NA_VALUES)),
  154. 70, subsequent_indent=" ") + """'.
  155. keep_default_na : bool, default True
  156. Whether or not to include the default NaN values when parsing the data.
  157. Depending on whether `na_values` is passed in, the behavior is as follows:
  158. * If `keep_default_na` is True, and `na_values` are specified, `na_values`
  159. is appended to the default NaN values used for parsing.
  160. * If `keep_default_na` is True, and `na_values` are not specified, only
  161. the default NaN values are used for parsing.
  162. * If `keep_default_na` is False, and `na_values` are specified, only
  163. the NaN values specified `na_values` are used for parsing.
  164. * If `keep_default_na` is False, and `na_values` are not specified, no
  165. strings will be parsed as NaN.
  166. Note that if `na_filter` is passed in as False, the `keep_default_na` and
  167. `na_values` parameters will be ignored.
  168. na_filter : bool, default True
  169. Detect missing value markers (empty strings and the value of na_values). In
  170. data without any NAs, passing na_filter=False can improve the performance
  171. of reading a large file.
  172. verbose : bool, default False
  173. Indicate number of NA values placed in non-numeric columns.
  174. skip_blank_lines : bool, default True
  175. If True, skip over blank lines rather than interpreting as NaN values.
  176. parse_dates : bool or list of int or names or list of lists or dict, \
  177. default False
  178. The behavior is as follows:
  179. * boolean. If True -> try parsing the index.
  180. * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
  181. each as a separate date column.
  182. * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
  183. a single date column.
  184. * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
  185. result 'foo'
  186. If a column or index cannot be represented as an array of datetimes,
  187. say because of an unparseable value or a mixture of timezones, the column
  188. or index will be returned unaltered as an object data type. For
  189. non-standard datetime parsing, use ``pd.to_datetime`` after
  190. ``pd.read_csv``. To parse an index or column with a mixture of timezones,
  191. specify ``date_parser`` to be a partially-applied
  192. :func:`pandas.to_datetime` with ``utc=True``. See
  193. :ref:`io.csv.mixed_timezones` for more.
  194. Note: A fast-path exists for iso8601-formatted dates.
  195. infer_datetime_format : bool, default False
  196. If True and `parse_dates` is enabled, pandas will attempt to infer the
  197. format of the datetime strings in the columns, and if it can be inferred,
  198. switch to a faster method of parsing them. In some cases this can increase
  199. the parsing speed by 5-10x.
  200. keep_date_col : bool, default False
  201. If True and `parse_dates` specifies combining multiple columns then
  202. keep the original columns.
  203. date_parser : function, optional
  204. Function to use for converting a sequence of string columns to an array of
  205. datetime instances. The default uses ``dateutil.parser.parser`` to do the
  206. conversion. Pandas will try to call `date_parser` in three different ways,
  207. advancing to the next if an exception occurs: 1) Pass one or more arrays
  208. (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
  209. string values from the columns defined by `parse_dates` into a single array
  210. and pass that; and 3) call `date_parser` once for each row using one or
  211. more strings (corresponding to the columns defined by `parse_dates`) as
  212. arguments.
  213. dayfirst : bool, default False
  214. DD/MM format dates, international and European format.
  215. iterator : bool, default False
  216. Return TextFileReader object for iteration or getting chunks with
  217. ``get_chunk()``.
  218. chunksize : int, optional
  219. Return TextFileReader object for iteration.
  220. See the `IO Tools docs
  221. <http://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
  222. for more information on ``iterator`` and ``chunksize``.
  223. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
  224. For on-the-fly decompression of on-disk data. If 'infer' and
  225. `filepath_or_buffer` is path-like, then detect compression from the
  226. following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
  227. decompression). If using 'zip', the ZIP file must contain only one data
  228. file to be read in. Set to None for no decompression.
  229. .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
  230. thousands : str, optional
  231. Thousands separator.
  232. decimal : str, default '.'
  233. Character to recognize as decimal point (e.g. use ',' for European data).
  234. lineterminator : str (length 1), optional
  235. Character to break file into lines. Only valid with C parser.
  236. quotechar : str (length 1), optional
  237. The character used to denote the start and end of a quoted item. Quoted
  238. items can include the delimiter and it will be ignored.
  239. quoting : int or csv.QUOTE_* instance, default 0
  240. Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
  241. QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
  242. doublequote : bool, default ``True``
  243. When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
  244. whether or not to interpret two consecutive quotechar elements INSIDE a
  245. field as a single ``quotechar`` element.
  246. escapechar : str (length 1), optional
  247. One-character string used to escape other characters.
  248. comment : str, optional
  249. Indicates remainder of line should not be parsed. If found at the beginning
  250. of a line, the line will be ignored altogether. This parameter must be a
  251. single character. Like empty lines (as long as ``skip_blank_lines=True``),
  252. fully commented lines are ignored by the parameter `header` but not by
  253. `skiprows`. For example, if ``comment='#'``, parsing
  254. ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
  255. treated as the header.
  256. encoding : str, optional
  257. Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
  258. standard encodings
  259. <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
  260. dialect : str or csv.Dialect, optional
  261. If provided, this parameter will override values (default or not) for the
  262. following parameters: `delimiter`, `doublequote`, `escapechar`,
  263. `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
  264. override values, a ParserWarning will be issued. See csv.Dialect
  265. documentation for more details.
  266. tupleize_cols : bool, default False
  267. Leave a list of tuples on columns as is (default is to convert to
  268. a MultiIndex on the columns).
  269. .. deprecated:: 0.21.0
  270. This argument will be removed and will always convert to MultiIndex
  271. error_bad_lines : bool, default True
  272. Lines with too many fields (e.g. a csv line with too many commas) will by
  273. default cause an exception to be raised, and no DataFrame will be returned.
  274. If False, then these "bad lines" will dropped from the DataFrame that is
  275. returned.
  276. warn_bad_lines : bool, default True
  277. If error_bad_lines is False, and warn_bad_lines is True, a warning for each
  278. "bad line" will be output.
  279. delim_whitespace : bool, default False
  280. Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
  281. used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
  282. is set to True, nothing should be passed in for the ``delimiter``
  283. parameter.
  284. .. versionadded:: 0.18.1 support for the Python parser.
  285. low_memory : bool, default True
  286. Internally process the file in chunks, resulting in lower memory use
  287. while parsing, but possibly mixed type inference. To ensure no mixed
  288. types either set False, or specify the type with the `dtype` parameter.
  289. Note that the entire file is read into a single DataFrame regardless,
  290. use the `chunksize` or `iterator` parameter to return the data in chunks.
  291. (Only valid with C parser).
  292. memory_map : bool, default False
  293. If a filepath is provided for `filepath_or_buffer`, map the file object
  294. directly onto memory and access the data directly from there. Using this
  295. option can improve performance because there is no longer any I/O overhead.
  296. float_precision : str, optional
  297. Specifies which converter the C engine should use for floating-point
  298. values. The options are `None` for the ordinary converter,
  299. `high` for the high-precision converter, and `round_trip` for the
  300. round-trip converter.
  301. Returns
  302. -------
  303. DataFrame or TextParser
  304. A comma-separated values (csv) file is returned as two-dimensional
  305. data structure with labeled axes.
  306. See Also
  307. --------
  308. to_csv : Write DataFrame to a comma-separated values (csv) file.
  309. read_csv : Read a comma-separated values (csv) file into DataFrame.
  310. read_fwf : Read a table of fixed-width formatted lines into DataFrame.
  311. Examples
  312. --------
  313. >>> pd.{func_name}('data.csv') # doctest: +SKIP
  314. """
  315. def _validate_integer(name, val, min_val=0):
  316. """
  317. Checks whether the 'name' parameter for parsing is either
  318. an integer OR float that can SAFELY be cast to an integer
  319. without losing accuracy. Raises a ValueError if that is
  320. not the case.
  321. Parameters
  322. ----------
  323. name : string
  324. Parameter name (used for error reporting)
  325. val : int or float
  326. The value to check
  327. min_val : int
  328. Minimum allowed value (val < min_val will result in a ValueError)
  329. """
  330. msg = "'{name:s}' must be an integer >={min_val:d}".format(name=name,
  331. min_val=min_val)
  332. if val is not None:
  333. if is_float(val):
  334. if int(val) != val:
  335. raise ValueError(msg)
  336. val = int(val)
  337. elif not (is_integer(val) and val >= min_val):
  338. raise ValueError(msg)
  339. return val
  340. def _validate_names(names):
  341. """
  342. Check if the `names` parameter contains duplicates.
  343. If duplicates are found, we issue a warning before returning.
  344. Parameters
  345. ----------
  346. names : array-like or None
  347. An array containing a list of the names used for the output DataFrame.
  348. Returns
  349. -------
  350. names : array-like or None
  351. The original `names` parameter.
  352. """
  353. if names is not None:
  354. if len(names) != len(set(names)):
  355. msg = ("Duplicate names specified. This "
  356. "will raise an error in the future.")
  357. warnings.warn(msg, UserWarning, stacklevel=3)
  358. return names
  359. def _read(filepath_or_buffer, kwds):
  360. """Generic reader of line files."""
  361. encoding = kwds.get('encoding', None)
  362. if encoding is not None:
  363. encoding = re.sub('_', '-', encoding).lower()
  364. kwds['encoding'] = encoding
  365. compression = kwds.get('compression', 'infer')
  366. compression = _infer_compression(filepath_or_buffer, compression)
  367. filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
  368. filepath_or_buffer, encoding, compression)
  369. kwds['compression'] = compression
  370. if kwds.get('date_parser', None) is not None:
  371. if isinstance(kwds['parse_dates'], bool):
  372. kwds['parse_dates'] = True
  373. # Extract some of the arguments (pass chunksize on).
  374. iterator = kwds.get('iterator', False)
  375. chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
  376. nrows = kwds.get('nrows', None)
  377. # Check for duplicates in names.
  378. _validate_names(kwds.get("names", None))
  379. # Create the parser.
  380. parser = TextFileReader(filepath_or_buffer, **kwds)
  381. if chunksize or iterator:
  382. return parser
  383. try:
  384. data = parser.read(nrows)
  385. finally:
  386. parser.close()
  387. if should_close:
  388. try:
  389. filepath_or_buffer.close()
  390. except ValueError:
  391. pass
  392. return data
  393. _parser_defaults = {
  394. 'delimiter': None,
  395. 'escapechar': None,
  396. 'quotechar': '"',
  397. 'quoting': csv.QUOTE_MINIMAL,
  398. 'doublequote': True,
  399. 'skipinitialspace': False,
  400. 'lineterminator': None,
  401. 'header': 'infer',
  402. 'index_col': None,
  403. 'names': None,
  404. 'prefix': None,
  405. 'skiprows': None,
  406. 'skipfooter': 0,
  407. 'nrows': None,
  408. 'na_values': None,
  409. 'keep_default_na': True,
  410. 'true_values': None,
  411. 'false_values': None,
  412. 'converters': None,
  413. 'dtype': None,
  414. 'thousands': None,
  415. 'comment': None,
  416. 'decimal': b'.',
  417. # 'engine': 'c',
  418. 'parse_dates': False,
  419. 'keep_date_col': False,
  420. 'dayfirst': False,
  421. 'date_parser': None,
  422. 'usecols': None,
  423. # 'iterator': False,
  424. 'chunksize': None,
  425. 'verbose': False,
  426. 'encoding': None,
  427. 'squeeze': False,
  428. 'compression': None,
  429. 'mangle_dupe_cols': True,
  430. 'tupleize_cols': False,
  431. 'infer_datetime_format': False,
  432. 'skip_blank_lines': True
  433. }
  434. _c_parser_defaults = {
  435. 'delim_whitespace': False,
  436. 'na_filter': True,
  437. 'low_memory': True,
  438. 'memory_map': False,
  439. 'error_bad_lines': True,
  440. 'warn_bad_lines': True,
  441. 'tupleize_cols': False,
  442. 'float_precision': None
  443. }
  444. _fwf_defaults = {
  445. 'colspecs': 'infer',
  446. 'infer_nrows': 100,
  447. 'widths': None,
  448. }
  449. _c_unsupported = {'skipfooter'}
  450. _python_unsupported = {
  451. 'low_memory',
  452. 'float_precision',
  453. }
  454. _deprecated_defaults = {
  455. 'tupleize_cols': None
  456. }
  457. _deprecated_args = {
  458. 'tupleize_cols',
  459. }
  460. def _make_parser_function(name, default_sep=','):
  461. # prepare read_table deprecation
  462. if name == "read_table":
  463. sep = False
  464. else:
  465. sep = default_sep
  466. def parser_f(filepath_or_buffer,
  467. sep=sep,
  468. delimiter=None,
  469. # Column and Index Locations and Names
  470. header='infer',
  471. names=None,
  472. index_col=None,
  473. usecols=None,
  474. squeeze=False,
  475. prefix=None,
  476. mangle_dupe_cols=True,
  477. # General Parsing Configuration
  478. dtype=None,
  479. engine=None,
  480. converters=None,
  481. true_values=None,
  482. false_values=None,
  483. skipinitialspace=False,
  484. skiprows=None,
  485. skipfooter=0,
  486. nrows=None,
  487. # NA and Missing Data Handling
  488. na_values=None,
  489. keep_default_na=True,
  490. na_filter=True,
  491. verbose=False,
  492. skip_blank_lines=True,
  493. # Datetime Handling
  494. parse_dates=False,
  495. infer_datetime_format=False,
  496. keep_date_col=False,
  497. date_parser=None,
  498. dayfirst=False,
  499. # Iteration
  500. iterator=False,
  501. chunksize=None,
  502. # Quoting, Compression, and File Format
  503. compression='infer',
  504. thousands=None,
  505. decimal=b'.',
  506. lineterminator=None,
  507. quotechar='"',
  508. quoting=csv.QUOTE_MINIMAL,
  509. doublequote=True,
  510. escapechar=None,
  511. comment=None,
  512. encoding=None,
  513. dialect=None,
  514. tupleize_cols=None,
  515. # Error Handling
  516. error_bad_lines=True,
  517. warn_bad_lines=True,
  518. # Internal
  519. delim_whitespace=False,
  520. low_memory=_c_parser_defaults['low_memory'],
  521. memory_map=False,
  522. float_precision=None):
  523. # deprecate read_table GH21948
  524. if name == "read_table":
  525. if sep is False and delimiter is None:
  526. warnings.warn("read_table is deprecated, use read_csv "
  527. "instead, passing sep='\\t'.",
  528. FutureWarning, stacklevel=2)
  529. else:
  530. warnings.warn("read_table is deprecated, use read_csv "
  531. "instead.",
  532. FutureWarning, stacklevel=2)
  533. if sep is False:
  534. sep = default_sep
  535. # gh-23761
  536. #
  537. # When a dialect is passed, it overrides any of the overlapping
  538. # parameters passed in directly. We don't want to warn if the
  539. # default parameters were passed in (since it probably means
  540. # that the user didn't pass them in explicitly in the first place).
  541. #
  542. # "delimiter" is the annoying corner case because we alias it to
  543. # "sep" before doing comparison to the dialect values later on.
  544. # Thus, we need a flag to indicate that we need to "override"
  545. # the comparison to dialect values by checking if default values
  546. # for BOTH "delimiter" and "sep" were provided.
  547. if dialect is not None:
  548. sep_override = delimiter is None and sep == default_sep
  549. kwds = dict(sep_override=sep_override)
  550. else:
  551. kwds = dict()
  552. # Alias sep -> delimiter.
  553. if delimiter is None:
  554. delimiter = sep
  555. if delim_whitespace and delimiter != default_sep:
  556. raise ValueError("Specified a delimiter with both sep and"
  557. " delim_whitespace=True; you can only"
  558. " specify one.")
  559. if engine is not None:
  560. engine_specified = True
  561. else:
  562. engine = 'c'
  563. engine_specified = False
  564. kwds.update(delimiter=delimiter,
  565. engine=engine,
  566. dialect=dialect,
  567. compression=compression,
  568. engine_specified=engine_specified,
  569. doublequote=doublequote,
  570. escapechar=escapechar,
  571. quotechar=quotechar,
  572. quoting=quoting,
  573. skipinitialspace=skipinitialspace,
  574. lineterminator=lineterminator,
  575. header=header,
  576. index_col=index_col,
  577. names=names,
  578. prefix=prefix,
  579. skiprows=skiprows,
  580. skipfooter=skipfooter,
  581. na_values=na_values,
  582. true_values=true_values,
  583. false_values=false_values,
  584. keep_default_na=keep_default_na,
  585. thousands=thousands,
  586. comment=comment,
  587. decimal=decimal,
  588. parse_dates=parse_dates,
  589. keep_date_col=keep_date_col,
  590. dayfirst=dayfirst,
  591. date_parser=date_parser,
  592. nrows=nrows,
  593. iterator=iterator,
  594. chunksize=chunksize,
  595. converters=converters,
  596. dtype=dtype,
  597. usecols=usecols,
  598. verbose=verbose,
  599. encoding=encoding,
  600. squeeze=squeeze,
  601. memory_map=memory_map,
  602. float_precision=float_precision,
  603. na_filter=na_filter,
  604. delim_whitespace=delim_whitespace,
  605. warn_bad_lines=warn_bad_lines,
  606. error_bad_lines=error_bad_lines,
  607. low_memory=low_memory,
  608. mangle_dupe_cols=mangle_dupe_cols,
  609. tupleize_cols=tupleize_cols,
  610. infer_datetime_format=infer_datetime_format,
  611. skip_blank_lines=skip_blank_lines)
  612. return _read(filepath_or_buffer, kwds)
  613. parser_f.__name__ = name
  614. return parser_f
  615. read_csv = _make_parser_function('read_csv', default_sep=',')
  616. read_csv = Appender(_doc_read_csv_and_table.format(
  617. func_name='read_csv',
  618. summary=('Read a comma-separated values (csv) file '
  619. 'into DataFrame.'),
  620. _default_sep="','")
  621. )(read_csv)
  622. read_table = _make_parser_function('read_table', default_sep='\t')
  623. read_table = Appender(_doc_read_csv_and_table.format(
  624. func_name='read_table',
  625. summary="""Read general delimited file into DataFrame.
  626. .. deprecated:: 0.24.0
  627. Use :func:`pandas.read_csv` instead, passing ``sep='\\t'`` if necessary.""",
  628. _default_sep=r"'\\t' (tab-stop)")
  629. )(read_table)
  630. def read_fwf(filepath_or_buffer, colspecs='infer', widths=None,
  631. infer_nrows=100, **kwds):
  632. r"""
  633. Read a table of fixed-width formatted lines into DataFrame.
  634. Also supports optionally iterating or breaking of the file
  635. into chunks.
  636. Additional help can be found in the `online docs for IO Tools
  637. <http://pandas.pydata.org/pandas-docs/stable/io.html>`_.
  638. Parameters
  639. ----------
  640. filepath_or_buffer : str, path object, or file-like object
  641. Any valid string path is acceptable. The string could be a URL. Valid
  642. URL schemes include http, ftp, s3, and file. For file URLs, a host is
  643. expected. A local file could be: file://localhost/path/to/table.csv.
  644. If you want to pass in a path object, pandas accepts either
  645. ``pathlib.Path`` or ``py._path.local.LocalPath``.
  646. By file-like object, we refer to objects with a ``read()`` method,
  647. such as a file handler (e.g. via builtin ``open`` function)
  648. or ``StringIO``.
  649. colspecs : list of tuple (int, int) or 'infer'. optional
  650. A list of tuples giving the extents of the fixed-width
  651. fields of each line as half-open intervals (i.e., [from, to[ ).
  652. String value 'infer' can be used to instruct the parser to try
  653. detecting the column specifications from the first 100 rows of
  654. the data which are not being skipped via skiprows (default='infer').
  655. widths : list of int, optional
  656. A list of field widths which can be used instead of 'colspecs' if
  657. the intervals are contiguous.
  658. infer_nrows : int, default 100
  659. The number of rows to consider when letting the parser determine the
  660. `colspecs`.
  661. .. versionadded:: 0.24.0
  662. **kwds : optional
  663. Optional keyword arguments can be passed to ``TextFileReader``.
  664. Returns
  665. -------
  666. DataFrame or TextParser
  667. A comma-separated values (csv) file is returned as two-dimensional
  668. data structure with labeled axes.
  669. See Also
  670. --------
  671. to_csv : Write DataFrame to a comma-separated values (csv) file.
  672. read_csv : Read a comma-separated values (csv) file into DataFrame.
  673. Examples
  674. --------
  675. >>> pd.read_fwf('data.csv') # doctest: +SKIP
  676. """
  677. # Check input arguments.
  678. if colspecs is None and widths is None:
  679. raise ValueError("Must specify either colspecs or widths")
  680. elif colspecs not in (None, 'infer') and widths is not None:
  681. raise ValueError("You must specify only one of 'widths' and "
  682. "'colspecs'")
  683. # Compute 'colspecs' from 'widths', if specified.
  684. if widths is not None:
  685. colspecs, col = [], 0
  686. for w in widths:
  687. colspecs.append((col, col + w))
  688. col += w
  689. kwds['colspecs'] = colspecs
  690. kwds['infer_nrows'] = infer_nrows
  691. kwds['engine'] = 'python-fwf'
  692. return _read(filepath_or_buffer, kwds)
  693. class TextFileReader(BaseIterator):
  694. """
  695. Passed dialect overrides any of the related parser options
  696. """
  697. def __init__(self, f, engine=None, **kwds):
  698. self.f = f
  699. if engine is not None:
  700. engine_specified = True
  701. else:
  702. engine = 'python'
  703. engine_specified = False
  704. self._engine_specified = kwds.get('engine_specified', engine_specified)
  705. if kwds.get('dialect') is not None:
  706. dialect = kwds['dialect']
  707. if dialect in csv.list_dialects():
  708. dialect = csv.get_dialect(dialect)
  709. # Any valid dialect should have these attributes.
  710. # If any are missing, we will raise automatically.
  711. for param in ('delimiter', 'doublequote', 'escapechar',
  712. 'skipinitialspace', 'quotechar', 'quoting'):
  713. try:
  714. dialect_val = getattr(dialect, param)
  715. except AttributeError:
  716. raise ValueError("Invalid dialect '{dialect}' provided"
  717. .format(dialect=kwds['dialect']))
  718. parser_default = _parser_defaults[param]
  719. provided = kwds.get(param, parser_default)
  720. # Messages for conflicting values between the dialect
  721. # instance and the actual parameters provided.
  722. conflict_msgs = []
  723. # Don't warn if the default parameter was passed in,
  724. # even if it conflicts with the dialect (gh-23761).
  725. if provided != parser_default and provided != dialect_val:
  726. msg = ("Conflicting values for '{param}': '{val}' was "
  727. "provided, but the dialect specifies '{diaval}'. "
  728. "Using the dialect-specified value.".format(
  729. param=param, val=provided, diaval=dialect_val))
  730. # Annoying corner case for not warning about
  731. # conflicts between dialect and delimiter parameter.
  732. # Refer to the outer "_read_" function for more info.
  733. if not (param == "delimiter" and
  734. kwds.pop("sep_override", False)):
  735. conflict_msgs.append(msg)
  736. if conflict_msgs:
  737. warnings.warn('\n\n'.join(conflict_msgs), ParserWarning,
  738. stacklevel=2)
  739. kwds[param] = dialect_val
  740. if kwds.get("skipfooter"):
  741. if kwds.get("iterator") or kwds.get("chunksize"):
  742. raise ValueError("'skipfooter' not supported for 'iteration'")
  743. if kwds.get("nrows"):
  744. raise ValueError("'skipfooter' not supported with 'nrows'")
  745. if kwds.get('header', 'infer') == 'infer':
  746. kwds['header'] = 0 if kwds.get('names') is None else None
  747. self.orig_options = kwds
  748. # miscellanea
  749. self.engine = engine
  750. self._engine = None
  751. self._currow = 0
  752. options = self._get_options_with_defaults(engine)
  753. self.chunksize = options.pop('chunksize', None)
  754. self.nrows = options.pop('nrows', None)
  755. self.squeeze = options.pop('squeeze', False)
  756. # might mutate self.engine
  757. self.engine = self._check_file_or_buffer(f, engine)
  758. self.options, self.engine = self._clean_options(options, engine)
  759. if 'has_index_names' in kwds:
  760. self.options['has_index_names'] = kwds['has_index_names']
  761. self._make_engine(self.engine)
  762. def close(self):
  763. self._engine.close()
  764. def _get_options_with_defaults(self, engine):
  765. kwds = self.orig_options
  766. options = {}
  767. for argname, default in compat.iteritems(_parser_defaults):
  768. value = kwds.get(argname, default)
  769. # see gh-12935
  770. if argname == 'mangle_dupe_cols' and not value:
  771. raise ValueError('Setting mangle_dupe_cols=False is '
  772. 'not supported yet')
  773. else:
  774. options[argname] = value
  775. for argname, default in compat.iteritems(_c_parser_defaults):
  776. if argname in kwds:
  777. value = kwds[argname]
  778. if engine != 'c' and value != default:
  779. if ('python' in engine and
  780. argname not in _python_unsupported):
  781. pass
  782. elif value == _deprecated_defaults.get(argname, default):
  783. pass
  784. else:
  785. raise ValueError(
  786. 'The %r option is not supported with the'
  787. ' %r engine' % (argname, engine))
  788. else:
  789. value = _deprecated_defaults.get(argname, default)
  790. options[argname] = value
  791. if engine == 'python-fwf':
  792. for argname, default in compat.iteritems(_fwf_defaults):
  793. options[argname] = kwds.get(argname, default)
  794. return options
  795. def _check_file_or_buffer(self, f, engine):
  796. # see gh-16530
  797. if is_file_like(f):
  798. next_attr = "__next__" if PY3 else "next"
  799. # The C engine doesn't need the file-like to have the "next" or
  800. # "__next__" attribute. However, the Python engine explicitly calls
  801. # "next(...)" when iterating through such an object, meaning it
  802. # needs to have that attribute ("next" for Python 2.x, "__next__"
  803. # for Python 3.x)
  804. if engine != "c" and not hasattr(f, next_attr):
  805. msg = ("The 'python' engine cannot iterate "
  806. "through this file buffer.")
  807. raise ValueError(msg)
  808. return engine
  809. def _clean_options(self, options, engine):
  810. result = options.copy()
  811. engine_specified = self._engine_specified
  812. fallback_reason = None
  813. sep = options['delimiter']
  814. delim_whitespace = options['delim_whitespace']
  815. # C engine not supported yet
  816. if engine == 'c':
  817. if options['skipfooter'] > 0:
  818. fallback_reason = ("the 'c' engine does not support"
  819. " skipfooter")
  820. engine = 'python'
  821. encoding = sys.getfilesystemencoding() or 'utf-8'
  822. if sep is None and not delim_whitespace:
  823. if engine == 'c':
  824. fallback_reason = ("the 'c' engine does not support"
  825. " sep=None with delim_whitespace=False")
  826. engine = 'python'
  827. elif sep is not None and len(sep) > 1:
  828. if engine == 'c' and sep == r'\s+':
  829. result['delim_whitespace'] = True
  830. del result['delimiter']
  831. elif engine not in ('python', 'python-fwf'):
  832. # wait until regex engine integrated
  833. fallback_reason = ("the 'c' engine does not support"
  834. " regex separators (separators > 1 char and"
  835. r" different from '\s+' are"
  836. " interpreted as regex)")
  837. engine = 'python'
  838. elif delim_whitespace:
  839. if 'python' in engine:
  840. result['delimiter'] = r'\s+'
  841. elif sep is not None:
  842. encodeable = True
  843. try:
  844. if len(sep.encode(encoding)) > 1:
  845. encodeable = False
  846. except UnicodeDecodeError:
  847. encodeable = False
  848. if not encodeable and engine not in ('python', 'python-fwf'):
  849. fallback_reason = ("the separator encoded in {encoding}"
  850. " is > 1 char long, and the 'c' engine"
  851. " does not support such separators"
  852. .format(encoding=encoding))
  853. engine = 'python'
  854. quotechar = options['quotechar']
  855. if (quotechar is not None and
  856. isinstance(quotechar, (str, compat.text_type, bytes))):
  857. if (len(quotechar) == 1 and ord(quotechar) > 127 and
  858. engine not in ('python', 'python-fwf')):
  859. fallback_reason = ("ord(quotechar) > 127, meaning the "
  860. "quotechar is larger than one byte, "
  861. "and the 'c' engine does not support "
  862. "such quotechars")
  863. engine = 'python'
  864. if fallback_reason and engine_specified:
  865. raise ValueError(fallback_reason)
  866. if engine == 'c':
  867. for arg in _c_unsupported:
  868. del result[arg]
  869. if 'python' in engine:
  870. for arg in _python_unsupported:
  871. if fallback_reason and result[arg] != _c_parser_defaults[arg]:
  872. msg = ("Falling back to the 'python' engine because"
  873. " {reason}, but this causes {option!r} to be"
  874. " ignored as it is not supported by the 'python'"
  875. " engine.").format(reason=fallback_reason,
  876. option=arg)
  877. raise ValueError(msg)
  878. del result[arg]
  879. if fallback_reason:
  880. warnings.warn(("Falling back to the 'python' engine because"
  881. " {0}; you can avoid this warning by specifying"
  882. " engine='python'.").format(fallback_reason),
  883. ParserWarning, stacklevel=5)
  884. index_col = options['index_col']
  885. names = options['names']
  886. converters = options['converters']
  887. na_values = options['na_values']
  888. skiprows = options['skiprows']
  889. _validate_header_arg(options['header'])
  890. depr_warning = ''
  891. for arg in _deprecated_args:
  892. parser_default = _c_parser_defaults[arg]
  893. depr_default = _deprecated_defaults[arg]
  894. msg = ("The '{arg}' argument has been deprecated "
  895. "and will be removed in a future version."
  896. .format(arg=arg))
  897. if arg == 'tupleize_cols':
  898. msg += (' Column tuples will then '
  899. 'always be converted to MultiIndex.')
  900. if result.get(arg, depr_default) != depr_default:
  901. # raise Exception(result.get(arg, depr_default), depr_default)
  902. depr_warning += msg + '\n\n'
  903. else:
  904. result[arg] = parser_default
  905. if depr_warning != '':
  906. warnings.warn(depr_warning, FutureWarning, stacklevel=2)
  907. if index_col is True:
  908. raise ValueError("The value of index_col couldn't be 'True'")
  909. if _is_index_col(index_col):
  910. if not isinstance(index_col, (list, tuple, np.ndarray)):
  911. index_col = [index_col]
  912. result['index_col'] = index_col
  913. names = list(names) if names is not None else names
  914. # type conversion-related
  915. if converters is not None:
  916. if not isinstance(converters, dict):
  917. raise TypeError('Type converters must be a dict or'
  918. ' subclass, input was '
  919. 'a {0!r}'.format(type(converters).__name__))
  920. else:
  921. converters = {}
  922. # Converting values to NA
  923. keep_default_na = options['keep_default_na']
  924. na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
  925. # handle skiprows; this is internally handled by the
  926. # c-engine, so only need for python parsers
  927. if engine != 'c':
  928. if is_integer(skiprows):
  929. skiprows = lrange(skiprows)
  930. if skiprows is None:
  931. skiprows = set()
  932. elif not callable(skiprows):
  933. skiprows = set(skiprows)
  934. # put stuff back
  935. result['names'] = names
  936. result['converters'] = converters
  937. result['na_values'] = na_values
  938. result['na_fvalues'] = na_fvalues
  939. result['skiprows'] = skiprows
  940. return result, engine
  941. def __next__(self):
  942. try:
  943. return self.get_chunk()
  944. except StopIteration:
  945. self.close()
  946. raise
  947. def _make_engine(self, engine='c'):
  948. if engine == 'c':
  949. self._engine = CParserWrapper(self.f, **self.options)
  950. else:
  951. if engine == 'python':
  952. klass = PythonParser
  953. elif engine == 'python-fwf':
  954. klass = FixedWidthFieldParser
  955. else:
  956. raise ValueError('Unknown engine: {engine} (valid options are'
  957. ' "c", "python", or' ' "python-fwf")'.format(
  958. engine=engine))
  959. self._engine = klass(self.f, **self.options)
  960. def _failover_to_python(self):
  961. raise AbstractMethodError(self)
  962. def read(self, nrows=None):
  963. nrows = _validate_integer('nrows', nrows)
  964. ret = self._engine.read(nrows)
  965. # May alter columns / col_dict
  966. index, columns, col_dict = self._create_index(ret)
  967. if index is None:
  968. if col_dict:
  969. # Any column is actually fine:
  970. new_rows = len(compat.next(compat.itervalues(col_dict)))
  971. index = RangeIndex(self._currow, self._currow + new_rows)
  972. else:
  973. new_rows = 0
  974. else:
  975. new_rows = len(index)
  976. df = DataFrame(col_dict, columns=columns, index=index)
  977. self._currow += new_rows
  978. if self.squeeze and len(df.columns) == 1:
  979. return df[df.columns[0]].copy()
  980. return df
  981. def _create_index(self, ret):
  982. index, columns, col_dict = ret
  983. return index, columns, col_dict
  984. def get_chunk(self, size=None):
  985. if size is None:
  986. size = self.chunksize
  987. if self.nrows is not None:
  988. if self._currow >= self.nrows:
  989. raise StopIteration
  990. size = min(size, self.nrows - self._currow)
  991. return self.read(nrows=size)
  992. def _is_index_col(col):
  993. return col is not None and col is not False
  994. def _is_potential_multi_index(columns):
  995. """
  996. Check whether or not the `columns` parameter
  997. could be converted into a MultiIndex.
  998. Parameters
  999. ----------
  1000. columns : array-like
  1001. Object which may or may not be convertible into a MultiIndex
  1002. Returns
  1003. -------
  1004. boolean : Whether or not columns could become a MultiIndex
  1005. """
  1006. return (len(columns) and not isinstance(columns, MultiIndex) and
  1007. all(isinstance(c, tuple) for c in columns))
  1008. def _evaluate_usecols(usecols, names):
  1009. """
  1010. Check whether or not the 'usecols' parameter
  1011. is a callable. If so, enumerates the 'names'
  1012. parameter and returns a set of indices for
  1013. each entry in 'names' that evaluates to True.
  1014. If not a callable, returns 'usecols'.
  1015. """
  1016. if callable(usecols):
  1017. return {i for i, name in enumerate(names) if usecols(name)}
  1018. return usecols
  1019. def _validate_usecols_names(usecols, names):
  1020. """
  1021. Validates that all usecols are present in a given
  1022. list of names. If not, raise a ValueError that
  1023. shows what usecols are missing.
  1024. Parameters
  1025. ----------
  1026. usecols : iterable of usecols
  1027. The columns to validate are present in names.
  1028. names : iterable of names
  1029. The column names to check against.
  1030. Returns
  1031. -------
  1032. usecols : iterable of usecols
  1033. The `usecols` parameter if the validation succeeds.
  1034. Raises
  1035. ------
  1036. ValueError : Columns were missing. Error message will list them.
  1037. """
  1038. missing = [c for c in usecols if c not in names]
  1039. if len(missing) > 0:
  1040. raise ValueError(
  1041. "Usecols do not match columns, "
  1042. "columns expected but not found: {missing}".format(missing=missing)
  1043. )
  1044. return usecols
  1045. def _validate_skipfooter_arg(skipfooter):
  1046. """
  1047. Validate the 'skipfooter' parameter.
  1048. Checks whether 'skipfooter' is a non-negative integer.
  1049. Raises a ValueError if that is not the case.
  1050. Parameters
  1051. ----------
  1052. skipfooter : non-negative integer
  1053. The number of rows to skip at the end of the file.
  1054. Returns
  1055. -------
  1056. validated_skipfooter : non-negative integer
  1057. The original input if the validation succeeds.
  1058. Raises
  1059. ------
  1060. ValueError : 'skipfooter' was not a non-negative integer.
  1061. """
  1062. if not is_integer(skipfooter):
  1063. raise ValueError("skipfooter must be an integer")
  1064. if skipfooter < 0:
  1065. raise ValueError("skipfooter cannot be negative")
  1066. return skipfooter
  1067. def _validate_usecols_arg(usecols):
  1068. """
  1069. Validate the 'usecols' parameter.
  1070. Checks whether or not the 'usecols' parameter contains all integers
  1071. (column selection by index), strings (column by name) or is a callable.
  1072. Raises a ValueError if that is not the case.
  1073. Parameters
  1074. ----------
  1075. usecols : list-like, callable, or None
  1076. List of columns to use when parsing or a callable that can be used
  1077. to filter a list of table columns.
  1078. Returns
  1079. -------
  1080. usecols_tuple : tuple
  1081. A tuple of (verified_usecols, usecols_dtype).
  1082. 'verified_usecols' is either a set if an array-like is passed in or
  1083. 'usecols' if a callable or None is passed in.
  1084. 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
  1085. is passed in or None if a callable or None is passed in.
  1086. """
  1087. msg = ("'usecols' must either be list-like of all strings, all unicode, "
  1088. "all integers or a callable.")
  1089. if usecols is not None:
  1090. if callable(usecols):
  1091. return usecols, None
  1092. if not is_list_like(usecols):
  1093. # see gh-20529
  1094. #
  1095. # Ensure it is iterable container but not string.
  1096. raise ValueError(msg)
  1097. usecols_dtype = lib.infer_dtype(usecols, skipna=False)
  1098. if usecols_dtype not in ("empty", "integer",
  1099. "string", "unicode"):
  1100. raise ValueError(msg)
  1101. usecols = set(usecols)
  1102. if usecols_dtype == "unicode":
  1103. # see gh-13253
  1104. #
  1105. # Python 2.x compatibility
  1106. usecols = {col.encode("utf-8") for col in usecols}
  1107. return usecols, usecols_dtype
  1108. return usecols, None
  1109. def _validate_parse_dates_arg(parse_dates):
  1110. """
  1111. Check whether or not the 'parse_dates' parameter
  1112. is a non-boolean scalar. Raises a ValueError if
  1113. that is the case.
  1114. """
  1115. msg = ("Only booleans, lists, and "
  1116. "dictionaries are accepted "
  1117. "for the 'parse_dates' parameter")
  1118. if parse_dates is not None:
  1119. if is_scalar(parse_dates):
  1120. if not lib.is_bool(parse_dates):
  1121. raise TypeError(msg)
  1122. elif not isinstance(parse_dates, (list, dict)):
  1123. raise TypeError(msg)
  1124. return parse_dates
  1125. class ParserBase(object):
  1126. def __init__(self, kwds):
  1127. self.names = kwds.get('names')
  1128. self.orig_names = None
  1129. self.prefix = kwds.pop('prefix', None)
  1130. self.index_col = kwds.get('index_col', None)
  1131. self.unnamed_cols = set()
  1132. self.index_names = None
  1133. self.col_names = None
  1134. self.parse_dates = _validate_parse_dates_arg(
  1135. kwds.pop('parse_dates', False))
  1136. self.date_parser = kwds.pop('date_parser', None)
  1137. self.dayfirst = kwds.pop('dayfirst', False)
  1138. self.keep_date_col = kwds.pop('keep_date_col', False)
  1139. self.na_values = kwds.get('na_values')
  1140. self.na_fvalues = kwds.get('na_fvalues')
  1141. self.na_filter = kwds.get('na_filter', False)
  1142. self.keep_default_na = kwds.get('keep_default_na', True)
  1143. self.true_values = kwds.get('true_values')
  1144. self.false_values = kwds.get('false_values')
  1145. self.tupleize_cols = kwds.get('tupleize_cols', False)
  1146. self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
  1147. self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
  1148. self._date_conv = _make_date_converter(
  1149. date_parser=self.date_parser,
  1150. dayfirst=self.dayfirst,
  1151. infer_datetime_format=self.infer_datetime_format
  1152. )
  1153. # validate header options for mi
  1154. self.header = kwds.get('header')
  1155. if isinstance(self.header, (list, tuple, np.ndarray)):
  1156. if not all(map(is_integer, self.header)):
  1157. raise ValueError("header must be integer or list of integers")
  1158. if kwds.get('usecols'):
  1159. raise ValueError("cannot specify usecols when "
  1160. "specifying a multi-index header")
  1161. if kwds.get('names'):
  1162. raise ValueError("cannot specify names when "
  1163. "specifying a multi-index header")
  1164. # validate index_col that only contains integers
  1165. if self.index_col is not None:
  1166. is_sequence = isinstance(self.index_col, (list, tuple,
  1167. np.ndarray))
  1168. if not (is_sequence and
  1169. all(map(is_integer, self.index_col)) or
  1170. is_integer(self.index_col)):
  1171. raise ValueError("index_col must only contain row numbers "
  1172. "when specifying a multi-index header")
  1173. # GH 16338
  1174. elif self.header is not None and not is_integer(self.header):
  1175. raise ValueError("header must be integer or list of integers")
  1176. self._name_processed = False
  1177. self._first_chunk = True
  1178. # GH 13932
  1179. # keep references to file handles opened by the parser itself
  1180. self.handles = []
  1181. def close(self):
  1182. for f in self.handles:
  1183. f.close()
  1184. @property
  1185. def _has_complex_date_col(self):
  1186. return (isinstance(self.parse_dates, dict) or
  1187. (isinstance(self.parse_dates, list) and
  1188. len(self.parse_dates) > 0 and
  1189. isinstance(self.parse_dates[0], list)))
  1190. def _should_parse_dates(self, i):
  1191. if isinstance(self.parse_dates, bool):
  1192. return self.parse_dates
  1193. else:
  1194. if self.index_names is not None:
  1195. name = self.index_names[i]
  1196. else:
  1197. name = None
  1198. j = self.index_col[i]
  1199. if is_scalar(self.parse_dates):
  1200. return ((j == self.parse_dates) or
  1201. (name is not None and name == self.parse_dates))
  1202. else:
  1203. return ((j in self.parse_dates) or
  1204. (name is not None and name in self.parse_dates))
  1205. def _extract_multi_indexer_columns(self, header, index_names, col_names,
  1206. passed_names=False):
  1207. """ extract and return the names, index_names, col_names
  1208. header is a list-of-lists returned from the parsers """
  1209. if len(header) < 2:
  1210. return header[0], index_names, col_names, passed_names
  1211. # the names are the tuples of the header that are not the index cols
  1212. # 0 is the name of the index, assuming index_col is a list of column
  1213. # numbers
  1214. ic = self.index_col
  1215. if ic is None:
  1216. ic = []
  1217. if not isinstance(ic, (list, tuple, np.ndarray)):
  1218. ic = [ic]
  1219. sic = set(ic)
  1220. # clean the index_names
  1221. index_names = header.pop(-1)
  1222. index_names, names, index_col = _clean_index_names(index_names,
  1223. self.index_col,
  1224. self.unnamed_cols)
  1225. # extract the columns
  1226. field_count = len(header[0])
  1227. def extract(r):
  1228. return tuple(r[i] for i in range(field_count) if i not in sic)
  1229. columns = lzip(*[extract(r) for r in header])
  1230. names = ic + columns
  1231. # If we find unnamed columns all in a single
  1232. # level, then our header was too long.
  1233. for n in range(len(columns[0])):
  1234. if all(compat.to_str(c[n]) in self.unnamed_cols for c in columns):
  1235. raise ParserError(
  1236. "Passed header=[%s] are too many rows for this "
  1237. "multi_index of columns"
  1238. % ','.join(str(x) for x in self.header)
  1239. )
  1240. # Clean the column names (if we have an index_col).
  1241. if len(ic):
  1242. col_names = [r[0] if (len(r[0]) and
  1243. r[0] not in self.unnamed_cols) else None
  1244. for r in header]
  1245. else:
  1246. col_names = [None] * len(header)
  1247. passed_names = True
  1248. return names, index_names, col_names, passed_names
  1249. def _maybe_dedup_names(self, names):
  1250. # see gh-7160 and gh-9424: this helps to provide
  1251. # immediate alleviation of the duplicate names
  1252. # issue and appears to be satisfactory to users,
  1253. # but ultimately, not needing to butcher the names
  1254. # would be nice!
  1255. if self.mangle_dupe_cols:
  1256. names = list(names) # so we can index
  1257. counts = defaultdict(int)
  1258. is_potential_mi = _is_potential_multi_index(names)
  1259. for i, col in enumerate(names):
  1260. cur_count = counts[col]
  1261. while cur_count > 0:
  1262. counts[col] = cur_count + 1
  1263. if is_potential_mi:
  1264. col = col[:-1] + ('%s.%d' % (col[-1], cur_count),)
  1265. else:
  1266. col = '%s.%d' % (col, cur_count)
  1267. cur_count = counts[col]
  1268. names[i] = col
  1269. counts[col] = cur_count + 1
  1270. return names
  1271. def _maybe_make_multi_index_columns(self, columns, col_names=None):
  1272. # possibly create a column mi here
  1273. if _is_potential_multi_index(columns):
  1274. columns = MultiIndex.from_tuples(columns, names=col_names)
  1275. return columns
  1276. def _make_index(self, data, alldata, columns, indexnamerow=False):
  1277. if not _is_index_col(self.index_col) or not self.index_col:
  1278. index = None
  1279. elif not self._has_complex_date_col:
  1280. index = self._get_simple_index(alldata, columns)
  1281. index = self._agg_index(index)
  1282. elif self._has_complex_date_col:
  1283. if not self._name_processed:
  1284. (self.index_names, _,
  1285. self.index_col) = _clean_index_names(list(columns),
  1286. self.index_col,
  1287. self.unnamed_cols)
  1288. self._name_processed = True
  1289. index = self._get_complex_date_index(data, columns)
  1290. index = self._agg_index(index, try_parse_dates=False)
  1291. # add names for the index
  1292. if indexnamerow:
  1293. coffset = len(indexnamerow) - len(columns)
  1294. index = index.set_names(indexnamerow[:coffset])
  1295. # maybe create a mi on the columns
  1296. columns = self._maybe_make_multi_index_columns(columns, self.col_names)
  1297. return index, columns
  1298. _implicit_index = False
  1299. def _get_simple_index(self, data, columns):
  1300. def ix(col):
  1301. if not isinstance(col, compat.string_types):
  1302. return col
  1303. raise ValueError('Index %s invalid' % col)
  1304. to_remove = []
  1305. index = []
  1306. for idx in self.index_col:
  1307. i = ix(idx)
  1308. to_remove.append(i)
  1309. index.append(data[i])
  1310. # remove index items from content and columns, don't pop in
  1311. # loop
  1312. for i in reversed(sorted(to_remove)):
  1313. data.pop(i)
  1314. if not self._implicit_index:
  1315. columns.pop(i)
  1316. return index
  1317. def _get_complex_date_index(self, data, col_names):
  1318. def _get_name(icol):
  1319. if isinstance(icol, compat.string_types):
  1320. return icol
  1321. if col_names is None:
  1322. raise ValueError(('Must supply column order to use %s as '
  1323. 'index') % str(icol))
  1324. for i, c in enumerate(col_names):
  1325. if i == icol:
  1326. return c
  1327. to_remove = []
  1328. index = []
  1329. for idx in self.index_col:
  1330. name = _get_name(idx)
  1331. to_remove.append(name)
  1332. index.append(data[name])
  1333. # remove index items from content and columns, don't pop in
  1334. # loop
  1335. for c in reversed(sorted(to_remove)):
  1336. data.pop(c)
  1337. col_names.remove(c)
  1338. return index
  1339. def _agg_index(self, index, try_parse_dates=True):
  1340. arrays = []
  1341. for i, arr in enumerate(index):
  1342. if try_parse_dates and self._should_parse_dates(i):
  1343. arr = self._date_conv(arr)
  1344. if self.na_filter:
  1345. col_na_values = self.na_values
  1346. col_na_fvalues = self.na_fvalues
  1347. else:
  1348. col_na_values = set()
  1349. col_na_fvalues = set()
  1350. if isinstance(self.na_values, dict):
  1351. col_name = self.index_names[i]
  1352. if col_name is not None:
  1353. col_na_values, col_na_fvalues = _get_na_values(
  1354. col_name, self.na_values, self.na_fvalues,
  1355. self.keep_default_na)
  1356. arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
  1357. arrays.append(arr)
  1358. names = self.index_names
  1359. index = ensure_index_from_sequences(arrays, names)
  1360. return index
  1361. def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
  1362. converters=None, dtypes=None):
  1363. result = {}
  1364. for c, values in compat.iteritems(dct):
  1365. conv_f = None if converters is None else converters.get(c, None)
  1366. if isinstance(dtypes, dict):
  1367. cast_type = dtypes.get(c, None)
  1368. else:
  1369. # single dtype or None
  1370. cast_type = dtypes
  1371. if self.na_filter:
  1372. col_na_values, col_na_fvalues = _get_na_values(
  1373. c, na_values, na_fvalues, self.keep_default_na)
  1374. else:
  1375. col_na_values, col_na_fvalues = set(), set()
  1376. if conv_f is not None:
  1377. # conv_f applied to data before inference
  1378. if cast_type is not None:
  1379. warnings.warn(("Both a converter and dtype were specified "
  1380. "for column {0} - only the converter will "
  1381. "be used").format(c), ParserWarning,
  1382. stacklevel=7)
  1383. try:
  1384. values = lib.map_infer(values, conv_f)
  1385. except ValueError:
  1386. mask = algorithms.isin(
  1387. values, list(na_values)).view(np.uint8)
  1388. values = lib.map_infer_mask(values, conv_f, mask)
  1389. cvals, na_count = self._infer_types(
  1390. values, set(col_na_values) | col_na_fvalues,
  1391. try_num_bool=False)
  1392. else:
  1393. is_str_or_ea_dtype = (is_string_dtype(cast_type)
  1394. or is_extension_array_dtype(cast_type))
  1395. # skip inference if specified dtype is object
  1396. # or casting to an EA
  1397. try_num_bool = not (cast_type and is_str_or_ea_dtype)
  1398. # general type inference and conversion
  1399. cvals, na_count = self._infer_types(
  1400. values, set(col_na_values) | col_na_fvalues,
  1401. try_num_bool)
  1402. # type specified in dtype param or cast_type is an EA
  1403. if cast_type and (not is_dtype_equal(cvals, cast_type)
  1404. or is_extension_array_dtype(cast_type)):
  1405. try:
  1406. if (is_bool_dtype(cast_type) and
  1407. not is_categorical_dtype(cast_type)
  1408. and na_count > 0):
  1409. raise ValueError("Bool column has NA values in "
  1410. "column {column}"
  1411. .format(column=c))
  1412. except (AttributeError, TypeError):
  1413. # invalid input to is_bool_dtype
  1414. pass
  1415. cvals = self._cast_types(cvals, cast_type, c)
  1416. result[c] = cvals
  1417. if verbose and na_count:
  1418. print('Filled %d NA values in column %s' % (na_count, str(c)))
  1419. return result
  1420. def _infer_types(self, values, na_values, try_num_bool=True):
  1421. """
  1422. Infer types of values, possibly casting
  1423. Parameters
  1424. ----------
  1425. values : ndarray
  1426. na_values : set
  1427. try_num_bool : bool, default try
  1428. try to cast values to numeric (first preference) or boolean
  1429. Returns:
  1430. --------
  1431. converted : ndarray
  1432. na_count : int
  1433. """
  1434. na_count = 0
  1435. if issubclass(values.dtype.type, (np.number, np.bool_)):
  1436. mask = algorithms.isin(values, list(na_values))
  1437. na_count = mask.sum()
  1438. if na_count > 0:
  1439. if is_integer_dtype(values):
  1440. values = values.astype(np.float64)
  1441. np.putmask(values, mask, np.nan)
  1442. return values, na_count
  1443. if try_num_bool:
  1444. try:
  1445. result = lib.maybe_convert_numeric(values, na_values, False)
  1446. na_count = isna(result).sum()
  1447. except Exception:
  1448. result = values
  1449. if values.dtype == np.object_:
  1450. na_count = parsers.sanitize_objects(result,
  1451. na_values, False)
  1452. else:
  1453. result = values
  1454. if values.dtype == np.object_:
  1455. na_count = parsers.sanitize_objects(values, na_values, False)
  1456. if result.dtype == np.object_ and try_num_bool:
  1457. result = libops.maybe_convert_bool(np.asarray(values),
  1458. true_values=self.true_values,
  1459. false_values=self.false_values)
  1460. return result, na_count
  1461. def _cast_types(self, values, cast_type, column):
  1462. """
  1463. Cast values to specified type
  1464. Parameters
  1465. ----------
  1466. values : ndarray
  1467. cast_type : string or np.dtype
  1468. dtype to cast values to
  1469. column : string
  1470. column name - used only for error reporting
  1471. Returns
  1472. -------
  1473. converted : ndarray
  1474. """
  1475. if is_categorical_dtype(cast_type):
  1476. known_cats = (isinstance(cast_type, CategoricalDtype) and
  1477. cast_type.categories is not None)
  1478. if not is_object_dtype(values) and not known_cats:
  1479. # XXX this is for consistency with
  1480. # c-parser which parses all categories
  1481. # as strings
  1482. values = astype_nansafe(values, str)
  1483. cats = Index(values).unique().dropna()
  1484. values = Categorical._from_inferred_categories(
  1485. cats, cats.get_indexer(values), cast_type,
  1486. true_values=self.true_values)
  1487. # use the EA's implementation of casting
  1488. elif is_extension_array_dtype(cast_type):
  1489. # ensure cast_type is an actual dtype and not a string
  1490. cast_type = pandas_dtype(cast_type)
  1491. array_type = cast_type.construct_array_type()
  1492. try:
  1493. return array_type._from_sequence_of_strings(values,
  1494. dtype=cast_type)
  1495. except NotImplementedError:
  1496. raise NotImplementedError(
  1497. "Extension Array: {ea} must implement "
  1498. "_from_sequence_of_strings in order "
  1499. "to be used in parser methods".format(ea=array_type))
  1500. else:
  1501. try:
  1502. values = astype_nansafe(values, cast_type,
  1503. copy=True, skipna=True)
  1504. except ValueError:
  1505. raise ValueError("Unable to convert column %s to "
  1506. "type %s" % (column, cast_type))
  1507. return values
  1508. def _do_date_conversions(self, names, data):
  1509. # returns data, columns
  1510. if self.parse_dates is not None:
  1511. data, names = _process_date_conversion(
  1512. data, self._date_conv, self.parse_dates, self.index_col,
  1513. self.index_names, names, keep_date_col=self.keep_date_col)
  1514. return names, data
  1515. class CParserWrapper(ParserBase):
  1516. """
  1517. """
  1518. def __init__(self, src, **kwds):
  1519. self.kwds = kwds
  1520. kwds = kwds.copy()
  1521. ParserBase.__init__(self, kwds)
  1522. if (kwds.get('compression') is None
  1523. and 'utf-16' in (kwds.get('encoding') or '')):
  1524. # if source is utf-16 plain text, convert source to utf-8
  1525. if isinstance(src, compat.string_types):
  1526. src = open(src, 'rb')
  1527. self.handles.append(src)
  1528. src = UTF8Recoder(src, kwds['encoding'])
  1529. kwds['encoding'] = 'utf-8'
  1530. # #2442
  1531. kwds['allow_leading_cols'] = self.index_col is not False
  1532. # GH20529, validate usecol arg before TextReader
  1533. self.usecols, self.usecols_dtype = _validate_usecols_arg(
  1534. kwds['usecols'])
  1535. kwds['usecols'] = self.usecols
  1536. self._reader = parsers.TextReader(src, **kwds)
  1537. self.unnamed_cols = self._reader.unnamed_cols
  1538. passed_names = self.names is None
  1539. if self._reader.header is None:
  1540. self.names = None
  1541. else:
  1542. if len(self._reader.header) > 1:
  1543. # we have a multi index in the columns
  1544. self.names, self.index_names, self.col_names, passed_names = (
  1545. self._extract_multi_indexer_columns(
  1546. self._reader.header, self.index_names, self.col_names,
  1547. passed_names
  1548. )
  1549. )
  1550. else:
  1551. self.names = list(self._reader.header[0])
  1552. if self.names is None:
  1553. if self.prefix:
  1554. self.names = ['%s%d' % (self.prefix, i)
  1555. for i in range(self._reader.table_width)]
  1556. else:
  1557. self.names = lrange(self._reader.table_width)
  1558. # gh-9755
  1559. #
  1560. # need to set orig_names here first
  1561. # so that proper indexing can be done
  1562. # with _set_noconvert_columns
  1563. #
  1564. # once names has been filtered, we will
  1565. # then set orig_names again to names
  1566. self.orig_names = self.names[:]
  1567. if self.usecols:
  1568. usecols = _evaluate_usecols(self.usecols, self.orig_names)
  1569. # GH 14671
  1570. if (self.usecols_dtype == 'string' and
  1571. not set(usecols).issubset(self.orig_names)):
  1572. _validate_usecols_names(usecols, self.orig_names)
  1573. if len(self.names) > len(usecols):
  1574. self.names = [n for i, n in enumerate(self.names)
  1575. if (i in usecols or n in usecols)]
  1576. if len(self.names) < len(usecols):
  1577. _validate_usecols_names(usecols, self.names)
  1578. self._set_noconvert_columns()
  1579. self.orig_names = self.names
  1580. if not self._has_complex_date_col:
  1581. if (self._reader.leading_cols == 0 and
  1582. _is_index_col(self.index_col)):
  1583. self._name_processed = True
  1584. (index_names, self.names,
  1585. self.index_col) = _clean_index_names(self.names,
  1586. self.index_col,
  1587. self.unnamed_cols)
  1588. if self.index_names is None:
  1589. self.index_names = index_names
  1590. if self._reader.header is None and not passed_names:
  1591. self.index_names = [None] * len(self.index_names)
  1592. self._implicit_index = self._reader.leading_cols > 0
  1593. def close(self):
  1594. for f in self.handles:
  1595. f.close()
  1596. # close additional handles opened by C parser (for compression)
  1597. try:
  1598. self._reader.close()
  1599. except ValueError:
  1600. pass
  1601. def _set_noconvert_columns(self):
  1602. """
  1603. Set the columns that should not undergo dtype conversions.
  1604. Currently, any column that is involved with date parsing will not
  1605. undergo such conversions.
  1606. """
  1607. names = self.orig_names
  1608. if self.usecols_dtype == 'integer':
  1609. # A set of integers will be converted to a list in
  1610. # the correct order every single time.
  1611. usecols = list(self.usecols)
  1612. usecols.sort()
  1613. elif (callable(self.usecols) or
  1614. self.usecols_dtype not in ('empty', None)):
  1615. # The names attribute should have the correct columns
  1616. # in the proper order for indexing with parse_dates.
  1617. usecols = self.names[:]
  1618. else:
  1619. # Usecols is empty.
  1620. usecols = None
  1621. def _set(x):
  1622. if usecols is not None and is_integer(x):
  1623. x = usecols[x]
  1624. if not is_integer(x):
  1625. x = names.index(x)
  1626. self._reader.set_noconvert(x)
  1627. if isinstance(self.parse_dates, list):
  1628. for val in self.parse_dates:
  1629. if isinstance(val, list):
  1630. for k in val:
  1631. _set(k)
  1632. else:
  1633. _set(val)
  1634. elif isinstance(self.parse_dates, dict):
  1635. for val in self.parse_dates.values():
  1636. if isinstance(val, list):
  1637. for k in val:
  1638. _set(k)
  1639. else:
  1640. _set(val)
  1641. elif self.parse_dates:
  1642. if isinstance(self.index_col, list):
  1643. for k in self.index_col:
  1644. _set(k)
  1645. elif self.index_col is not None:
  1646. _set(self.index_col)
  1647. def set_error_bad_lines(self, status):
  1648. self._reader.set_error_bad_lines(int(status))
  1649. def read(self, nrows=None):
  1650. try:
  1651. data = self._reader.read(nrows)
  1652. except StopIteration:
  1653. if self._first_chunk:
  1654. self._first_chunk = False
  1655. names = self._maybe_dedup_names(self.orig_names)
  1656. index, columns, col_dict = _get_empty_meta(
  1657. names, self.index_col, self.index_names,
  1658. dtype=self.kwds.get('dtype'))
  1659. columns = self._maybe_make_multi_index_columns(
  1660. columns, self.col_names)
  1661. if self.usecols is not None:
  1662. columns = self._filter_usecols(columns)
  1663. col_dict = dict(filter(lambda item: item[0] in columns,
  1664. col_dict.items()))
  1665. return index, columns, col_dict
  1666. else:
  1667. raise
  1668. # Done with first read, next time raise StopIteration
  1669. self._first_chunk = False
  1670. names = self.names
  1671. if self._reader.leading_cols:
  1672. if self._has_complex_date_col:
  1673. raise NotImplementedError('file structure not yet supported')
  1674. # implicit index, no index names
  1675. arrays = []
  1676. for i in range(self._reader.leading_cols):
  1677. if self.index_col is None:
  1678. values = data.pop(i)
  1679. else:
  1680. values = data.pop(self.index_col[i])
  1681. values = self._maybe_parse_dates(values, i,
  1682. try_parse_dates=True)
  1683. arrays.append(values)
  1684. index = ensure_index_from_sequences(arrays)
  1685. if self.usecols is not None:
  1686. names = self._filter_usecols(names)
  1687. names = self._maybe_dedup_names(names)
  1688. # rename dict keys
  1689. data = sorted(data.items())
  1690. data = {k: v for k, (i, v) in zip(names, data)}
  1691. names, data = self._do_date_conversions(names, data)
  1692. else:
  1693. # rename dict keys
  1694. data = sorted(data.items())
  1695. # ugh, mutation
  1696. names = list(self.orig_names)
  1697. names = self._maybe_dedup_names(names)
  1698. if self.usecols is not None:
  1699. names = self._filter_usecols(names)
  1700. # columns as list
  1701. alldata = [x[1] for x in data]
  1702. data = {k: v for k, (i, v) in zip(names, data)}
  1703. names, data = self._do_date_conversions(names, data)
  1704. index, names = self._make_index(data, alldata, names)
  1705. # maybe create a mi on the columns
  1706. names = self._maybe_make_multi_index_columns(names, self.col_names)
  1707. return index, names, data
  1708. def _filter_usecols(self, names):
  1709. # hackish
  1710. usecols = _evaluate_usecols(self.usecols, names)
  1711. if usecols is not None and len(names) != len(usecols):
  1712. names = [name for i, name in enumerate(names)
  1713. if i in usecols or name in usecols]
  1714. return names
  1715. def _get_index_names(self):
  1716. names = list(self._reader.header[0])
  1717. idx_names = None
  1718. if self._reader.leading_cols == 0 and self.index_col is not None:
  1719. (idx_names, names,
  1720. self.index_col) = _clean_index_names(names, self.index_col,
  1721. self.unnamed_cols)
  1722. return names, idx_names
  1723. def _maybe_parse_dates(self, values, index, try_parse_dates=True):
  1724. if try_parse_dates and self._should_parse_dates(index):
  1725. values = self._date_conv(values)
  1726. return values
  1727. def TextParser(*args, **kwds):
  1728. """
  1729. Converts lists of lists/tuples into DataFrames with proper type inference
  1730. and optional (e.g. string to datetime) conversion. Also enables iterating
  1731. lazily over chunks of large files
  1732. Parameters
  1733. ----------
  1734. data : file-like object or list
  1735. delimiter : separator character to use
  1736. dialect : str or csv.Dialect instance, optional
  1737. Ignored if delimiter is longer than 1 character
  1738. names : sequence, default
  1739. header : int, default 0
  1740. Row to use to parse column labels. Defaults to the first row. Prior
  1741. rows will be discarded
  1742. index_col : int or list, optional
  1743. Column or columns to use as the (possibly hierarchical) index
  1744. has_index_names: bool, default False
  1745. True if the cols defined in index_col have an index name and are
  1746. not in the header.
  1747. na_values : scalar, str, list-like, or dict, optional
  1748. Additional strings to recognize as NA/NaN.
  1749. keep_default_na : bool, default True
  1750. thousands : str, optional
  1751. Thousands separator
  1752. comment : str, optional
  1753. Comment out remainder of line
  1754. parse_dates : bool, default False
  1755. keep_date_col : bool, default False
  1756. date_parser : function, optional
  1757. skiprows : list of integers
  1758. Row numbers to skip
  1759. skipfooter : int
  1760. Number of line at bottom of file to skip
  1761. converters : dict, optional
  1762. Dict of functions for converting values in certain columns. Keys can
  1763. either be integers or column labels, values are functions that take one
  1764. input argument, the cell (not column) content, and return the
  1765. transformed content.
  1766. encoding : str, optional
  1767. Encoding to use for UTF when reading/writing (ex. 'utf-8')
  1768. squeeze : bool, default False
  1769. returns Series if only one column.
  1770. infer_datetime_format: bool, default False
  1771. If True and `parse_dates` is True for a column, try to infer the
  1772. datetime format based on the first datetime string. If the format
  1773. can be inferred, there often will be a large parsing speed-up.
  1774. float_precision : str, optional
  1775. Specifies which converter the C engine should use for floating-point
  1776. values. The options are None for the ordinary converter,
  1777. 'high' for the high-precision converter, and 'round_trip' for the
  1778. round-trip converter.
  1779. """
  1780. kwds['engine'] = 'python'
  1781. return TextFileReader(*args, **kwds)
  1782. def count_empty_vals(vals):
  1783. return sum(1 for v in vals if v == '' or v is None)
  1784. class PythonParser(ParserBase):
  1785. def __init__(self, f, **kwds):
  1786. """
  1787. Workhorse function for processing nested list into DataFrame
  1788. Should be replaced by np.genfromtxt eventually?
  1789. """
  1790. ParserBase.__init__(self, kwds)
  1791. self.data = None
  1792. self.buf = []
  1793. self.pos = 0
  1794. self.line_pos = 0
  1795. self.encoding = kwds['encoding']
  1796. self.compression = kwds['compression']
  1797. self.memory_map = kwds['memory_map']
  1798. self.skiprows = kwds['skiprows']
  1799. if callable(self.skiprows):
  1800. self.skipfunc = self.skiprows
  1801. else:
  1802. self.skipfunc = lambda x: x in self.skiprows
  1803. self.skipfooter = _validate_skipfooter_arg(kwds['skipfooter'])
  1804. self.delimiter = kwds['delimiter']
  1805. self.quotechar = kwds['quotechar']
  1806. if isinstance(self.quotechar, compat.text_type):
  1807. self.quotechar = str(self.quotechar)
  1808. self.escapechar = kwds['escapechar']
  1809. self.doublequote = kwds['doublequote']
  1810. self.skipinitialspace = kwds['skipinitialspace']
  1811. self.lineterminator = kwds['lineterminator']
  1812. self.quoting = kwds['quoting']
  1813. self.usecols, _ = _validate_usecols_arg(kwds['usecols'])
  1814. self.skip_blank_lines = kwds['skip_blank_lines']
  1815. self.warn_bad_lines = kwds['warn_bad_lines']
  1816. self.error_bad_lines = kwds['error_bad_lines']
  1817. self.names_passed = kwds['names'] or None
  1818. self.has_index_names = False
  1819. if 'has_index_names' in kwds:
  1820. self.has_index_names = kwds['has_index_names']
  1821. self.verbose = kwds['verbose']
  1822. self.converters = kwds['converters']
  1823. self.dtype = kwds['dtype']
  1824. self.thousands = kwds['thousands']
  1825. self.decimal = kwds['decimal']
  1826. self.comment = kwds['comment']
  1827. self._comment_lines = []
  1828. mode = 'r' if PY3 else 'rb'
  1829. f, handles = _get_handle(f, mode, encoding=self.encoding,
  1830. compression=self.compression,
  1831. memory_map=self.memory_map)
  1832. self.handles.extend(handles)
  1833. # Set self.data to something that can read lines.
  1834. if hasattr(f, 'readline'):
  1835. self._make_reader(f)
  1836. else:
  1837. self.data = f
  1838. # Get columns in two steps: infer from data, then
  1839. # infer column indices from self.usecols if it is specified.
  1840. self._col_indices = None
  1841. (self.columns, self.num_original_columns,
  1842. self.unnamed_cols) = self._infer_columns()
  1843. # Now self.columns has the set of columns that we will process.
  1844. # The original set is stored in self.original_columns.
  1845. if len(self.columns) > 1:
  1846. # we are processing a multi index column
  1847. self.columns, self.index_names, self.col_names, _ = (
  1848. self._extract_multi_indexer_columns(
  1849. self.columns, self.index_names, self.col_names
  1850. )
  1851. )
  1852. # Update list of original names to include all indices.
  1853. self.num_original_columns = len(self.columns)
  1854. else:
  1855. self.columns = self.columns[0]
  1856. # get popped off for index
  1857. self.orig_names = list(self.columns)
  1858. # needs to be cleaned/refactored
  1859. # multiple date column thing turning into a real spaghetti factory
  1860. if not self._has_complex_date_col:
  1861. (index_names, self.orig_names, self.columns) = (
  1862. self._get_index_name(self.columns))
  1863. self._name_processed = True
  1864. if self.index_names is None:
  1865. self.index_names = index_names
  1866. if self.parse_dates:
  1867. self._no_thousands_columns = self._set_no_thousands_columns()
  1868. else:
  1869. self._no_thousands_columns = None
  1870. if len(self.decimal) != 1:
  1871. raise ValueError('Only length-1 decimal markers supported')
  1872. if self.thousands is None:
  1873. self.nonnum = re.compile('[^-^0-9^%s]+' % self.decimal)
  1874. else:
  1875. self.nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands,
  1876. self.decimal))
  1877. def _set_no_thousands_columns(self):
  1878. # Create a set of column ids that are not to be stripped of thousands
  1879. # operators.
  1880. noconvert_columns = set()
  1881. def _set(x):
  1882. if is_integer(x):
  1883. noconvert_columns.add(x)
  1884. else:
  1885. noconvert_columns.add(self.columns.index(x))
  1886. if isinstance(self.parse_dates, list):
  1887. for val in self.parse_dates:
  1888. if isinstance(val, list):
  1889. for k in val:
  1890. _set(k)
  1891. else:
  1892. _set(val)
  1893. elif isinstance(self.parse_dates, dict):
  1894. for val in self.parse_dates.values():
  1895. if isinstance(val, list):
  1896. for k in val:
  1897. _set(k)
  1898. else:
  1899. _set(val)
  1900. elif self.parse_dates:
  1901. if isinstance(self.index_col, list):
  1902. for k in self.index_col:
  1903. _set(k)
  1904. elif self.index_col is not None:
  1905. _set(self.index_col)
  1906. return noconvert_columns
  1907. def _make_reader(self, f):
  1908. sep = self.delimiter
  1909. if sep is None or len(sep) == 1:
  1910. if self.lineterminator:
  1911. raise ValueError('Custom line terminators not supported in '
  1912. 'python parser (yet)')
  1913. class MyDialect(csv.Dialect):
  1914. delimiter = self.delimiter
  1915. quotechar = self.quotechar
  1916. escapechar = self.escapechar
  1917. doublequote = self.doublequote
  1918. skipinitialspace = self.skipinitialspace
  1919. quoting = self.quoting
  1920. lineterminator = '\n'
  1921. dia = MyDialect
  1922. sniff_sep = True
  1923. if sep is not None:
  1924. sniff_sep = False
  1925. dia.delimiter = sep
  1926. # attempt to sniff the delimiter
  1927. if sniff_sep:
  1928. line = f.readline()
  1929. while self.skipfunc(self.pos):
  1930. self.pos += 1
  1931. line = f.readline()
  1932. line = self._check_comments([line])[0]
  1933. self.pos += 1
  1934. self.line_pos += 1
  1935. sniffed = csv.Sniffer().sniff(line)
  1936. dia.delimiter = sniffed.delimiter
  1937. if self.encoding is not None:
  1938. self.buf.extend(list(
  1939. UnicodeReader(StringIO(line),
  1940. dialect=dia,
  1941. encoding=self.encoding)))
  1942. else:
  1943. self.buf.extend(list(csv.reader(StringIO(line),
  1944. dialect=dia)))
  1945. if self.encoding is not None:
  1946. reader = UnicodeReader(f, dialect=dia,
  1947. encoding=self.encoding,
  1948. strict=True)
  1949. else:
  1950. reader = csv.reader(f, dialect=dia,
  1951. strict=True)
  1952. else:
  1953. def _read():
  1954. line = f.readline()
  1955. if compat.PY2 and self.encoding:
  1956. line = line.decode(self.encoding)
  1957. pat = re.compile(sep)
  1958. yield pat.split(line.strip())
  1959. for line in f:
  1960. yield pat.split(line.strip())
  1961. reader = _read()
  1962. self.data = reader
  1963. def read(self, rows=None):
  1964. try:
  1965. content = self._get_lines(rows)
  1966. except StopIteration:
  1967. if self._first_chunk:
  1968. content = []
  1969. else:
  1970. raise
  1971. # done with first read, next time raise StopIteration
  1972. self._first_chunk = False
  1973. columns = list(self.orig_names)
  1974. if not len(content): # pragma: no cover
  1975. # DataFrame with the right metadata, even though it's length 0
  1976. names = self._maybe_dedup_names(self.orig_names)
  1977. index, columns, col_dict = _get_empty_meta(
  1978. names, self.index_col, self.index_names, self.dtype)
  1979. columns = self._maybe_make_multi_index_columns(
  1980. columns, self.col_names)
  1981. return index, columns, col_dict
  1982. # handle new style for names in index
  1983. count_empty_content_vals = count_empty_vals(content[0])
  1984. indexnamerow = None
  1985. if self.has_index_names and count_empty_content_vals == len(columns):
  1986. indexnamerow = content[0]
  1987. content = content[1:]
  1988. alldata = self._rows_to_cols(content)
  1989. data = self._exclude_implicit_index(alldata)
  1990. columns = self._maybe_dedup_names(self.columns)
  1991. columns, data = self._do_date_conversions(columns, data)
  1992. data = self._convert_data(data)
  1993. index, columns = self._make_index(data, alldata, columns, indexnamerow)
  1994. return index, columns, data
  1995. def _exclude_implicit_index(self, alldata):
  1996. names = self._maybe_dedup_names(self.orig_names)
  1997. if self._implicit_index:
  1998. excl_indices = self.index_col
  1999. data = {}
  2000. offset = 0
  2001. for i, col in enumerate(names):
  2002. while i + offset in excl_indices:
  2003. offset += 1
  2004. data[col] = alldata[i + offset]
  2005. else:
  2006. data = {k: v for k, v in zip(names, alldata)}
  2007. return data
  2008. # legacy
  2009. def get_chunk(self, size=None):
  2010. if size is None:
  2011. size = self.chunksize
  2012. return self.read(rows=size)
  2013. def _convert_data(self, data):
  2014. # apply converters
  2015. def _clean_mapping(mapping):
  2016. "converts col numbers to names"
  2017. clean = {}
  2018. for col, v in compat.iteritems(mapping):
  2019. if isinstance(col, int) and col not in self.orig_names:
  2020. col = self.orig_names[col]
  2021. clean[col] = v
  2022. return clean
  2023. clean_conv = _clean_mapping(self.converters)
  2024. if not isinstance(self.dtype, dict):
  2025. # handles single dtype applied to all columns
  2026. clean_dtypes = self.dtype
  2027. else:
  2028. clean_dtypes = _clean_mapping(self.dtype)
  2029. # Apply NA values.
  2030. clean_na_values = {}
  2031. clean_na_fvalues = {}
  2032. if isinstance(self.na_values, dict):
  2033. for col in self.na_values:
  2034. na_value = self.na_values[col]
  2035. na_fvalue = self.na_fvalues[col]
  2036. if isinstance(col, int) and col not in self.orig_names:
  2037. col = self.orig_names[col]
  2038. clean_na_values[col] = na_value
  2039. clean_na_fvalues[col] = na_fvalue
  2040. else:
  2041. clean_na_values = self.na_values
  2042. clean_na_fvalues = self.na_fvalues
  2043. return self._convert_to_ndarrays(data, clean_na_values,
  2044. clean_na_fvalues, self.verbose,
  2045. clean_conv, clean_dtypes)
  2046. def _infer_columns(self):
  2047. names = self.names
  2048. num_original_columns = 0
  2049. clear_buffer = True
  2050. unnamed_cols = set()
  2051. if self.header is not None:
  2052. header = self.header
  2053. if isinstance(header, (list, tuple, np.ndarray)):
  2054. have_mi_columns = len(header) > 1
  2055. # we have a mi columns, so read an extra line
  2056. if have_mi_columns:
  2057. header = list(header) + [header[-1] + 1]
  2058. else:
  2059. have_mi_columns = False
  2060. header = [header]
  2061. columns = []
  2062. for level, hr in enumerate(header):
  2063. try:
  2064. line = self._buffered_line()
  2065. while self.line_pos <= hr:
  2066. line = self._next_line()
  2067. except StopIteration:
  2068. if self.line_pos < hr:
  2069. raise ValueError(
  2070. 'Passed header=%s but only %d lines in file'
  2071. % (hr, self.line_pos + 1))
  2072. # We have an empty file, so check
  2073. # if columns are provided. That will
  2074. # serve as the 'line' for parsing
  2075. if have_mi_columns and hr > 0:
  2076. if clear_buffer:
  2077. self._clear_buffer()
  2078. columns.append([None] * len(columns[-1]))
  2079. return columns, num_original_columns, unnamed_cols
  2080. if not self.names:
  2081. raise EmptyDataError(
  2082. "No columns to parse from file")
  2083. line = self.names[:]
  2084. this_columns = []
  2085. this_unnamed_cols = []
  2086. for i, c in enumerate(line):
  2087. if c == '':
  2088. if have_mi_columns:
  2089. col_name = ("Unnamed: {i}_level_{level}"
  2090. .format(i=i, level=level))
  2091. else:
  2092. col_name = "Unnamed: {i}".format(i=i)
  2093. this_unnamed_cols.append(i)
  2094. this_columns.append(col_name)
  2095. else:
  2096. this_columns.append(c)
  2097. if not have_mi_columns and self.mangle_dupe_cols:
  2098. counts = defaultdict(int)
  2099. for i, col in enumerate(this_columns):
  2100. cur_count = counts[col]
  2101. while cur_count > 0:
  2102. counts[col] = cur_count + 1
  2103. col = "%s.%d" % (col, cur_count)
  2104. cur_count = counts[col]
  2105. this_columns[i] = col
  2106. counts[col] = cur_count + 1
  2107. elif have_mi_columns:
  2108. # if we have grabbed an extra line, but its not in our
  2109. # format so save in the buffer, and create an blank extra
  2110. # line for the rest of the parsing code
  2111. if hr == header[-1]:
  2112. lc = len(this_columns)
  2113. ic = (len(self.index_col)
  2114. if self.index_col is not None else 0)
  2115. unnamed_count = len(this_unnamed_cols)
  2116. if lc != unnamed_count and lc - ic > unnamed_count:
  2117. clear_buffer = False
  2118. this_columns = [None] * lc
  2119. self.buf = [self.buf[-1]]
  2120. columns.append(this_columns)
  2121. unnamed_cols.update({this_columns[i]
  2122. for i in this_unnamed_cols})
  2123. if len(columns) == 1:
  2124. num_original_columns = len(this_columns)
  2125. if clear_buffer:
  2126. self._clear_buffer()
  2127. if names is not None:
  2128. if ((self.usecols is not None and
  2129. len(names) != len(self.usecols)) or
  2130. (self.usecols is None and
  2131. len(names) != len(columns[0]))):
  2132. raise ValueError('Number of passed names did not match '
  2133. 'number of header fields in the file')
  2134. if len(columns) > 1:
  2135. raise TypeError('Cannot pass names with multi-index '
  2136. 'columns')
  2137. if self.usecols is not None:
  2138. # Set _use_cols. We don't store columns because they are
  2139. # overwritten.
  2140. self._handle_usecols(columns, names)
  2141. else:
  2142. self._col_indices = None
  2143. num_original_columns = len(names)
  2144. columns = [names]
  2145. else:
  2146. columns = self._handle_usecols(columns, columns[0])
  2147. else:
  2148. try:
  2149. line = self._buffered_line()
  2150. except StopIteration:
  2151. if not names:
  2152. raise EmptyDataError(
  2153. "No columns to parse from file")
  2154. line = names[:]
  2155. ncols = len(line)
  2156. num_original_columns = ncols
  2157. if not names:
  2158. if self.prefix:
  2159. columns = [['%s%d' % (self.prefix, i)
  2160. for i in range(ncols)]]
  2161. else:
  2162. columns = [lrange(ncols)]
  2163. columns = self._handle_usecols(columns, columns[0])
  2164. else:
  2165. if self.usecols is None or len(names) >= num_original_columns:
  2166. columns = self._handle_usecols([names], names)
  2167. num_original_columns = len(names)
  2168. else:
  2169. if (not callable(self.usecols) and
  2170. len(names) != len(self.usecols)):
  2171. raise ValueError(
  2172. 'Number of passed names did not match number of '
  2173. 'header fields in the file'
  2174. )
  2175. # Ignore output but set used columns.
  2176. self._handle_usecols([names], names)
  2177. columns = [names]
  2178. num_original_columns = ncols
  2179. return columns, num_original_columns, unnamed_cols
  2180. def _handle_usecols(self, columns, usecols_key):
  2181. """
  2182. Sets self._col_indices
  2183. usecols_key is used if there are string usecols.
  2184. """
  2185. if self.usecols is not None:
  2186. if callable(self.usecols):
  2187. col_indices = _evaluate_usecols(self.usecols, usecols_key)
  2188. elif any(isinstance(u, string_types) for u in self.usecols):
  2189. if len(columns) > 1:
  2190. raise ValueError("If using multiple headers, usecols must "
  2191. "be integers.")
  2192. col_indices = []
  2193. for col in self.usecols:
  2194. if isinstance(col, string_types):
  2195. try:
  2196. col_indices.append(usecols_key.index(col))
  2197. except ValueError:
  2198. _validate_usecols_names(self.usecols, usecols_key)
  2199. else:
  2200. col_indices.append(col)
  2201. else:
  2202. col_indices = self.usecols
  2203. columns = [[n for i, n in enumerate(column) if i in col_indices]
  2204. for column in columns]
  2205. self._col_indices = col_indices
  2206. return columns
  2207. def _buffered_line(self):
  2208. """
  2209. Return a line from buffer, filling buffer if required.
  2210. """
  2211. if len(self.buf) > 0:
  2212. return self.buf[0]
  2213. else:
  2214. return self._next_line()
  2215. def _check_for_bom(self, first_row):
  2216. """
  2217. Checks whether the file begins with the BOM character.
  2218. If it does, remove it. In addition, if there is quoting
  2219. in the field subsequent to the BOM, remove it as well
  2220. because it technically takes place at the beginning of
  2221. the name, not the middle of it.
  2222. """
  2223. # first_row will be a list, so we need to check
  2224. # that that list is not empty before proceeding.
  2225. if not first_row:
  2226. return first_row
  2227. # The first element of this row is the one that could have the
  2228. # BOM that we want to remove. Check that the first element is a
  2229. # string before proceeding.
  2230. if not isinstance(first_row[0], compat.string_types):
  2231. return first_row
  2232. # Check that the string is not empty, as that would
  2233. # obviously not have a BOM at the start of it.
  2234. if not first_row[0]:
  2235. return first_row
  2236. # Since the string is non-empty, check that it does
  2237. # in fact begin with a BOM.
  2238. first_elt = first_row[0][0]
  2239. # This is to avoid warnings we get in Python 2.x if
  2240. # we find ourselves comparing with non-Unicode
  2241. if compat.PY2 and not isinstance(first_elt, unicode): # noqa
  2242. try:
  2243. first_elt = u(first_elt)
  2244. except UnicodeDecodeError:
  2245. return first_row
  2246. if first_elt != _BOM:
  2247. return first_row
  2248. first_row = first_row[0]
  2249. if len(first_row) > 1 and first_row[1] == self.quotechar:
  2250. start = 2
  2251. quote = first_row[1]
  2252. end = first_row[2:].index(quote) + 2
  2253. # Extract the data between the quotation marks
  2254. new_row = first_row[start:end]
  2255. # Extract any remaining data after the second
  2256. # quotation mark.
  2257. if len(first_row) > end + 1:
  2258. new_row += first_row[end + 1:]
  2259. return [new_row]
  2260. elif len(first_row) > 1:
  2261. return [first_row[1:]]
  2262. else:
  2263. # First row is just the BOM, so we
  2264. # return an empty string.
  2265. return [""]
  2266. def _is_line_empty(self, line):
  2267. """
  2268. Check if a line is empty or not.
  2269. Parameters
  2270. ----------
  2271. line : str, array-like
  2272. The line of data to check.
  2273. Returns
  2274. -------
  2275. boolean : Whether or not the line is empty.
  2276. """
  2277. return not line or all(not x for x in line)
  2278. def _next_line(self):
  2279. if isinstance(self.data, list):
  2280. while self.skipfunc(self.pos):
  2281. self.pos += 1
  2282. while True:
  2283. try:
  2284. line = self._check_comments([self.data[self.pos]])[0]
  2285. self.pos += 1
  2286. # either uncommented or blank to begin with
  2287. if (not self.skip_blank_lines and
  2288. (self._is_line_empty(
  2289. self.data[self.pos - 1]) or line)):
  2290. break
  2291. elif self.skip_blank_lines:
  2292. ret = self._remove_empty_lines([line])
  2293. if ret:
  2294. line = ret[0]
  2295. break
  2296. except IndexError:
  2297. raise StopIteration
  2298. else:
  2299. while self.skipfunc(self.pos):
  2300. self.pos += 1
  2301. next(self.data)
  2302. while True:
  2303. orig_line = self._next_iter_line(row_num=self.pos + 1)
  2304. self.pos += 1
  2305. if orig_line is not None:
  2306. line = self._check_comments([orig_line])[0]
  2307. if self.skip_blank_lines:
  2308. ret = self._remove_empty_lines([line])
  2309. if ret:
  2310. line = ret[0]
  2311. break
  2312. elif self._is_line_empty(orig_line) or line:
  2313. break
  2314. # This was the first line of the file,
  2315. # which could contain the BOM at the
  2316. # beginning of it.
  2317. if self.pos == 1:
  2318. line = self._check_for_bom(line)
  2319. self.line_pos += 1
  2320. self.buf.append(line)
  2321. return line
  2322. def _alert_malformed(self, msg, row_num):
  2323. """
  2324. Alert a user about a malformed row.
  2325. If `self.error_bad_lines` is True, the alert will be `ParserError`.
  2326. If `self.warn_bad_lines` is True, the alert will be printed out.
  2327. Parameters
  2328. ----------
  2329. msg : The error message to display.
  2330. row_num : The row number where the parsing error occurred.
  2331. Because this row number is displayed, we 1-index,
  2332. even though we 0-index internally.
  2333. """
  2334. if self.error_bad_lines:
  2335. raise ParserError(msg)
  2336. elif self.warn_bad_lines:
  2337. base = 'Skipping line {row_num}: '.format(row_num=row_num)
  2338. sys.stderr.write(base + msg + '\n')
  2339. def _next_iter_line(self, row_num):
  2340. """
  2341. Wrapper around iterating through `self.data` (CSV source).
  2342. When a CSV error is raised, we check for specific
  2343. error messages that allow us to customize the
  2344. error message displayed to the user.
  2345. Parameters
  2346. ----------
  2347. row_num : The row number of the line being parsed.
  2348. """
  2349. try:
  2350. return next(self.data)
  2351. except csv.Error as e:
  2352. if self.warn_bad_lines or self.error_bad_lines:
  2353. msg = str(e)
  2354. if 'NULL byte' in msg:
  2355. msg = ('NULL byte detected. This byte '
  2356. 'cannot be processed in Python\'s '
  2357. 'native csv library at the moment, '
  2358. 'so please pass in engine=\'c\' instead')
  2359. if self.skipfooter > 0:
  2360. reason = ('Error could possibly be due to '
  2361. 'parsing errors in the skipped footer rows '
  2362. '(the skipfooter keyword is only applied '
  2363. 'after Python\'s csv library has parsed '
  2364. 'all rows).')
  2365. msg += '. ' + reason
  2366. self._alert_malformed(msg, row_num)
  2367. return None
  2368. def _check_comments(self, lines):
  2369. if self.comment is None:
  2370. return lines
  2371. ret = []
  2372. for l in lines:
  2373. rl = []
  2374. for x in l:
  2375. if (not isinstance(x, compat.string_types) or
  2376. self.comment not in x):
  2377. rl.append(x)
  2378. else:
  2379. x = x[:x.find(self.comment)]
  2380. if len(x) > 0:
  2381. rl.append(x)
  2382. break
  2383. ret.append(rl)
  2384. return ret
  2385. def _remove_empty_lines(self, lines):
  2386. """
  2387. Iterate through the lines and remove any that are
  2388. either empty or contain only one whitespace value
  2389. Parameters
  2390. ----------
  2391. lines : array-like
  2392. The array of lines that we are to filter.
  2393. Returns
  2394. -------
  2395. filtered_lines : array-like
  2396. The same array of lines with the "empty" ones removed.
  2397. """
  2398. ret = []
  2399. for l in lines:
  2400. # Remove empty lines and lines with only one whitespace value
  2401. if (len(l) > 1 or len(l) == 1 and
  2402. (not isinstance(l[0], compat.string_types) or
  2403. l[0].strip())):
  2404. ret.append(l)
  2405. return ret
  2406. def _check_thousands(self, lines):
  2407. if self.thousands is None:
  2408. return lines
  2409. return self._search_replace_num_columns(lines=lines,
  2410. search=self.thousands,
  2411. replace='')
  2412. def _search_replace_num_columns(self, lines, search, replace):
  2413. ret = []
  2414. for l in lines:
  2415. rl = []
  2416. for i, x in enumerate(l):
  2417. if (not isinstance(x, compat.string_types) or
  2418. search not in x or
  2419. (self._no_thousands_columns and
  2420. i in self._no_thousands_columns) or
  2421. self.nonnum.search(x.strip())):
  2422. rl.append(x)
  2423. else:
  2424. rl.append(x.replace(search, replace))
  2425. ret.append(rl)
  2426. return ret
  2427. def _check_decimal(self, lines):
  2428. if self.decimal == _parser_defaults['decimal']:
  2429. return lines
  2430. return self._search_replace_num_columns(lines=lines,
  2431. search=self.decimal,
  2432. replace='.')
  2433. def _clear_buffer(self):
  2434. self.buf = []
  2435. _implicit_index = False
  2436. def _get_index_name(self, columns):
  2437. """
  2438. Try several cases to get lines:
  2439. 0) There are headers on row 0 and row 1 and their
  2440. total summed lengths equals the length of the next line.
  2441. Treat row 0 as columns and row 1 as indices
  2442. 1) Look for implicit index: there are more columns
  2443. on row 1 than row 0. If this is true, assume that row
  2444. 1 lists index columns and row 0 lists normal columns.
  2445. 2) Get index from the columns if it was listed.
  2446. """
  2447. orig_names = list(columns)
  2448. columns = list(columns)
  2449. try:
  2450. line = self._next_line()
  2451. except StopIteration:
  2452. line = None
  2453. try:
  2454. next_line = self._next_line()
  2455. except StopIteration:
  2456. next_line = None
  2457. # implicitly index_col=0 b/c 1 fewer column names
  2458. implicit_first_cols = 0
  2459. if line is not None:
  2460. # leave it 0, #2442
  2461. # Case 1
  2462. if self.index_col is not False:
  2463. implicit_first_cols = len(line) - self.num_original_columns
  2464. # Case 0
  2465. if next_line is not None:
  2466. if len(next_line) == len(line) + self.num_original_columns:
  2467. # column and index names on diff rows
  2468. self.index_col = lrange(len(line))
  2469. self.buf = self.buf[1:]
  2470. for c in reversed(line):
  2471. columns.insert(0, c)
  2472. # Update list of original names to include all indices.
  2473. orig_names = list(columns)
  2474. self.num_original_columns = len(columns)
  2475. return line, orig_names, columns
  2476. if implicit_first_cols > 0:
  2477. # Case 1
  2478. self._implicit_index = True
  2479. if self.index_col is None:
  2480. self.index_col = lrange(implicit_first_cols)
  2481. index_name = None
  2482. else:
  2483. # Case 2
  2484. (index_name, columns_,
  2485. self.index_col) = _clean_index_names(columns, self.index_col,
  2486. self.unnamed_cols)
  2487. return index_name, orig_names, columns
  2488. def _rows_to_cols(self, content):
  2489. col_len = self.num_original_columns
  2490. if self._implicit_index:
  2491. col_len += len(self.index_col)
  2492. max_len = max(len(row) for row in content)
  2493. # Check that there are no rows with too many
  2494. # elements in their row (rows with too few
  2495. # elements are padded with NaN).
  2496. if (max_len > col_len and
  2497. self.index_col is not False and
  2498. self.usecols is None):
  2499. footers = self.skipfooter if self.skipfooter else 0
  2500. bad_lines = []
  2501. iter_content = enumerate(content)
  2502. content_len = len(content)
  2503. content = []
  2504. for (i, l) in iter_content:
  2505. actual_len = len(l)
  2506. if actual_len > col_len:
  2507. if self.error_bad_lines or self.warn_bad_lines:
  2508. row_num = self.pos - (content_len - i + footers)
  2509. bad_lines.append((row_num, actual_len))
  2510. if self.error_bad_lines:
  2511. break
  2512. else:
  2513. content.append(l)
  2514. for row_num, actual_len in bad_lines:
  2515. msg = ('Expected %d fields in line %d, saw %d' %
  2516. (col_len, row_num + 1, actual_len))
  2517. if (self.delimiter and
  2518. len(self.delimiter) > 1 and
  2519. self.quoting != csv.QUOTE_NONE):
  2520. # see gh-13374
  2521. reason = ('Error could possibly be due to quotes being '
  2522. 'ignored when a multi-char delimiter is used.')
  2523. msg += '. ' + reason
  2524. self._alert_malformed(msg, row_num + 1)
  2525. # see gh-13320
  2526. zipped_content = list(lib.to_object_array(
  2527. content, min_width=col_len).T)
  2528. if self.usecols:
  2529. if self._implicit_index:
  2530. zipped_content = [
  2531. a for i, a in enumerate(zipped_content)
  2532. if (i < len(self.index_col) or
  2533. i - len(self.index_col) in self._col_indices)]
  2534. else:
  2535. zipped_content = [a for i, a in enumerate(zipped_content)
  2536. if i in self._col_indices]
  2537. return zipped_content
  2538. def _get_lines(self, rows=None):
  2539. lines = self.buf
  2540. new_rows = None
  2541. # already fetched some number
  2542. if rows is not None:
  2543. # we already have the lines in the buffer
  2544. if len(self.buf) >= rows:
  2545. new_rows, self.buf = self.buf[:rows], self.buf[rows:]
  2546. # need some lines
  2547. else:
  2548. rows -= len(self.buf)
  2549. if new_rows is None:
  2550. if isinstance(self.data, list):
  2551. if self.pos > len(self.data):
  2552. raise StopIteration
  2553. if rows is None:
  2554. new_rows = self.data[self.pos:]
  2555. new_pos = len(self.data)
  2556. else:
  2557. new_rows = self.data[self.pos:self.pos + rows]
  2558. new_pos = self.pos + rows
  2559. # Check for stop rows. n.b.: self.skiprows is a set.
  2560. if self.skiprows:
  2561. new_rows = [row for i, row in enumerate(new_rows)
  2562. if not self.skipfunc(i + self.pos)]
  2563. lines.extend(new_rows)
  2564. self.pos = new_pos
  2565. else:
  2566. new_rows = []
  2567. try:
  2568. if rows is not None:
  2569. for _ in range(rows):
  2570. new_rows.append(next(self.data))
  2571. lines.extend(new_rows)
  2572. else:
  2573. rows = 0
  2574. while True:
  2575. new_row = self._next_iter_line(
  2576. row_num=self.pos + rows + 1)
  2577. rows += 1
  2578. if new_row is not None:
  2579. new_rows.append(new_row)
  2580. except StopIteration:
  2581. if self.skiprows:
  2582. new_rows = [row for i, row in enumerate(new_rows)
  2583. if not self.skipfunc(i + self.pos)]
  2584. lines.extend(new_rows)
  2585. if len(lines) == 0:
  2586. raise
  2587. self.pos += len(new_rows)
  2588. self.buf = []
  2589. else:
  2590. lines = new_rows
  2591. if self.skipfooter:
  2592. lines = lines[:-self.skipfooter]
  2593. lines = self._check_comments(lines)
  2594. if self.skip_blank_lines:
  2595. lines = self._remove_empty_lines(lines)
  2596. lines = self._check_thousands(lines)
  2597. return self._check_decimal(lines)
  2598. def _make_date_converter(date_parser=None, dayfirst=False,
  2599. infer_datetime_format=False):
  2600. def converter(*date_cols):
  2601. if date_parser is None:
  2602. strs = _concat_date_cols(date_cols)
  2603. try:
  2604. return tools.to_datetime(
  2605. ensure_object(strs),
  2606. utc=None,
  2607. box=False,
  2608. dayfirst=dayfirst,
  2609. errors='ignore',
  2610. infer_datetime_format=infer_datetime_format
  2611. )
  2612. except ValueError:
  2613. return tools.to_datetime(
  2614. parsing.try_parse_dates(strs, dayfirst=dayfirst))
  2615. else:
  2616. try:
  2617. result = tools.to_datetime(
  2618. date_parser(*date_cols), errors='ignore')
  2619. if isinstance(result, datetime.datetime):
  2620. raise Exception('scalar parser')
  2621. return result
  2622. except Exception:
  2623. try:
  2624. return tools.to_datetime(
  2625. parsing.try_parse_dates(_concat_date_cols(date_cols),
  2626. parser=date_parser,
  2627. dayfirst=dayfirst),
  2628. errors='ignore')
  2629. except Exception:
  2630. return generic_parser(date_parser, *date_cols)
  2631. return converter
  2632. def _process_date_conversion(data_dict, converter, parse_spec,
  2633. index_col, index_names, columns,
  2634. keep_date_col=False):
  2635. def _isindex(colspec):
  2636. return ((isinstance(index_col, list) and
  2637. colspec in index_col) or
  2638. (isinstance(index_names, list) and
  2639. colspec in index_names))
  2640. new_cols = []
  2641. new_data = {}
  2642. orig_names = columns
  2643. columns = list(columns)
  2644. date_cols = set()
  2645. if parse_spec is None or isinstance(parse_spec, bool):
  2646. return data_dict, columns
  2647. if isinstance(parse_spec, list):
  2648. # list of column lists
  2649. for colspec in parse_spec:
  2650. if is_scalar(colspec):
  2651. if isinstance(colspec, int) and colspec not in data_dict:
  2652. colspec = orig_names[colspec]
  2653. if _isindex(colspec):
  2654. continue
  2655. data_dict[colspec] = converter(data_dict[colspec])
  2656. else:
  2657. new_name, col, old_names = _try_convert_dates(
  2658. converter, colspec, data_dict, orig_names)
  2659. if new_name in data_dict:
  2660. raise ValueError('New date column already in dict %s' %
  2661. new_name)
  2662. new_data[new_name] = col
  2663. new_cols.append(new_name)
  2664. date_cols.update(old_names)
  2665. elif isinstance(parse_spec, dict):
  2666. # dict of new name to column list
  2667. for new_name, colspec in compat.iteritems(parse_spec):
  2668. if new_name in data_dict:
  2669. raise ValueError('Date column %s already in dict' %
  2670. new_name)
  2671. _, col, old_names = _try_convert_dates(converter, colspec,
  2672. data_dict, orig_names)
  2673. new_data[new_name] = col
  2674. new_cols.append(new_name)
  2675. date_cols.update(old_names)
  2676. data_dict.update(new_data)
  2677. new_cols.extend(columns)
  2678. if not keep_date_col:
  2679. for c in list(date_cols):
  2680. data_dict.pop(c)
  2681. new_cols.remove(c)
  2682. return data_dict, new_cols
  2683. def _try_convert_dates(parser, colspec, data_dict, columns):
  2684. colset = set(columns)
  2685. colnames = []
  2686. for c in colspec:
  2687. if c in colset:
  2688. colnames.append(c)
  2689. elif isinstance(c, int) and c not in columns:
  2690. colnames.append(columns[c])
  2691. else:
  2692. colnames.append(c)
  2693. new_name = '_'.join(str(x) for x in colnames)
  2694. to_parse = [data_dict[c] for c in colnames if c in data_dict]
  2695. new_col = parser(*to_parse)
  2696. return new_name, new_col, colnames
  2697. def _clean_na_values(na_values, keep_default_na=True):
  2698. if na_values is None:
  2699. if keep_default_na:
  2700. na_values = _NA_VALUES
  2701. else:
  2702. na_values = set()
  2703. na_fvalues = set()
  2704. elif isinstance(na_values, dict):
  2705. old_na_values = na_values.copy()
  2706. na_values = {} # Prevent aliasing.
  2707. # Convert the values in the na_values dictionary
  2708. # into array-likes for further use. This is also
  2709. # where we append the default NaN values, provided
  2710. # that `keep_default_na=True`.
  2711. for k, v in compat.iteritems(old_na_values):
  2712. if not is_list_like(v):
  2713. v = [v]
  2714. if keep_default_na:
  2715. v = set(v) | _NA_VALUES
  2716. na_values[k] = v
  2717. na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
  2718. else:
  2719. if not is_list_like(na_values):
  2720. na_values = [na_values]
  2721. na_values = _stringify_na_values(na_values)
  2722. if keep_default_na:
  2723. na_values = na_values | _NA_VALUES
  2724. na_fvalues = _floatify_na_values(na_values)
  2725. return na_values, na_fvalues
  2726. def _clean_index_names(columns, index_col, unnamed_cols):
  2727. if not _is_index_col(index_col):
  2728. return None, columns, index_col
  2729. columns = list(columns)
  2730. cp_cols = list(columns)
  2731. index_names = []
  2732. # don't mutate
  2733. index_col = list(index_col)
  2734. for i, c in enumerate(index_col):
  2735. if isinstance(c, compat.string_types):
  2736. index_names.append(c)
  2737. for j, name in enumerate(cp_cols):
  2738. if name == c:
  2739. index_col[i] = j
  2740. columns.remove(name)
  2741. break
  2742. else:
  2743. name = cp_cols[c]
  2744. columns.remove(name)
  2745. index_names.append(name)
  2746. # Only clean index names that were placeholders.
  2747. for i, name in enumerate(index_names):
  2748. if isinstance(name, compat.string_types) and name in unnamed_cols:
  2749. index_names[i] = None
  2750. return index_names, columns, index_col
  2751. def _get_empty_meta(columns, index_col, index_names, dtype=None):
  2752. columns = list(columns)
  2753. # Convert `dtype` to a defaultdict of some kind.
  2754. # This will enable us to write `dtype[col_name]`
  2755. # without worrying about KeyError issues later on.
  2756. if not isinstance(dtype, dict):
  2757. # if dtype == None, default will be np.object.
  2758. default_dtype = dtype or np.object
  2759. dtype = defaultdict(lambda: default_dtype)
  2760. else:
  2761. # Save a copy of the dictionary.
  2762. _dtype = dtype.copy()
  2763. dtype = defaultdict(lambda: np.object)
  2764. # Convert column indexes to column names.
  2765. for k, v in compat.iteritems(_dtype):
  2766. col = columns[k] if is_integer(k) else k
  2767. dtype[col] = v
  2768. # Even though we have no data, the "index" of the empty DataFrame
  2769. # could for example still be an empty MultiIndex. Thus, we need to
  2770. # check whether we have any index columns specified, via either:
  2771. #
  2772. # 1) index_col (column indices)
  2773. # 2) index_names (column names)
  2774. #
  2775. # Both must be non-null to ensure a successful construction. Otherwise,
  2776. # we have to create a generic emtpy Index.
  2777. if (index_col is None or index_col is False) or index_names is None:
  2778. index = Index([])
  2779. else:
  2780. data = [Series([], dtype=dtype[name]) for name in index_names]
  2781. index = ensure_index_from_sequences(data, names=index_names)
  2782. index_col.sort()
  2783. for i, n in enumerate(index_col):
  2784. columns.pop(n - i)
  2785. col_dict = {col_name: Series([], dtype=dtype[col_name])
  2786. for col_name in columns}
  2787. return index, columns, col_dict
  2788. def _floatify_na_values(na_values):
  2789. # create float versions of the na_values
  2790. result = set()
  2791. for v in na_values:
  2792. try:
  2793. v = float(v)
  2794. if not np.isnan(v):
  2795. result.add(v)
  2796. except (TypeError, ValueError, OverflowError):
  2797. pass
  2798. return result
  2799. def _stringify_na_values(na_values):
  2800. """ return a stringified and numeric for these values """
  2801. result = []
  2802. for x in na_values:
  2803. result.append(str(x))
  2804. result.append(x)
  2805. try:
  2806. v = float(x)
  2807. # we are like 999 here
  2808. if v == int(v):
  2809. v = int(v)
  2810. result.append("%s.0" % v)
  2811. result.append(str(v))
  2812. result.append(v)
  2813. except (TypeError, ValueError, OverflowError):
  2814. pass
  2815. try:
  2816. result.append(int(x))
  2817. except (TypeError, ValueError, OverflowError):
  2818. pass
  2819. return set(result)
  2820. def _get_na_values(col, na_values, na_fvalues, keep_default_na):
  2821. """
  2822. Get the NaN values for a given column.
  2823. Parameters
  2824. ----------
  2825. col : str
  2826. The name of the column.
  2827. na_values : array-like, dict
  2828. The object listing the NaN values as strings.
  2829. na_fvalues : array-like, dict
  2830. The object listing the NaN values as floats.
  2831. keep_default_na : bool
  2832. If `na_values` is a dict, and the column is not mapped in the
  2833. dictionary, whether to return the default NaN values or the empty set.
  2834. Returns
  2835. -------
  2836. nan_tuple : A length-two tuple composed of
  2837. 1) na_values : the string NaN values for that column.
  2838. 2) na_fvalues : the float NaN values for that column.
  2839. """
  2840. if isinstance(na_values, dict):
  2841. if col in na_values:
  2842. return na_values[col], na_fvalues[col]
  2843. else:
  2844. if keep_default_na:
  2845. return _NA_VALUES, set()
  2846. return set(), set()
  2847. else:
  2848. return na_values, na_fvalues
  2849. def _get_col_names(colspec, columns):
  2850. colset = set(columns)
  2851. colnames = []
  2852. for c in colspec:
  2853. if c in colset:
  2854. colnames.append(c)
  2855. elif isinstance(c, int):
  2856. colnames.append(columns[c])
  2857. return colnames
  2858. def _concat_date_cols(date_cols):
  2859. if len(date_cols) == 1:
  2860. if compat.PY3:
  2861. return np.array([compat.text_type(x) for x in date_cols[0]],
  2862. dtype=object)
  2863. else:
  2864. return np.array([
  2865. str(x) if not isinstance(x, compat.string_types) else x
  2866. for x in date_cols[0]
  2867. ], dtype=object)
  2868. rs = np.array([' '.join(compat.text_type(y) for y in x)
  2869. for x in zip(*date_cols)], dtype=object)
  2870. return rs
  2871. class FixedWidthReader(BaseIterator):
  2872. """
  2873. A reader of fixed-width lines.
  2874. """
  2875. def __init__(self, f, colspecs, delimiter, comment, skiprows=None,
  2876. infer_nrows=100):
  2877. self.f = f
  2878. self.buffer = None
  2879. self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t '
  2880. self.comment = comment
  2881. if colspecs == 'infer':
  2882. self.colspecs = self.detect_colspecs(infer_nrows=infer_nrows,
  2883. skiprows=skiprows)
  2884. else:
  2885. self.colspecs = colspecs
  2886. if not isinstance(self.colspecs, (tuple, list)):
  2887. raise TypeError("column specifications must be a list or tuple, "
  2888. "input was a %r" % type(colspecs).__name__)
  2889. for colspec in self.colspecs:
  2890. if not (isinstance(colspec, (tuple, list)) and
  2891. len(colspec) == 2 and
  2892. isinstance(colspec[0], (int, np.integer, type(None))) and
  2893. isinstance(colspec[1], (int, np.integer, type(None)))):
  2894. raise TypeError('Each column specification must be '
  2895. '2 element tuple or list of integers')
  2896. def get_rows(self, infer_nrows, skiprows=None):
  2897. """
  2898. Read rows from self.f, skipping as specified.
  2899. We distinguish buffer_rows (the first <= infer_nrows
  2900. lines) from the rows returned to detect_colspecs
  2901. because it's simpler to leave the other locations
  2902. with skiprows logic alone than to modify them to
  2903. deal with the fact we skipped some rows here as
  2904. well.
  2905. Parameters
  2906. ----------
  2907. infer_nrows : int
  2908. Number of rows to read from self.f, not counting
  2909. rows that are skipped.
  2910. skiprows: set, optional
  2911. Indices of rows to skip.
  2912. Returns
  2913. -------
  2914. detect_rows : list of str
  2915. A list containing the rows to read.
  2916. """
  2917. if skiprows is None:
  2918. skiprows = set()
  2919. buffer_rows = []
  2920. detect_rows = []
  2921. for i, row in enumerate(self.f):
  2922. if i not in skiprows:
  2923. detect_rows.append(row)
  2924. buffer_rows.append(row)
  2925. if len(detect_rows) >= infer_nrows:
  2926. break
  2927. self.buffer = iter(buffer_rows)
  2928. return detect_rows
  2929. def detect_colspecs(self, infer_nrows=100, skiprows=None):
  2930. # Regex escape the delimiters
  2931. delimiters = ''.join(r'\%s' % x for x in self.delimiter)
  2932. pattern = re.compile('([^%s]+)' % delimiters)
  2933. rows = self.get_rows(infer_nrows, skiprows)
  2934. if not rows:
  2935. raise EmptyDataError("No rows from which to infer column width")
  2936. max_len = max(map(len, rows))
  2937. mask = np.zeros(max_len + 1, dtype=int)
  2938. if self.comment is not None:
  2939. rows = [row.partition(self.comment)[0] for row in rows]
  2940. for row in rows:
  2941. for m in pattern.finditer(row):
  2942. mask[m.start():m.end()] = 1
  2943. shifted = np.roll(mask, 1)
  2944. shifted[0] = 0
  2945. edges = np.where((mask ^ shifted) == 1)[0]
  2946. edge_pairs = list(zip(edges[::2], edges[1::2]))
  2947. return edge_pairs
  2948. def __next__(self):
  2949. if self.buffer is not None:
  2950. try:
  2951. line = next(self.buffer)
  2952. except StopIteration:
  2953. self.buffer = None
  2954. line = next(self.f)
  2955. else:
  2956. line = next(self.f)
  2957. # Note: 'colspecs' is a sequence of half-open intervals.
  2958. return [line[fromm:to].strip(self.delimiter)
  2959. for (fromm, to) in self.colspecs]
  2960. class FixedWidthFieldParser(PythonParser):
  2961. """
  2962. Specialization that Converts fixed-width fields into DataFrames.
  2963. See PythonParser for details.
  2964. """
  2965. def __init__(self, f, **kwds):
  2966. # Support iterators, convert to a list.
  2967. self.colspecs = kwds.pop('colspecs')
  2968. self.infer_nrows = kwds.pop('infer_nrows')
  2969. PythonParser.__init__(self, f, **kwds)
  2970. def _make_reader(self, f):
  2971. self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
  2972. self.comment, self.skiprows,
  2973. self.infer_nrows)