PageRenderTime 90ms CodeModel.GetById 33ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/io/parsers.py

http://github.com/pydata/pandas
Python | 3719 lines | 3659 code | 21 blank | 39 comment | 153 complexity | c989477ba1887f9bd065ae6a07118f48 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. """
  2. Module contains tools for processing files into DataFrames or other objects
  3. """
  4. from collections import abc, defaultdict
  5. import csv
  6. import datetime
  7. from io import StringIO, TextIOWrapper
  8. import itertools
  9. import re
  10. import sys
  11. from textwrap import fill
  12. from typing import Any, Dict, Iterable, List, Set
  13. import warnings
  14. import numpy as np
  15. import pandas._libs.lib as lib
  16. import pandas._libs.ops as libops
  17. import pandas._libs.parsers as parsers
  18. from pandas._libs.parsers import STR_NA_VALUES
  19. from pandas._libs.tslibs import parsing
  20. from pandas._typing import FilePathOrBuffer
  21. from pandas.errors import (
  22. AbstractMethodError,
  23. EmptyDataError,
  24. ParserError,
  25. ParserWarning,
  26. )
  27. from pandas.util._decorators import Appender
  28. from pandas.core.dtypes.cast import astype_nansafe
  29. from pandas.core.dtypes.common import (
  30. ensure_object,
  31. ensure_str,
  32. is_bool_dtype,
  33. is_categorical_dtype,
  34. is_dict_like,
  35. is_dtype_equal,
  36. is_extension_array_dtype,
  37. is_file_like,
  38. is_float,
  39. is_integer,
  40. is_integer_dtype,
  41. is_list_like,
  42. is_object_dtype,
  43. is_scalar,
  44. is_string_dtype,
  45. pandas_dtype,
  46. )
  47. from pandas.core.dtypes.dtypes import CategoricalDtype
  48. from pandas.core.dtypes.missing import isna
  49. from pandas.core import algorithms
  50. from pandas.core.arrays import Categorical
  51. from pandas.core.frame import DataFrame
  52. from pandas.core.indexes.api import (
  53. Index,
  54. MultiIndex,
  55. RangeIndex,
  56. ensure_index_from_sequences,
  57. )
  58. from pandas.core.series import Series
  59. from pandas.core.tools import datetimes as tools
  60. from pandas.io.common import (
  61. get_filepath_or_buffer,
  62. get_handle,
  63. infer_compression,
  64. validate_header_arg,
  65. )
  66. from pandas.io.date_converters import generic_parser
  67. # BOM character (byte order mark)
  68. # This exists at the beginning of a file to indicate endianness
  69. # of a file (stream). Unfortunately, this marker screws up parsing,
  70. # so we need to remove it if we see it.
  71. _BOM = "\ufeff"
  72. _doc_read_csv_and_table = (
  73. r"""
  74. {summary}
  75. Also supports optionally iterating or breaking of the file
  76. into chunks.
  77. Additional help can be found in the online docs for
  78. `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
  79. Parameters
  80. ----------
  81. filepath_or_buffer : str, path object or file-like object
  82. Any valid string path is acceptable. The string could be a URL. Valid
  83. URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
  84. expected. A local file could be: file://localhost/path/to/table.csv.
  85. If you want to pass in a path object, pandas accepts any ``os.PathLike``.
  86. By file-like object, we refer to objects with a ``read()`` method, such as
  87. a file handler (e.g. via builtin ``open`` function) or ``StringIO``.
  88. sep : str, default {_default_sep}
  89. Delimiter to use. If sep is None, the C engine cannot automatically detect
  90. the separator, but the Python parsing engine can, meaning the latter will
  91. be used and automatically detect the separator by Python's builtin sniffer
  92. tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
  93. different from ``'\s+'`` will be interpreted as regular expressions and
  94. will also force the use of the Python parsing engine. Note that regex
  95. delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
  96. delimiter : str, default ``None``
  97. Alias for sep.
  98. header : int, list of int, default 'infer'
  99. Row number(s) to use as the column names, and the start of the
  100. data. Default behavior is to infer the column names: if no names
  101. are passed the behavior is identical to ``header=0`` and column
  102. names are inferred from the first line of the file, if column
  103. names are passed explicitly then the behavior is identical to
  104. ``header=None``. Explicitly pass ``header=0`` to be able to
  105. replace existing names. The header can be a list of integers that
  106. specify row locations for a multi-index on the columns
  107. e.g. [0,1,3]. Intervening rows that are not specified will be
  108. skipped (e.g. 2 in this example is skipped). Note that this
  109. parameter ignores commented lines and empty lines if
  110. ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
  111. data rather than the first line of the file.
  112. names : array-like, optional
  113. List of column names to use. If the file contains a header row,
  114. then you should explicitly pass ``header=0`` to override the column names.
  115. Duplicates in this list are not allowed.
  116. index_col : int, str, sequence of int / str, or False, default ``None``
  117. Column(s) to use as the row labels of the ``DataFrame``, either given as
  118. string name or column index. If a sequence of int / str is given, a
  119. MultiIndex is used.
  120. Note: ``index_col=False`` can be used to force pandas to *not* use the first
  121. column as the index, e.g. when you have a malformed file with delimiters at
  122. the end of each line.
  123. usecols : list-like or callable, optional
  124. Return a subset of the columns. If list-like, all elements must either
  125. be positional (i.e. integer indices into the document columns) or strings
  126. that correspond to column names provided either by the user in `names` or
  127. inferred from the document header row(s). For example, a valid list-like
  128. `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
  129. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
  130. To instantiate a DataFrame from ``data`` with element order preserved use
  131. ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
  132. in ``['foo', 'bar']`` order or
  133. ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
  134. for ``['bar', 'foo']`` order.
  135. If callable, the callable function will be evaluated against the column
  136. names, returning names where the callable function evaluates to True. An
  137. example of a valid callable argument would be ``lambda x: x.upper() in
  138. ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
  139. parsing time and lower memory usage.
  140. squeeze : bool, default False
  141. If the parsed data only contains one column then return a Series.
  142. prefix : str, optional
  143. Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
  144. mangle_dupe_cols : bool, default True
  145. Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
  146. 'X'...'X'. Passing in False will cause data to be overwritten if there
  147. are duplicate names in the columns.
  148. dtype : Type name or dict of column -> type, optional
  149. Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
  150. 'c': 'Int64'}}
  151. Use `str` or `object` together with suitable `na_values` settings
  152. to preserve and not interpret dtype.
  153. If converters are specified, they will be applied INSTEAD
  154. of dtype conversion.
  155. engine : {{'c', 'python'}}, optional
  156. Parser engine to use. The C engine is faster while the python engine is
  157. currently more feature-complete.
  158. converters : dict, optional
  159. Dict of functions for converting values in certain columns. Keys can either
  160. be integers or column labels.
  161. true_values : list, optional
  162. Values to consider as True.
  163. false_values : list, optional
  164. Values to consider as False.
  165. skipinitialspace : bool, default False
  166. Skip spaces after delimiter.
  167. skiprows : list-like, int or callable, optional
  168. Line numbers to skip (0-indexed) or number of lines to skip (int)
  169. at the start of the file.
  170. If callable, the callable function will be evaluated against the row
  171. indices, returning True if the row should be skipped and False otherwise.
  172. An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
  173. skipfooter : int, default 0
  174. Number of lines at bottom of file to skip (Unsupported with engine='c').
  175. nrows : int, optional
  176. Number of rows of file to read. Useful for reading pieces of large files.
  177. na_values : scalar, str, list-like, or dict, optional
  178. Additional strings to recognize as NA/NaN. If dict passed, specific
  179. per-column NA values. By default the following values are interpreted as
  180. NaN: '"""
  181. + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")
  182. + """'.
  183. keep_default_na : bool, default True
  184. Whether or not to include the default NaN values when parsing the data.
  185. Depending on whether `na_values` is passed in, the behavior is as follows:
  186. * If `keep_default_na` is True, and `na_values` are specified, `na_values`
  187. is appended to the default NaN values used for parsing.
  188. * If `keep_default_na` is True, and `na_values` are not specified, only
  189. the default NaN values are used for parsing.
  190. * If `keep_default_na` is False, and `na_values` are specified, only
  191. the NaN values specified `na_values` are used for parsing.
  192. * If `keep_default_na` is False, and `na_values` are not specified, no
  193. strings will be parsed as NaN.
  194. Note that if `na_filter` is passed in as False, the `keep_default_na` and
  195. `na_values` parameters will be ignored.
  196. na_filter : bool, default True
  197. Detect missing value markers (empty strings and the value of na_values). In
  198. data without any NAs, passing na_filter=False can improve the performance
  199. of reading a large file.
  200. verbose : bool, default False
  201. Indicate number of NA values placed in non-numeric columns.
  202. skip_blank_lines : bool, default True
  203. If True, skip over blank lines rather than interpreting as NaN values.
  204. parse_dates : bool or list of int or names or list of lists or dict, \
  205. default False
  206. The behavior is as follows:
  207. * boolean. If True -> try parsing the index.
  208. * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
  209. each as a separate date column.
  210. * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
  211. a single date column.
  212. * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
  213. result 'foo'
  214. If a column or index cannot be represented as an array of datetimes,
  215. say because of an unparseable value or a mixture of timezones, the column
  216. or index will be returned unaltered as an object data type. For
  217. non-standard datetime parsing, use ``pd.to_datetime`` after
  218. ``pd.read_csv``. To parse an index or column with a mixture of timezones,
  219. specify ``date_parser`` to be a partially-applied
  220. :func:`pandas.to_datetime` with ``utc=True``. See
  221. :ref:`io.csv.mixed_timezones` for more.
  222. Note: A fast-path exists for iso8601-formatted dates.
  223. infer_datetime_format : bool, default False
  224. If True and `parse_dates` is enabled, pandas will attempt to infer the
  225. format of the datetime strings in the columns, and if it can be inferred,
  226. switch to a faster method of parsing them. In some cases this can increase
  227. the parsing speed by 5-10x.
  228. keep_date_col : bool, default False
  229. If True and `parse_dates` specifies combining multiple columns then
  230. keep the original columns.
  231. date_parser : function, optional
  232. Function to use for converting a sequence of string columns to an array of
  233. datetime instances. The default uses ``dateutil.parser.parser`` to do the
  234. conversion. Pandas will try to call `date_parser` in three different ways,
  235. advancing to the next if an exception occurs: 1) Pass one or more arrays
  236. (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
  237. string values from the columns defined by `parse_dates` into a single array
  238. and pass that; and 3) call `date_parser` once for each row using one or
  239. more strings (corresponding to the columns defined by `parse_dates`) as
  240. arguments.
  241. dayfirst : bool, default False
  242. DD/MM format dates, international and European format.
  243. cache_dates : bool, default True
  244. If True, use a cache of unique, converted dates to apply the datetime
  245. conversion. May produce significant speed-up when parsing duplicate
  246. date strings, especially ones with timezone offsets.
  247. .. versionadded:: 0.25.0
  248. iterator : bool, default False
  249. Return TextFileReader object for iteration or getting chunks with
  250. ``get_chunk()``.
  251. chunksize : int, optional
  252. Return TextFileReader object for iteration.
  253. See the `IO Tools docs
  254. <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
  255. for more information on ``iterator`` and ``chunksize``.
  256. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
  257. For on-the-fly decompression of on-disk data. If 'infer' and
  258. `filepath_or_buffer` is path-like, then detect compression from the
  259. following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
  260. decompression). If using 'zip', the ZIP file must contain only one data
  261. file to be read in. Set to None for no decompression.
  262. thousands : str, optional
  263. Thousands separator.
  264. decimal : str, default '.'
  265. Character to recognize as decimal point (e.g. use ',' for European data).
  266. lineterminator : str (length 1), optional
  267. Character to break file into lines. Only valid with C parser.
  268. quotechar : str (length 1), optional
  269. The character used to denote the start and end of a quoted item. Quoted
  270. items can include the delimiter and it will be ignored.
  271. quoting : int or csv.QUOTE_* instance, default 0
  272. Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
  273. QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
  274. doublequote : bool, default ``True``
  275. When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
  276. whether or not to interpret two consecutive quotechar elements INSIDE a
  277. field as a single ``quotechar`` element.
  278. escapechar : str (length 1), optional
  279. One-character string used to escape other characters.
  280. comment : str, optional
  281. Indicates remainder of line should not be parsed. If found at the beginning
  282. of a line, the line will be ignored altogether. This parameter must be a
  283. single character. Like empty lines (as long as ``skip_blank_lines=True``),
  284. fully commented lines are ignored by the parameter `header` but not by
  285. `skiprows`. For example, if ``comment='#'``, parsing
  286. ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
  287. treated as the header.
  288. encoding : str, optional
  289. Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
  290. standard encodings
  291. <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
  292. dialect : str or csv.Dialect, optional
  293. If provided, this parameter will override values (default or not) for the
  294. following parameters: `delimiter`, `doublequote`, `escapechar`,
  295. `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
  296. override values, a ParserWarning will be issued. See csv.Dialect
  297. documentation for more details.
  298. error_bad_lines : bool, default True
  299. Lines with too many fields (e.g. a csv line with too many commas) will by
  300. default cause an exception to be raised, and no DataFrame will be returned.
  301. If False, then these "bad lines" will dropped from the DataFrame that is
  302. returned.
  303. warn_bad_lines : bool, default True
  304. If error_bad_lines is False, and warn_bad_lines is True, a warning for each
  305. "bad line" will be output.
  306. delim_whitespace : bool, default False
  307. Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
  308. used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
  309. is set to True, nothing should be passed in for the ``delimiter``
  310. parameter.
  311. low_memory : bool, default True
  312. Internally process the file in chunks, resulting in lower memory use
  313. while parsing, but possibly mixed type inference. To ensure no mixed
  314. types either set False, or specify the type with the `dtype` parameter.
  315. Note that the entire file is read into a single DataFrame regardless,
  316. use the `chunksize` or `iterator` parameter to return the data in chunks.
  317. (Only valid with C parser).
  318. memory_map : bool, default False
  319. If a filepath is provided for `filepath_or_buffer`, map the file object
  320. directly onto memory and access the data directly from there. Using this
  321. option can improve performance because there is no longer any I/O overhead.
  322. float_precision : str, optional
  323. Specifies which converter the C engine should use for floating-point
  324. values. The options are `None` for the ordinary converter,
  325. `high` for the high-precision converter, and `round_trip` for the
  326. round-trip converter.
  327. Returns
  328. -------
  329. DataFrame or TextParser
  330. A comma-separated values (csv) file is returned as two-dimensional
  331. data structure with labeled axes.
  332. See Also
  333. --------
  334. DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
  335. read_csv : Read a comma-separated values (csv) file into DataFrame.
  336. read_fwf : Read a table of fixed-width formatted lines into DataFrame.
  337. Examples
  338. --------
  339. >>> pd.{func_name}('data.csv') # doctest: +SKIP
  340. """
  341. )
  342. def _validate_integer(name, val, min_val=0):
  343. """
  344. Checks whether the 'name' parameter for parsing is either
  345. an integer OR float that can SAFELY be cast to an integer
  346. without losing accuracy. Raises a ValueError if that is
  347. not the case.
  348. Parameters
  349. ----------
  350. name : string
  351. Parameter name (used for error reporting)
  352. val : int or float
  353. The value to check
  354. min_val : int
  355. Minimum allowed value (val < min_val will result in a ValueError)
  356. """
  357. msg = f"'{name:s}' must be an integer >={min_val:d}"
  358. if val is not None:
  359. if is_float(val):
  360. if int(val) != val:
  361. raise ValueError(msg)
  362. val = int(val)
  363. elif not (is_integer(val) and val >= min_val):
  364. raise ValueError(msg)
  365. return val
  366. def _validate_names(names):
  367. """
  368. Raise ValueError if the `names` parameter contains duplicates.
  369. Parameters
  370. ----------
  371. names : array-like or None
  372. An array containing a list of the names used for the output DataFrame.
  373. Raises
  374. ------
  375. ValueError
  376. If names are not unique.
  377. """
  378. if names is not None:
  379. if len(names) != len(set(names)):
  380. raise ValueError("Duplicate names are not allowed.")
  381. def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
  382. """Generic reader of line files."""
  383. encoding = kwds.get("encoding", None)
  384. if encoding is not None:
  385. encoding = re.sub("_", "-", encoding).lower()
  386. kwds["encoding"] = encoding
  387. compression = kwds.get("compression", "infer")
  388. compression = infer_compression(filepath_or_buffer, compression)
  389. # TODO: get_filepath_or_buffer could return
  390. # Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile]
  391. # though mypy handling of conditional imports is difficult.
  392. # See https://github.com/python/mypy/issues/1297
  393. fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
  394. filepath_or_buffer, encoding, compression
  395. )
  396. kwds["compression"] = compression
  397. if kwds.get("date_parser", None) is not None:
  398. if isinstance(kwds["parse_dates"], bool):
  399. kwds["parse_dates"] = True
  400. # Extract some of the arguments (pass chunksize on).
  401. iterator = kwds.get("iterator", False)
  402. chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1)
  403. nrows = kwds.get("nrows", None)
  404. # Check for duplicates in names.
  405. _validate_names(kwds.get("names", None))
  406. # Create the parser.
  407. parser = TextFileReader(fp_or_buf, **kwds)
  408. if chunksize or iterator:
  409. return parser
  410. try:
  411. data = parser.read(nrows)
  412. finally:
  413. parser.close()
  414. if should_close:
  415. try:
  416. fp_or_buf.close()
  417. except ValueError:
  418. pass
  419. return data
  420. _parser_defaults = {
  421. "delimiter": None,
  422. "escapechar": None,
  423. "quotechar": '"',
  424. "quoting": csv.QUOTE_MINIMAL,
  425. "doublequote": True,
  426. "skipinitialspace": False,
  427. "lineterminator": None,
  428. "header": "infer",
  429. "index_col": None,
  430. "names": None,
  431. "prefix": None,
  432. "skiprows": None,
  433. "skipfooter": 0,
  434. "nrows": None,
  435. "na_values": None,
  436. "keep_default_na": True,
  437. "true_values": None,
  438. "false_values": None,
  439. "converters": None,
  440. "dtype": None,
  441. "cache_dates": True,
  442. "thousands": None,
  443. "comment": None,
  444. "decimal": ".",
  445. # 'engine': 'c',
  446. "parse_dates": False,
  447. "keep_date_col": False,
  448. "dayfirst": False,
  449. "date_parser": None,
  450. "usecols": None,
  451. # 'iterator': False,
  452. "chunksize": None,
  453. "verbose": False,
  454. "encoding": None,
  455. "squeeze": False,
  456. "compression": None,
  457. "mangle_dupe_cols": True,
  458. "infer_datetime_format": False,
  459. "skip_blank_lines": True,
  460. }
  461. _c_parser_defaults = {
  462. "delim_whitespace": False,
  463. "na_filter": True,
  464. "low_memory": True,
  465. "memory_map": False,
  466. "error_bad_lines": True,
  467. "warn_bad_lines": True,
  468. "float_precision": None,
  469. }
  470. _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
  471. _c_unsupported = {"skipfooter"}
  472. _python_unsupported = {"low_memory", "float_precision"}
  473. _deprecated_defaults: Dict[str, Any] = {}
  474. _deprecated_args: Set[str] = set()
  475. def _make_parser_function(name, default_sep=","):
  476. def parser_f(
  477. filepath_or_buffer: FilePathOrBuffer,
  478. sep=default_sep,
  479. delimiter=None,
  480. # Column and Index Locations and Names
  481. header="infer",
  482. names=None,
  483. index_col=None,
  484. usecols=None,
  485. squeeze=False,
  486. prefix=None,
  487. mangle_dupe_cols=True,
  488. # General Parsing Configuration
  489. dtype=None,
  490. engine=None,
  491. converters=None,
  492. true_values=None,
  493. false_values=None,
  494. skipinitialspace=False,
  495. skiprows=None,
  496. skipfooter=0,
  497. nrows=None,
  498. # NA and Missing Data Handling
  499. na_values=None,
  500. keep_default_na=True,
  501. na_filter=True,
  502. verbose=False,
  503. skip_blank_lines=True,
  504. # Datetime Handling
  505. parse_dates=False,
  506. infer_datetime_format=False,
  507. keep_date_col=False,
  508. date_parser=None,
  509. dayfirst=False,
  510. cache_dates=True,
  511. # Iteration
  512. iterator=False,
  513. chunksize=None,
  514. # Quoting, Compression, and File Format
  515. compression="infer",
  516. thousands=None,
  517. decimal: str = ".",
  518. lineterminator=None,
  519. quotechar='"',
  520. quoting=csv.QUOTE_MINIMAL,
  521. doublequote=True,
  522. escapechar=None,
  523. comment=None,
  524. encoding=None,
  525. dialect=None,
  526. # Error Handling
  527. error_bad_lines=True,
  528. warn_bad_lines=True,
  529. # Internal
  530. delim_whitespace=False,
  531. low_memory=_c_parser_defaults["low_memory"],
  532. memory_map=False,
  533. float_precision=None,
  534. ):
  535. # gh-23761
  536. #
  537. # When a dialect is passed, it overrides any of the overlapping
  538. # parameters passed in directly. We don't want to warn if the
  539. # default parameters were passed in (since it probably means
  540. # that the user didn't pass them in explicitly in the first place).
  541. #
  542. # "delimiter" is the annoying corner case because we alias it to
  543. # "sep" before doing comparison to the dialect values later on.
  544. # Thus, we need a flag to indicate that we need to "override"
  545. # the comparison to dialect values by checking if default values
  546. # for BOTH "delimiter" and "sep" were provided.
  547. if dialect is not None:
  548. sep_override = delimiter is None and sep == default_sep
  549. kwds = dict(sep_override=sep_override)
  550. else:
  551. kwds = dict()
  552. # Alias sep -> delimiter.
  553. if delimiter is None:
  554. delimiter = sep
  555. if delim_whitespace and delimiter != default_sep:
  556. raise ValueError(
  557. "Specified a delimiter with both sep and "
  558. "delim_whitespace=True; you can only specify one."
  559. )
  560. if engine is not None:
  561. engine_specified = True
  562. else:
  563. engine = "c"
  564. engine_specified = False
  565. kwds.update(
  566. delimiter=delimiter,
  567. engine=engine,
  568. dialect=dialect,
  569. compression=compression,
  570. engine_specified=engine_specified,
  571. doublequote=doublequote,
  572. escapechar=escapechar,
  573. quotechar=quotechar,
  574. quoting=quoting,
  575. skipinitialspace=skipinitialspace,
  576. lineterminator=lineterminator,
  577. header=header,
  578. index_col=index_col,
  579. names=names,
  580. prefix=prefix,
  581. skiprows=skiprows,
  582. skipfooter=skipfooter,
  583. na_values=na_values,
  584. true_values=true_values,
  585. false_values=false_values,
  586. keep_default_na=keep_default_na,
  587. thousands=thousands,
  588. comment=comment,
  589. decimal=decimal,
  590. parse_dates=parse_dates,
  591. keep_date_col=keep_date_col,
  592. dayfirst=dayfirst,
  593. date_parser=date_parser,
  594. cache_dates=cache_dates,
  595. nrows=nrows,
  596. iterator=iterator,
  597. chunksize=chunksize,
  598. converters=converters,
  599. dtype=dtype,
  600. usecols=usecols,
  601. verbose=verbose,
  602. encoding=encoding,
  603. squeeze=squeeze,
  604. memory_map=memory_map,
  605. float_precision=float_precision,
  606. na_filter=na_filter,
  607. delim_whitespace=delim_whitespace,
  608. warn_bad_lines=warn_bad_lines,
  609. error_bad_lines=error_bad_lines,
  610. low_memory=low_memory,
  611. mangle_dupe_cols=mangle_dupe_cols,
  612. infer_datetime_format=infer_datetime_format,
  613. skip_blank_lines=skip_blank_lines,
  614. )
  615. return _read(filepath_or_buffer, kwds)
  616. parser_f.__name__ = name
  617. return parser_f
  618. read_csv = _make_parser_function("read_csv", default_sep=",")
  619. read_csv = Appender(
  620. _doc_read_csv_and_table.format(
  621. func_name="read_csv",
  622. summary="Read a comma-separated values (csv) file into DataFrame.",
  623. _default_sep="','",
  624. )
  625. )(read_csv)
  626. read_table = _make_parser_function("read_table", default_sep="\t")
  627. read_table = Appender(
  628. _doc_read_csv_and_table.format(
  629. func_name="read_table",
  630. summary="Read general delimited file into DataFrame.",
  631. _default_sep=r"'\\t' (tab-stop)",
  632. )
  633. )(read_table)
  634. def read_fwf(
  635. filepath_or_buffer: FilePathOrBuffer,
  636. colspecs="infer",
  637. widths=None,
  638. infer_nrows=100,
  639. **kwds,
  640. ):
  641. r"""
  642. Read a table of fixed-width formatted lines into DataFrame.
  643. Also supports optionally iterating or breaking of the file
  644. into chunks.
  645. Additional help can be found in the `online docs for IO Tools
  646. <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
  647. Parameters
  648. ----------
  649. filepath_or_buffer : str, path object or file-like object
  650. Any valid string path is acceptable. The string could be a URL. Valid
  651. URL schemes include http, ftp, s3, and file. For file URLs, a host is
  652. expected. A local file could be:
  653. ``file://localhost/path/to/table.csv``.
  654. If you want to pass in a path object, pandas accepts any
  655. ``os.PathLike``.
  656. By file-like object, we refer to objects with a ``read()`` method,
  657. such as a file handler (e.g. via builtin ``open`` function)
  658. or ``StringIO``.
  659. colspecs : list of tuple (int, int) or 'infer'. optional
  660. A list of tuples giving the extents of the fixed-width
  661. fields of each line as half-open intervals (i.e., [from, to[ ).
  662. String value 'infer' can be used to instruct the parser to try
  663. detecting the column specifications from the first 100 rows of
  664. the data which are not being skipped via skiprows (default='infer').
  665. widths : list of int, optional
  666. A list of field widths which can be used instead of 'colspecs' if
  667. the intervals are contiguous.
  668. infer_nrows : int, default 100
  669. The number of rows to consider when letting the parser determine the
  670. `colspecs`.
  671. .. versionadded:: 0.24.0
  672. **kwds : optional
  673. Optional keyword arguments can be passed to ``TextFileReader``.
  674. Returns
  675. -------
  676. DataFrame or TextParser
  677. A comma-separated values (csv) file is returned as two-dimensional
  678. data structure with labeled axes.
  679. See Also
  680. --------
  681. DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
  682. read_csv : Read a comma-separated values (csv) file into DataFrame.
  683. Examples
  684. --------
  685. >>> pd.read_fwf('data.csv') # doctest: +SKIP
  686. """
  687. # Check input arguments.
  688. if colspecs is None and widths is None:
  689. raise ValueError("Must specify either colspecs or widths")
  690. elif colspecs not in (None, "infer") and widths is not None:
  691. raise ValueError("You must specify only one of 'widths' and 'colspecs'")
  692. # Compute 'colspecs' from 'widths', if specified.
  693. if widths is not None:
  694. colspecs, col = [], 0
  695. for w in widths:
  696. colspecs.append((col, col + w))
  697. col += w
  698. kwds["colspecs"] = colspecs
  699. kwds["infer_nrows"] = infer_nrows
  700. kwds["engine"] = "python-fwf"
  701. return _read(filepath_or_buffer, kwds)
  702. class TextFileReader(abc.Iterator):
  703. """
  704. Passed dialect overrides any of the related parser options
  705. """
  706. def __init__(self, f, engine=None, **kwds):
  707. self.f = f
  708. if engine is not None:
  709. engine_specified = True
  710. else:
  711. engine = "python"
  712. engine_specified = False
  713. self._engine_specified = kwds.get("engine_specified", engine_specified)
  714. if kwds.get("dialect") is not None:
  715. dialect = kwds["dialect"]
  716. if dialect in csv.list_dialects():
  717. dialect = csv.get_dialect(dialect)
  718. # Any valid dialect should have these attributes.
  719. # If any are missing, we will raise automatically.
  720. for param in (
  721. "delimiter",
  722. "doublequote",
  723. "escapechar",
  724. "skipinitialspace",
  725. "quotechar",
  726. "quoting",
  727. ):
  728. try:
  729. dialect_val = getattr(dialect, param)
  730. except AttributeError as err:
  731. raise ValueError(
  732. f"Invalid dialect {kwds['dialect']} provided"
  733. ) from err
  734. parser_default = _parser_defaults[param]
  735. provided = kwds.get(param, parser_default)
  736. # Messages for conflicting values between the dialect
  737. # instance and the actual parameters provided.
  738. conflict_msgs = []
  739. # Don't warn if the default parameter was passed in,
  740. # even if it conflicts with the dialect (gh-23761).
  741. if provided != parser_default and provided != dialect_val:
  742. msg = (
  743. f"Conflicting values for '{param}': '{provided}' was "
  744. f"provided, but the dialect specifies '{dialect_val}'. "
  745. "Using the dialect-specified value."
  746. )
  747. # Annoying corner case for not warning about
  748. # conflicts between dialect and delimiter parameter.
  749. # Refer to the outer "_read_" function for more info.
  750. if not (param == "delimiter" and kwds.pop("sep_override", False)):
  751. conflict_msgs.append(msg)
  752. if conflict_msgs:
  753. warnings.warn(
  754. "\n\n".join(conflict_msgs), ParserWarning, stacklevel=2
  755. )
  756. kwds[param] = dialect_val
  757. if kwds.get("skipfooter"):
  758. if kwds.get("iterator") or kwds.get("chunksize"):
  759. raise ValueError("'skipfooter' not supported for 'iteration'")
  760. if kwds.get("nrows"):
  761. raise ValueError("'skipfooter' not supported with 'nrows'")
  762. if kwds.get("header", "infer") == "infer":
  763. kwds["header"] = 0 if kwds.get("names") is None else None
  764. self.orig_options = kwds
  765. # miscellanea
  766. self.engine = engine
  767. self._engine = None
  768. self._currow = 0
  769. options = self._get_options_with_defaults(engine)
  770. self.chunksize = options.pop("chunksize", None)
  771. self.nrows = options.pop("nrows", None)
  772. self.squeeze = options.pop("squeeze", False)
  773. # might mutate self.engine
  774. self.engine = self._check_file_or_buffer(f, engine)
  775. self.options, self.engine = self._clean_options(options, engine)
  776. if "has_index_names" in kwds:
  777. self.options["has_index_names"] = kwds["has_index_names"]
  778. self._make_engine(self.engine)
  779. def close(self):
  780. self._engine.close()
  781. def _get_options_with_defaults(self, engine):
  782. kwds = self.orig_options
  783. options = {}
  784. for argname, default in _parser_defaults.items():
  785. value = kwds.get(argname, default)
  786. # see gh-12935
  787. if argname == "mangle_dupe_cols" and not value:
  788. raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
  789. else:
  790. options[argname] = value
  791. for argname, default in _c_parser_defaults.items():
  792. if argname in kwds:
  793. value = kwds[argname]
  794. if engine != "c" and value != default:
  795. if "python" in engine and argname not in _python_unsupported:
  796. pass
  797. elif value == _deprecated_defaults.get(argname, default):
  798. pass
  799. else:
  800. raise ValueError(
  801. f"The {repr(argname)} option is not supported with the "
  802. f"{repr(engine)} engine"
  803. )
  804. else:
  805. value = _deprecated_defaults.get(argname, default)
  806. options[argname] = value
  807. if engine == "python-fwf":
  808. for argname, default in _fwf_defaults.items():
  809. options[argname] = kwds.get(argname, default)
  810. return options
  811. def _check_file_or_buffer(self, f, engine):
  812. # see gh-16530
  813. if is_file_like(f):
  814. next_attr = "__next__"
  815. # The C engine doesn't need the file-like to have the "next" or
  816. # "__next__" attribute. However, the Python engine explicitly calls
  817. # "next(...)" when iterating through such an object, meaning it
  818. # needs to have that attribute ("next" for Python 2.x, "__next__"
  819. # for Python 3.x)
  820. if engine != "c" and not hasattr(f, next_attr):
  821. msg = "The 'python' engine cannot iterate through this file buffer."
  822. raise ValueError(msg)
  823. return engine
  824. def _clean_options(self, options, engine):
  825. result = options.copy()
  826. engine_specified = self._engine_specified
  827. fallback_reason = None
  828. sep = options["delimiter"]
  829. delim_whitespace = options["delim_whitespace"]
  830. # C engine not supported yet
  831. if engine == "c":
  832. if options["skipfooter"] > 0:
  833. fallback_reason = "the 'c' engine does not support skipfooter"
  834. engine = "python"
  835. encoding = sys.getfilesystemencoding() or "utf-8"
  836. if sep is None and not delim_whitespace:
  837. if engine == "c":
  838. fallback_reason = (
  839. "the 'c' engine does not support "
  840. "sep=None with delim_whitespace=False"
  841. )
  842. engine = "python"
  843. elif sep is not None and len(sep) > 1:
  844. if engine == "c" and sep == r"\s+":
  845. result["delim_whitespace"] = True
  846. del result["delimiter"]
  847. elif engine not in ("python", "python-fwf"):
  848. # wait until regex engine integrated
  849. fallback_reason = (
  850. "the 'c' engine does not support "
  851. "regex separators (separators > 1 char and "
  852. r"different from '\s+' are interpreted as regex)"
  853. )
  854. engine = "python"
  855. elif delim_whitespace:
  856. if "python" in engine:
  857. result["delimiter"] = r"\s+"
  858. elif sep is not None:
  859. encodeable = True
  860. try:
  861. if len(sep.encode(encoding)) > 1:
  862. encodeable = False
  863. except UnicodeDecodeError:
  864. encodeable = False
  865. if not encodeable and engine not in ("python", "python-fwf"):
  866. fallback_reason = (
  867. f"the separator encoded in {encoding} "
  868. "is > 1 char long, and the 'c' engine "
  869. "does not support such separators"
  870. )
  871. engine = "python"
  872. quotechar = options["quotechar"]
  873. if quotechar is not None and isinstance(quotechar, (str, bytes)):
  874. if (
  875. len(quotechar) == 1
  876. and ord(quotechar) > 127
  877. and engine not in ("python", "python-fwf")
  878. ):
  879. fallback_reason = (
  880. "ord(quotechar) > 127, meaning the "
  881. "quotechar is larger than one byte, "
  882. "and the 'c' engine does not support such quotechars"
  883. )
  884. engine = "python"
  885. if fallback_reason and engine_specified:
  886. raise ValueError(fallback_reason)
  887. if engine == "c":
  888. for arg in _c_unsupported:
  889. del result[arg]
  890. if "python" in engine:
  891. for arg in _python_unsupported:
  892. if fallback_reason and result[arg] != _c_parser_defaults[arg]:
  893. raise ValueError(
  894. "Falling back to the 'python' engine because "
  895. f"{fallback_reason}, but this causes {repr(arg)} to be "
  896. "ignored as it is not supported by the 'python' engine."
  897. )
  898. del result[arg]
  899. if fallback_reason:
  900. warnings.warn(
  901. (
  902. "Falling back to the 'python' engine because "
  903. f"{fallback_reason}; you can avoid this warning by specifying "
  904. "engine='python'."
  905. ),
  906. ParserWarning,
  907. stacklevel=5,
  908. )
  909. index_col = options["index_col"]
  910. names = options["names"]
  911. converters = options["converters"]
  912. na_values = options["na_values"]
  913. skiprows = options["skiprows"]
  914. validate_header_arg(options["header"])
  915. depr_warning = ""
  916. for arg in _deprecated_args:
  917. parser_default = _c_parser_defaults[arg]
  918. depr_default = _deprecated_defaults[arg]
  919. msg = (
  920. f"The {repr(arg)} argument has been deprecated and will be "
  921. "removed in a future version."
  922. )
  923. if result.get(arg, depr_default) != depr_default:
  924. depr_warning += msg + "\n\n"
  925. else:
  926. result[arg] = parser_default
  927. if depr_warning != "":
  928. warnings.warn(depr_warning, FutureWarning, stacklevel=2)
  929. if index_col is True:
  930. raise ValueError("The value of index_col couldn't be 'True'")
  931. if _is_index_col(index_col):
  932. if not isinstance(index_col, (list, tuple, np.ndarray)):
  933. index_col = [index_col]
  934. result["index_col"] = index_col
  935. names = list(names) if names is not None else names
  936. # type conversion-related
  937. if converters is not None:
  938. if not isinstance(converters, dict):
  939. raise TypeError(
  940. "Type converters must be a dict or subclass, "
  941. f"input was a {type(converters).__name__}"
  942. )
  943. else:
  944. converters = {}
  945. # Converting values to NA
  946. keep_default_na = options["keep_default_na"]
  947. na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
  948. # handle skiprows; this is internally handled by the
  949. # c-engine, so only need for python parsers
  950. if engine != "c":
  951. if is_integer(skiprows):
  952. skiprows = list(range(skiprows))
  953. if skiprows is None:
  954. skiprows = set()
  955. elif not callable(skiprows):
  956. skiprows = set(skiprows)
  957. # put stuff back
  958. result["names"] = names
  959. result["converters"] = converters
  960. result["na_values"] = na_values
  961. result["na_fvalues"] = na_fvalues
  962. result["skiprows"] = skiprows
  963. return result, engine
  964. def __next__(self):
  965. try:
  966. return self.get_chunk()
  967. except StopIteration:
  968. self.close()
  969. raise
  970. def _make_engine(self, engine="c"):
  971. if engine == "c":
  972. self._engine = CParserWrapper(self.f, **self.options)
  973. else:
  974. if engine == "python":
  975. klass = PythonParser
  976. elif engine == "python-fwf":
  977. klass = FixedWidthFieldParser
  978. else:
  979. raise ValueError(
  980. f"Unknown engine: {engine} (valid options "
  981. 'are "c", "python", or "python-fwf")'
  982. )
  983. self._engine = klass(self.f, **self.options)
  984. def _failover_to_python(self):
  985. raise AbstractMethodError(self)
  986. def read(self, nrows=None):
  987. nrows = _validate_integer("nrows", nrows)
  988. ret = self._engine.read(nrows)
  989. # May alter columns / col_dict
  990. index, columns, col_dict = self._create_index(ret)
  991. if index is None:
  992. if col_dict:
  993. # Any column is actually fine:
  994. new_rows = len(next(iter(col_dict.values())))
  995. index = RangeIndex(self._currow, self._currow + new_rows)
  996. else:
  997. new_rows = 0
  998. else:
  999. new_rows = len(index)
  1000. df = DataFrame(col_dict, columns=columns, index=index)
  1001. self._currow += new_rows
  1002. if self.squeeze and len(df.columns) == 1:
  1003. return df[df.columns[0]].copy()
  1004. return df
  1005. def _create_index(self, ret):
  1006. index, columns, col_dict = ret
  1007. return index, columns, col_dict
  1008. def get_chunk(self, size=None):
  1009. if size is None:
  1010. size = self.chunksize
  1011. if self.nrows is not None:
  1012. if self._currow >= self.nrows:
  1013. raise StopIteration
  1014. size = min(size, self.nrows - self._currow)
  1015. return self.read(nrows=size)
  1016. def _is_index_col(col):
  1017. return col is not None and col is not False
  1018. def _is_potential_multi_index(columns):
  1019. """
  1020. Check whether or not the `columns` parameter
  1021. could be converted into a MultiIndex.
  1022. Parameters
  1023. ----------
  1024. columns : array-like
  1025. Object which may or may not be convertible into a MultiIndex
  1026. Returns
  1027. -------
  1028. boolean : Whether or not columns could become a MultiIndex
  1029. """
  1030. return (
  1031. len(columns)
  1032. and not isinstance(columns, MultiIndex)
  1033. and all(isinstance(c, tuple) for c in columns)
  1034. )
  1035. def _evaluate_usecols(usecols, names):
  1036. """
  1037. Check whether or not the 'usecols' parameter
  1038. is a callable. If so, enumerates the 'names'
  1039. parameter and returns a set of indices for
  1040. each entry in 'names' that evaluates to True.
  1041. If not a callable, returns 'usecols'.
  1042. """
  1043. if callable(usecols):
  1044. return {i for i, name in enumerate(names) if usecols(name)}
  1045. return usecols
  1046. def _validate_usecols_names(usecols, names):
  1047. """
  1048. Validates that all usecols are present in a given
  1049. list of names. If not, raise a ValueError that
  1050. shows what usecols are missing.
  1051. Parameters
  1052. ----------
  1053. usecols : iterable of usecols
  1054. The columns to validate are present in names.
  1055. names : iterable of names
  1056. The column names to check against.
  1057. Returns
  1058. -------
  1059. usecols : iterable of usecols
  1060. The `usecols` parameter if the validation succeeds.
  1061. Raises
  1062. ------
  1063. ValueError : Columns were missing. Error message will list them.
  1064. """
  1065. missing = [c for c in usecols if c not in names]
  1066. if len(missing) > 0:
  1067. raise ValueError(
  1068. f"Usecols do not match columns, columns expected but not found: {missing}"
  1069. )
  1070. return usecols
  1071. def _validate_skipfooter_arg(skipfooter):
  1072. """
  1073. Validate the 'skipfooter' parameter.
  1074. Checks whether 'skipfooter' is a non-negative integer.
  1075. Raises a ValueError if that is not the case.
  1076. Parameters
  1077. ----------
  1078. skipfooter : non-negative integer
  1079. The number of rows to skip at the end of the file.
  1080. Returns
  1081. -------
  1082. validated_skipfooter : non-negative integer
  1083. The original input if the validation succeeds.
  1084. Raises
  1085. ------
  1086. ValueError : 'skipfooter' was not a non-negative integer.
  1087. """
  1088. if not is_integer(skipfooter):
  1089. raise ValueError("skipfooter must be an integer")
  1090. if skipfooter < 0:
  1091. raise ValueError("skipfooter cannot be negative")
  1092. return skipfooter
  1093. def _validate_usecols_arg(usecols):
  1094. """
  1095. Validate the 'usecols' parameter.
  1096. Checks whether or not the 'usecols' parameter contains all integers
  1097. (column selection by index), strings (column by name) or is a callable.
  1098. Raises a ValueError if that is not the case.
  1099. Parameters
  1100. ----------
  1101. usecols : list-like, callable, or None
  1102. List of columns to use when parsing or a callable that can be used
  1103. to filter a list of table columns.
  1104. Returns
  1105. -------
  1106. usecols_tuple : tuple
  1107. A tuple of (verified_usecols, usecols_dtype).
  1108. 'verified_usecols' is either a set if an array-like is passed in or
  1109. 'usecols' if a callable or None is passed in.
  1110. 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
  1111. is passed in or None if a callable or None is passed in.
  1112. """
  1113. msg = (
  1114. "'usecols' must either be list-like of all strings, all unicode, "
  1115. "all integers or a callable."
  1116. )
  1117. if usecols is not None:
  1118. if callable(usecols):
  1119. return usecols, None
  1120. if not is_list_like(usecols):
  1121. # see gh-20529
  1122. #
  1123. # Ensure it is iterable container but not string.
  1124. raise ValueError(msg)
  1125. usecols_dtype = lib.infer_dtype(usecols, skipna=False)
  1126. if usecols_dtype not in ("empty", "integer", "string"):
  1127. raise ValueError(msg)
  1128. usecols = set(usecols)
  1129. return usecols, usecols_dtype
  1130. return usecols, None
  1131. def _validate_parse_dates_arg(parse_dates):
  1132. """
  1133. Check whether or not the 'parse_dates' parameter
  1134. is a non-boolean scalar. Raises a ValueError if
  1135. that is the case.
  1136. """
  1137. msg = (
  1138. "Only booleans, lists, and dictionaries are accepted "
  1139. "for the 'parse_dates' parameter"
  1140. )
  1141. if parse_dates is not None:
  1142. if is_scalar(parse_dates):
  1143. if not lib.is_bool(parse_dates):
  1144. raise TypeError(msg)
  1145. elif not isinstance(parse_dates, (list, dict)):
  1146. raise TypeError(msg)
  1147. return parse_dates
  1148. class ParserBase:
  1149. def __init__(self, kwds):
  1150. self.names = kwds.get("names")
  1151. self.orig_names = None
  1152. self.prefix = kwds.pop("prefix", None)
  1153. self.index_col = kwds.get("index_col", None)
  1154. self.unnamed_cols = set()
  1155. self.index_names = None
  1156. self.col_names = None
  1157. self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
  1158. self.date_parser = kwds.pop("date_parser", None)
  1159. self.dayfirst = kwds.pop("dayfirst", False)
  1160. self.keep_date_col = kwds.pop("keep_date_col", False)
  1161. self.na_values = kwds.get("na_values")
  1162. self.na_fvalues = kwds.get("na_fvalues")
  1163. self.na_filter = kwds.get("na_filter", False)
  1164. self.keep_default_na = kwds.get("keep_default_na", True)
  1165. self.true_values = kwds.get("true_values")
  1166. self.false_values = kwds.get("false_values")
  1167. self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)
  1168. self.infer_datetime_format = kwds.pop("infer_datetime_format", False)
  1169. self.cache_dates = kwds.pop("cache_dates", True)
  1170. self._date_conv = _make_date_converter(
  1171. date_parser=self.date_parser,
  1172. dayfirst=self.dayfirst,
  1173. infer_datetime_format=self.infer_datetime_format,
  1174. cache_dates=self.cache_dates,
  1175. )
  1176. # validate header options for mi
  1177. self.header = kwds.get("header")
  1178. if isinstance(self.header, (list, tuple, np.ndarray)):
  1179. if not all(map(is_integer, self.header)):
  1180. raise ValueError("header must be integer or list of integers")
  1181. if any(i < 0 for i in self.header):
  1182. raise ValueError(
  1183. "cannot specify multi-index header with negative integers"
  1184. )
  1185. if kwds.get("usecols"):
  1186. raise ValueError(
  1187. "cannot specify usecols when specifying a multi-index header"
  1188. )
  1189. if kwds.get("names"):
  1190. raise ValueError(
  1191. "cannot specify names when specifying a multi-index header"
  1192. )
  1193. # validate index_col that only contains integers
  1194. if self.index_col is not None:
  1195. is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray))
  1196. if not (
  1197. is_sequence
  1198. and all(map(is_integer, self.index_col))
  1199. or is_integer(self.index_col)
  1200. ):
  1201. raise ValueError(
  1202. "index_col must only contain row numbers "
  1203. "when specifying a multi-index header"
  1204. )
  1205. elif self.header is not None:
  1206. # GH 27394
  1207. if self.prefix is not None:
  1208. raise ValueError(
  1209. "Argument prefix must be None if argument header is not None"
  1210. )
  1211. # GH 16338
  1212. elif not is_integer(self.header):
  1213. raise ValueError("header must be integer or list of integers")
  1214. # GH 27779
  1215. elif self.header < 0:
  1216. raise ValueError(
  1217. "Passing negative integer to header is invalid. "
  1218. "For no header, use header=None instead"
  1219. )
  1220. self._name_processed = False
  1221. self._first_chunk = True
  1222. # GH 13932
  1223. # keep references to file handles opened by the parser itself
  1224. self.handles = []
  1225. def _validate_parse_dates_presence(self, columns: List[str]) -> None:
  1226. """
  1227. Check if parse_dates are in columns.
  1228. If user has provided names for parse_dates, check if those columns
  1229. are available.
  1230. Parameters
  1231. ----------
  1232. columns : list
  1233. List of names of the dataframe.
  1234. Raises
  1235. ------
  1236. ValueError
  1237. If column to parse_date is not in dataframe.
  1238. """
  1239. cols_needed: Iterable
  1240. if is_dict_like(self.parse_dates):
  1241. cols_needed = itertools.chain(*self.parse_dates.values())
  1242. elif is_list_like(self.parse_dates):
  1243. # a column in parse_dates could be represented
  1244. # ColReference = Union[int, str]
  1245. # DateGroups = List[ColReference]
  1246. # ParseDates = Union[DateGroups, List[DateGroups],
  1247. # Dict[ColReference, DateGroups]]
  1248. cols_needed = itertools.chain.from_iterable(
  1249. col if is_list_like(col) else [col] for col in self.parse_dates
  1250. )
  1251. else:
  1252. cols_needed = []
  1253. # get only columns that are references using names (str), not by index
  1254. missing_cols = ", ".join(
  1255. sorted(
  1256. {
  1257. col
  1258. for col in cols_needed
  1259. if isinstance(col, str) and col not in columns
  1260. }
  1261. )
  1262. )
  1263. if missing_cols:
  1264. raise ValueError(
  1265. f"Missing column provided to 'parse_dates': '{missing_cols}'"
  1266. )
  1267. def close(self):
  1268. for f in self.handles:
  1269. f.close()
  1270. @property
  1271. def _has_complex_date_col(self):
  1272. return isinstance(self.parse_dates, dict) or (
  1273. isinstance(self.parse_dates, list)
  1274. and len(self.parse_dates) > 0
  1275. and isinstance(self.parse_dates[0], list)
  1276. )
  1277. def _should_parse_dates(self, i):
  1278. if isinstance(self.parse_dates, bool):
  1279. return self.parse_dates
  1280. else:
  1281. if self.index_names is not None:
  1282. name = self.index_names[i]
  1283. else:
  1284. name = None
  1285. j = self.index_col[i]
  1286. if is_scalar(self.parse_dates):
  1287. return (j == self.parse_dates) or (
  1288. name is not None and name == self.parse_dates
  1289. )
  1290. else:
  1291. return (j in self.parse_dates) or (
  1292. name is not None and name in self.parse_dates
  1293. )
  1294. def _extract_multi_indexer_columns(
  1295. self, header, index_names, col_names, passed_names=False
  1296. ):
  1297. """
  1298. extract and return the names, index_names, col_names
  1299. header is a list-of-lists returned from the parsers
  1300. """
  1301. if len(header) < 2:
  1302. return header[0], index_names, col_names, passed_names
  1303. # the names are the tuples of the header that are not the index cols
  1304. # 0 is the name of the index, assuming index_col is a list of column
  1305. # numbers
  1306. ic = self.index_col
  1307. if ic is None:
  1308. ic = []
  1309. if not isinstance(ic, (list, tuple, np.ndarray)):
  1310. ic = [ic]
  1311. sic = set(ic)
  1312. # clean the index_names
  1313. index_names = header.pop(-1)
  1314. index_names, names, index_col = _clean_index_names(
  1315. index_names, self.index_col, self.unnamed_cols
  1316. )
  1317. # extract the columns
  1318. field_count = len(header[0])
  1319. def extract(r):
  1320. return tuple(r[i] for i in range(field_count) if i not in sic)
  1321. columns = list(zip(*(extract(r) for r in header)))
  1322. names = ic + columns
  1323. # If we find unnamed columns all in a single
  1324. # level, then our header was too long.
  1325. for n in range(len(columns[0])):
  1326. if all(ensure_str(col[n]) in self.unnamed_cols for col in columns):
  1327. header = ",".join(str(x) for x in self.header)
  1328. raise ParserError(
  1329. f"Passed header=[{header}] are too many rows "
  1330. "for this multi_index of columns"
  1331. )
  1332. # Clean the column names (if we have an index_col).
  1333. if len(ic):
  1334. col_names = [
  1335. r[0] if (len(r[0]) and r[0] not in self.unnamed_cols) else None
  1336. for r in header
  1337. ]
  1338. else:
  1339. col_names = [None] * len(header)
  1340. passed_names = True
  1341. return names, index_names, col_names, passed_names
  1342. def _maybe_dedup_names(self, names):
  1343. # see gh-7160 and gh-9424: this helps to provide
  1344. # immediate alleviation of the duplicate names
  1345. # issue and appears to be satisfactory to users,
  1346. # but ultimately, not needing to butcher the names
  1347. # would be nice!
  1348. if self.mangle_dupe_cols:
  1349. names = list(names) # so we can index
  1350. counts = defaultdict(int)
  1351. is_potential_mi = _is_potential_multi_index(names)
  1352. for i, col in enumerate(names):
  1353. cur_count = counts[col]
  1354. while cur_count > 0:
  1355. counts[col] = cur_count + 1
  1356. if is_potential_mi:
  1357. col = col[:-1] + (f"{col[-1]}.{cur_count}",)
  1358. else:
  1359. col = f"{col}.{cur_count}"
  1360. cur_count = counts[col]
  1361. names[i] = col
  1362. counts[col] = cur_count + 1
  1363. return names
  1364. def _maybe_make_multi_index_columns(self, columns, col_names=None):
  1365. # possibly create a column mi here
  1366. if _is_potential_multi_index(columns):
  1367. columns = MultiIndex.from_tuples(columns, names=col_names)
  1368. return columns
  1369. def _make_index(self, data, alldata, columns, indexnamerow=False):
  1370. if not _is_index_col(self.index_col) or not self.index_col:
  1371. index = None
  1372. elif not self._has_complex_date_col:
  1373. index = self._get_simple_index(alldata, columns)
  1374. index = self._agg_index(index)
  1375. elif self._has_complex_date_col:
  1376. if not self._name_processed:
  1377. (self.index_names, _, self.index_col) = _clean_index_names(
  1378. list(columns), self.index_col, self.unnamed_cols
  1379. )
  1380. self._name_processed = True
  1381. index = self._get_complex_date_index(data, columns)
  1382. index = self._agg_index(index, try_parse_dates=False)
  1383. # add names for the index
  1384. if indexnamerow:
  1385. coffset = len(indexnamerow) - len(columns)
  1386. index = index.set_names(indexnamerow[:coffset])
  1387. # maybe create a mi on the columns
  1388. columns = self._maybe_make_multi_index_columns(columns, self.col_names)
  1389. return index, columns
  1390. _implicit_index = False
  1391. def _get_simple_index(self, data, columns):
  1392. def ix(col):
  1393. if not isinstance(col, str):
  1394. return col
  1395. raise ValueError(f"Index {col} invalid")
  1396. to_remove = []
  1397. index = []
  1398. for idx in self.index_col:
  1399. i = ix(idx)
  1400. to_remove.append(i)
  1401. index.append(data[i])
  1402. # remove index items from content and columns, don't pop in
  1403. # loop
  1404. for i in sorted(to_remove, reverse=True):
  1405. data.pop(i)
  1406. if not self._implicit_index:
  1407. columns.pop(i)
  1408. return index
  1409. def _get_complex_date_index(self, data, col_names):
  1410. def _get_name(icol):
  1411. if isinstance(icol, str):
  1412. return icol
  1413. if col_names is None:
  1414. raise ValueError(f"Must supply column order to use {icol!s} as index")
  1415. for i, c in enumerate(col_names):
  1416. if i == icol:
  1417. return c
  1418. to_remove = []
  1419. index = []
  1420. for idx in self.index_col:
  1421. name = _get_name(idx)
  1422. to_remove.append(name)
  1423. index.append(data[name])
  1424. # remove index items from content and columns, don't pop in
  1425. # loop
  1426. for c in sorted(to_remove, reverse=True):
  1427. data.pop(c)
  1428. col_names.remove(c)
  1429. return index
  1430. def _agg_index(self, index, try_parse_dates=True):
  1431. arrays = []
  1432. for i, arr in enumerate(index):
  1433. if try_parse_dates and self._should_parse_dates(i):
  1434. arr = self._date_conv(arr)
  1435. if self.na_filter:
  1436. col_na_values = self.na_values
  1437. col_na_fvalues = self.na_fvalues
  1438. else:
  1439. col_na_values = set()
  1440. col_na_fvalues = set()
  1441. if isinstance(self.na_values, dict):
  1442. col_name = self.index_names[i]
  1443. if col_name is not None:
  1444. col_na_values, col_na_fvalues = _get_na_values(
  1445. col_name, self.na_values, self.na_fvalues, self.keep_default_na
  1446. )
  1447. arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
  1448. arrays.append(arr)
  1449. names = self.index_names
  1450. index = ensure_index_from_sequences(arrays, names)
  1451. return index
  1452. def _convert_to_ndarrays(
  1453. self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None
  1454. ):
  1455. result = {}
  1456. for c, values in dct.items():
  1457. conv_f = None if converters is None else converters.get(c, None)
  1458. if isinstance(dtypes, dict):
  1459. cast_type = dtypes.get(c, None)
  1460. else:
  1461. # single dtype or None
  1462. cast_type = dtypes
  1463. if self.na_filter:
  1464. col_na_values, col_na_fvalues = _get_na_values(
  1465. c, na_values, na_fvalues, self.keep_default_na
  1466. )
  1467. else:
  1468. col_na_values, col_na_fvalues = set(), set()
  1469. if conv_f is not None:
  1470. # conv_f applied to data before inference
  1471. if cast_type is not None:
  1472. warnings.warn(
  1473. (
  1474. "Both a converter and dtype were specified "
  1475. f"for column {c} - only the converter will be used"
  1476. ),
  1477. ParserWarning,
  1478. stacklevel=7,
  1479. )
  1480. try:
  1481. values = lib.map_infer(values, conv_f)
  1482. except ValueError:
  1483. mask = algorithms.isin(values, list(na_values)).view(np.uint8)
  1484. values = lib.map_infer_mask(values, conv_f, mask)
  1485. cvals, na_count = self._infer_types(
  1486. values, set(col_na_values) | col_na_fvalues, try_num_bool=False
  1487. )
  1488. else:
  1489. is_str_or_ea_dtype = is_string_dtype(
  1490. cast_type
  1491. ) or is_extension_array_dtype(cast_type)
  1492. # skip inference if specified dtype is object
  1493. # or casting to an EA
  1494. try_num_bool = not (cast_type and is_str_or_ea_dtype)
  1495. # general type inference and conversion
  1496. cvals, na_count = self._infer_types(
  1497. values, set(col_na_values) | col_na_fvalues, try_num_bool
  1498. )
  1499. # type specified in dtype param or cast_type is an EA
  1500. if cast_type and (
  1501. not is_dtype_equal(cvals, cast_type)
  1502. or is_extension_array_dtype(cast_type)
  1503. ):
  1504. try:
  1505. if (
  1506. is_bool_dtype(cast_type)
  1507. and not is_categorical_dtype(cast_type)
  1508. and na_count > 0
  1509. ):
  1510. raise ValueError(f"Bool column has NA values in column {c}")
  1511. except (AttributeError, TypeError):
  1512. # invalid input to is_bool_dtype
  1513. pass
  1514. cvals = self._cast_types(cvals, cast_type, c)
  1515. result[c] = cvals
  1516. if verbose and na_count:
  1517. print(f"Filled {na_count} NA values in column {c!s}")
  1518. return result
  1519. def _infer_types(self, values, na_values, try_num_bool=True):
  1520. """
  1521. Infer types of values, possibly casting
  1522. Parameters
  1523. ----------
  1524. values : ndarray
  1525. na_values : set
  1526. try_num_bool : bool, default try
  1527. try to cast values to numeric (first preference) or boolean
  1528. Returns
  1529. -------
  1530. converted : ndarray
  1531. na_count : int
  1532. """
  1533. na_count = 0
  1534. if issubclass(values.dtype.type, (np.number, np.bool_)):
  1535. mask = algorithms.isin(values, list(na_values))
  1536. na_count = mask.sum()
  1537. if na_count > 0:
  1538. if is_integer_dtype(values):
  1539. values = values.astype(np.float64)
  1540. np.putmask(values, mask, np.nan)
  1541. return values, na_count
  1542. if try_num_bool and is_object_dtype(values.dtype):
  1543. # exclude e.g DatetimeIndex here
  1544. try:
  1545. result = lib.maybe_convert_numeric(values, na_values, False)
  1546. except (ValueError, TypeError):
  1547. # e.g. encountering datetime string gets ValueError
  1548. # TypeError can be raised in floatify
  1549. result = values
  1550. na_count = parsers.sanitize_objects(result, na_values, False)
  1551. else:
  1552. na_count = isna(result).sum()
  1553. else:
  1554. result = values
  1555. if values.dtype == np.object_:
  1556. na_count = parsers.sanitize_objects(values, na_values, False)
  1557. if result.dtype == np.object_ and try_num_bool:
  1558. result = libops.maybe_convert_bool(
  1559. np.asarray(values),
  1560. true_values=self.true_values,
  1561. false_values=self.false_values,
  1562. )
  1563. return result, na_count
  1564. def _cast_types(self, values, cast_type, column):
  1565. """
  1566. Cast values to specified type
  1567. Parameters
  1568. ----------
  1569. values : ndarray
  1570. cast_type : string or np.dtype
  1571. dtype to cast values to
  1572. column : string
  1573. column name - used only for error reporting
  1574. Returns
  1575. -------
  1576. converted : ndarray
  1577. """
  1578. if is_categorical_dtype(cast_type):
  1579. known_cats = (
  1580. isinstance(cast_type, CategoricalDtype)
  1581. and cast_type.categories is not None
  1582. )
  1583. if not is_object_dtype(values) and not known_cats:
  1584. # TODO: this is for consistency with
  1585. # c-parser which parses all categories
  1586. # as strings
  1587. values = astype_nansafe(values, str)
  1588. cats = Index(values).unique().dropna()
  1589. values = Categorical._from_inferred_categories(
  1590. cats, cats.get_indexer(values), cast_type, true_values=self.true_values
  1591. )
  1592. # use the EA's implementation of casting
  1593. elif is_extension_array_dtype(cast_type):
  1594. # ensure cast_type is an actual dtype and not a string
  1595. cast_type = pandas_dtype(cast_type)
  1596. array_type = cast_type.construct_array_type()
  1597. try:
  1598. return array_type._from_sequence_of_strings(values, dtype=cast_type)
  1599. except NotImplementedError as err:
  1600. raise NotImplementedError(
  1601. f"Extension Array: {array_type} must implement "
  1602. "_from_sequence_of_strings in order to be used in parser methods"
  1603. ) from err
  1604. else:
  1605. try:
  1606. values = astype_nansafe(values, cast_type, copy=True, skipna=True)
  1607. except ValueError as err:
  1608. raise ValueError(
  1609. f"Unable to convert column {column} to type {cast_type}"
  1610. ) from err
  1611. return values
  1612. def _do_date_conversions(self, names, data):
  1613. # returns data, columns
  1614. if self.parse_dates is not None:
  1615. data, names = _process_date_conversion(
  1616. data,
  1617. self._date_conv,
  1618. self.parse_dates,
  1619. self.index_col,
  1620. self.index_names,
  1621. names,
  1622. keep_date_col=self.keep_date_col,
  1623. )
  1624. return names, data
  1625. class CParserWrapper(ParserBase):
  1626. """
  1627. """
  1628. def __init__(self, src, **kwds):
  1629. self.kwds = kwds
  1630. kwds = kwds.copy()
  1631. ParserBase.__init__(self, kwds)
  1632. encoding = kwds.get("encoding")
  1633. if kwds.get("compression") is None and encoding:
  1634. if isinstance(src, str):
  1635. src = open(src, "rb")
  1636. self.handles.append(src)
  1637. # Handle the file object with universal line mode enabled.
  1638. # We will handle the newline character ourselves later on.
  1639. if hasattr(src, "read") and not hasattr(src, "encoding"):
  1640. src = TextIOWrapper(src, encoding=encoding, newline="")
  1641. kwds["encoding"] = "utf-8"
  1642. # #2442
  1643. kwds["allow_leading_cols"] = self.index_col is not False
  1644. # GH20529, validate usecol arg before TextReader
  1645. self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
  1646. kwds["usecols"] = self.usecols
  1647. self._reader = parsers.TextReader(src, **kwds)
  1648. self.unnamed_cols = self._reader.unnamed_cols
  1649. passed_names = self.names is None
  1650. if self._reader.header is None:
  1651. self.names = None
  1652. else:
  1653. if len(self._reader.header) > 1:
  1654. # we have a multi index in the columns
  1655. (
  1656. self.names,
  1657. self.index_names,
  1658. self.col_names,
  1659. passed_names,
  1660. ) = self._extract_multi_indexer_columns(
  1661. self._reader.header, self.index_names, self.col_names, passed_names
  1662. )
  1663. else:
  1664. self.names = list(self._reader.header[0])
  1665. if self.names is None:
  1666. if self.prefix:
  1667. self.names = [
  1668. f"{self.prefix}{i}" for i in range(self._reader.table_width)
  1669. ]
  1670. else:
  1671. self.names = list(range(self._reader.table_width))
  1672. # gh-9755
  1673. #
  1674. # need to set orig_names here first
  1675. # so that proper indexing can be done
  1676. # with _set_noconvert_columns
  1677. #
  1678. # once names has been filtered, we will
  1679. # then set orig_names again to names
  1680. self.orig_names = self.names[:]
  1681. if self.usecols:
  1682. usecols = _evaluate_usecols(self.usecols, self.orig_names)
  1683. # GH 14671
  1684. if self.usecols_dtype == "string" and not set(usecols).issubset(
  1685. self.orig_names
  1686. ):
  1687. _validate_usecols_names(usecols, self.orig_names)
  1688. if len(self.names) > len(usecols):
  1689. self.names = [
  1690. n
  1691. for i, n in enumerate(self.names)
  1692. if (i in usecols or n in usecols)
  1693. ]
  1694. if len(self.names) < len(usecols):
  1695. _validate_usecols_names(usecols, self.names)
  1696. self._validate_parse_dates_presence(self.names)
  1697. self._set_noconvert_columns()
  1698. self.orig_names = self.names
  1699. if not self._has_complex_date_col:
  1700. if self._reader.leading_cols == 0 and _is_index_col(self.index_col):
  1701. self._name_processed = True
  1702. (index_names, self.names, self.index_col) = _clean_index_names(
  1703. self.names, self.index_col, self.unnamed_cols
  1704. )
  1705. if self.index_names is None:
  1706. self.index_names = index_names
  1707. if self._reader.header is None and not passed_names:
  1708. self.index_names = [None] * len(self.index_names)
  1709. self._implicit_index = self._reader.leading_cols > 0
  1710. def close(self):
  1711. for f in self.handles:
  1712. f.close()
  1713. # close additional handles opened by C parser (for compression)
  1714. try:
  1715. self._reader.close()
  1716. except ValueError:
  1717. pass
  1718. def _set_noconvert_columns(self):
  1719. """
  1720. Set the columns that should not undergo dtype conversions.
  1721. Currently, any column that is involved with date parsing will not
  1722. undergo such conversions.
  1723. """
  1724. names = self.orig_names
  1725. if self.usecols_dtype == "integer":
  1726. # A set of integers will be converted to a list in
  1727. # the correct order every single time.
  1728. usecols = list(self.usecols)
  1729. usecols.sort()
  1730. elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
  1731. # The names attribute should have the correct columns
  1732. # in the proper order for indexing with parse_dates.
  1733. usecols = self.names[:]
  1734. else:
  1735. # Usecols is empty.
  1736. usecols = None
  1737. def _set(x):
  1738. if usecols is not None and is_integer(x):
  1739. x = usecols[x]
  1740. if not is_integer(x):
  1741. x = names.index(x)
  1742. self._reader.set_noconvert(x)
  1743. if isinstance(self.parse_dates, list):
  1744. for val in self.parse_dates:
  1745. if isinstance(val, list):
  1746. for k in val:
  1747. _set(k)
  1748. else:
  1749. _set(val)
  1750. elif isinstance(self.parse_dates, dict):
  1751. for val in self.parse_dates.values():
  1752. if isinstance(val, list):
  1753. for k in val:
  1754. _set(k)
  1755. else:
  1756. _set(val)
  1757. elif self.parse_dates:
  1758. if isinstance(self.index_col, list):
  1759. for k in self.index_col:
  1760. _set(k)
  1761. elif self.index_col is not None:
  1762. _set(self.index_col)
  1763. def set_error_bad_lines(self, status):
  1764. self._reader.set_error_bad_lines(int(status))
  1765. def read(self, nrows=None):
  1766. try:
  1767. data = self._reader.read(nrows)
  1768. except StopIteration:
  1769. if self._first_chunk:
  1770. self._first_chunk = False
  1771. names = self._maybe_dedup_names(self.orig_names)
  1772. index, columns, col_dict = _get_empty_meta(
  1773. names,
  1774. self.index_col,
  1775. self.index_names,
  1776. dtype=self.kwds.get("dtype"),
  1777. )
  1778. columns = self._maybe_make_multi_index_columns(columns, self.col_names)
  1779. if self.usecols is not None:
  1780. columns = self._filter_usecols(columns)
  1781. col_dict = dict(
  1782. filter(lambda item: item[0] in columns, col_dict.items())
  1783. )
  1784. return index, columns, col_dict
  1785. else:
  1786. raise
  1787. # Done with first read, next time raise StopIteration
  1788. self._first_chunk = False
  1789. names = self.names
  1790. if self._reader.leading_cols:
  1791. if self._has_complex_date_col:
  1792. raise NotImplementedError("file structure not yet supported")
  1793. # implicit index, no index names
  1794. arrays = []
  1795. for i in range(self._reader.leading_cols):
  1796. if self.index_col is None:
  1797. values = data.pop(i)
  1798. else:
  1799. values = data.pop(self.index_col[i])
  1800. values = self._maybe_parse_dates(values, i, try_parse_dates=True)
  1801. arrays.append(values)
  1802. index = ensure_index_from_sequences(arrays)
  1803. if self.usecols is not None:
  1804. names = self._filter_usecols(names)
  1805. names = self._maybe_dedup_names(names)
  1806. # rename dict keys
  1807. data = sorted(data.items())
  1808. data = {k: v for k, (i, v) in zip(names, data)}
  1809. names, data = self._do_date_conversions(names, data)
  1810. else:
  1811. # rename dict keys
  1812. data = sorted(data.items())
  1813. # ugh, mutation
  1814. names = list(self.orig_names)
  1815. names = self._maybe_dedup_names(names)
  1816. if self.usecols is not None:
  1817. names = self._filter_usecols(names)
  1818. # columns as list
  1819. alldata = [x[1] for x in data]
  1820. data = {k: v for k, (i, v) in zip(names, data)}
  1821. names, data = self._do_date_conversions(names, data)
  1822. index, names = self._make_index(data, alldata, names)
  1823. # maybe create a mi on the columns
  1824. names = self._maybe_make_multi_index_columns(names, self.col_names)
  1825. return index, names, data
  1826. def _filter_usecols(self, names):
  1827. # hackish
  1828. usecols = _evaluate_usecols(self.usecols, names)
  1829. if usecols is not None and len(names) != len(usecols):
  1830. names = [
  1831. name for i, name in enumerate(names) if i in usecols or name in usecols
  1832. ]
  1833. return names
  1834. def _get_index_names(self):
  1835. names = list(self._reader.header[0])
  1836. idx_names = None
  1837. if self._reader.leading_cols == 0 and self.index_col is not None:
  1838. (idx_names, names, self.index_col) = _clean_index_names(
  1839. names, self.index_col, self.unnamed_cols
  1840. )
  1841. return names, idx_names
  1842. def _maybe_parse_dates(self, values, index, try_parse_dates=True):
  1843. if try_parse_dates and self._should_parse_dates(index):
  1844. values = self._date_conv(values)
  1845. return values
  1846. def TextParser(*args, **kwds):
  1847. """
  1848. Converts lists of lists/tuples into DataFrames with proper type inference
  1849. and optional (e.g. string to datetime) conversion. Also enables iterating
  1850. lazily over chunks of large files
  1851. Parameters
  1852. ----------
  1853. data : file-like object or list
  1854. delimiter : separator character to use
  1855. dialect : str or csv.Dialect instance, optional
  1856. Ignored if delimiter is longer than 1 character
  1857. names : sequence, default
  1858. header : int, default 0
  1859. Row to use to parse column labels. Defaults to the first row. Prior
  1860. rows will be discarded
  1861. index_col : int or list, optional
  1862. Column or columns to use as the (possibly hierarchical) index
  1863. has_index_names: bool, default False
  1864. True if the cols defined in index_col have an index name and are
  1865. not in the header.
  1866. na_values : scalar, str, list-like, or dict, optional
  1867. Additional strings to recognize as NA/NaN.
  1868. keep_default_na : bool, default True
  1869. thousands : str, optional
  1870. Thousands separator
  1871. comment : str, optional
  1872. Comment out remainder of line
  1873. parse_dates : bool, default False
  1874. keep_date_col : bool, default False
  1875. date_parser : function, optional
  1876. skiprows : list of integers
  1877. Row numbers to skip
  1878. skipfooter : int
  1879. Number of line at bottom of file to skip
  1880. converters : dict, optional
  1881. Dict of functions for converting values in certain columns. Keys can
  1882. either be integers or column labels, values are functions that take one
  1883. input argument, the cell (not column) content, and return the
  1884. transformed content.
  1885. encoding : str, optional
  1886. Encoding to use for UTF when reading/writing (ex. 'utf-8')
  1887. squeeze : bool, default False
  1888. returns Series if only one column.
  1889. infer_datetime_format: bool, default False
  1890. If True and `parse_dates` is True for a column, try to infer the
  1891. datetime format based on the first datetime string. If the format
  1892. can be inferred, there often will be a large parsing speed-up.
  1893. float_precision : str, optional
  1894. Specifies which converter the C engine should use for floating-point
  1895. values. The options are None for the ordinary converter,
  1896. 'high' for the high-precision converter, and 'round_trip' for the
  1897. round-trip converter.
  1898. """
  1899. kwds["engine"] = "python"
  1900. return TextFileReader(*args, **kwds)
  1901. def count_empty_vals(vals):
  1902. return sum(1 for v in vals if v == "" or v is None)
  1903. class PythonParser(ParserBase):
  1904. def __init__(self, f, **kwds):
  1905. """
  1906. Workhorse function for processing nested list into DataFrame
  1907. """
  1908. ParserBase.__init__(self, kwds)
  1909. self.data = None
  1910. self.buf = []
  1911. self.pos = 0
  1912. self.line_pos = 0
  1913. self.encoding = kwds["encoding"]
  1914. self.compression = kwds["compression"]
  1915. self.memory_map = kwds["memory_map"]
  1916. self.skiprows = kwds["skiprows"]
  1917. if callable(self.skiprows):
  1918. self.skipfunc = self.skiprows
  1919. else:
  1920. self.skipfunc = lambda x: x in self.skiprows
  1921. self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"])
  1922. self.delimiter = kwds["delimiter"]
  1923. self.quotechar = kwds["quotechar"]
  1924. if isinstance(self.quotechar, str):
  1925. self.quotechar = str(self.quotechar)
  1926. self.escapechar = kwds["escapechar"]
  1927. self.doublequote = kwds["doublequote"]
  1928. self.skipinitialspace = kwds["skipinitialspace"]
  1929. self.lineterminator = kwds["lineterminator"]
  1930. self.quoting = kwds["quoting"]
  1931. self.usecols, _ = _validate_usecols_arg(kwds["usecols"])
  1932. self.skip_blank_lines = kwds["skip_blank_lines"]
  1933. self.warn_bad_lines = kwds["warn_bad_lines"]
  1934. self.error_bad_lines = kwds["error_bad_lines"]
  1935. self.names_passed = kwds["names"] or None
  1936. self.has_index_names = False
  1937. if "has_index_names" in kwds:
  1938. self.has_index_names = kwds["has_index_names"]
  1939. self.verbose = kwds["verbose"]
  1940. self.converters = kwds["converters"]
  1941. self.dtype = kwds["dtype"]
  1942. self.thousands = kwds["thousands"]
  1943. self.decimal = kwds["decimal"]
  1944. self.comment = kwds["comment"]
  1945. self._comment_lines = []
  1946. f, handles = get_handle(
  1947. f,
  1948. "r",
  1949. encoding=self.encoding,
  1950. compression=self.compression,
  1951. memory_map=self.memory_map,
  1952. )
  1953. self.handles.extend(handles)
  1954. # Set self.data to something that can read lines.
  1955. if hasattr(f, "readline"):
  1956. self._make_reader(f)
  1957. else:
  1958. self.data = f
  1959. # Get columns in two steps: infer from data, then
  1960. # infer column indices from self.usecols if it is specified.
  1961. self._col_indices = None
  1962. try:
  1963. (
  1964. self.columns,
  1965. self.num_original_columns,
  1966. self.unnamed_cols,
  1967. ) = self._infer_columns()
  1968. except (TypeError, ValueError):
  1969. self.close()
  1970. raise
  1971. # Now self.columns has the set of columns that we will process.
  1972. # The original set is stored in self.original_columns.
  1973. if len(self.columns) > 1:
  1974. # we are processing a multi index column
  1975. (
  1976. self.columns,
  1977. self.index_names,
  1978. self.col_names,
  1979. _,
  1980. ) = self._extract_multi_indexer_columns(
  1981. self.columns, self.index_names, self.col_names
  1982. )
  1983. # Update list of original names to include all indices.
  1984. self.num_original_columns = len(self.columns)
  1985. else:
  1986. self.columns = self.columns[0]
  1987. # get popped off for index
  1988. self.orig_names = list(self.columns)
  1989. # needs to be cleaned/refactored
  1990. # multiple date column thing turning into a real spaghetti factory
  1991. if not self._has_complex_date_col:
  1992. (index_names, self.orig_names, self.columns) = self._get_index_name(
  1993. self.columns
  1994. )
  1995. self._name_processed = True
  1996. if self.index_names is None:
  1997. self.index_names = index_names
  1998. self._validate_parse_dates_presence(self.columns)
  1999. if self.parse_dates:
  2000. self._no_thousands_columns = self._set_no_thousands_columns()
  2001. else:
  2002. self._no_thousands_columns = None
  2003. if len(self.decimal) != 1:
  2004. raise ValueError("Only length-1 decimal markers supported")
  2005. if self.thousands is None:
  2006. self.nonnum = re.compile(fr"[^-^0-9^{self.decimal}]+")
  2007. else:
  2008. self.nonnum = re.compile(fr"[^-^0-9^{self.thousands}^{self.decimal}]+")
  2009. def _set_no_thousands_columns(self):
  2010. # Create a set of column ids that are not to be stripped of thousands
  2011. # operators.
  2012. noconvert_columns = set()
  2013. def _set(x):
  2014. if is_integer(x):
  2015. noconvert_columns.add(x)
  2016. else:
  2017. noconvert_columns.add(self.columns.index(x))
  2018. if isinstance(self.parse_dates, list):
  2019. for val in self.parse_dates:
  2020. if isinstance(val, list):
  2021. for k in val:
  2022. _set(k)
  2023. else:
  2024. _set(val)
  2025. elif isinstance(self.parse_dates, dict):
  2026. for val in self.parse_dates.values():
  2027. if isinstance(val, list):
  2028. for k in val:
  2029. _set(k)
  2030. else:
  2031. _set(val)
  2032. elif self.parse_dates:
  2033. if isinstance(self.index_col, list):
  2034. for k in self.index_col:
  2035. _set(k)
  2036. elif self.index_col is not None:
  2037. _set(self.index_col)
  2038. return noconvert_columns
  2039. def _make_reader(self, f):
  2040. sep = self.delimiter
  2041. if sep is None or len(sep) == 1:
  2042. if self.lineterminator:
  2043. raise ValueError(
  2044. "Custom line terminators not supported in python parser (yet)"
  2045. )
  2046. class MyDialect(csv.Dialect):
  2047. delimiter = self.delimiter
  2048. quotechar = self.quotechar
  2049. escapechar = self.escapechar
  2050. doublequote = self.doublequote
  2051. skipinitialspace = self.skipinitialspace
  2052. quoting = self.quoting
  2053. lineterminator = "\n"
  2054. dia = MyDialect
  2055. if sep is not None:
  2056. dia.delimiter = sep
  2057. else:
  2058. # attempt to sniff the delimiter from the first valid line,
  2059. # i.e. no comment line and not in skiprows
  2060. line = f.readline()
  2061. lines = self._check_comments([[line]])[0]
  2062. while self.skipfunc(self.pos) or not lines:
  2063. self.pos += 1
  2064. line = f.readline()
  2065. lines = self._check_comments([[line]])[0]
  2066. # since `line` was a string, lines will be a list containing
  2067. # only a single string
  2068. line = lines[0]
  2069. self.pos += 1
  2070. self.line_pos += 1
  2071. sniffed = csv.Sniffer().sniff(line)
  2072. dia.delimiter = sniffed.delimiter
  2073. # Note: self.encoding is irrelevant here
  2074. line_rdr = csv.reader(StringIO(line), dialect=dia)
  2075. self.buf.extend(list(line_rdr))
  2076. # Note: self.encoding is irrelevant here
  2077. reader = csv.reader(f, dialect=dia, strict=True)
  2078. else:
  2079. def _read():
  2080. line = f.readline()
  2081. pat = re.compile(sep)
  2082. yield pat.split(line.strip())
  2083. for line in f:
  2084. yield pat.split(line.strip())
  2085. reader = _read()
  2086. self.data = reader
  2087. def read(self, rows=None):
  2088. try:
  2089. content = self._get_lines(rows)
  2090. except StopIteration:
  2091. if self._first_chunk:
  2092. content = []
  2093. else:
  2094. raise
  2095. # done with first read, next time raise StopIteration
  2096. self._first_chunk = False
  2097. columns = list(self.orig_names)
  2098. if not len(content): # pragma: no cover
  2099. # DataFrame with the right metadata, even though it's length 0
  2100. names = self._maybe_dedup_names(self.orig_names)
  2101. index, columns, col_dict = _get_empty_meta(
  2102. names, self.index_col, self.index_names, self.dtype
  2103. )
  2104. columns = self._maybe_make_multi_index_columns(columns, self.col_names)
  2105. return index, columns, col_dict
  2106. # handle new style for names in index
  2107. count_empty_content_vals = count_empty_vals(content[0])
  2108. indexnamerow = None
  2109. if self.has_index_names and count_empty_content_vals == len(columns):
  2110. indexnamerow = content[0]
  2111. content = content[1:]
  2112. alldata = self._rows_to_cols(content)
  2113. data = self._exclude_implicit_index(alldata)
  2114. columns = self._maybe_dedup_names(self.columns)
  2115. columns, data = self._do_date_conversions(columns, data)
  2116. data = self._convert_data(data)
  2117. index, columns = self._make_index(data, alldata, columns, indexnamerow)
  2118. return index, columns, data
  2119. def _exclude_implicit_index(self, alldata):
  2120. names = self._maybe_dedup_names(self.orig_names)
  2121. if self._implicit_index:
  2122. excl_indices = self.index_col
  2123. data = {}
  2124. offset = 0
  2125. for i, col in enumerate(names):
  2126. while i + offset in excl_indices:
  2127. offset += 1
  2128. data[col] = alldata[i + offset]
  2129. else:
  2130. data = {k: v for k, v in zip(names, alldata)}
  2131. return data
  2132. # legacy
  2133. def get_chunk(self, size=None):
  2134. if size is None:
  2135. size = self.chunksize
  2136. return self.read(rows=size)
  2137. def _convert_data(self, data):
  2138. # apply converters
  2139. def _clean_mapping(mapping):
  2140. """converts col numbers to names"""
  2141. clean = {}
  2142. for col, v in mapping.items():
  2143. if isinstance(col, int) and col not in self.orig_names:
  2144. col = self.orig_names[col]
  2145. clean[col] = v
  2146. return clean
  2147. clean_conv = _clean_mapping(self.converters)
  2148. if not isinstance(self.dtype, dict):
  2149. # handles single dtype applied to all columns
  2150. clean_dtypes = self.dtype
  2151. else:
  2152. clean_dtypes = _clean_mapping(self.dtype)
  2153. # Apply NA values.
  2154. clean_na_values = {}
  2155. clean_na_fvalues = {}
  2156. if isinstance(self.na_values, dict):
  2157. for col in self.na_values:
  2158. na_value = self.na_values[col]
  2159. na_fvalue = self.na_fvalues[col]
  2160. if isinstance(col, int) and col not in self.orig_names:
  2161. col = self.orig_names[col]
  2162. clean_na_values[col] = na_value
  2163. clean_na_fvalues[col] = na_fvalue
  2164. else:
  2165. clean_na_values = self.na_values
  2166. clean_na_fvalues = self.na_fvalues
  2167. return self._convert_to_ndarrays(
  2168. data,
  2169. clean_na_values,
  2170. clean_na_fvalues,
  2171. self.verbose,
  2172. clean_conv,
  2173. clean_dtypes,
  2174. )
  2175. def _infer_columns(self):
  2176. names = self.names
  2177. num_original_columns = 0
  2178. clear_buffer = True
  2179. unnamed_cols = set()
  2180. if self.header is not None:
  2181. header = self.header
  2182. if isinstance(header, (list, tuple, np.ndarray)):
  2183. have_mi_columns = len(header) > 1
  2184. # we have a mi columns, so read an extra line
  2185. if have_mi_columns:
  2186. header = list(header) + [header[-1] + 1]
  2187. else:
  2188. have_mi_columns = False
  2189. header = [header]
  2190. columns = []
  2191. for level, hr in enumerate(header):
  2192. try:
  2193. line = self._buffered_line()
  2194. while self.line_pos <= hr:
  2195. line = self._next_line()
  2196. except StopIteration as err:
  2197. if self.line_pos < hr:
  2198. raise ValueError(
  2199. f"Passed header={hr} but only {self.line_pos + 1} lines in "
  2200. "file"
  2201. ) from err
  2202. # We have an empty file, so check
  2203. # if columns are provided. That will
  2204. # serve as the 'line' for parsing
  2205. if have_mi_columns and hr > 0:
  2206. if clear_buffer:
  2207. self._clear_buffer()
  2208. columns.append([None] * len(columns[-1]))
  2209. return columns, num_original_columns, unnamed_cols
  2210. if not self.names:
  2211. raise EmptyDataError("No columns to parse from file") from err
  2212. line = self.names[:]
  2213. this_columns = []
  2214. this_unnamed_cols = []
  2215. for i, c in enumerate(line):
  2216. if c == "":
  2217. if have_mi_columns:
  2218. col_name = f"Unnamed: {i}_level_{level}"
  2219. else:
  2220. col_name = f"Unnamed: {i}"
  2221. this_unnamed_cols.append(i)
  2222. this_columns.append(col_name)
  2223. else:
  2224. this_columns.append(c)
  2225. if not have_mi_columns and self.mangle_dupe_cols:
  2226. counts = defaultdict(int)
  2227. for i, col in enumerate(this_columns):
  2228. cur_count = counts[col]
  2229. while cur_count > 0:
  2230. counts[col] = cur_count + 1
  2231. col = f"{col}.{cur_count}"
  2232. cur_count = counts[col]
  2233. this_columns[i] = col
  2234. counts[col] = cur_count + 1
  2235. elif have_mi_columns:
  2236. # if we have grabbed an extra line, but its not in our
  2237. # format so save in the buffer, and create an blank extra
  2238. # line for the rest of the parsing code
  2239. if hr == header[-1]:
  2240. lc = len(this_columns)
  2241. ic = len(self.index_col) if self.index_col is not None else 0
  2242. unnamed_count = len(this_unnamed_cols)
  2243. if lc != unnamed_count and lc - ic > unnamed_count:
  2244. clear_buffer = False
  2245. this_columns = [None] * lc
  2246. self.buf = [self.buf[-1]]
  2247. columns.append(this_columns)
  2248. unnamed_cols.update({this_columns[i] for i in this_unnamed_cols})
  2249. if len(columns) == 1:
  2250. num_original_columns = len(this_columns)
  2251. if clear_buffer:
  2252. self._clear_buffer()
  2253. if names is not None:
  2254. if (self.usecols is not None and len(names) != len(self.usecols)) or (
  2255. self.usecols is None and len(names) != len(columns[0])
  2256. ):
  2257. raise ValueError(
  2258. "Number of passed names did not match "
  2259. "number of header fields in the file"
  2260. )
  2261. if len(columns) > 1:
  2262. raise TypeError("Cannot pass names with multi-index columns")
  2263. if self.usecols is not None:
  2264. # Set _use_cols. We don't store columns because they are
  2265. # overwritten.
  2266. self._handle_usecols(columns, names)
  2267. else:
  2268. self._col_indices = None
  2269. num_original_columns = len(names)
  2270. columns = [names]
  2271. else:
  2272. columns = self._handle_usecols(columns, columns[0])
  2273. else:
  2274. try:
  2275. line = self._buffered_line()
  2276. except StopIteration as err:
  2277. if not names:
  2278. raise EmptyDataError("No columns to parse from file") from err
  2279. line = names[:]
  2280. ncols = len(line)
  2281. num_original_columns = ncols
  2282. if not names:
  2283. if self.prefix:
  2284. columns = [[f"{self.prefix}{i}" for i in range(ncols)]]
  2285. else:
  2286. columns = [list(range(ncols))]
  2287. columns = self._handle_usecols(columns, columns[0])
  2288. else:
  2289. if self.usecols is None or len(names) >= num_original_columns:
  2290. columns = self._handle_usecols([names], names)
  2291. num_original_columns = len(names)
  2292. else:
  2293. if not callable(self.usecols) and len(names) != len(self.usecols):
  2294. raise ValueError(
  2295. "Number of passed names did not match number of "
  2296. "header fields in the file"
  2297. )
  2298. # Ignore output but set used columns.
  2299. self._handle_usecols([names], names)
  2300. columns = [names]
  2301. num_original_columns = ncols
  2302. return columns, num_original_columns, unnamed_cols
  2303. def _handle_usecols(self, columns, usecols_key):
  2304. """
  2305. Sets self._col_indices
  2306. usecols_key is used if there are string usecols.
  2307. """
  2308. if self.usecols is not None:
  2309. if callable(self.usecols):
  2310. col_indices = _evaluate_usecols(self.usecols, usecols_key)
  2311. elif any(isinstance(u, str) for u in self.usecols):
  2312. if len(columns) > 1:
  2313. raise ValueError(
  2314. "If using multiple headers, usecols must be integers."
  2315. )
  2316. col_indices = []
  2317. for col in self.usecols:
  2318. if isinstance(col, str):
  2319. try:
  2320. col_indices.append(usecols_key.index(col))
  2321. except ValueError:
  2322. _validate_usecols_names(self.usecols, usecols_key)
  2323. else:
  2324. col_indices.append(col)
  2325. else:
  2326. col_indices = self.usecols
  2327. columns = [
  2328. [n for i, n in enumerate(column) if i in col_indices]
  2329. for column in columns
  2330. ]
  2331. self._col_indices = col_indices
  2332. return columns
  2333. def _buffered_line(self):
  2334. """
  2335. Return a line from buffer, filling buffer if required.
  2336. """
  2337. if len(self.buf) > 0:
  2338. return self.buf[0]
  2339. else:
  2340. return self._next_line()
  2341. def _check_for_bom(self, first_row):
  2342. """
  2343. Checks whether the file begins with the BOM character.
  2344. If it does, remove it. In addition, if there is quoting
  2345. in the field subsequent to the BOM, remove it as well
  2346. because it technically takes place at the beginning of
  2347. the name, not the middle of it.
  2348. """
  2349. # first_row will be a list, so we need to check
  2350. # that that list is not empty before proceeding.
  2351. if not first_row:
  2352. return first_row
  2353. # The first element of this row is the one that could have the
  2354. # BOM that we want to remove. Check that the first element is a
  2355. # string before proceeding.
  2356. if not isinstance(first_row[0], str):
  2357. return first_row
  2358. # Check that the string is not empty, as that would
  2359. # obviously not have a BOM at the start of it.
  2360. if not first_row[0]:
  2361. return first_row
  2362. # Since the string is non-empty, check that it does
  2363. # in fact begin with a BOM.
  2364. first_elt = first_row[0][0]
  2365. if first_elt != _BOM:
  2366. return first_row
  2367. first_row_bom = first_row[0]
  2368. if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar:
  2369. start = 2
  2370. quote = first_row_bom[1]
  2371. end = first_row_bom[2:].index(quote) + 2
  2372. # Extract the data between the quotation marks
  2373. new_row = first_row_bom[start:end]
  2374. # Extract any remaining data after the second
  2375. # quotation mark.
  2376. if len(first_row_bom) > end + 1:
  2377. new_row += first_row_bom[end + 1 :]
  2378. return [new_row] + first_row[1:]
  2379. elif len(first_row_bom) > 1:
  2380. return [first_row_bom[1:]]
  2381. else:
  2382. # First row is just the BOM, so we
  2383. # return an empty string.
  2384. return [""]
  2385. def _is_line_empty(self, line):
  2386. """
  2387. Check if a line is empty or not.
  2388. Parameters
  2389. ----------
  2390. line : str, array-like
  2391. The line of data to check.
  2392. Returns
  2393. -------
  2394. boolean : Whether or not the line is empty.
  2395. """
  2396. return not line or all(not x for x in line)
  2397. def _next_line(self):
  2398. if isinstance(self.data, list):
  2399. while self.skipfunc(self.pos):
  2400. self.pos += 1
  2401. while True:
  2402. try:
  2403. line = self._check_comments([self.data[self.pos]])[0]
  2404. self.pos += 1
  2405. # either uncommented or blank to begin with
  2406. if not self.skip_blank_lines and (
  2407. self._is_line_empty(self.data[self.pos - 1]) or line
  2408. ):
  2409. break
  2410. elif self.skip_blank_lines:
  2411. ret = self._remove_empty_lines([line])
  2412. if ret:
  2413. line = ret[0]
  2414. break
  2415. except IndexError:
  2416. raise StopIteration
  2417. else:
  2418. while self.skipfunc(self.pos):
  2419. self.pos += 1
  2420. next(self.data)
  2421. while True:
  2422. orig_line = self._next_iter_line(row_num=self.pos + 1)
  2423. self.pos += 1
  2424. if orig_line is not None:
  2425. line = self._check_comments([orig_line])[0]
  2426. if self.skip_blank_lines:
  2427. ret = self._remove_empty_lines([line])
  2428. if ret:
  2429. line = ret[0]
  2430. break
  2431. elif self._is_line_empty(orig_line) or line:
  2432. break
  2433. # This was the first line of the file,
  2434. # which could contain the BOM at the
  2435. # beginning of it.
  2436. if self.pos == 1:
  2437. line = self._check_for_bom(line)
  2438. self.line_pos += 1
  2439. self.buf.append(line)
  2440. return line
  2441. def _alert_malformed(self, msg, row_num):
  2442. """
  2443. Alert a user about a malformed row.
  2444. If `self.error_bad_lines` is True, the alert will be `ParserError`.
  2445. If `self.warn_bad_lines` is True, the alert will be printed out.
  2446. Parameters
  2447. ----------
  2448. msg : The error message to display.
  2449. row_num : The row number where the parsing error occurred.
  2450. Because this row number is displayed, we 1-index,
  2451. even though we 0-index internally.
  2452. """
  2453. if self.error_bad_lines:
  2454. raise ParserError(msg)
  2455. elif self.warn_bad_lines:
  2456. base = f"Skipping line {row_num}: "
  2457. sys.stderr.write(base + msg + "\n")
  2458. def _next_iter_line(self, row_num):
  2459. """
  2460. Wrapper around iterating through `self.data` (CSV source).
  2461. When a CSV error is raised, we check for specific
  2462. error messages that allow us to customize the
  2463. error message displayed to the user.
  2464. Parameters
  2465. ----------
  2466. row_num : The row number of the line being parsed.
  2467. """
  2468. try:
  2469. return next(self.data)
  2470. except csv.Error as e:
  2471. if self.warn_bad_lines or self.error_bad_lines:
  2472. msg = str(e)
  2473. if "NULL byte" in msg or "line contains NUL" in msg:
  2474. msg = (
  2475. "NULL byte detected. This byte "
  2476. "cannot be processed in Python's "
  2477. "native csv library at the moment, "
  2478. "so please pass in engine='c' instead"
  2479. )
  2480. if self.skipfooter > 0:
  2481. reason = (
  2482. "Error could possibly be due to "
  2483. "parsing errors in the skipped footer rows "
  2484. "(the skipfooter keyword is only applied "
  2485. "after Python's csv library has parsed "
  2486. "all rows)."
  2487. )
  2488. msg += ". " + reason
  2489. self._alert_malformed(msg, row_num)
  2490. return None
  2491. def _check_comments(self, lines):
  2492. if self.comment is None:
  2493. return lines
  2494. ret = []
  2495. for l in lines:
  2496. rl = []
  2497. for x in l:
  2498. if not isinstance(x, str) or self.comment not in x:
  2499. rl.append(x)
  2500. else:
  2501. x = x[: x.find(self.comment)]
  2502. if len(x) > 0:
  2503. rl.append(x)
  2504. break
  2505. ret.append(rl)
  2506. return ret
  2507. def _remove_empty_lines(self, lines):
  2508. """
  2509. Iterate through the lines and remove any that are
  2510. either empty or contain only one whitespace value
  2511. Parameters
  2512. ----------
  2513. lines : array-like
  2514. The array of lines that we are to filter.
  2515. Returns
  2516. -------
  2517. filtered_lines : array-like
  2518. The same array of lines with the "empty" ones removed.
  2519. """
  2520. ret = []
  2521. for l in lines:
  2522. # Remove empty lines and lines with only one whitespace value
  2523. if (
  2524. len(l) > 1
  2525. or len(l) == 1
  2526. and (not isinstance(l[0], str) or l[0].strip())
  2527. ):
  2528. ret.append(l)
  2529. return ret
  2530. def _check_thousands(self, lines):
  2531. if self.thousands is None:
  2532. return lines
  2533. return self._search_replace_num_columns(
  2534. lines=lines, search=self.thousands, replace=""
  2535. )
  2536. def _search_replace_num_columns(self, lines, search, replace):
  2537. ret = []
  2538. for l in lines:
  2539. rl = []
  2540. for i, x in enumerate(l):
  2541. if (
  2542. not isinstance(x, str)
  2543. or search not in x
  2544. or (self._no_thousands_columns and i in self._no_thousands_columns)
  2545. or self.nonnum.search(x.strip())
  2546. ):
  2547. rl.append(x)
  2548. else:
  2549. rl.append(x.replace(search, replace))
  2550. ret.append(rl)
  2551. return ret
  2552. def _check_decimal(self, lines):
  2553. if self.decimal == _parser_defaults["decimal"]:
  2554. return lines
  2555. return self._search_replace_num_columns(
  2556. lines=lines, search=self.decimal, replace="."
  2557. )
  2558. def _clear_buffer(self):
  2559. self.buf = []
  2560. _implicit_index = False
  2561. def _get_index_name(self, columns):
  2562. """
  2563. Try several cases to get lines:
  2564. 0) There are headers on row 0 and row 1 and their
  2565. total summed lengths equals the length of the next line.
  2566. Treat row 0 as columns and row 1 as indices
  2567. 1) Look for implicit index: there are more columns
  2568. on row 1 than row 0. If this is true, assume that row
  2569. 1 lists index columns and row 0 lists normal columns.
  2570. 2) Get index from the columns if it was listed.
  2571. """
  2572. orig_names = list(columns)
  2573. columns = list(columns)
  2574. try:
  2575. line = self._next_line()
  2576. except StopIteration:
  2577. line = None
  2578. try:
  2579. next_line = self._next_line()
  2580. except StopIteration:
  2581. next_line = None
  2582. # implicitly index_col=0 b/c 1 fewer column names
  2583. implicit_first_cols = 0
  2584. if line is not None:
  2585. # leave it 0, #2442
  2586. # Case 1
  2587. if self.index_col is not False:
  2588. implicit_first_cols = len(line) - self.num_original_columns
  2589. # Case 0
  2590. if next_line is not None:
  2591. if len(next_line) == len(line) + self.num_original_columns:
  2592. # column and index names on diff rows
  2593. self.index_col = list(range(len(line)))
  2594. self.buf = self.buf[1:]
  2595. for c in reversed(line):
  2596. columns.insert(0, c)
  2597. # Update list of original names to include all indices.
  2598. orig_names = list(columns)
  2599. self.num_original_columns = len(columns)
  2600. return line, orig_names, columns
  2601. if implicit_first_cols > 0:
  2602. # Case 1
  2603. self._implicit_index = True
  2604. if self.index_col is None:
  2605. self.index_col = list(range(implicit_first_cols))
  2606. index_name = None
  2607. else:
  2608. # Case 2
  2609. (index_name, columns_, self.index_col) = _clean_index_names(
  2610. columns, self.index_col, self.unnamed_cols
  2611. )
  2612. return index_name, orig_names, columns
  2613. def _rows_to_cols(self, content):
  2614. col_len = self.num_original_columns
  2615. if self._implicit_index:
  2616. col_len += len(self.index_col)
  2617. max_len = max(len(row) for row in content)
  2618. # Check that there are no rows with too many
  2619. # elements in their row (rows with too few
  2620. # elements are padded with NaN).
  2621. if max_len > col_len and self.index_col is not False and self.usecols is None:
  2622. footers = self.skipfooter if self.skipfooter else 0
  2623. bad_lines = []
  2624. iter_content = enumerate(content)
  2625. content_len = len(content)
  2626. content = []
  2627. for (i, l) in iter_content:
  2628. actual_len = len(l)
  2629. if actual_len > col_len:
  2630. if self.error_bad_lines or self.warn_bad_lines:
  2631. row_num = self.pos - (content_len - i + footers)
  2632. bad_lines.append((row_num, actual_len))
  2633. if self.error_bad_lines:
  2634. break
  2635. else:
  2636. content.append(l)
  2637. for row_num, actual_len in bad_lines:
  2638. msg = (
  2639. f"Expected {col_len} fields in line {row_num + 1}, saw "
  2640. f"{actual_len}"
  2641. )
  2642. if (
  2643. self.delimiter
  2644. and len(self.delimiter) > 1
  2645. and self.quoting != csv.QUOTE_NONE
  2646. ):
  2647. # see gh-13374
  2648. reason = (
  2649. "Error could possibly be due to quotes being "
  2650. "ignored when a multi-char delimiter is used."
  2651. )
  2652. msg += ". " + reason
  2653. self._alert_malformed(msg, row_num + 1)
  2654. # see gh-13320
  2655. zipped_content = list(lib.to_object_array(content, min_width=col_len).T)
  2656. if self.usecols:
  2657. if self._implicit_index:
  2658. zipped_content = [
  2659. a
  2660. for i, a in enumerate(zipped_content)
  2661. if (
  2662. i < len(self.index_col)
  2663. or i - len(self.index_col) in self._col_indices
  2664. )
  2665. ]
  2666. else:
  2667. zipped_content = [
  2668. a for i, a in enumerate(zipped_content) if i in self._col_indices
  2669. ]
  2670. return zipped_content
  2671. def _get_lines(self, rows=None):
  2672. lines = self.buf
  2673. new_rows = None
  2674. # already fetched some number
  2675. if rows is not None:
  2676. # we already have the lines in the buffer
  2677. if len(self.buf) >= rows:
  2678. new_rows, self.buf = self.buf[:rows], self.buf[rows:]
  2679. # need some lines
  2680. else:
  2681. rows -= len(self.buf)
  2682. if new_rows is None:
  2683. if isinstance(self.data, list):
  2684. if self.pos > len(self.data):
  2685. raise StopIteration
  2686. if rows is None:
  2687. new_rows = self.data[self.pos :]
  2688. new_pos = len(self.data)
  2689. else:
  2690. new_rows = self.data[self.pos : self.pos + rows]
  2691. new_pos = self.pos + rows
  2692. # Check for stop rows. n.b.: self.skiprows is a set.
  2693. if self.skiprows:
  2694. new_rows = [
  2695. row
  2696. for i, row in enumerate(new_rows)
  2697. if not self.skipfunc(i + self.pos)
  2698. ]
  2699. lines.extend(new_rows)
  2700. self.pos = new_pos
  2701. else:
  2702. new_rows = []
  2703. try:
  2704. if rows is not None:
  2705. for _ in range(rows):
  2706. new_rows.append(next(self.data))
  2707. lines.extend(new_rows)
  2708. else:
  2709. rows = 0
  2710. while True:
  2711. new_row = self._next_iter_line(row_num=self.pos + rows + 1)
  2712. rows += 1
  2713. if new_row is not None:
  2714. new_rows.append(new_row)
  2715. except StopIteration:
  2716. if self.skiprows:
  2717. new_rows = [
  2718. row
  2719. for i, row in enumerate(new_rows)
  2720. if not self.skipfunc(i + self.pos)
  2721. ]
  2722. lines.extend(new_rows)
  2723. if len(lines) == 0:
  2724. raise
  2725. self.pos += len(new_rows)
  2726. self.buf = []
  2727. else:
  2728. lines = new_rows
  2729. if self.skipfooter:
  2730. lines = lines[: -self.skipfooter]
  2731. lines = self._check_comments(lines)
  2732. if self.skip_blank_lines:
  2733. lines = self._remove_empty_lines(lines)
  2734. lines = self._check_thousands(lines)
  2735. return self._check_decimal(lines)
  2736. def _make_date_converter(
  2737. date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True
  2738. ):
  2739. def converter(*date_cols):
  2740. if date_parser is None:
  2741. strs = parsing.concat_date_cols(date_cols)
  2742. try:
  2743. return tools.to_datetime(
  2744. ensure_object(strs),
  2745. utc=None,
  2746. dayfirst=dayfirst,
  2747. errors="ignore",
  2748. infer_datetime_format=infer_datetime_format,
  2749. cache=cache_dates,
  2750. ).to_numpy()
  2751. except ValueError:
  2752. return tools.to_datetime(
  2753. parsing.try_parse_dates(strs, dayfirst=dayfirst), cache=cache_dates
  2754. )
  2755. else:
  2756. try:
  2757. result = tools.to_datetime(
  2758. date_parser(*date_cols), errors="ignore", cache=cache_dates
  2759. )
  2760. if isinstance(result, datetime.datetime):
  2761. raise Exception("scalar parser")
  2762. return result
  2763. except Exception:
  2764. try:
  2765. return tools.to_datetime(
  2766. parsing.try_parse_dates(
  2767. parsing.concat_date_cols(date_cols),
  2768. parser=date_parser,
  2769. dayfirst=dayfirst,
  2770. ),
  2771. errors="ignore",
  2772. )
  2773. except Exception:
  2774. return generic_parser(date_parser, *date_cols)
  2775. return converter
  2776. def _process_date_conversion(
  2777. data_dict,
  2778. converter,
  2779. parse_spec,
  2780. index_col,
  2781. index_names,
  2782. columns,
  2783. keep_date_col=False,
  2784. ):
  2785. def _isindex(colspec):
  2786. return (isinstance(index_col, list) and colspec in index_col) or (
  2787. isinstance(index_names, list) and colspec in index_names
  2788. )
  2789. new_cols = []
  2790. new_data = {}
  2791. orig_names = columns
  2792. columns = list(columns)
  2793. date_cols = set()
  2794. if parse_spec is None or isinstance(parse_spec, bool):
  2795. return data_dict, columns
  2796. if isinstance(parse_spec, list):
  2797. # list of column lists
  2798. for colspec in parse_spec:
  2799. if is_scalar(colspec):
  2800. if isinstance(colspec, int) and colspec not in data_dict:
  2801. colspec = orig_names[colspec]
  2802. if _isindex(colspec):
  2803. continue
  2804. data_dict[colspec] = converter(data_dict[colspec])
  2805. else:
  2806. new_name, col, old_names = _try_convert_dates(
  2807. converter, colspec, data_dict, orig_names
  2808. )
  2809. if new_name in data_dict:
  2810. raise ValueError(f"New date column already in dict {new_name}")
  2811. new_data[new_name] = col
  2812. new_cols.append(new_name)
  2813. date_cols.update(old_names)
  2814. elif isinstance(parse_spec, dict):
  2815. # dict of new name to column list
  2816. for new_name, colspec in parse_spec.items():
  2817. if new_name in data_dict:
  2818. raise ValueError(f"Date column {new_name} already in dict")
  2819. _, col, old_names = _try_convert_dates(
  2820. converter, colspec, data_dict, orig_names
  2821. )
  2822. new_data[new_name] = col
  2823. new_cols.append(new_name)
  2824. date_cols.update(old_names)
  2825. data_dict.update(new_data)
  2826. new_cols.extend(columns)
  2827. if not keep_date_col:
  2828. for c in list(date_cols):
  2829. data_dict.pop(c)
  2830. new_cols.remove(c)
  2831. return data_dict, new_cols
  2832. def _try_convert_dates(parser, colspec, data_dict, columns):
  2833. colset = set(columns)
  2834. colnames = []
  2835. for c in colspec:
  2836. if c in colset:
  2837. colnames.append(c)
  2838. elif isinstance(c, int) and c not in columns:
  2839. colnames.append(columns[c])
  2840. else:
  2841. colnames.append(c)
  2842. new_name = "_".join(str(x) for x in colnames)
  2843. to_parse = [data_dict[c] for c in colnames if c in data_dict]
  2844. new_col = parser(*to_parse)
  2845. return new_name, new_col, colnames
  2846. def _clean_na_values(na_values, keep_default_na=True):
  2847. if na_values is None:
  2848. if keep_default_na:
  2849. na_values = STR_NA_VALUES
  2850. else:
  2851. na_values = set()
  2852. na_fvalues = set()
  2853. elif isinstance(na_values, dict):
  2854. old_na_values = na_values.copy()
  2855. na_values = {} # Prevent aliasing.
  2856. # Convert the values in the na_values dictionary
  2857. # into array-likes for further use. This is also
  2858. # where we append the default NaN values, provided
  2859. # that `keep_default_na=True`.
  2860. for k, v in old_na_values.items():
  2861. if not is_list_like(v):
  2862. v = [v]
  2863. if keep_default_na:
  2864. v = set(v) | STR_NA_VALUES
  2865. na_values[k] = v
  2866. na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
  2867. else:
  2868. if not is_list_like(na_values):
  2869. na_values = [na_values]
  2870. na_values = _stringify_na_values(na_values)
  2871. if keep_default_na:
  2872. na_values = na_values | STR_NA_VALUES
  2873. na_fvalues = _floatify_na_values(na_values)
  2874. return na_values, na_fvalues
  2875. def _clean_index_names(columns, index_col, unnamed_cols):
  2876. if not _is_index_col(index_col):
  2877. return None, columns, index_col
  2878. columns = list(columns)
  2879. cp_cols = list(columns)
  2880. index_names = []
  2881. # don't mutate
  2882. index_col = list(index_col)
  2883. for i, c in enumerate(index_col):
  2884. if isinstance(c, str):
  2885. index_names.append(c)
  2886. for j, name in enumerate(cp_cols):
  2887. if name == c:
  2888. index_col[i] = j
  2889. columns.remove(name)
  2890. break
  2891. else:
  2892. name = cp_cols[c]
  2893. columns.remove(name)
  2894. index_names.append(name)
  2895. # Only clean index names that were placeholders.
  2896. for i, name in enumerate(index_names):
  2897. if isinstance(name, str) and name in unnamed_cols:
  2898. index_names[i] = None
  2899. return index_names, columns, index_col
  2900. def _get_empty_meta(columns, index_col, index_names, dtype=None):
  2901. columns = list(columns)
  2902. # Convert `dtype` to a defaultdict of some kind.
  2903. # This will enable us to write `dtype[col_name]`
  2904. # without worrying about KeyError issues later on.
  2905. if not isinstance(dtype, dict):
  2906. # if dtype == None, default will be np.object.
  2907. default_dtype = dtype or np.object
  2908. dtype = defaultdict(lambda: default_dtype)
  2909. else:
  2910. # Save a copy of the dictionary.
  2911. _dtype = dtype.copy()
  2912. dtype = defaultdict(lambda: np.object)
  2913. # Convert column indexes to column names.
  2914. for k, v in _dtype.items():
  2915. col = columns[k] if is_integer(k) else k
  2916. dtype[col] = v
  2917. # Even though we have no data, the "index" of the empty DataFrame
  2918. # could for example still be an empty MultiIndex. Thus, we need to
  2919. # check whether we have any index columns specified, via either:
  2920. #
  2921. # 1) index_col (column indices)
  2922. # 2) index_names (column names)
  2923. #
  2924. # Both must be non-null to ensure a successful construction. Otherwise,
  2925. # we have to create a generic empty Index.
  2926. if (index_col is None or index_col is False) or index_names is None:
  2927. index = Index([])
  2928. else:
  2929. data = [Series([], dtype=dtype[name]) for name in index_names]
  2930. index = ensure_index_from_sequences(data, names=index_names)
  2931. index_col.sort()
  2932. for i, n in enumerate(index_col):
  2933. columns.pop(n - i)
  2934. col_dict = {col_name: Series([], dtype=dtype[col_name]) for col_name in columns}
  2935. return index, columns, col_dict
  2936. def _floatify_na_values(na_values):
  2937. # create float versions of the na_values
  2938. result = set()
  2939. for v in na_values:
  2940. try:
  2941. v = float(v)
  2942. if not np.isnan(v):
  2943. result.add(v)
  2944. except (TypeError, ValueError, OverflowError):
  2945. pass
  2946. return result
  2947. def _stringify_na_values(na_values):
  2948. """ return a stringified and numeric for these values """
  2949. result = []
  2950. for x in na_values:
  2951. result.append(str(x))
  2952. result.append(x)
  2953. try:
  2954. v = float(x)
  2955. # we are like 999 here
  2956. if v == int(v):
  2957. v = int(v)
  2958. result.append(f"{v}.0")
  2959. result.append(str(v))
  2960. result.append(v)
  2961. except (TypeError, ValueError, OverflowError):
  2962. pass
  2963. try:
  2964. result.append(int(x))
  2965. except (TypeError, ValueError, OverflowError):
  2966. pass
  2967. return set(result)
  2968. def _get_na_values(col, na_values, na_fvalues, keep_default_na):
  2969. """
  2970. Get the NaN values for a given column.
  2971. Parameters
  2972. ----------
  2973. col : str
  2974. The name of the column.
  2975. na_values : array-like, dict
  2976. The object listing the NaN values as strings.
  2977. na_fvalues : array-like, dict
  2978. The object listing the NaN values as floats.
  2979. keep_default_na : bool
  2980. If `na_values` is a dict, and the column is not mapped in the
  2981. dictionary, whether to return the default NaN values or the empty set.
  2982. Returns
  2983. -------
  2984. nan_tuple : A length-two tuple composed of
  2985. 1) na_values : the string NaN values for that column.
  2986. 2) na_fvalues : the float NaN values for that column.
  2987. """
  2988. if isinstance(na_values, dict):
  2989. if col in na_values:
  2990. return na_values[col], na_fvalues[col]
  2991. else:
  2992. if keep_default_na:
  2993. return STR_NA_VALUES, set()
  2994. return set(), set()
  2995. else:
  2996. return na_values, na_fvalues
  2997. def _get_col_names(colspec, columns):
  2998. colset = set(columns)
  2999. colnames = []
  3000. for c in colspec:
  3001. if c in colset:
  3002. colnames.append(c)
  3003. elif isinstance(c, int):
  3004. colnames.append(columns[c])
  3005. return colnames
  3006. class FixedWidthReader(abc.Iterator):
  3007. """
  3008. A reader of fixed-width lines.
  3009. """
  3010. def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=100):
  3011. self.f = f
  3012. self.buffer = None
  3013. self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
  3014. self.comment = comment
  3015. if colspecs == "infer":
  3016. self.colspecs = self.detect_colspecs(
  3017. infer_nrows=infer_nrows, skiprows=skiprows
  3018. )
  3019. else:
  3020. self.colspecs = colspecs
  3021. if not isinstance(self.colspecs, (tuple, list)):
  3022. raise TypeError(
  3023. "column specifications must be a list or tuple, "
  3024. f"input was a {type(colspecs).__name__}"
  3025. )
  3026. for colspec in self.colspecs:
  3027. if not (
  3028. isinstance(colspec, (tuple, list))
  3029. and len(colspec) == 2
  3030. and isinstance(colspec[0], (int, np.integer, type(None)))
  3031. and isinstance(colspec[1], (int, np.integer, type(None)))
  3032. ):
  3033. raise TypeError(
  3034. "Each column specification must be "
  3035. "2 element tuple or list of integers"
  3036. )
  3037. def get_rows(self, infer_nrows, skiprows=None):
  3038. """
  3039. Read rows from self.f, skipping as specified.
  3040. We distinguish buffer_rows (the first <= infer_nrows
  3041. lines) from the rows returned to detect_colspecs
  3042. because it's simpler to leave the other locations
  3043. with skiprows logic alone than to modify them to
  3044. deal with the fact we skipped some rows here as
  3045. well.
  3046. Parameters
  3047. ----------
  3048. infer_nrows : int
  3049. Number of rows to read from self.f, not counting
  3050. rows that are skipped.
  3051. skiprows: set, optional
  3052. Indices of rows to skip.
  3053. Returns
  3054. -------
  3055. detect_rows : list of str
  3056. A list containing the rows to read.
  3057. """
  3058. if skiprows is None:
  3059. skiprows = set()
  3060. buffer_rows = []
  3061. detect_rows = []
  3062. for i, row in enumerate(self.f):
  3063. if i not in skiprows:
  3064. detect_rows.append(row)
  3065. buffer_rows.append(row)
  3066. if len(detect_rows) >= infer_nrows:
  3067. break
  3068. self.buffer = iter(buffer_rows)
  3069. return detect_rows
  3070. def detect_colspecs(self, infer_nrows=100, skiprows=None):
  3071. # Regex escape the delimiters
  3072. delimiters = "".join(fr"\{x}" for x in self.delimiter)
  3073. pattern = re.compile(f"([^{delimiters}]+)")
  3074. rows = self.get_rows(infer_nrows, skiprows)
  3075. if not rows:
  3076. raise EmptyDataError("No rows from which to infer column width")
  3077. max_len = max(map(len, rows))
  3078. mask = np.zeros(max_len + 1, dtype=int)
  3079. if self.comment is not None:
  3080. rows = [row.partition(self.comment)[0] for row in rows]
  3081. for row in rows:
  3082. for m in pattern.finditer(row):
  3083. mask[m.start() : m.end()] = 1
  3084. shifted = np.roll(mask, 1)
  3085. shifted[0] = 0
  3086. edges = np.where((mask ^ shifted) == 1)[0]
  3087. edge_pairs = list(zip(edges[::2], edges[1::2]))
  3088. return edge_pairs
  3089. def __next__(self):
  3090. if self.buffer is not None:
  3091. try:
  3092. line = next(self.buffer)
  3093. except StopIteration:
  3094. self.buffer = None
  3095. line = next(self.f)
  3096. else:
  3097. line = next(self.f)
  3098. # Note: 'colspecs' is a sequence of half-open intervals.
  3099. return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs]
  3100. class FixedWidthFieldParser(PythonParser):
  3101. """
  3102. Specialization that Converts fixed-width fields into DataFrames.
  3103. See PythonParser for details.
  3104. """
  3105. def __init__(self, f, **kwds):
  3106. # Support iterators, convert to a list.
  3107. self.colspecs = kwds.pop("colspecs")
  3108. self.infer_nrows = kwds.pop("infer_nrows")
  3109. PythonParser.__init__(self, f, **kwds)
  3110. def _make_reader(self, f):
  3111. self.data = FixedWidthReader(
  3112. f,
  3113. self.colspecs,
  3114. self.delimiter,
  3115. self.comment,
  3116. self.skiprows,
  3117. self.infer_nrows,
  3118. )