PageRenderTime 61ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/io/parsers.py

http://github.com/wesm/pandas
Python | 3610 lines | 3593 code | 9 blank | 8 comment | 19 complexity | 801a9f05bffb2ca4bf300d4b8b808101 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. """
  2. Module contains tools for processing files into DataFrames or other objects
  3. """
  4. from __future__ import print_function
  5. from collections import defaultdict
  6. import csv
  7. import datetime
  8. import re
  9. import sys
  10. from textwrap import fill
  11. import warnings
  12. import numpy as np
  13. import pandas._libs.lib as lib
  14. import pandas._libs.ops as libops
  15. import pandas._libs.parsers as parsers
  16. from pandas._libs.tslibs import parsing
  17. import pandas.compat as compat
  18. from pandas.compat import (
  19. PY3, StringIO, lrange, lzip, map, range, string_types, u, zip)
  20. from pandas.errors import (
  21. AbstractMethodError, EmptyDataError, ParserError, ParserWarning)
  22. from pandas.util._decorators import Appender
  23. from pandas.core.dtypes.cast import astype_nansafe
  24. from pandas.core.dtypes.common import (
  25. ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal,
  26. is_extension_array_dtype, is_float, is_integer, is_integer_dtype,
  27. is_list_like, is_object_dtype, is_scalar, is_string_dtype, pandas_dtype)
  28. from pandas.core.dtypes.dtypes import CategoricalDtype
  29. from pandas.core.dtypes.missing import isna
  30. from pandas.core import algorithms
  31. from pandas.core.arrays import Categorical
  32. from pandas.core.frame import DataFrame
  33. from pandas.core.index import (
  34. Index, MultiIndex, RangeIndex, ensure_index_from_sequences)
  35. from pandas.core.series import Series
  36. from pandas.core.tools import datetimes as tools
  37. from pandas.io.common import (
  38. _NA_VALUES, BaseIterator, UnicodeReader, UTF8Recoder, _get_handle,
  39. _infer_compression, _validate_header_arg, get_filepath_or_buffer,
  40. is_file_like)
  41. from pandas.io.date_converters import generic_parser
  42. # BOM character (byte order mark)
  43. # This exists at the beginning of a file to indicate endianness
  44. # of a file (stream). Unfortunately, this marker screws up parsing,
  45. # so we need to remove it if we see it.
  46. _BOM = u('\ufeff')
  47. _doc_read_csv_and_table = r"""
  48. {summary}
  49. Also supports optionally iterating or breaking of the file
  50. into chunks.
  51. Additional help can be found in the online docs for
  52. `IO Tools <http://pandas.pydata.org/pandas-docs/stable/io.html>`_.
  53. Parameters
  54. ----------
  55. filepath_or_buffer : str, path object, or file-like object
  56. Any valid string path is acceptable. The string could be a URL. Valid
  57. URL schemes include http, ftp, s3, and file. For file URLs, a host is
  58. expected. A local file could be: file://localhost/path/to/table.csv.
  59. If you want to pass in a path object, pandas accepts either
  60. ``pathlib.Path`` or ``py._path.local.LocalPath``.
  61. By file-like object, we refer to objects with a ``read()`` method, such as
  62. a file handler (e.g. via builtin ``open`` function) or ``StringIO``.
  63. sep : str, default {_default_sep}
  64. Delimiter to use. If sep is None, the C engine cannot automatically detect
  65. the separator, but the Python parsing engine can, meaning the latter will
  66. be used and automatically detect the separator by Python's builtin sniffer
  67. tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
  68. different from ``'\s+'`` will be interpreted as regular expressions and
  69. will also force the use of the Python parsing engine. Note that regex
  70. delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
  71. delimiter : str, default ``None``
  72. Alias for sep.
  73. header : int, list of int, default 'infer'
  74. Row number(s) to use as the column names, and the start of the
  75. data. Default behavior is to infer the column names: if no names
  76. are passed the behavior is identical to ``header=0`` and column
  77. names are inferred from the first line of the file, if column
  78. names are passed explicitly then the behavior is identical to
  79. ``header=None``. Explicitly pass ``header=0`` to be able to
  80. replace existing names. The header can be a list of integers that
  81. specify row locations for a multi-index on the columns
  82. e.g. [0,1,3]. Intervening rows that are not specified will be
  83. skipped (e.g. 2 in this example is skipped). Note that this
  84. parameter ignores commented lines and empty lines if
  85. ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
  86. data rather than the first line of the file.
  87. names : array-like, optional
  88. List of column names to use. If file contains no header row, then you
  89. should explicitly pass ``header=None``. Duplicates in this list will cause
  90. a ``UserWarning`` to be issued.
  91. index_col : int, sequence or bool, optional
  92. Column to use as the row labels of the DataFrame. If a sequence is given, a
  93. MultiIndex is used. If you have a malformed file with delimiters at the end
  94. of each line, you might consider ``index_col=False`` to force pandas to
  95. not use the first column as the index (row names).
  96. usecols : list-like or callable, optional
  97. Return a subset of the columns. If list-like, all elements must either
  98. be positional (i.e. integer indices into the document columns) or strings
  99. that correspond to column names provided either by the user in `names` or
  100. inferred from the document header row(s). For example, a valid list-like
  101. `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
  102. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
  103. To instantiate a DataFrame from ``data`` with element order preserved use
  104. ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
  105. in ``['foo', 'bar']`` order or
  106. ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
  107. for ``['bar', 'foo']`` order.
  108. If callable, the callable function will be evaluated against the column
  109. names, returning names where the callable function evaluates to True. An
  110. example of a valid callable argument would be ``lambda x: x.upper() in
  111. ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
  112. parsing time and lower memory usage.
  113. squeeze : bool, default False
  114. If the parsed data only contains one column then return a Series.
  115. prefix : str, optional
  116. Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
  117. mangle_dupe_cols : bool, default True
  118. Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
  119. 'X'...'X'. Passing in False will cause data to be overwritten if there
  120. are duplicate names in the columns.
  121. dtype : Type name or dict of column -> type, optional
  122. Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
  123. 'c': 'Int64'}}
  124. Use `str` or `object` together with suitable `na_values` settings
  125. to preserve and not interpret dtype.
  126. If converters are specified, they will be applied INSTEAD
  127. of dtype conversion.
  128. engine : {{'c', 'python'}}, optional
  129. Parser engine to use. The C engine is faster while the python engine is
  130. currently more feature-complete.
  131. converters : dict, optional
  132. Dict of functions for converting values in certain columns. Keys can either
  133. be integers or column labels.
  134. true_values : list, optional
  135. Values to consider as True.
  136. false_values : list, optional
  137. Values to consider as False.
  138. skipinitialspace : bool, default False
  139. Skip spaces after delimiter.
  140. skiprows : list-like, int or callable, optional
  141. Line numbers to skip (0-indexed) or number of lines to skip (int)
  142. at the start of the file.
  143. If callable, the callable function will be evaluated against the row
  144. indices, returning True if the row should be skipped and False otherwise.
  145. An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
  146. skipfooter : int, default 0
  147. Number of lines at bottom of file to skip (Unsupported with engine='c').
  148. nrows : int, optional
  149. Number of rows of file to read. Useful for reading pieces of large files.
  150. na_values : scalar, str, list-like, or dict, optional
  151. Additional strings to recognize as NA/NaN. If dict passed, specific
  152. per-column NA values. By default the following values are interpreted as
  153. NaN: '""" + fill("', '".join(sorted(_NA_VALUES)),
  154. 70, subsequent_indent=" ") + """'.
  155. keep_default_na : bool, default True
  156. Whether or not to include the default NaN values when parsing the data.
  157. Depending on whether `na_values` is passed in, the behavior is as follows:
  158. * If `keep_default_na` is True, and `na_values` are specified, `na_values`
  159. is appended to the default NaN values used for parsing.
  160. * If `keep_default_na` is True, and `na_values` are not specified, only
  161. the default NaN values are used for parsing.
  162. * If `keep_default_na` is False, and `na_values` are specified, only
  163. the NaN values specified `na_values` are used for parsing.
  164. * If `keep_default_na` is False, and `na_values` are not specified, no
  165. strings will be parsed as NaN.
  166. Note that if `na_filter` is passed in as False, the `keep_default_na` and
  167. `na_values` parameters will be ignored.
  168. na_filter : bool, default True
  169. Detect missing value markers (empty strings and the value of na_values). In
  170. data without any NAs, passing na_filter=False can improve the performance
  171. of reading a large file.
  172. verbose : bool, default False
  173. Indicate number of NA values placed in non-numeric columns.
  174. skip_blank_lines : bool, default True
  175. If True, skip over blank lines rather than interpreting as NaN values.
  176. parse_dates : bool or list of int or names or list of lists or dict, \
  177. default False
  178. The behavior is as follows:
  179. * boolean. If True -> try parsing the index.
  180. * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
  181. each as a separate date column.
  182. * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
  183. a single date column.
  184. * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
  185. result 'foo'
  186. If a column or index cannot be represented as an array of datetimes,
  187. say because of an unparseable value or a mixture of timezones, the column
  188. or index will be returned unaltered as an object data type. For
  189. non-standard datetime parsing, use ``pd.to_datetime`` after
  190. ``pd.read_csv``. To parse an index or column with a mixture of timezones,
  191. specify ``date_parser`` to be a partially-applied
  192. :func:`pandas.to_datetime` with ``utc=True``. See
  193. :ref:`io.csv.mixed_timezones` for more.
  194. Note: A fast-path exists for iso8601-formatted dates.
  195. infer_datetime_format : bool, default False
  196. If True and `parse_dates` is enabled, pandas will attempt to infer the
  197. format of the datetime strings in the columns, and if it can be inferred,
  198. switch to a faster method of parsing them. In some cases this can increase
  199. the parsing speed by 5-10x.
  200. keep_date_col : bool, default False
  201. If True and `parse_dates` specifies combining multiple columns then
  202. keep the original columns.
  203. date_parser : function, optional
  204. Function to use for converting a sequence of string columns to an array of
  205. datetime instances. The default uses ``dateutil.parser.parser`` to do the
  206. conversion. Pandas will try to call `date_parser` in three different ways,
  207. advancing to the next if an exception occurs: 1) Pass one or more arrays
  208. (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
  209. string values from the columns defined by `parse_dates` into a single array
  210. and pass that; and 3) call `date_parser` once for each row using one or
  211. more strings (corresponding to the columns defined by `parse_dates`) as
  212. arguments.
  213. dayfirst : bool, default False
  214. DD/MM format dates, international and European format.
  215. iterator : bool, default False
  216. Return TextFileReader object for iteration or getting chunks with
  217. ``get_chunk()``.
  218. chunksize : int, optional
  219. Return TextFileReader object for iteration.
  220. See the `IO Tools docs
  221. <http://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
  222. for more information on ``iterator`` and ``chunksize``.
  223. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
  224. For on-the-fly decompression of on-disk data. If 'infer' and
  225. `filepath_or_buffer` is path-like, then detect compression from the
  226. following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
  227. decompression). If using 'zip', the ZIP file must contain only one data
  228. file to be read in. Set to None for no decompression.
  229. .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
  230. thousands : str, optional
  231. Thousands separator.
  232. decimal : str, default '.'
  233. Character to recognize as decimal point (e.g. use ',' for European data).
  234. lineterminator : str (length 1), optional
  235. Character to break file into lines. Only valid with C parser.
  236. quotechar : str (length 1), optional
  237. The character used to denote the start and end of a quoted item. Quoted
  238. items can include the delimiter and it will be ignored.
  239. quoting : int or csv.QUOTE_* instance, default 0
  240. Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
  241. QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
  242. doublequote : bool, default ``True``
  243. When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
  244. whether or not to interpret two consecutive quotechar elements INSIDE a
  245. field as a single ``quotechar`` element.
  246. escapechar : str (length 1), optional
  247. One-character string used to escape other characters.
  248. comment : str, optional
  249. Indicates remainder of line should not be parsed. If found at the beginning
  250. of a line, the line will be ignored altogether. This parameter must be a
  251. single character. Like empty lines (as long as ``skip_blank_lines=True``),
  252. fully commented lines are ignored by the parameter `header` but not by
  253. `skiprows`. For example, if ``comment='#'``, parsing
  254. ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
  255. treated as the header.
  256. encoding : str, optional
  257. Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
  258. standard encodings
  259. <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
  260. dialect : str or csv.Dialect, optional
  261. If provided, this parameter will override values (default or not) for the
  262. following parameters: `delimiter`, `doublequote`, `escapechar`,
  263. `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
  264. override values, a ParserWarning will be issued. See csv.Dialect
  265. documentation for more details.
  266. tupleize_cols : bool, default False
  267. Leave a list of tuples on columns as is (default is to convert to
  268. a MultiIndex on the columns).
  269. .. deprecated:: 0.21.0
  270. This argument will be removed and will always convert to MultiIndex
  271. error_bad_lines : bool, default True
  272. Lines with too many fields (e.g. a csv line with too many commas) will by
  273. default cause an exception to be raised, and no DataFrame will be returned.
  274. If False, then these "bad lines" will dropped from the DataFrame that is
  275. returned.
  276. warn_bad_lines : bool, default True
  277. If error_bad_lines is False, and warn_bad_lines is True, a warning for each
  278. "bad line" will be output.
  279. delim_whitespace : bool, default False
  280. Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
  281. used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
  282. is set to True, nothing should be passed in for the ``delimiter``
  283. parameter.
  284. .. versionadded:: 0.18.1 support for the Python parser.
  285. low_memory : bool, default True
  286. Internally process the file in chunks, resulting in lower memory use
  287. while parsing, but possibly mixed type inference. To ensure no mixed
  288. types either set False, or specify the type with the `dtype` parameter.
  289. Note that the entire file is read into a single DataFrame regardless,
  290. use the `chunksize` or `iterator` parameter to return the data in chunks.
  291. (Only valid with C parser).
  292. memory_map : bool, default False
  293. If a filepath is provided for `filepath_or_buffer`, map the file object
  294. directly onto memory and access the data directly from there. Using this
  295. option can improve performance because there is no longer any I/O overhead.
  296. float_precision : str, optional
  297. Specifies which converter the C engine should use for floating-point
  298. values. The options are `None` for the ordinary converter,
  299. `high` for the high-precision converter, and `round_trip` for the
  300. round-trip converter.
  301. Returns
  302. -------
  303. DataFrame or TextParser
  304. A comma-separated values (csv) file is returned as two-dimensional
  305. data structure with labeled axes.
  306. See Also
  307. --------
  308. to_csv : Write DataFrame to a comma-separated values (csv) file.
  309. read_csv : Read a comma-separated values (csv) file into DataFrame.
  310. read_fwf : Read a table of fixed-width formatted lines into DataFrame.
  311. Examples
  312. --------
  313. >>> pd.{func_name}('data.csv') # doctest: +SKIP
  314. """
  315. def _validate_integer(name, val, min_val=0):
  316. """
  317. Checks whether the 'name' parameter for parsing is either
  318. an integer OR float that can SAFELY be cast to an integer
  319. without losing accuracy. Raises a ValueError if that is
  320. not the case.
  321. Parameters
  322. ----------
  323. name : string
  324. Parameter name (used for error reporting)
  325. val : int or float
  326. The value to check
  327. min_val : int
  328. Minimum allowed value (val < min_val will result in a ValueError)
  329. """
  330. msg = "'{name:s}' must be an integer >={min_val:d}".format(name=name,
  331. min_val=min_val)
  332. if val is not None:
  333. if is_float(val):
  334. if int(val) != val:
  335. raise ValueError(msg)
  336. val = int(val)
  337. elif not (is_integer(val) and val >= min_val):
  338. raise ValueError(msg)
  339. return val
  340. def _validate_names(names):
  341. """
  342. Check if the `names` parameter contains duplicates.
  343. If duplicates are found, we issue a warning before returning.
  344. Parameters
  345. ----------
  346. names : array-like or None
  347. An array containing a list of the names used for the output DataFrame.
  348. Returns
  349. -------
  350. names : array-like or None
  351. The original `names` parameter.
  352. """
  353. if names is not None:
  354. if len(names) != len(set(names)):
  355. msg = ("Duplicate names specified. This "
  356. "will raise an error in the future.")
  357. warnings.warn(msg, UserWarning, stacklevel=3)
  358. return names
  359. def _read(filepath_or_buffer, kwds):
  360. """Generic reader of line files."""
  361. encoding = kwds.get('encoding', None)
  362. if encoding is not None:
  363. encoding = re.sub('_', '-', encoding).lower()
  364. kwds['encoding'] = encoding
  365. compression = kwds.get('compression', 'infer')
  366. compression = _infer_compression(filepath_or_buffer, compression)
  367. filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
  368. filepath_or_buffer, encoding, compression)
  369. kwds['compression'] = compression
  370. if kwds.get('date_parser', None) is not None:
  371. if isinstance(kwds['parse_dates'], bool):
  372. kwds['parse_dates'] = True
  373. # Extract some of the arguments (pass chunksize on).
  374. iterator = kwds.get('iterator', False)
  375. chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
  376. nrows = kwds.get('nrows', None)
  377. # Check for duplicates in names.
  378. _validate_names(kwds.get("names", None))
  379. # Create the parser.
  380. parser = TextFileReader(filepath_or_buffer, **kwds)
  381. if chunksize or iterator:
  382. return parser
  383. try:
  384. data = parser.read(nrows)
  385. finally:
  386. parser.close()
  387. if should_close:
  388. try:
  389. filepath_or_buffer.close()
  390. except ValueError:
  391. pass
  392. return data
  393. _parser_defaults = {
  394. 'delimiter': None,
  395. 'escapechar': None,
  396. 'quotechar': '"',
  397. 'quoting': csv.QUOTE_MINIMAL,
  398. 'doublequote': True,
  399. 'skipinitialspace': False,
  400. 'lineterminator': None,
  401. 'header': 'infer',
  402. 'index_col': None,
  403. 'names': None,
  404. 'prefix': None,
  405. 'skiprows': None,
  406. 'skipfooter': 0,
  407. 'nrows': None,
  408. 'na_values': None,
  409. 'keep_default_na': True,
  410. 'true_values': None,
  411. 'false_values': None,
  412. 'converters': None,
  413. 'dtype': None,
  414. 'thousands': None,
  415. 'comment': None,
  416. 'decimal': b'.',
  417. # 'engine': 'c',
  418. 'parse_dates': False,
  419. 'keep_date_col': False,
  420. 'dayfirst': False,
  421. 'date_parser': None,
  422. 'usecols': None,
  423. # 'iterator': False,
  424. 'chunksize': None,
  425. 'verbose': False,
  426. 'encoding': None,
  427. 'squeeze': False,
  428. 'compression': None,
  429. 'mangle_dupe_cols': True,
  430. 'tupleize_cols': False,
  431. 'infer_datetime_format': False,
  432. 'skip_blank_lines': True
  433. }
  434. _c_parser_defaults = {
  435. 'delim_whitespace': False,
  436. 'na_filter': True,
  437. 'low_memory': True,
  438. 'memory_map': False,
  439. 'error_bad_lines': True,
  440. 'warn_bad_lines': True,
  441. 'tupleize_cols': False,
  442. 'float_precision': None
  443. }
  444. _fwf_defaults = {
  445. 'colspecs': 'infer',
  446. 'infer_nrows': 100,
  447. 'widths': None,
  448. }
  449. _c_unsupported = {'skipfooter'}
  450. _python_unsupported = {
  451. 'low_memory',
  452. 'float_precision',
  453. }
  454. _deprecated_defaults = {
  455. 'tupleize_cols': None
  456. }
  457. _deprecated_args = {
  458. 'tupleize_cols',
  459. }
  460. def _make_parser_function(name, default_sep=','):
  461. # prepare read_table deprecation
  462. if name == "read_table":
  463. sep = False
  464. else:
  465. sep = default_sep
  466. def parser_f(filepath_or_buffer,
  467. sep=sep,
  468. delimiter=None,
  469. # Column and Index Locations and Names
  470. header='infer',
  471. names=None,
  472. index_col=None,
  473. usecols=None,
  474. squeeze=False,
  475. prefix=None,
  476. mangle_dupe_cols=True,
  477. # General Parsing Configuration
  478. dtype=None,
  479. engine=None,
  480. converters=None,
  481. true_values=None,
  482. false_values=None,
  483. skipinitialspace=False,
  484. skiprows=None,
  485. skipfooter=0,
  486. nrows=None,
  487. # NA and Missing Data Handling
  488. na_values=None,
  489. keep_default_na=True,
  490. na_filter=True,
  491. verbose=False,
  492. skip_blank_lines=True,
  493. # Datetime Handling
  494. parse_dates=False,
  495. infer_datetime_format=False,
  496. keep_date_col=False,
  497. date_parser=None,
  498. dayfirst=False,
  499. # Iteration
  500. iterator=False,
  501. chunksize=None,
  502. # Quoting, Compression, and File Format
  503. compression='infer',
  504. thousands=None,
  505. decimal=b'.',
  506. lineterminator=None,
  507. quotechar='"',
  508. quoting=csv.QUOTE_MINIMAL,
  509. doublequote=True,
  510. escapechar=None,
  511. comment=None,
  512. encoding=None,
  513. dialect=None,
  514. tupleize_cols=None,
  515. # Error Handling
  516. error_bad_lines=True,
  517. warn_bad_lines=True,
  518. # Internal
  519. delim_whitespace=False,
  520. low_memory=_c_parser_defaults['low_memory'],
  521. memory_map=False,
  522. float_precision=None):
  523. # deprecate read_table GH21948
  524. if name == "read_table":
  525. if sep is False and delimiter is None:
  526. warnings.warn("read_table is deprecated, use read_csv "
  527. "instead, passing sep='\\t'.",
  528. FutureWarning, stacklevel=2)
  529. else:
  530. warnings.warn("read_table is deprecated, use read_csv "
  531. "instead.",
  532. FutureWarning, stacklevel=2)
  533. if sep is False:
  534. sep = default_sep
  535. # gh-23761
  536. #
  537. # When a dialect is passed, it overrides any of the overlapping
  538. # parameters passed in directly. We don't want to warn if the
  539. # default parameters were passed in (since it probably means
  540. # that the user didn't pass them in explicitly in the first place).
  541. #
  542. # "delimiter" is the annoying corner case because we alias it to
  543. # "sep" before doing comparison to the dialect values later on.
  544. # Thus, we need a flag to indicate that we need to "override"
  545. # the comparison to dialect values by checking if default values
  546. # for BOTH "delimiter" and "sep" were provided.
  547. if dialect is not None:
  548. sep_override = delimiter is None and sep == default_sep
  549. kwds = dict(sep_override=sep_override)
  550. else:
  551. kwds = dict()
  552. # Alias sep -> delimiter.
  553. if delimiter is None:
  554. delimiter = sep
  555. if delim_whitespace and delimiter != default_sep:
  556. raise ValueError("Specified a delimiter with both sep and"
  557. " delim_whitespace=True; you can only"
  558. " specify one.")
  559. if engine is not None:
  560. engine_specified = True
  561. else:
  562. engine = 'c'
  563. engine_specified = False
  564. kwds.update(delimiter=delimiter,
  565. engine=engine,
  566. dialect=dialect,
  567. compression=compression,
  568. engine_specified=engine_specified,
  569. doublequote=doublequote,
  570. escapechar=escapechar,
  571. quotechar=quotechar,
  572. quoting=quoting,
  573. skipinitialspace=skipinitialspace,
  574. lineterminator=lineterminator,
  575. header=header,
  576. index_col=index_col,
  577. names=names,
  578. prefix=prefix,
  579. skiprows=skiprows,
  580. skipfooter=skipfooter,
  581. na_values=na_values,
  582. true_values=true_values,
  583. false_values=false_values,
  584. keep_default_na=keep_default_na,
  585. thousands=thousands,
  586. comment=comment,
  587. decimal=decimal,
  588. parse_dates=parse_dates,
  589. keep_date_col=keep_date_col,
  590. dayfirst=dayfirst,
  591. date_parser=date_parser,
  592. nrows=nrows,
  593. iterator=iterator,
  594. chunksize=chunksize,
  595. converters=converters,
  596. dtype=dtype,
  597. usecols=usecols,
  598. verbose=verbose,
  599. encoding=encoding,
  600. squeeze=squeeze,
  601. memory_map=memory_map,
  602. float_precision=float_precision,
  603. na_filter=na_filter,
  604. delim_whitespace=delim_whitespace,
  605. warn_bad_lines=warn_bad_lines,
  606. error_bad_lines=error_bad_lines,
  607. low_memory=low_memory,
  608. mangle_dupe_cols=mangle_dupe_cols,
  609. tupleize_cols=tupleize_cols,
  610. infer_datetime_format=infer_datetime_format,
  611. skip_blank_lines=skip_blank_lines)
  612. return _read(filepath_or_buffer, kwds)
  613. parser_f.__name__ = name
  614. return parser_f
  615. read_csv = _make_parser_function('read_csv', default_sep=',')
  616. read_csv = Appender(_doc_read_csv_and_table.format(
  617. func_name='read_csv',
  618. summary=('Read a comma-separated values (csv) file '
  619. 'into DataFrame.'),
  620. _default_sep="','")
  621. )(read_csv)
  622. read_table = _make_parser_function('read_table', default_sep='\t')
  623. read_table = Appender(_doc_read_csv_and_table.format(
  624. func_name='read_table',
  625. summary="""Read general delimited file into DataFrame.
  626. .. deprecated:: 0.24.0
  627. Use :func:`pandas.read_csv` instead, passing ``sep='\\t'`` if necessary.""",
  628. _default_sep=r"'\\t' (tab-stop)")
  629. )(read_table)
  630. def read_fwf(filepath_or_buffer, colspecs='infer', widths=None,
  631. infer_nrows=100, **kwds):
  632. r"""
  633. Read a table of fixed-width formatted lines into DataFrame.
  634. Also supports optionally iterating or breaking of the file
  635. into chunks.
  636. Additional help can be found in the `online docs for IO Tools
  637. <http://pandas.pydata.org/pandas-docs/stable/io.html>`_.
  638. Parameters
  639. ----------
  640. filepath_or_buffer : str, path object, or file-like object
  641. Any valid string path is acceptable. The string could be a URL. Valid
  642. URL schemes include http, ftp, s3, and file. For file URLs, a host is
  643. expected. A local file could be: file://localhost/path/to/table.csv.
  644. If you want to pass in a path object, pandas accepts either
  645. ``pathlib.Path`` or ``py._path.local.LocalPath``.
  646. By file-like object, we refer to objects with a ``read()`` method,
  647. such as a file handler (e.g. via builtin ``open`` function)
  648. or ``StringIO``.
  649. colspecs : list of tuple (int, int) or 'infer'. optional
  650. A list of tuples giving the extents of the fixed-width
  651. fields of each line as half-open intervals (i.e., [from, to[ ).
  652. String value 'infer' can be used to instruct the parser to try
  653. detecting the column specifications from the first 100 rows of
  654. the data which are not being skipped via skiprows (default='infer').
  655. widths : list of int, optional
  656. A list of field widths which can be used instead of 'colspecs' if
  657. the intervals are contiguous.
  658. infer_nrows : int, default 100
  659. The number of rows to consider when letting the parser determine the
  660. `colspecs`.
  661. .. versionadded:: 0.24.0
  662. **kwds : optional
  663. Optional keyword arguments can be passed to ``TextFileReader``.
  664. Returns
  665. -------
  666. DataFrame or TextParser
  667. A comma-separated values (csv) file is returned as two-dimensional
  668. data structure with labeled axes.
  669. See Also
  670. --------
  671. to_csv : Write DataFrame to a comma-separated values (csv) file.
  672. read_csv : Read a comma-separated values (csv) file into DataFrame.
  673. Examples
  674. --------
  675. >>> pd.read_fwf('data.csv') # doctest: +SKIP
  676. """
  677. # Check input arguments.
  678. if colspecs is None and widths is None:
  679. raise ValueError("Must specify either colspecs or widths")
  680. elif colspecs not in (None, 'infer') and widths is not None:
  681. raise ValueError("You must specify only one of 'widths' and "
  682. "'colspecs'")
  683. # Compute 'colspecs' from 'widths', if specified.
  684. if widths is not None:
  685. colspecs, col = [], 0
  686. for w in widths:
  687. colspecs.append((col, col + w))
  688. col += w
  689. kwds['colspecs'] = colspecs
  690. kwds['infer_nrows'] = infer_nrows
  691. kwds['engine'] = 'python-fwf'
  692. return _read(filepath_or_buffer, kwds)
  693. class TextFileReader(BaseIterator):
  694. """
  695. Passed dialect overrides any of the related parser options
  696. """
  697. def __init__(self, f, engine=None, **kwds):
  698. self.f = f
  699. if engine is not None:
  700. engine_specified = True
  701. else:
  702. engine = 'python'
  703. engine_specified = False
  704. self._engine_specified = kwds.get('engine_specified', engine_specified)
  705. if kwds.get('dialect') is not None:
  706. dialect = kwds['dialect']
  707. if dialect in csv.list_dialects():
  708. dialect = csv.get_dialect(dialect)
  709. # Any valid dialect should have these attributes.
  710. # If any are missing, we will raise automatically.
  711. for param in ('delimiter', 'doublequote', 'escapechar',
  712. 'skipinitialspace', 'quotechar', 'quoting'):
  713. try:
  714. dialect_val = getattr(dialect, param)
  715. except AttributeError:
  716. raise ValueError("Invalid dialect '{dialect}' provided"
  717. .format(dialect=kwds['dialect']))
  718. parser_default = _parser_defaults[param]
  719. provided = kwds.get(param, parser_default)
  720. # Messages for conflicting values between the dialect
  721. # instance and the actual parameters provided.
  722. conflict_msgs = []
  723. # Don't warn if the default parameter was passed in,
  724. # even if it conflicts with the dialect (gh-23761).
  725. if provided != parser_default and provided != dialect_val:
  726. msg = ("Conflicting values for '{param}': '{val}' was "
  727. "provided, but the dialect specifies '{diaval}'. "
  728. "Using the dialect-specified value.".format(
  729. param=param, val=provided, diaval=dialect_val))
  730. # Annoying corner case for not warning about
  731. # conflicts between dialect and delimiter parameter.
  732. # Refer to the outer "_read_" function for more info.
  733. if not (param == "delimiter" and
  734. kwds.pop("sep_override", False)):
  735. conflict_msgs.append(msg)
  736. if conflict_msgs:
  737. warnings.warn('\n\n'.join(conflict_msgs), ParserWarning,
  738. stacklevel=2)
  739. kwds[param] = dialect_val
  740. if kwds.get("skipfooter"):
  741. if kwds.get("iterator") or kwds.get("chunksize"):
  742. raise ValueError("'skipfooter' not supported for 'iteration'")
  743. if kwds.get("nrows"):
  744. raise ValueError("'skipfooter' not supported with 'nrows'")
  745. if kwds.get('header', 'infer') == 'infer':
  746. kwds['header'] = 0 if kwds.get('names') is None else None
  747. self.orig_options = kwds
  748. # miscellanea
  749. self.engine = engine
  750. self._engine = None
  751. self._currow = 0
  752. options = self._get_options_with_defaults(engine)
  753. self.chunksize = options.pop('chunksize', None)
  754. self.nrows = options.pop('nrows', None)
  755. self.squeeze = options.pop('squeeze', False)
  756. # might mutate self.engine
  757. self.engine = self._check_file_or_buffer(f, engine)
  758. self.options, self.engine = self._clean_options(options, engine)
  759. if 'has_index_names' in kwds:
  760. self.options['has_index_names'] = kwds['has_index_names']
  761. self._make_engine(self.engine)
  762. def close(self):
  763. self._engine.close()
  764. def _get_options_with_defaults(self, engine):
  765. kwds = self.orig_options
  766. options = {}
  767. for argname, default in compat.iteritems(_parser_defaults):
  768. value = kwds.get(argname, default)
  769. # see gh-12935
  770. if argname == 'mangle_dupe_cols' and not value:
  771. raise ValueError('Setting mangle_dupe_cols=False is '
  772. 'not supported yet')
  773. else:
  774. options[argname] = value
  775. for argname, default in compat.iteritems(_c_parser_defaults):
  776. if argname in kwds:
  777. value = kwds[argname]
  778. if engine != 'c' and value != default:
  779. if ('python' in engine and
  780. argname not in _python_unsupported):
  781. pass
  782. elif value == _deprecated_defaults.get(argname, default):
  783. pass
  784. else:
  785. raise ValueError(
  786. 'The %r option is not supported with the'
  787. ' %r engine' % (argname, engine))
  788. else:
  789. value = _deprecated_defaults.get(argname, default)
  790. options[argname] = value
  791. if engine == 'python-fwf':
  792. for argname, default in compat.iteritems(_fwf_defaults):
  793. options[argname] = kwds.get(argname, default)
  794. return options
  795. def _check_file_or_buffer(self, f, engine):
  796. # see gh-16530
  797. if is_file_like(f):
  798. next_attr = "__next__" if PY3 else "next"
  799. # The C engine doesn't need the file-like to have the "next" or
  800. # "__next__" attribute. However, the Python engine explicitly calls
  801. # "next(...)" when iterating through such an object, meaning it
  802. # needs to have that attribute ("next" for Python 2.x, "__next__"
  803. # for Python 3.x)
  804. if engine != "c" and not hasattr(f, next_attr):
  805. msg = ("The 'python' engine cannot iterate "
  806. "through this file buffer.")
  807. raise ValueError(msg)
  808. return engine
  809. def _clean_options(self, options, engine):
  810. result = options.copy()
  811. engine_specified = self._engine_specified
  812. fallback_reason = None
  813. sep = options['delimiter']
  814. delim_whitespace = options['delim_whitespace']
  815. # C engine not supported yet
  816. if engine == 'c':
  817. if options['skipfooter'] > 0:
  818. fallback_reason = ("the 'c' engine does not support"
  819. " skipfooter")
  820. engine = 'python'
  821. encoding = sys.getfilesystemencoding() or 'utf-8'
  822. if sep is None and not delim_whitespace:
  823. if engine == 'c':
  824. fallback_reason = ("the 'c' engine does not support"
  825. " sep=None with delim_whitespace=False")
  826. engine = 'python'
  827. elif sep is not None and len(sep) > 1:
  828. if engine == 'c' and sep == r'\s+':
  829. result['delim_whitespace'] = True
  830. del result['delimiter']
  831. elif engine not in ('python', 'python-fwf'):
  832. # wait until regex engine integrated
  833. fallback_reason = ("the 'c' engine does not support"
  834. " regex separators (separators > 1 char and"
  835. r" different from '\s+' are"
  836. " interpreted as regex)")
  837. engine = 'python'
  838. elif delim_whitespace:
  839. if 'python' in engine:
  840. result['delimiter'] = r'\s+'
  841. elif sep is not None:
  842. encodeable = True
  843. try:
  844. if len(sep.encode(encoding)) > 1:
  845. encodeable = False
  846. except UnicodeDecodeError:
  847. encodeable = False
  848. if not encodeable and engine not in ('python', 'python-fwf'):
  849. fallback_reason = ("the separator encoded in {encoding}"
  850. " is > 1 char long, and the 'c' engine"
  851. " does not support such separators"
  852. .format(encoding=encoding))
  853. engine = 'python'
  854. quotechar = options['quotechar']
  855. if (quotechar is not None and
  856. isinstance(quotechar, (str, compat.text_type, bytes))):
  857. if (len(quotechar) == 1 and ord(quotechar) > 127 and
  858. engine not in ('python', 'python-fwf')):
  859. fallback_reason = ("ord(quotechar) > 127, meaning the "
  860. "quotechar is larger than one byte, "
  861. "and the 'c' engine does not support "
  862. "such quotechars")
  863. engine = 'python'
  864. if fallback_reason and engine_specified:
  865. raise ValueError(fallback_reason)
  866. if engine == 'c':
  867. for arg in _c_unsupported:
  868. del result[arg]
  869. if 'python' in engine:
  870. for arg in _python_unsupported:
  871. if fallback_reason and result[arg] != _c_parser_defaults[arg]:
  872. msg = ("Falling back to the 'python' engine because"
  873. " {reason}, but this causes {option!r} to be"
  874. " ignored as it is not supported by the 'python'"
  875. " engine.").format(reason=fallback_reason,
  876. option=arg)
  877. raise ValueError(msg)
  878. del result[arg]
  879. if fallback_reason:
  880. warnings.warn(("Falling back to the 'python' engine because"
  881. " {0}; you can avoid this warning by specifying"
  882. " engine='python'.").format(fallback_reason),
  883. ParserWarning, stacklevel=5)
  884. index_col = options['index_col']
  885. names = options['names']
  886. converters = options['converters']
  887. na_values = options['na_values']
  888. skiprows = options['skiprows']
  889. _validate_header_arg(options['header'])
  890. depr_warning = ''
  891. for arg in _deprecated_args:
  892. parser_default = _c_parser_defaults[arg]
  893. depr_default = _deprecated_defaults[arg]
  894. msg = ("The '{arg}' argument has been deprecated "
  895. "and will be removed in a future version."
  896. .format(arg=arg))
  897. if arg == 'tupleize_cols':
  898. msg += (' Column tuples will then '
  899. 'always be converted to MultiIndex.')
  900. if result.get(arg, depr_default) != depr_default:
  901. # raise Exception(result.get(arg, depr_default), depr_default)
  902. depr_warning += msg + '\n\n'
  903. else:
  904. result[arg] = parser_default
  905. if depr_warning != '':
  906. warnings.warn(depr_warning, FutureWarning, stacklevel=2)
  907. if index_col is True:
  908. raise ValueError("The value of index_col couldn't be 'True'")
  909. if _is_index_col(index_col):
  910. if not isinstance(index_col, (list, tuple, np.ndarray)):
  911. index_col = [index_col]
  912. result['index_col'] = index_col
  913. names = list(names) if names is not None else names
  914. # type conversion-related
  915. if converters is not None:
  916. if not isinstance(converters, dict):
  917. raise TypeError('Type converters must be a dict or'
  918. ' subclass, input was '
  919. 'a {0!r}'.format(type(converters).__name__))
  920. else:
  921. converters = {}
  922. # Converting values to NA
  923. keep_default_na = options['keep_default_na']
  924. na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
  925. # handle skiprows; this is internally handled by the
  926. # c-engine, so only need for python parsers
  927. if engine != 'c':
  928. if is_integer(skiprows):
  929. skiprows = lrange(skiprows)
  930. if skiprows is None:
  931. skiprows = set()
  932. elif not callable(skiprows):
  933. skiprows = set(skiprows)
  934. # put stuff back
  935. result['names'] = names
  936. result['converters'] = converters
  937. result['na_values'] = na_values
  938. result['na_fvalues'] = na_fvalues
  939. result['skiprows'] = skiprows
  940. return result, engine
  941. def __next__(self):
  942. try:
  943. return self.get_chunk()
  944. except StopIteration:
  945. self.close()
  946. raise
  947. def _make_engine(self, engine='c'):
  948. if engine == 'c':
  949. self._engine = CParserWrapper(self.f, **self.options)
  950. else:
  951. if engine == 'python':
  952. klass = PythonParser
  953. elif engine == 'python-fwf':
  954. klass = FixedWidthFieldParser
  955. else:
  956. raise ValueError('Unknown engine: {engine} (valid options are'
  957. ' "c", "python", or' ' "python-fwf")'.format(
  958. engine=engine))
  959. self._engine = klass(self.f, **self.options)
  960. def _failover_to_python(self):
  961. raise AbstractMethodError(self)
  962. def read(self, nrows=None):
  963. nrows = _validate_integer('nrows', nrows)
  964. ret = self._engine.read(nrows)
  965. # May alter columns / col_dict
  966. index, columns, col_dict = self._create_index(ret)
  967. if index is None:
  968. if col_dict:
  969. # Any column is actually fine:
  970. new_rows = len(compat.next(compat.itervalues(col_dict)))
  971. index = RangeIndex(self._currow, self._currow + new_rows)
  972. else:
  973. new_rows = 0
  974. else:
  975. new_rows = len(index)
  976. df = DataFrame(col_dict, columns=columns, index=index)
  977. self._currow += new_rows
  978. if self.squeeze and len(df.columns) == 1:
  979. return df[df.columns[0]].copy()
  980. return df
  981. def _create_index(self, ret):
  982. index, columns, col_dict = ret
  983. return index, columns, col_dict
  984. def get_chunk(self, size=None):
  985. if size is None:
  986. size = self.chunksize
  987. if self.nrows is not None:
  988. if self._currow >= self.nrows:
  989. raise StopIteration
  990. size = min(size, self.nrows - self._currow)
  991. return self.read(nrows=size)
  992. def _is_index_col(col):
  993. return col is not None and col is not False
  994. def _is_potential_multi_index(columns):
  995. """
  996. Check whether or not the `columns` parameter
  997. could be converted into a MultiIndex.
  998. Parameters
  999. ----------
  1000. columns : array-like
  1001. Object which may or may not be convertible into a MultiIndex
  1002. Returns
  1003. -------
  1004. boolean : Whether or not columns could become a MultiIndex
  1005. """
  1006. return (len(columns) and not isinstance(columns, MultiIndex) and
  1007. all(isinstance(c, tuple) for c in columns))
  1008. def _evaluate_usecols(usecols, names):
  1009. """
  1010. Check whether or not the 'usecols' parameter
  1011. is a callable. If so, enumerates the 'names'
  1012. parameter and returns a set of indices for
  1013. each entry in 'names' that evaluates to True.
  1014. If not a callable, returns 'usecols'.
  1015. """
  1016. if callable(usecols):
  1017. return {i for i, name in enumerate(names) if usecols(name)}
  1018. return usecols
  1019. def _validate_usecols_names(usecols, names):
  1020. """
  1021. Validates that all usecols are present in a given
  1022. list of names. If not, raise a ValueError that
  1023. shows what usecols are missing.
  1024. Parameters
  1025. ----------
  1026. usecols : iterable of usecols
  1027. The columns to validate are present in names.
  1028. names : iterable of names
  1029. The column names to check against.
  1030. Returns
  1031. -------
  1032. usecols : iterable of usecols
  1033. The `usecols` parameter if the validation succeeds.
  1034. Raises
  1035. ------
  1036. ValueError : Columns were missing. Error message will list them.
  1037. """
  1038. missing = [c for c in usecols if c not in names]
  1039. if len(missing) > 0:
  1040. raise ValueError(
  1041. "Usecols do not match columns, "
  1042. "columns expected but not found: {missing}".format(missing=missing)
  1043. )
  1044. return usecols
  1045. def _validate_skipfooter_arg(skipfooter):
  1046. """
  1047. Validate the 'skipfooter' parameter.
  1048. Checks whether 'skipfooter' is a non-negative integer.
  1049. Raises a ValueError if that is not the case.
  1050. Parameters
  1051. ----------
  1052. skipfooter : non-negative integer
  1053. The number of rows to skip at the end of the file.
  1054. Returns
  1055. -------
  1056. validated_skipfooter : non-negative integer
  1057. The original input if the validation succeeds.
  1058. Raises
  1059. ------
  1060. ValueError : 'skipfooter' was not a non-negative integer.
  1061. """
  1062. if not is_integer(skipfooter):
  1063. raise ValueError("skipfooter must be an integer")
  1064. if skipfooter < 0:
  1065. raise ValueError("skipfooter cannot be negative")
  1066. return skipfooter
  1067. def _validate_usecols_arg(usecols):
  1068. """
  1069. Validate the 'usecols' parameter.
  1070. Checks whether or not the 'usecols' parameter contains all integers
  1071. (column selection by index), strings (column by name) or is a callable.
  1072. Raises a ValueError if that is not the case.
  1073. Parameters
  1074. ----------
  1075. usecols : list-like, callable, or None
  1076. List of columns to use when parsing or a callable that can be used
  1077. to filter a list of table columns.
  1078. Returns
  1079. -------
  1080. usecols_tuple : tuple
  1081. A tuple of (verified_usecols, usecols_dtype).
  1082. 'verified_usecols' is either a set if an array-like is passed in or
  1083. 'usecols' if a callable or None is passed in.
  1084. 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
  1085. is passed in or None if a callable or None is passed in.
  1086. """
  1087. msg = ("'usecols' must either be list-like of all strings, all unicode, "
  1088. "all integers or a callable.")
  1089. if usecols is not None:
  1090. if callable(usecols):
  1091. return usecols, None
  1092. if not is_list_like(usecols):
  1093. # see gh-20529
  1094. #
  1095. # Ensure it is iterable container but not string.
  1096. raise ValueError(msg)
  1097. usecols_dtype = lib.infer_dtype(usecols, skipna=False)
  1098. if usecols_dtype not in ("empty", "integer",
  1099. "string", "unicode"):
  1100. raise ValueError(msg)
  1101. usecols = set(usecols)
  1102. if usecols_dtype == "unicode":
  1103. # see gh-13253
  1104. #
  1105. # Python 2.x compatibility
  1106. usecols = {col.encode("utf-8") for col in usecols}
  1107. return usecols, usecols_dtype
  1108. return usecols, None
  1109. def _validate_parse_dates_arg(parse_dates):
  1110. """
  1111. Check whether or not the

Large files files are truncated, but you can click here to view the full file