/pandas/io/parsers.py
Python | 3719 lines | 3659 code | 21 blank | 39 comment | 153 complexity | c989477ba1887f9bd065ae6a07118f48 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- """
- Module contains tools for processing files into DataFrames or other objects
- """
- from collections import abc, defaultdict
- import csv
- import datetime
- from io import StringIO, TextIOWrapper
- import itertools
- import re
- import sys
- from textwrap import fill
- from typing import Any, Dict, Iterable, List, Set
- import warnings
- import numpy as np
- import pandas._libs.lib as lib
- import pandas._libs.ops as libops
- import pandas._libs.parsers as parsers
- from pandas._libs.parsers import STR_NA_VALUES
- from pandas._libs.tslibs import parsing
- from pandas._typing import FilePathOrBuffer
- from pandas.errors import (
- AbstractMethodError,
- EmptyDataError,
- ParserError,
- ParserWarning,
- )
- from pandas.util._decorators import Appender
- from pandas.core.dtypes.cast import astype_nansafe
- from pandas.core.dtypes.common import (
- ensure_object,
- ensure_str,
- is_bool_dtype,
- is_categorical_dtype,
- is_dict_like,
- is_dtype_equal,
- is_extension_array_dtype,
- is_file_like,
- is_float,
- is_integer,
- is_integer_dtype,
- is_list_like,
- is_object_dtype,
- is_scalar,
- is_string_dtype,
- pandas_dtype,
- )
- from pandas.core.dtypes.dtypes import CategoricalDtype
- from pandas.core.dtypes.missing import isna
- from pandas.core import algorithms
- from pandas.core.arrays import Categorical
- from pandas.core.frame import DataFrame
- from pandas.core.indexes.api import (
- Index,
- MultiIndex,
- RangeIndex,
- ensure_index_from_sequences,
- )
- from pandas.core.series import Series
- from pandas.core.tools import datetimes as tools
- from pandas.io.common import (
- get_filepath_or_buffer,
- get_handle,
- infer_compression,
- validate_header_arg,
- )
- from pandas.io.date_converters import generic_parser
- # BOM character (byte order mark)
- # This exists at the beginning of a file to indicate endianness
- # of a file (stream). Unfortunately, this marker screws up parsing,
- # so we need to remove it if we see it.
- _BOM = "\ufeff"
- _doc_read_csv_and_table = (
- r"""
- {summary}
- Also supports optionally iterating or breaking of the file
- into chunks.
- Additional help can be found in the online docs for
- `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
- Parameters
- ----------
- filepath_or_buffer : str, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
- expected. A local file could be: file://localhost/path/to/table.csv.
- If you want to pass in a path object, pandas accepts any ``os.PathLike``.
- By file-like object, we refer to objects with a ``read()`` method, such as
- a file handler (e.g. via builtin ``open`` function) or ``StringIO``.
- sep : str, default {_default_sep}
- Delimiter to use. If sep is None, the C engine cannot automatically detect
- the separator, but the Python parsing engine can, meaning the latter will
- be used and automatically detect the separator by Python's builtin sniffer
- tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
- different from ``'\s+'`` will be interpreted as regular expressions and
- will also force the use of the Python parsing engine. Note that regex
- delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
- delimiter : str, default ``None``
- Alias for sep.
- header : int, list of int, default 'infer'
- Row number(s) to use as the column names, and the start of the
- data. Default behavior is to infer the column names: if no names
- are passed the behavior is identical to ``header=0`` and column
- names are inferred from the first line of the file, if column
- names are passed explicitly then the behavior is identical to
- ``header=None``. Explicitly pass ``header=0`` to be able to
- replace existing names. The header can be a list of integers that
- specify row locations for a multi-index on the columns
- e.g. [0,1,3]. Intervening rows that are not specified will be
- skipped (e.g. 2 in this example is skipped). Note that this
- parameter ignores commented lines and empty lines if
- ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
- data rather than the first line of the file.
- names : array-like, optional
- List of column names to use. If the file contains a header row,
- then you should explicitly pass ``header=0`` to override the column names.
- Duplicates in this list are not allowed.
- index_col : int, str, sequence of int / str, or False, default ``None``
- Column(s) to use as the row labels of the ``DataFrame``, either given as
- string name or column index. If a sequence of int / str is given, a
- MultiIndex is used.
- Note: ``index_col=False`` can be used to force pandas to *not* use the first
- column as the index, e.g. when you have a malformed file with delimiters at
- the end of each line.
- usecols : list-like or callable, optional
- Return a subset of the columns. If list-like, all elements must either
- be positional (i.e. integer indices into the document columns) or strings
- that correspond to column names provided either by the user in `names` or
- inferred from the document header row(s). For example, a valid list-like
- `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
- Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
- To instantiate a DataFrame from ``data`` with element order preserved use
- ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
- in ``['foo', 'bar']`` order or
- ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
- for ``['bar', 'foo']`` order.
- If callable, the callable function will be evaluated against the column
- names, returning names where the callable function evaluates to True. An
- example of a valid callable argument would be ``lambda x: x.upper() in
- ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
- parsing time and lower memory usage.
- squeeze : bool, default False
- If the parsed data only contains one column then return a Series.
- prefix : str, optional
- Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
- mangle_dupe_cols : bool, default True
- Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
- 'X'...'X'. Passing in False will cause data to be overwritten if there
- are duplicate names in the columns.
- dtype : Type name or dict of column -> type, optional
- Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
- 'c': 'Int64'}}
- Use `str` or `object` together with suitable `na_values` settings
- to preserve and not interpret dtype.
- If converters are specified, they will be applied INSTEAD
- of dtype conversion.
- engine : {{'c', 'python'}}, optional
- Parser engine to use. The C engine is faster while the python engine is
- currently more feature-complete.
- converters : dict, optional
- Dict of functions for converting values in certain columns. Keys can either
- be integers or column labels.
- true_values : list, optional
- Values to consider as True.
- false_values : list, optional
- Values to consider as False.
- skipinitialspace : bool, default False
- Skip spaces after delimiter.
- skiprows : list-like, int or callable, optional
- Line numbers to skip (0-indexed) or number of lines to skip (int)
- at the start of the file.
- If callable, the callable function will be evaluated against the row
- indices, returning True if the row should be skipped and False otherwise.
- An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
- skipfooter : int, default 0
- Number of lines at bottom of file to skip (Unsupported with engine='c').
- nrows : int, optional
- Number of rows of file to read. Useful for reading pieces of large files.
- na_values : scalar, str, list-like, or dict, optional
- Additional strings to recognize as NA/NaN. If dict passed, specific
- per-column NA values. By default the following values are interpreted as
- NaN: '"""
- + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")
- + """'.
- keep_default_na : bool, default True
- Whether or not to include the default NaN values when parsing the data.
- Depending on whether `na_values` is passed in, the behavior is as follows:
- * If `keep_default_na` is True, and `na_values` are specified, `na_values`
- is appended to the default NaN values used for parsing.
- * If `keep_default_na` is True, and `na_values` are not specified, only
- the default NaN values are used for parsing.
- * If `keep_default_na` is False, and `na_values` are specified, only
- the NaN values specified `na_values` are used for parsing.
- * If `keep_default_na` is False, and `na_values` are not specified, no
- strings will be parsed as NaN.
- Note that if `na_filter` is passed in as False, the `keep_default_na` and
- `na_values` parameters will be ignored.
- na_filter : bool, default True
- Detect missing value markers (empty strings and the value of na_values). In
- data without any NAs, passing na_filter=False can improve the performance
- of reading a large file.
- verbose : bool, default False
- Indicate number of NA values placed in non-numeric columns.
- skip_blank_lines : bool, default True
- If True, skip over blank lines rather than interpreting as NaN values.
- parse_dates : bool or list of int or names or list of lists or dict, \
- default False
- The behavior is as follows:
- * boolean. If True -> try parsing the index.
- * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
- each as a separate date column.
- * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
- a single date column.
- * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
- result 'foo'
- If a column or index cannot be represented as an array of datetimes,
- say because of an unparseable value or a mixture of timezones, the column
- or index will be returned unaltered as an object data type. For
- non-standard datetime parsing, use ``pd.to_datetime`` after
- ``pd.read_csv``. To parse an index or column with a mixture of timezones,
- specify ``date_parser`` to be a partially-applied
- :func:`pandas.to_datetime` with ``utc=True``. See
- :ref:`io.csv.mixed_timezones` for more.
- Note: A fast-path exists for iso8601-formatted dates.
- infer_datetime_format : bool, default False
- If True and `parse_dates` is enabled, pandas will attempt to infer the
- format of the datetime strings in the columns, and if it can be inferred,
- switch to a faster method of parsing them. In some cases this can increase
- the parsing speed by 5-10x.
- keep_date_col : bool, default False
- If True and `parse_dates` specifies combining multiple columns then
- keep the original columns.
- date_parser : function, optional
- Function to use for converting a sequence of string columns to an array of
- datetime instances. The default uses ``dateutil.parser.parser`` to do the
- conversion. Pandas will try to call `date_parser` in three different ways,
- advancing to the next if an exception occurs: 1) Pass one or more arrays
- (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
- string values from the columns defined by `parse_dates` into a single array
- and pass that; and 3) call `date_parser` once for each row using one or
- more strings (corresponding to the columns defined by `parse_dates`) as
- arguments.
- dayfirst : bool, default False
- DD/MM format dates, international and European format.
- cache_dates : bool, default True
- If True, use a cache of unique, converted dates to apply the datetime
- conversion. May produce significant speed-up when parsing duplicate
- date strings, especially ones with timezone offsets.
- .. versionadded:: 0.25.0
- iterator : bool, default False
- Return TextFileReader object for iteration or getting chunks with
- ``get_chunk()``.
- chunksize : int, optional
- Return TextFileReader object for iteration.
- See the `IO Tools docs
- <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
- for more information on ``iterator`` and ``chunksize``.
- compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
- For on-the-fly decompression of on-disk data. If 'infer' and
- `filepath_or_buffer` is path-like, then detect compression from the
- following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
- decompression). If using 'zip', the ZIP file must contain only one data
- file to be read in. Set to None for no decompression.
- thousands : str, optional
- Thousands separator.
- decimal : str, default '.'
- Character to recognize as decimal point (e.g. use ',' for European data).
- lineterminator : str (length 1), optional
- Character to break file into lines. Only valid with C parser.
- quotechar : str (length 1), optional
- The character used to denote the start and end of a quoted item. Quoted
- items can include the delimiter and it will be ignored.
- quoting : int or csv.QUOTE_* instance, default 0
- Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
- QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
- doublequote : bool, default ``True``
- When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
- whether or not to interpret two consecutive quotechar elements INSIDE a
- field as a single ``quotechar`` element.
- escapechar : str (length 1), optional
- One-character string used to escape other characters.
- comment : str, optional
- Indicates remainder of line should not be parsed. If found at the beginning
- of a line, the line will be ignored altogether. This parameter must be a
- single character. Like empty lines (as long as ``skip_blank_lines=True``),
- fully commented lines are ignored by the parameter `header` but not by
- `skiprows`. For example, if ``comment='#'``, parsing
- ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
- treated as the header.
- encoding : str, optional
- Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
- standard encodings
- <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
- dialect : str or csv.Dialect, optional
- If provided, this parameter will override values (default or not) for the
- following parameters: `delimiter`, `doublequote`, `escapechar`,
- `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
- override values, a ParserWarning will be issued. See csv.Dialect
- documentation for more details.
- error_bad_lines : bool, default True
- Lines with too many fields (e.g. a csv line with too many commas) will by
- default cause an exception to be raised, and no DataFrame will be returned.
- If False, then these "bad lines" will dropped from the DataFrame that is
- returned.
- warn_bad_lines : bool, default True
- If error_bad_lines is False, and warn_bad_lines is True, a warning for each
- "bad line" will be output.
- delim_whitespace : bool, default False
- Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
- used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
- is set to True, nothing should be passed in for the ``delimiter``
- parameter.
- low_memory : bool, default True
- Internally process the file in chunks, resulting in lower memory use
- while parsing, but possibly mixed type inference. To ensure no mixed
- types either set False, or specify the type with the `dtype` parameter.
- Note that the entire file is read into a single DataFrame regardless,
- use the `chunksize` or `iterator` parameter to return the data in chunks.
- (Only valid with C parser).
- memory_map : bool, default False
- If a filepath is provided for `filepath_or_buffer`, map the file object
- directly onto memory and access the data directly from there. Using this
- option can improve performance because there is no longer any I/O overhead.
- float_precision : str, optional
- Specifies which converter the C engine should use for floating-point
- values. The options are `None` for the ordinary converter,
- `high` for the high-precision converter, and `round_trip` for the
- round-trip converter.
- Returns
- -------
- DataFrame or TextParser
- A comma-separated values (csv) file is returned as two-dimensional
- data structure with labeled axes.
- See Also
- --------
- DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
- read_csv : Read a comma-separated values (csv) file into DataFrame.
- read_fwf : Read a table of fixed-width formatted lines into DataFrame.
- Examples
- --------
- >>> pd.{func_name}('data.csv') # doctest: +SKIP
- """
- )
- def _validate_integer(name, val, min_val=0):
- """
- Checks whether the 'name' parameter for parsing is either
- an integer OR float that can SAFELY be cast to an integer
- without losing accuracy. Raises a ValueError if that is
- not the case.
- Parameters
- ----------
- name : string
- Parameter name (used for error reporting)
- val : int or float
- The value to check
- min_val : int
- Minimum allowed value (val < min_val will result in a ValueError)
- """
- msg = f"'{name:s}' must be an integer >={min_val:d}"
- if val is not None:
- if is_float(val):
- if int(val) != val:
- raise ValueError(msg)
- val = int(val)
- elif not (is_integer(val) and val >= min_val):
- raise ValueError(msg)
- return val
- def _validate_names(names):
- """
- Raise ValueError if the `names` parameter contains duplicates.
- Parameters
- ----------
- names : array-like or None
- An array containing a list of the names used for the output DataFrame.
- Raises
- ------
- ValueError
- If names are not unique.
- """
- if names is not None:
- if len(names) != len(set(names)):
- raise ValueError("Duplicate names are not allowed.")
- def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
- """Generic reader of line files."""
- encoding = kwds.get("encoding", None)
- if encoding is not None:
- encoding = re.sub("_", "-", encoding).lower()
- kwds["encoding"] = encoding
- compression = kwds.get("compression", "infer")
- compression = infer_compression(filepath_or_buffer, compression)
- # TODO: get_filepath_or_buffer could return
- # Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile]
- # though mypy handling of conditional imports is difficult.
- # See https://github.com/python/mypy/issues/1297
- fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
- filepath_or_buffer, encoding, compression
- )
- kwds["compression"] = compression
- if kwds.get("date_parser", None) is not None:
- if isinstance(kwds["parse_dates"], bool):
- kwds["parse_dates"] = True
- # Extract some of the arguments (pass chunksize on).
- iterator = kwds.get("iterator", False)
- chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1)
- nrows = kwds.get("nrows", None)
- # Check for duplicates in names.
- _validate_names(kwds.get("names", None))
- # Create the parser.
- parser = TextFileReader(fp_or_buf, **kwds)
- if chunksize or iterator:
- return parser
- try:
- data = parser.read(nrows)
- finally:
- parser.close()
- if should_close:
- try:
- fp_or_buf.close()
- except ValueError:
- pass
- return data
- _parser_defaults = {
- "delimiter": None,
- "escapechar": None,
- "quotechar": '"',
- "quoting": csv.QUOTE_MINIMAL,
- "doublequote": True,
- "skipinitialspace": False,
- "lineterminator": None,
- "header": "infer",
- "index_col": None,
- "names": None,
- "prefix": None,
- "skiprows": None,
- "skipfooter": 0,
- "nrows": None,
- "na_values": None,
- "keep_default_na": True,
- "true_values": None,
- "false_values": None,
- "converters": None,
- "dtype": None,
- "cache_dates": True,
- "thousands": None,
- "comment": None,
- "decimal": ".",
- # 'engine': 'c',
- "parse_dates": False,
- "keep_date_col": False,
- "dayfirst": False,
- "date_parser": None,
- "usecols": None,
- # 'iterator': False,
- "chunksize": None,
- "verbose": False,
- "encoding": None,
- "squeeze": False,
- "compression": None,
- "mangle_dupe_cols": True,
- "infer_datetime_format": False,
- "skip_blank_lines": True,
- }
- _c_parser_defaults = {
- "delim_whitespace": False,
- "na_filter": True,
- "low_memory": True,
- "memory_map": False,
- "error_bad_lines": True,
- "warn_bad_lines": True,
- "float_precision": None,
- }
- _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
- _c_unsupported = {"skipfooter"}
- _python_unsupported = {"low_memory", "float_precision"}
- _deprecated_defaults: Dict[str, Any] = {}
- _deprecated_args: Set[str] = set()
- def _make_parser_function(name, default_sep=","):
- def parser_f(
- filepath_or_buffer: FilePathOrBuffer,
- sep=default_sep,
- delimiter=None,
- # Column and Index Locations and Names
- header="infer",
- names=None,
- index_col=None,
- usecols=None,
- squeeze=False,
- prefix=None,
- mangle_dupe_cols=True,
- # General Parsing Configuration
- dtype=None,
- engine=None,
- converters=None,
- true_values=None,
- false_values=None,
- skipinitialspace=False,
- skiprows=None,
- skipfooter=0,
- nrows=None,
- # NA and Missing Data Handling
- na_values=None,
- keep_default_na=True,
- na_filter=True,
- verbose=False,
- skip_blank_lines=True,
- # Datetime Handling
- parse_dates=False,
- infer_datetime_format=False,
- keep_date_col=False,
- date_parser=None,
- dayfirst=False,
- cache_dates=True,
- # Iteration
- iterator=False,
- chunksize=None,
- # Quoting, Compression, and File Format
- compression="infer",
- thousands=None,
- decimal: str = ".",
- lineterminator=None,
- quotechar='"',
- quoting=csv.QUOTE_MINIMAL,
- doublequote=True,
- escapechar=None,
- comment=None,
- encoding=None,
- dialect=None,
- # Error Handling
- error_bad_lines=True,
- warn_bad_lines=True,
- # Internal
- delim_whitespace=False,
- low_memory=_c_parser_defaults["low_memory"],
- memory_map=False,
- float_precision=None,
- ):
- # gh-23761
- #
- # When a dialect is passed, it overrides any of the overlapping
- # parameters passed in directly. We don't want to warn if the
- # default parameters were passed in (since it probably means
- # that the user didn't pass them in explicitly in the first place).
- #
- # "delimiter" is the annoying corner case because we alias it to
- # "sep" before doing comparison to the dialect values later on.
- # Thus, we need a flag to indicate that we need to "override"
- # the comparison to dialect values by checking if default values
- # for BOTH "delimiter" and "sep" were provided.
- if dialect is not None:
- sep_override = delimiter is None and sep == default_sep
- kwds = dict(sep_override=sep_override)
- else:
- kwds = dict()
- # Alias sep -> delimiter.
- if delimiter is None:
- delimiter = sep
- if delim_whitespace and delimiter != default_sep:
- raise ValueError(
- "Specified a delimiter with both sep and "
- "delim_whitespace=True; you can only specify one."
- )
- if engine is not None:
- engine_specified = True
- else:
- engine = "c"
- engine_specified = False
- kwds.update(
- delimiter=delimiter,
- engine=engine,
- dialect=dialect,
- compression=compression,
- engine_specified=engine_specified,
- doublequote=doublequote,
- escapechar=escapechar,
- quotechar=quotechar,
- quoting=quoting,
- skipinitialspace=skipinitialspace,
- lineterminator=lineterminator,
- header=header,
- index_col=index_col,
- names=names,
- prefix=prefix,
- skiprows=skiprows,
- skipfooter=skipfooter,
- na_values=na_values,
- true_values=true_values,
- false_values=false_values,
- keep_default_na=keep_default_na,
- thousands=thousands,
- comment=comment,
- decimal=decimal,
- parse_dates=parse_dates,
- keep_date_col=keep_date_col,
- dayfirst=dayfirst,
- date_parser=date_parser,
- cache_dates=cache_dates,
- nrows=nrows,
- iterator=iterator,
- chunksize=chunksize,
- converters=converters,
- dtype=dtype,
- usecols=usecols,
- verbose=verbose,
- encoding=encoding,
- squeeze=squeeze,
- memory_map=memory_map,
- float_precision=float_precision,
- na_filter=na_filter,
- delim_whitespace=delim_whitespace,
- warn_bad_lines=warn_bad_lines,
- error_bad_lines=error_bad_lines,
- low_memory=low_memory,
- mangle_dupe_cols=mangle_dupe_cols,
- infer_datetime_format=infer_datetime_format,
- skip_blank_lines=skip_blank_lines,
- )
- return _read(filepath_or_buffer, kwds)
- parser_f.__name__ = name
- return parser_f
- read_csv = _make_parser_function("read_csv", default_sep=",")
- read_csv = Appender(
- _doc_read_csv_and_table.format(
- func_name="read_csv",
- summary="Read a comma-separated values (csv) file into DataFrame.",
- _default_sep="','",
- )
- )(read_csv)
- read_table = _make_parser_function("read_table", default_sep="\t")
- read_table = Appender(
- _doc_read_csv_and_table.format(
- func_name="read_table",
- summary="Read general delimited file into DataFrame.",
- _default_sep=r"'\\t' (tab-stop)",
- )
- )(read_table)
- def read_fwf(
- filepath_or_buffer: FilePathOrBuffer,
- colspecs="infer",
- widths=None,
- infer_nrows=100,
- **kwds,
- ):
- r"""
- Read a table of fixed-width formatted lines into DataFrame.
- Also supports optionally iterating or breaking of the file
- into chunks.
- Additional help can be found in the `online docs for IO Tools
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
- Parameters
- ----------
- filepath_or_buffer : str, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, and file. For file URLs, a host is
- expected. A local file could be:
- ``file://localhost/path/to/table.csv``.
- If you want to pass in a path object, pandas accepts any
- ``os.PathLike``.
- By file-like object, we refer to objects with a ``read()`` method,
- such as a file handler (e.g. via builtin ``open`` function)
- or ``StringIO``.
- colspecs : list of tuple (int, int) or 'infer'. optional
- A list of tuples giving the extents of the fixed-width
- fields of each line as half-open intervals (i.e., [from, to[ ).
- String value 'infer' can be used to instruct the parser to try
- detecting the column specifications from the first 100 rows of
- the data which are not being skipped via skiprows (default='infer').
- widths : list of int, optional
- A list of field widths which can be used instead of 'colspecs' if
- the intervals are contiguous.
- infer_nrows : int, default 100
- The number of rows to consider when letting the parser determine the
- `colspecs`.
- .. versionadded:: 0.24.0
- **kwds : optional
- Optional keyword arguments can be passed to ``TextFileReader``.
- Returns
- -------
- DataFrame or TextParser
- A comma-separated values (csv) file is returned as two-dimensional
- data structure with labeled axes.
- See Also
- --------
- DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
- read_csv : Read a comma-separated values (csv) file into DataFrame.
- Examples
- --------
- >>> pd.read_fwf('data.csv') # doctest: +SKIP
- """
- # Check input arguments.
- if colspecs is None and widths is None:
- raise ValueError("Must specify either colspecs or widths")
- elif colspecs not in (None, "infer") and widths is not None:
- raise ValueError("You must specify only one of 'widths' and 'colspecs'")
- # Compute 'colspecs' from 'widths', if specified.
- if widths is not None:
- colspecs, col = [], 0
- for w in widths:
- colspecs.append((col, col + w))
- col += w
- kwds["colspecs"] = colspecs
- kwds["infer_nrows"] = infer_nrows
- kwds["engine"] = "python-fwf"
- return _read(filepath_or_buffer, kwds)
- class TextFileReader(abc.Iterator):
- """
- Passed dialect overrides any of the related parser options
- """
- def __init__(self, f, engine=None, **kwds):
- self.f = f
- if engine is not None:
- engine_specified = True
- else:
- engine = "python"
- engine_specified = False
- self._engine_specified = kwds.get("engine_specified", engine_specified)
- if kwds.get("dialect") is not None:
- dialect = kwds["dialect"]
- if dialect in csv.list_dialects():
- dialect = csv.get_dialect(dialect)
- # Any valid dialect should have these attributes.
- # If any are missing, we will raise automatically.
- for param in (
- "delimiter",
- "doublequote",
- "escapechar",
- "skipinitialspace",
- "quotechar",
- "quoting",
- ):
- try:
- dialect_val = getattr(dialect, param)
- except AttributeError as err:
- raise ValueError(
- f"Invalid dialect {kwds['dialect']} provided"
- ) from err
- parser_default = _parser_defaults[param]
- provided = kwds.get(param, parser_default)
- # Messages for conflicting values between the dialect
- # instance and the actual parameters provided.
- conflict_msgs = []
- # Don't warn if the default parameter was passed in,
- # even if it conflicts with the dialect (gh-23761).
- if provided != parser_default and provided != dialect_val:
- msg = (
- f"Conflicting values for '{param}': '{provided}' was "
- f"provided, but the dialect specifies '{dialect_val}'. "
- "Using the dialect-specified value."
- )
- # Annoying corner case for not warning about
- # conflicts between dialect and delimiter parameter.
- # Refer to the outer "_read_" function for more info.
- if not (param == "delimiter" and kwds.pop("sep_override", False)):
- conflict_msgs.append(msg)
- if conflict_msgs:
- warnings.warn(
- "\n\n".join(conflict_msgs), ParserWarning, stacklevel=2
- )
- kwds[param] = dialect_val
- if kwds.get("skipfooter"):
- if kwds.get("iterator") or kwds.get("chunksize"):
- raise ValueError("'skipfooter' not supported for 'iteration'")
- if kwds.get("nrows"):
- raise ValueError("'skipfooter' not supported with 'nrows'")
- if kwds.get("header", "infer") == "infer":
- kwds["header"] = 0 if kwds.get("names") is None else None
- self.orig_options = kwds
- # miscellanea
- self.engine = engine
- self._engine = None
- self._currow = 0
- options = self._get_options_with_defaults(engine)
- self.chunksize = options.pop("chunksize", None)
- self.nrows = options.pop("nrows", None)
- self.squeeze = options.pop("squeeze", False)
- # might mutate self.engine
- self.engine = self._check_file_or_buffer(f, engine)
- self.options, self.engine = self._clean_options(options, engine)
- if "has_index_names" in kwds:
- self.options["has_index_names"] = kwds["has_index_names"]
- self._make_engine(self.engine)
- def close(self):
- self._engine.close()
- def _get_options_with_defaults(self, engine):
- kwds = self.orig_options
- options = {}
- for argname, default in _parser_defaults.items():
- value = kwds.get(argname, default)
- # see gh-12935
- if argname == "mangle_dupe_cols" and not value:
- raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
- else:
- options[argname] = value
- for argname, default in _c_parser_defaults.items():
- if argname in kwds:
- value = kwds[argname]
- if engine != "c" and value != default:
- if "python" in engine and argname not in _python_unsupported:
- pass
- elif value == _deprecated_defaults.get(argname, default):
- pass
- else:
- raise ValueError(
- f"The {repr(argname)} option is not supported with the "
- f"{repr(engine)} engine"
- )
- else:
- value = _deprecated_defaults.get(argname, default)
- options[argname] = value
- if engine == "python-fwf":
- for argname, default in _fwf_defaults.items():
- options[argname] = kwds.get(argname, default)
- return options
- def _check_file_or_buffer(self, f, engine):
- # see gh-16530
- if is_file_like(f):
- next_attr = "__next__"
- # The C engine doesn't need the file-like to have the "next" or
- # "__next__" attribute. However, the Python engine explicitly calls
- # "next(...)" when iterating through such an object, meaning it
- # needs to have that attribute ("next" for Python 2.x, "__next__"
- # for Python 3.x)
- if engine != "c" and not hasattr(f, next_attr):
- msg = "The 'python' engine cannot iterate through this file buffer."
- raise ValueError(msg)
- return engine
- def _clean_options(self, options, engine):
- result = options.copy()
- engine_specified = self._engine_specified
- fallback_reason = None
- sep = options["delimiter"]
- delim_whitespace = options["delim_whitespace"]
- # C engine not supported yet
- if engine == "c":
- if options["skipfooter"] > 0:
- fallback_reason = "the 'c' engine does not support skipfooter"
- engine = "python"
- encoding = sys.getfilesystemencoding() or "utf-8"
- if sep is None and not delim_whitespace:
- if engine == "c":
- fallback_reason = (
- "the 'c' engine does not support "
- "sep=None with delim_whitespace=False"
- )
- engine = "python"
- elif sep is not None and len(sep) > 1:
- if engine == "c" and sep == r"\s+":
- result["delim_whitespace"] = True
- del result["delimiter"]
- elif engine not in ("python", "python-fwf"):
- # wait until regex engine integrated
- fallback_reason = (
- "the 'c' engine does not support "
- "regex separators (separators > 1 char and "
- r"different from '\s+' are interpreted as regex)"
- )
- engine = "python"
- elif delim_whitespace:
- if "python" in engine:
- result["delimiter"] = r"\s+"
- elif sep is not None:
- encodeable = True
- try:
- if len(sep.encode(encoding)) > 1:
- encodeable = False
- except UnicodeDecodeError:
- encodeable = False
- if not encodeable and engine not in ("python", "python-fwf"):
- fallback_reason = (
- f"the separator encoded in {encoding} "
- "is > 1 char long, and the 'c' engine "
- "does not support such separators"
- )
- engine = "python"
- quotechar = options["quotechar"]
- if quotechar is not None and isinstance(quotechar, (str, bytes)):
- if (
- len(quotechar) == 1
- and ord(quotechar) > 127
- and engine not in ("python", "python-fwf")
- ):
- fallback_reason = (
- "ord(quotechar) > 127, meaning the "
- "quotechar is larger than one byte, "
- "and the 'c' engine does not support such quotechars"
- )
- engine = "python"
- if fallback_reason and engine_specified:
- raise ValueError(fallback_reason)
- if engine == "c":
- for arg in _c_unsupported:
- del result[arg]
- if "python" in engine:
- for arg in _python_unsupported:
- if fallback_reason and result[arg] != _c_parser_defaults[arg]:
- raise ValueError(
- "Falling back to the 'python' engine because "
- f"{fallback_reason}, but this causes {repr(arg)} to be "
- "ignored as it is not supported by the 'python' engine."
- )
- del result[arg]
- if fallback_reason:
- warnings.warn(
- (
- "Falling back to the 'python' engine because "
- f"{fallback_reason}; you can avoid this warning by specifying "
- "engine='python'."
- ),
- ParserWarning,
- stacklevel=5,
- )
- index_col = options["index_col"]
- names = options["names"]
- converters = options["converters"]
- na_values = options["na_values"]
- skiprows = options["skiprows"]
- validate_header_arg(options["header"])
- depr_warning = ""
- for arg in _deprecated_args:
- parser_default = _c_parser_defaults[arg]
- depr_default = _deprecated_defaults[arg]
- msg = (
- f"The {repr(arg)} argument has been deprecated and will be "
- "removed in a future version."
- )
- if result.get(arg, depr_default) != depr_default:
- depr_warning += msg + "\n\n"
- else:
- result[arg] = parser_default
- if depr_warning != "":
- warnings.warn(depr_warning, FutureWarning, stacklevel=2)
- if index_col is True:
- raise ValueError("The value of index_col couldn't be 'True'")
- if _is_index_col(index_col):
- if not isinstance(index_col, (list, tuple, np.ndarray)):
- index_col = [index_col]
- result["index_col"] = index_col
- names = list(names) if names is not None else names
- # type conversion-related
- if converters is not None:
- if not isinstance(converters, dict):
- raise TypeError(
- "Type converters must be a dict or subclass, "
- f"input was a {type(converters).__name__}"
- )
- else:
- converters = {}
- # Converting values to NA
- keep_default_na = options["keep_default_na"]
- na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
- # handle skiprows; this is internally handled by the
- # c-engine, so only need for python parsers
- if engine != "c":
- if is_integer(skiprows):
- skiprows = list(range(skiprows))
- if skiprows is None:
- skiprows = set()
- elif not callable(skiprows):
- skiprows = set(skiprows)
- # put stuff back
- result["names"] = names
- result["converters"] = converters
- result["na_values"] = na_values
- result["na_fvalues"] = na_fvalues
- result["skiprows"] = skiprows
- return result, engine
- def __next__(self):
- try:
- return self.get_chunk()
- except StopIteration:
- self.close()
- raise
- def _make_engine(self, engine="c"):
- if engine == "c":
- self._engine = CParserWrapper(self.f, **self.options)
- else:
- if engine == "python":
- klass = PythonParser
- elif engine == "python-fwf":
- klass = FixedWidthFieldParser
- else:
- raise ValueError(
- f"Unknown engine: {engine} (valid options "
- 'are "c", "python", or "python-fwf")'
- )
- self._engine = klass(self.f, **self.options)
- def _failover_to_python(self):
- raise AbstractMethodError(self)
- def read(self, nrows=None):
- nrows = _validate_integer("nrows", nrows)
- ret = self._engine.read(nrows)
- # May alter columns / col_dict
- index, columns, col_dict = self._create_index(ret)
- if index is None:
- if col_dict:
- # Any column is actually fine:
- new_rows = len(next(iter(col_dict.values())))
- index = RangeIndex(self._currow, self._currow + new_rows)
- else:
- new_rows = 0
- else:
- new_rows = len(index)
- df = DataFrame(col_dict, columns=columns, index=index)
- self._currow += new_rows
- if self.squeeze and len(df.columns) == 1:
- return df[df.columns[0]].copy()
- return df
- def _create_index(self, ret):
- index, columns, col_dict = ret
- return index, columns, col_dict
- def get_chunk(self, size=None):
- if size is None:
- size = self.chunksize
- if self.nrows is not None:
- if self._currow >= self.nrows:
- raise StopIteration
- size = min(size, self.nrows - self._currow)
- return self.read(nrows=size)
- def _is_index_col(col):
- return col is not None and col is not False
- def _is_potential_multi_index(columns):
- """
- Check whether or not the `columns` parameter
- could be converted into a MultiIndex.
- Parameters
- ----------
- columns : array-like
- Object which may or may not be convertible into a MultiIndex
- Returns
- -------
- boolean : Whether or not columns could become a MultiIndex
- """
- return (
- len(columns)
- and not isinstance(columns, MultiIndex)
- and all(isinstance(c, tuple) for c in columns)
- )
- def _evaluate_usecols(usecols, names):
- """
- Check whether or not the 'usecols' parameter
- is a callable. If so, enumerates the 'names'
- parameter and returns a set of indices for
- each entry in 'names' that evaluates to True.
- If not a callable, returns 'usecols'.
- """
- if callable(usecols):
- return {i for i, name in enumerate(names) if usecols(name)}
- return usecols
- def _validate_usecols_names(usecols, names):
- """
- Validates that all usecols are present in a given
- list of names. If not, raise a ValueError that
- shows what usecols are missing.
- Parameters
- ----------
- usecols : iterable of usecols
- The columns to validate are present in names.
- names : iterable of names
- The column names to check against.
- Returns
- -------
- usecols : iterable of usecols
- The `usecols` parameter if the validation succeeds.
- Raises
- ------
- ValueError : Columns were missing. Error message will list them.
- """
- missing = [c for c in usecols if c not in names]
- if len(missing) > 0:
- raise ValueError(
- f"Usecols do not match columns, columns expected but not found: {missing}"
- )
- return usecols
- def _validate_skipfooter_arg(skipfooter):
- """
- Validate the 'skipfooter' parameter.
- Checks whether 'skipfooter' is a non-negative integer.
- Raises a ValueError if that is not the case.
- Parameters
- ----------
- skipfooter : non-negative integer
- The number of rows to skip at the end of the file.
- Returns
- -------
- validated_skipfooter : non-negative integer
- The original input if the validation succeeds.
- Raises
- ------
- ValueError : 'skipfooter' was not a non-negative integer.
- """
- if not is_integer(skipfooter):
- raise ValueError("skipfooter must be an integer")
- if skipfooter < 0:
- raise ValueError("skipfooter cannot be negative")
- return skipfooter
- def _validate_usecols_arg(usecols):
- """
- Validate the 'usecols' parameter.
- Checks whether or not the 'usecols' parameter contains all integers
- (column selection by index), strings (column by name) or is a callable.
- Raises a ValueError if that is not the case.
- Parameters
- ----------
- usecols : list-like, callable, or None
- List of columns to use when parsing or a callable that can be used
- to filter a list of table columns.
- Returns
- -------
- usecols_tuple : tuple
- A tuple of (verified_usecols, usecols_dtype).
- 'verified_usecols' is either a set if an array-like is passed in or
- 'usecols' if a callable or None is passed in.
- 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
- is passed in or None if a callable or None is passed in.
- """
- msg = (
- "'usecols' must either be list-like of all strings, all unicode, "
- "all integers or a callable."
- )
- if usecols is not None:
- if callable(usecols):
- return usecols, None
- if not is_list_like(usecols):
- # see gh-20529
- #
- # Ensure it is iterable container but not string.
- raise ValueError(msg)
- usecols_dtype = lib.infer_dtype(usecols, skipna=False)
- if usecols_dtype not in ("empty", "integer", "string"):
- raise ValueError(msg)
- usecols = set(usecols)
- return usecols, usecols_dtype
- return usecols, None
- def _validate_parse_dates_arg(parse_dates):
- """
- Check whether or not the 'parse_dates' parameter
- is a non-boolean scalar. Raises a ValueError if
- that is the case.
- """
- msg = (
- "Only booleans, lists, and dictionaries are accepted "
- "for the 'parse_dates' parameter"
- )
- if parse_dates is not None:
- if is_scalar(parse_dates):
- if not lib.is_bool(parse_dates):
- raise TypeError(msg)
- elif not isinstance(parse_dates, (list, dict)):
- raise TypeError(msg)
- return parse_dates
- class ParserBase:
- def __init__(self, kwds):
- self.names = kwds.get("names")
- self.orig_names = None
- self.prefix = kwds.pop("prefix", None)
- self.index_col = kwds.get("index_col", None)
- self.unnamed_cols = set()
- self.index_names = None
- self.col_names = None
- self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
- self.date_parser = kwds.pop("date_parser", None)
- self.dayfirst = kwds.pop("dayfirst", False)
- self.keep_date_col = kwds.pop("keep_date_col", False)
- self.na_values = kwds.get("na_values")
- self.na_fvalues = kwds.get("na_fvalues")
- self.na_filter = kwds.get("na_filter", False)
- self.keep_default_na = kwds.get("keep_default_na", True)
- self.true_values = kwds.get("true_values")
- self.false_values = kwds.get("false_values")
- self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)
- self.infer_datetime_format = kwds.pop("infer_datetime_format", False)
- self.cache_dates = kwds.pop("cache_dates", True)
- self._date_conv = _make_date_converter(
- date_parser=self.date_parser,
- dayfirst=self.dayfirst,
- infer_datetime_format=self.infer_datetime_format,
- cache_dates=self.cache_dates,
- )
- # validate header options for mi
- self.header = kwds.get("header")
- if isinstance(self.header, (list, tuple, np.ndarray)):
- if not all(map(is_integer, self.header)):
- raise ValueError("header must be integer or list of integers")
- if any(i < 0 for i in self.header):
- raise ValueError(
- "cannot specify multi-index header with negative integers"
- )
- if kwds.get("usecols"):
- raise ValueError(
- "cannot specify usecols when specifying a multi-index header"
- )
- if kwds.get("names"):
- raise ValueError(
- "cannot specify names when specifying a multi-index header"
- )
- # validate index_col that only…
Large files files are truncated, but you can click here to view the full file