2384 lines
		
	
	
		
			85 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			2384 lines
		
	
	
		
			85 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| Module contains tools for processing files into DataFrames or other objects
 | |
| 
 | |
| GH#48849 provides a convenient way of deprecating keyword arguments
 | |
| """
 | |
| from __future__ import annotations
 | |
| 
 | |
| from collections import (
 | |
|     abc,
 | |
|     defaultdict,
 | |
| )
 | |
| import csv
 | |
| import sys
 | |
| from textwrap import fill
 | |
| from typing import (
 | |
|     IO,
 | |
|     TYPE_CHECKING,
 | |
|     Any,
 | |
|     Callable,
 | |
|     Literal,
 | |
|     NamedTuple,
 | |
|     TypedDict,
 | |
|     overload,
 | |
| )
 | |
| import warnings
 | |
| 
 | |
| import numpy as np
 | |
| 
 | |
| from pandas._config import using_copy_on_write
 | |
| 
 | |
| from pandas._libs import lib
 | |
| from pandas._libs.parsers import STR_NA_VALUES
 | |
| from pandas.errors import (
 | |
|     AbstractMethodError,
 | |
|     ParserWarning,
 | |
| )
 | |
| from pandas.util._decorators import Appender
 | |
| from pandas.util._exceptions import find_stack_level
 | |
| from pandas.util._validators import check_dtype_backend
 | |
| 
 | |
| from pandas.core.dtypes.common import (
 | |
|     is_file_like,
 | |
|     is_float,
 | |
|     is_hashable,
 | |
|     is_integer,
 | |
|     is_list_like,
 | |
|     pandas_dtype,
 | |
| )
 | |
| 
 | |
| from pandas import Series
 | |
| from pandas.core.frame import DataFrame
 | |
| from pandas.core.indexes.api import RangeIndex
 | |
| from pandas.core.shared_docs import _shared_docs
 | |
| 
 | |
| from pandas.io.common import (
 | |
|     IOHandles,
 | |
|     get_handle,
 | |
|     stringify_path,
 | |
|     validate_header_arg,
 | |
| )
 | |
| from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper
 | |
| from pandas.io.parsers.base_parser import (
 | |
|     ParserBase,
 | |
|     is_index_col,
 | |
|     parser_defaults,
 | |
| )
 | |
| from pandas.io.parsers.c_parser_wrapper import CParserWrapper
 | |
| from pandas.io.parsers.python_parser import (
 | |
|     FixedWidthFieldParser,
 | |
|     PythonParser,
 | |
| )
 | |
| 
 | |
| if TYPE_CHECKING:
 | |
|     from collections.abc import (
 | |
|         Hashable,
 | |
|         Iterable,
 | |
|         Mapping,
 | |
|         Sequence,
 | |
|     )
 | |
|     from types import TracebackType
 | |
| 
 | |
|     from pandas._typing import (
 | |
|         CompressionOptions,
 | |
|         CSVEngine,
 | |
|         DtypeArg,
 | |
|         DtypeBackend,
 | |
|         FilePath,
 | |
|         IndexLabel,
 | |
|         ReadCsvBuffer,
 | |
|         Self,
 | |
|         StorageOptions,
 | |
|         UsecolsArgType,
 | |
|     )
 | |
| _doc_read_csv_and_table = (
 | |
|     r"""
 | |
| {summary}
 | |
| 
 | |
| Also supports optionally iterating or breaking of the file
 | |
| into chunks.
 | |
| 
 | |
| Additional help can be found in the online docs for
 | |
| `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
 | |
| 
 | |
| Parameters
 | |
| ----------
 | |
| filepath_or_buffer : str, path object or file-like object
 | |
|     Any valid string path is acceptable. The string could be a URL. Valid
 | |
|     URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
 | |
|     expected. A local file could be: file://localhost/path/to/table.csv.
 | |
| 
 | |
|     If you want to pass in a path object, pandas accepts any ``os.PathLike``.
 | |
| 
 | |
|     By file-like object, we refer to objects with a ``read()`` method, such as
 | |
|     a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
 | |
| sep : str, default {_default_sep}
 | |
|     Character or regex pattern to treat as the delimiter. If ``sep=None``, the
 | |
|     C engine cannot automatically detect
 | |
|     the separator, but the Python parsing engine can, meaning the latter will
 | |
|     be used and automatically detect the separator from only the first valid
 | |
|     row of the file by Python's builtin sniffer tool, ``csv.Sniffer``.
 | |
|     In addition, separators longer than 1 character and different from
 | |
|     ``'\s+'`` will be interpreted as regular expressions and will also force
 | |
|     the use of the Python parsing engine. Note that regex delimiters are prone
 | |
|     to ignoring quoted data. Regex example: ``'\r\t'``.
 | |
| delimiter : str, optional
 | |
|     Alias for ``sep``.
 | |
| header : int, Sequence of int, 'infer' or None, default 'infer'
 | |
|     Row number(s) containing column labels and marking the start of the
 | |
|     data (zero-indexed). Default behavior is to infer the column names: if no ``names``
 | |
|     are passed the behavior is identical to ``header=0`` and column
 | |
|     names are inferred from the first line of the file, if column
 | |
|     names are passed explicitly to ``names`` then the behavior is identical to
 | |
|     ``header=None``. Explicitly pass ``header=0`` to be able to
 | |
|     replace existing names. The header can be a list of integers that
 | |
|     specify row locations for a :class:`~pandas.MultiIndex` on the columns
 | |
|     e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be
 | |
|     skipped (e.g. 2 in this example is skipped). Note that this
 | |
|     parameter ignores commented lines and empty lines if
 | |
|     ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
 | |
|     data rather than the first line of the file.
 | |
| names : Sequence of Hashable, optional
 | |
|     Sequence of column labels to apply. If the file contains a header row,
 | |
|     then you should explicitly pass ``header=0`` to override the column names.
 | |
|     Duplicates in this list are not allowed.
 | |
| index_col : Hashable, Sequence of Hashable or False, optional
 | |
|   Column(s) to use as row label(s), denoted either by column labels or column
 | |
|   indices.  If a sequence of labels or indices is given, :class:`~pandas.MultiIndex`
 | |
|   will be formed for the row labels.
 | |
| 
 | |
|   Note: ``index_col=False`` can be used to force pandas to *not* use the first
 | |
|   column as the index, e.g., when you have a malformed file with delimiters at
 | |
|   the end of each line.
 | |
| usecols : Sequence of Hashable or Callable, optional
 | |
|     Subset of columns to select, denoted either by column labels or column indices.
 | |
|     If list-like, all elements must either
 | |
|     be positional (i.e. integer indices into the document columns) or strings
 | |
|     that correspond to column names provided either by the user in ``names`` or
 | |
|     inferred from the document header row(s). If ``names`` are given, the document
 | |
|     header row(s) are not taken into account. For example, a valid list-like
 | |
|     ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
 | |
|     Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
 | |
|     To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order
 | |
|     preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]``
 | |
|     for columns in ``['foo', 'bar']`` order or
 | |
|     ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
 | |
|     for ``['bar', 'foo']`` order.
 | |
| 
 | |
|     If callable, the callable function will be evaluated against the column
 | |
|     names, returning names where the callable function evaluates to ``True``. An
 | |
|     example of a valid callable argument would be ``lambda x: x.upper() in
 | |
|     ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
 | |
|     parsing time and lower memory usage.
 | |
| dtype : dtype or dict of {{Hashable : dtype}}, optional
 | |
|     Data type(s) to apply to either the whole dataset or individual columns.
 | |
|     E.g., ``{{'a': np.float64, 'b': np.int32, 'c': 'Int64'}}``
 | |
|     Use ``str`` or ``object`` together with suitable ``na_values`` settings
 | |
|     to preserve and not interpret ``dtype``.
 | |
|     If ``converters`` are specified, they will be applied INSTEAD
 | |
|     of ``dtype`` conversion.
 | |
| 
 | |
|     .. versionadded:: 1.5.0
 | |
| 
 | |
|         Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where
 | |
|         the default determines the ``dtype`` of the columns which are not explicitly
 | |
|         listed.
 | |
| engine : {{'c', 'python', 'pyarrow'}}, optional
 | |
|     Parser engine to use. The C and pyarrow engines are faster, while the python engine
 | |
|     is currently more feature-complete. Multithreading is currently only supported by
 | |
|     the pyarrow engine.
 | |
| 
 | |
|     .. versionadded:: 1.4.0
 | |
| 
 | |
|         The 'pyarrow' engine was added as an *experimental* engine, and some features
 | |
|         are unsupported, or may not work correctly, with this engine.
 | |
| converters : dict of {{Hashable : Callable}}, optional
 | |
|     Functions for converting values in specified columns. Keys can either
 | |
|     be column labels or column indices.
 | |
| true_values : list, optional
 | |
|     Values to consider as ``True`` in addition to case-insensitive variants of 'True'.
 | |
| false_values : list, optional
 | |
|     Values to consider as ``False`` in addition to case-insensitive variants of 'False'.
 | |
| skipinitialspace : bool, default False
 | |
|     Skip spaces after delimiter.
 | |
| skiprows : int, list of int or Callable, optional
 | |
|     Line numbers to skip (0-indexed) or number of lines to skip (``int``)
 | |
|     at the start of the file.
 | |
| 
 | |
|     If callable, the callable function will be evaluated against the row
 | |
|     indices, returning ``True`` if the row should be skipped and ``False`` otherwise.
 | |
|     An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
 | |
| skipfooter : int, default 0
 | |
|     Number of lines at bottom of file to skip (Unsupported with ``engine='c'``).
 | |
| nrows : int, optional
 | |
|     Number of rows of file to read. Useful for reading pieces of large files.
 | |
| na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional
 | |
|     Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific
 | |
|     per-column ``NA`` values.  By default the following values are interpreted as
 | |
|     ``NaN``: " """
 | |
|     + fill('", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent="    ")
 | |
|     + """ ".
 | |
| 
 | |
| keep_default_na : bool, default True
 | |
|     Whether or not to include the default ``NaN`` values when parsing the data.
 | |
|     Depending on whether ``na_values`` is passed in, the behavior is as follows:
 | |
| 
 | |
|     * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values``
 | |
|       is appended to the default ``NaN`` values used for parsing.
 | |
|     * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only
 | |
|       the default ``NaN`` values are used for parsing.
 | |
|     * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only
 | |
|       the ``NaN`` values specified ``na_values`` are used for parsing.
 | |
|     * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no
 | |
|       strings will be parsed as ``NaN``.
 | |
| 
 | |
|     Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and
 | |
|     ``na_values`` parameters will be ignored.
 | |
| na_filter : bool, default True
 | |
|     Detect missing value markers (empty strings and the value of ``na_values``). In
 | |
|     data without any ``NA`` values, passing ``na_filter=False`` can improve the
 | |
|     performance of reading a large file.
 | |
| verbose : bool, default False
 | |
|     Indicate number of ``NA`` values placed in non-numeric columns.
 | |
| 
 | |
|     .. deprecated:: 2.2.0
 | |
| skip_blank_lines : bool, default True
 | |
|     If ``True``, skip over blank lines rather than interpreting as ``NaN`` values.
 | |
| parse_dates : bool, list of Hashable, list of lists or dict of {{Hashable : list}}, \
 | |
| default False
 | |
|     The behavior is as follows:
 | |
| 
 | |
|     * ``bool``. If ``True`` -> try parsing the index. Note: Automatically set to
 | |
|       ``True`` if ``date_format`` or ``date_parser`` arguments have been passed.
 | |
|     * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3
 | |
|       each as a separate date column.
 | |
|     * ``list`` of ``list``. e.g.  If ``[[1, 3]]`` -> combine columns 1 and 3 and parse
 | |
|       as a single date column. Values are joined with a space before parsing.
 | |
|     * ``dict``, e.g. ``{{'foo' : [1, 3]}}`` -> parse columns 1, 3 as date and call
 | |
|       result 'foo'. Values are joined with a space before parsing.
 | |
| 
 | |
|     If a column or index cannot be represented as an array of ``datetime``,
 | |
|     say because of an unparsable value or a mixture of timezones, the column
 | |
|     or index will be returned unaltered as an ``object`` data type. For
 | |
|     non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after
 | |
|     :func:`~pandas.read_csv`.
 | |
| 
 | |
|     Note: A fast-path exists for iso8601-formatted dates.
 | |
| infer_datetime_format : bool, default False
 | |
|     If ``True`` and ``parse_dates`` is enabled, pandas will attempt to infer the
 | |
|     format of the ``datetime`` strings in the columns, and if it can be inferred,
 | |
|     switch to a faster method of parsing them. In some cases this can increase
 | |
|     the parsing speed by 5-10x.
 | |
| 
 | |
|     .. deprecated:: 2.0.0
 | |
|         A strict version of this argument is now the default, passing it has no effect.
 | |
| 
 | |
| keep_date_col : bool, default False
 | |
|     If ``True`` and ``parse_dates`` specifies combining multiple columns then
 | |
|     keep the original columns.
 | |
| date_parser : Callable, optional
 | |
|     Function to use for converting a sequence of string columns to an array of
 | |
|     ``datetime`` instances. The default uses ``dateutil.parser.parser`` to do the
 | |
|     conversion. pandas will try to call ``date_parser`` in three different ways,
 | |
|     advancing to the next if an exception occurs: 1) Pass one or more arrays
 | |
|     (as defined by ``parse_dates``) as arguments; 2) concatenate (row-wise) the
 | |
|     string values from the columns defined by ``parse_dates`` into a single array
 | |
|     and pass that; and 3) call ``date_parser`` once for each row using one or
 | |
|     more strings (corresponding to the columns defined by ``parse_dates``) as
 | |
|     arguments.
 | |
| 
 | |
|     .. deprecated:: 2.0.0
 | |
|        Use ``date_format`` instead, or read in as ``object`` and then apply
 | |
|        :func:`~pandas.to_datetime` as-needed.
 | |
| date_format : str or dict of column -> format, optional
 | |
|     Format to use for parsing dates when used in conjunction with ``parse_dates``.
 | |
|     The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See
 | |
|     `strftime documentation
 | |
|     <https://docs.python.org/3/library/datetime.html
 | |
|     #strftime-and-strptime-behavior>`_ for more information on choices, though
 | |
|     note that :const:`"%f"` will parse all the way up to nanoseconds.
 | |
|     You can also pass:
 | |
| 
 | |
|     - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_
 | |
|         time string (not necessarily in exactly the same format);
 | |
|     - "mixed", to infer the format for each element individually. This is risky,
 | |
|         and you should probably use it along with `dayfirst`.
 | |
| 
 | |
|     .. versionadded:: 2.0.0
 | |
| dayfirst : bool, default False
 | |
|     DD/MM format dates, international and European format.
 | |
| cache_dates : bool, default True
 | |
|     If ``True``, use a cache of unique, converted dates to apply the ``datetime``
 | |
|     conversion. May produce significant speed-up when parsing duplicate
 | |
|     date strings, especially ones with timezone offsets.
 | |
| 
 | |
| iterator : bool, default False
 | |
|     Return ``TextFileReader`` object for iteration or getting chunks with
 | |
|     ``get_chunk()``.
 | |
| chunksize : int, optional
 | |
|     Number of lines to read from the file per chunk. Passing a value will cause the
 | |
|     function to return a ``TextFileReader`` object for iteration.
 | |
|     See the `IO Tools docs
 | |
|     <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
 | |
|     for more information on ``iterator`` and ``chunksize``.
 | |
| 
 | |
| {decompression_options}
 | |
| 
 | |
|     .. versionchanged:: 1.4.0 Zstandard support.
 | |
| 
 | |
| thousands : str (length 1), optional
 | |
|     Character acting as the thousands separator in numerical values.
 | |
| decimal : str (length 1), default '.'
 | |
|     Character to recognize as decimal point (e.g., use ',' for European data).
 | |
| lineterminator : str (length 1), optional
 | |
|     Character used to denote a line break. Only valid with C parser.
 | |
| quotechar : str (length 1), optional
 | |
|     Character used to denote the start and end of a quoted item. Quoted
 | |
|     items can include the ``delimiter`` and it will be ignored.
 | |
| quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, \
 | |
| 3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL
 | |
|     Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is
 | |
|     ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special
 | |
|     characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``,
 | |
|     or ``lineterminator``.
 | |
| doublequote : bool, default True
 | |
|    When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate
 | |
|    whether or not to interpret two consecutive ``quotechar`` elements INSIDE a
 | |
|    field as a single ``quotechar`` element.
 | |
| escapechar : str (length 1), optional
 | |
|     Character used to escape other characters.
 | |
| comment : str (length 1), optional
 | |
|     Character indicating that the remainder of line should not be parsed.
 | |
|     If found at the beginning
 | |
|     of a line, the line will be ignored altogether. This parameter must be a
 | |
|     single character. Like empty lines (as long as ``skip_blank_lines=True``),
 | |
|     fully commented lines are ignored by the parameter ``header`` but not by
 | |
|     ``skiprows``. For example, if ``comment='#'``, parsing
 | |
|     ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in ``'a,b,c'`` being
 | |
|     treated as the header.
 | |
| encoding : str, optional, default 'utf-8'
 | |
|     Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). `List of Python
 | |
|     standard encodings
 | |
|     <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
 | |
| 
 | |
| encoding_errors : str, optional, default 'strict'
 | |
|     How encoding errors are treated. `List of possible values
 | |
|     <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
 | |
| 
 | |
|     .. versionadded:: 1.3.0
 | |
| 
 | |
| dialect : str or csv.Dialect, optional
 | |
|     If provided, this parameter will override values (default or not) for the
 | |
|     following parameters: ``delimiter``, ``doublequote``, ``escapechar``,
 | |
|     ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to
 | |
|     override values, a ``ParserWarning`` will be issued. See ``csv.Dialect``
 | |
|     documentation for more details.
 | |
| on_bad_lines : {{'error', 'warn', 'skip'}} or Callable, default 'error'
 | |
|     Specifies what to do upon encountering a bad line (a line with too many fields).
 | |
|     Allowed values are :
 | |
| 
 | |
|     - ``'error'``, raise an Exception when a bad line is encountered.
 | |
|     - ``'warn'``, raise a warning when a bad line is encountered and skip that line.
 | |
|     - ``'skip'``, skip bad lines without raising or warning when they are encountered.
 | |
| 
 | |
|     .. versionadded:: 1.3.0
 | |
| 
 | |
|     .. versionadded:: 1.4.0
 | |
| 
 | |
|         - Callable, function with signature
 | |
|           ``(bad_line: list[str]) -> list[str] | None`` that will process a single
 | |
|           bad line. ``bad_line`` is a list of strings split by the ``sep``.
 | |
|           If the function returns ``None``, the bad line will be ignored.
 | |
|           If the function returns a new ``list`` of strings with more elements than
 | |
|           expected, a ``ParserWarning`` will be emitted while dropping extra elements.
 | |
|           Only supported when ``engine='python'``
 | |
| 
 | |
|     .. versionchanged:: 2.2.0
 | |
| 
 | |
|         - Callable, function with signature
 | |
|           as described in `pyarrow documentation
 | |
|           <https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
 | |
|           #pyarrow.csv.ParseOptions.invalid_row_handler>`_ when ``engine='pyarrow'``
 | |
| 
 | |
| delim_whitespace : bool, default False
 | |
|     Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be
 | |
|     used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option
 | |
|     is set to ``True``, nothing should be passed in for the ``delimiter``
 | |
|     parameter.
 | |
| 
 | |
|     .. deprecated:: 2.2.0
 | |
|         Use ``sep="\\s+"`` instead.
 | |
| low_memory : bool, default True
 | |
|     Internally process the file in chunks, resulting in lower memory use
 | |
|     while parsing, but possibly mixed type inference.  To ensure no mixed
 | |
|     types either set ``False``, or specify the type with the ``dtype`` parameter.
 | |
|     Note that the entire file is read into a single :class:`~pandas.DataFrame`
 | |
|     regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in
 | |
|     chunks. (Only valid with C parser).
 | |
| memory_map : bool, default False
 | |
|     If a filepath is provided for ``filepath_or_buffer``, map the file object
 | |
|     directly onto memory and access the data directly from there. Using this
 | |
|     option can improve performance because there is no longer any I/O overhead.
 | |
| float_precision : {{'high', 'legacy', 'round_trip'}}, optional
 | |
|     Specifies which converter the C engine should use for floating-point
 | |
|     values. The options are ``None`` or ``'high'`` for the ordinary converter,
 | |
|     ``'legacy'`` for the original lower precision pandas converter, and
 | |
|     ``'round_trip'`` for the round-trip converter.
 | |
| 
 | |
| {storage_options}
 | |
| 
 | |
| dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
 | |
|     Back-end data type applied to the resultant :class:`DataFrame`
 | |
|     (still experimental). Behaviour is as follows:
 | |
| 
 | |
|     * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
 | |
|       (default).
 | |
|     * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
 | |
|       DataFrame.
 | |
| 
 | |
|     .. versionadded:: 2.0
 | |
| 
 | |
| Returns
 | |
| -------
 | |
| DataFrame or TextFileReader
 | |
|     A comma-separated values (csv) file is returned as two-dimensional
 | |
|     data structure with labeled axes.
 | |
| 
 | |
| See Also
 | |
| --------
 | |
| DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
 | |
| {see_also_func_name} : {see_also_func_summary}
 | |
| read_fwf : Read a table of fixed-width formatted lines into DataFrame.
 | |
| 
 | |
| Examples
 | |
| --------
 | |
| >>> pd.{func_name}('data.csv')  # doctest: +SKIP
 | |
| """
 | |
| )
 | |
| 
 | |
| 
 | |
| class _C_Parser_Defaults(TypedDict):
 | |
|     delim_whitespace: Literal[False]
 | |
|     na_filter: Literal[True]
 | |
|     low_memory: Literal[True]
 | |
|     memory_map: Literal[False]
 | |
|     float_precision: None
 | |
| 
 | |
| 
 | |
| _c_parser_defaults: _C_Parser_Defaults = {
 | |
|     "delim_whitespace": False,
 | |
|     "na_filter": True,
 | |
|     "low_memory": True,
 | |
|     "memory_map": False,
 | |
|     "float_precision": None,
 | |
| }
 | |
| 
 | |
| 
 | |
| class _Fwf_Defaults(TypedDict):
 | |
|     colspecs: Literal["infer"]
 | |
|     infer_nrows: Literal[100]
 | |
|     widths: None
 | |
| 
 | |
| 
 | |
| _fwf_defaults: _Fwf_Defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
 | |
| _c_unsupported = {"skipfooter"}
 | |
| _python_unsupported = {"low_memory", "float_precision"}
 | |
| _pyarrow_unsupported = {
 | |
|     "skipfooter",
 | |
|     "float_precision",
 | |
|     "chunksize",
 | |
|     "comment",
 | |
|     "nrows",
 | |
|     "thousands",
 | |
|     "memory_map",
 | |
|     "dialect",
 | |
|     "delim_whitespace",
 | |
|     "quoting",
 | |
|     "lineterminator",
 | |
|     "converters",
 | |
|     "iterator",
 | |
|     "dayfirst",
 | |
|     "verbose",
 | |
|     "skipinitialspace",
 | |
|     "low_memory",
 | |
| }
 | |
| 
 | |
| 
 | |
| class _DeprecationConfig(NamedTuple):
 | |
|     default_value: Any
 | |
|     msg: str | None
 | |
| 
 | |
| 
 | |
| @overload
 | |
| def validate_integer(name: str, val: None, min_val: int = ...) -> None:
 | |
|     ...
 | |
| 
 | |
| 
 | |
| @overload
 | |
| def validate_integer(name: str, val: float, min_val: int = ...) -> int:
 | |
|     ...
 | |
| 
 | |
| 
 | |
| @overload
 | |
| def validate_integer(name: str, val: int | None, min_val: int = ...) -> int | None:
 | |
|     ...
 | |
| 
 | |
| 
 | |
| def validate_integer(
 | |
|     name: str, val: int | float | None, min_val: int = 0
 | |
| ) -> int | None:
 | |
|     """
 | |
|     Checks whether the 'name' parameter for parsing is either
 | |
|     an integer OR float that can SAFELY be cast to an integer
 | |
|     without losing accuracy. Raises a ValueError if that is
 | |
|     not the case.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     name : str
 | |
|         Parameter name (used for error reporting)
 | |
|     val : int or float
 | |
|         The value to check
 | |
|     min_val : int
 | |
|         Minimum allowed value (val < min_val will result in a ValueError)
 | |
|     """
 | |
|     if val is None:
 | |
|         return val
 | |
| 
 | |
|     msg = f"'{name:s}' must be an integer >={min_val:d}"
 | |
|     if is_float(val):
 | |
|         if int(val) != val:
 | |
|             raise ValueError(msg)
 | |
|         val = int(val)
 | |
|     elif not (is_integer(val) and val >= min_val):
 | |
|         raise ValueError(msg)
 | |
| 
 | |
|     return int(val)
 | |
| 
 | |
| 
 | |
| def _validate_names(names: Sequence[Hashable] | None) -> None:
 | |
|     """
 | |
|     Raise ValueError if the `names` parameter contains duplicates or has an
 | |
|     invalid data type.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     names : array-like or None
 | |
|         An array containing a list of the names used for the output DataFrame.
 | |
| 
 | |
|     Raises
 | |
|     ------
 | |
|     ValueError
 | |
|         If names are not unique or are not ordered (e.g. set).
 | |
|     """
 | |
|     if names is not None:
 | |
|         if len(names) != len(set(names)):
 | |
|             raise ValueError("Duplicate names are not allowed.")
 | |
|         if not (
 | |
|             is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView)
 | |
|         ):
 | |
|             raise ValueError("Names should be an ordered collection.")
 | |
| 
 | |
| 
 | |
| def _read(
 | |
|     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds
 | |
| ) -> DataFrame | TextFileReader:
 | |
|     """Generic reader of line files."""
 | |
|     # if we pass a date_parser and parse_dates=False, we should not parse the
 | |
|     # dates GH#44366
 | |
|     if kwds.get("parse_dates", None) is None:
 | |
|         if (
 | |
|             kwds.get("date_parser", lib.no_default) is lib.no_default
 | |
|             and kwds.get("date_format", None) is None
 | |
|         ):
 | |
|             kwds["parse_dates"] = False
 | |
|         else:
 | |
|             kwds["parse_dates"] = True
 | |
| 
 | |
|     # Extract some of the arguments (pass chunksize on).
 | |
|     iterator = kwds.get("iterator", False)
 | |
|     chunksize = kwds.get("chunksize", None)
 | |
|     if kwds.get("engine") == "pyarrow":
 | |
|         if iterator:
 | |
|             raise ValueError(
 | |
|                 "The 'iterator' option is not supported with the 'pyarrow' engine"
 | |
|             )
 | |
| 
 | |
|         if chunksize is not None:
 | |
|             raise ValueError(
 | |
|                 "The 'chunksize' option is not supported with the 'pyarrow' engine"
 | |
|             )
 | |
|     else:
 | |
|         chunksize = validate_integer("chunksize", chunksize, 1)
 | |
| 
 | |
|     nrows = kwds.get("nrows", None)
 | |
| 
 | |
|     # Check for duplicates in names.
 | |
|     _validate_names(kwds.get("names", None))
 | |
| 
 | |
|     # Create the parser.
 | |
|     parser = TextFileReader(filepath_or_buffer, **kwds)
 | |
| 
 | |
|     if chunksize or iterator:
 | |
|         return parser
 | |
| 
 | |
|     with parser:
 | |
|         return parser.read(nrows)
 | |
| 
 | |
| 
 | |
| # iterator=True -> TextFileReader
 | |
| @overload
 | |
| def read_csv(
 | |
|     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
 | |
|     *,
 | |
|     sep: str | None | lib.NoDefault = ...,
 | |
|     delimiter: str | None | lib.NoDefault = ...,
 | |
|     header: int | Sequence[int] | None | Literal["infer"] = ...,
 | |
|     names: Sequence[Hashable] | None | lib.NoDefault = ...,
 | |
|     index_col: IndexLabel | Literal[False] | None = ...,
 | |
|     usecols: UsecolsArgType = ...,
 | |
|     dtype: DtypeArg | None = ...,
 | |
|     engine: CSVEngine | None = ...,
 | |
|     converters: Mapping[Hashable, Callable] | None = ...,
 | |
|     true_values: list | None = ...,
 | |
|     false_values: list | None = ...,
 | |
|     skipinitialspace: bool = ...,
 | |
|     skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
 | |
|     skipfooter: int = ...,
 | |
|     nrows: int | None = ...,
 | |
|     na_values: Hashable
 | |
|     | Iterable[Hashable]
 | |
|     | Mapping[Hashable, Iterable[Hashable]]
 | |
|     | None = ...,
 | |
|     na_filter: bool = ...,
 | |
|     verbose: bool | lib.NoDefault = ...,
 | |
|     skip_blank_lines: bool = ...,
 | |
|     parse_dates: bool | Sequence[Hashable] | None = ...,
 | |
|     infer_datetime_format: bool | lib.NoDefault = ...,
 | |
|     keep_date_col: bool | lib.NoDefault = ...,
 | |
|     date_parser: Callable | lib.NoDefault = ...,
 | |
|     date_format: str | dict[Hashable, str] | None = ...,
 | |
|     dayfirst: bool = ...,
 | |
|     cache_dates: bool = ...,
 | |
|     iterator: Literal[True],
 | |
|     chunksize: int | None = ...,
 | |
|     compression: CompressionOptions = ...,
 | |
|     thousands: str | None = ...,
 | |
|     decimal: str = ...,
 | |
|     lineterminator: str | None = ...,
 | |
|     quotechar: str = ...,
 | |
|     quoting: int = ...,
 | |
|     doublequote: bool = ...,
 | |
|     escapechar: str | None = ...,
 | |
|     comment: str | None = ...,
 | |
|     encoding: str | None = ...,
 | |
|     encoding_errors: str | None = ...,
 | |
|     dialect: str | csv.Dialect | None = ...,
 | |
|     on_bad_lines=...,
 | |
|     delim_whitespace: bool | lib.NoDefault = ...,
 | |
|     low_memory: bool = ...,
 | |
|     memory_map: bool = ...,
 | |
|     float_precision: Literal["high", "legacy"] | None = ...,
 | |
|     storage_options: StorageOptions = ...,
 | |
|     dtype_backend: DtypeBackend | lib.NoDefault = ...,
 | |
| ) -> TextFileReader:
 | |
|     ...
 | |
| 
 | |
| 
 | |
| # chunksize=int -> TextFileReader
 | |
| @overload
 | |
| def read_csv(
 | |
|     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
 | |
|     *,
 | |
|     sep: str | None | lib.NoDefault = ...,
 | |
|     delimiter: str | None | lib.NoDefault = ...,
 | |
|     header: int | Sequence[int] | None | Literal["infer"] = ...,
 | |
|     names: Sequence[Hashable] | None | lib.NoDefault = ...,
 | |
|     index_col: IndexLabel | Literal[False] | None = ...,
 | |
|     usecols: UsecolsArgType = ...,
 | |
|     dtype: DtypeArg | None = ...,
 | |
|     engine: CSVEngine | None = ...,
 | |
|     converters: Mapping[Hashable, Callable] | None = ...,
 | |
|     true_values: list | None = ...,
 | |
|     false_values: list | None = ...,
 | |
|     skipinitialspace: bool = ...,
 | |
|     skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
 | |
|     skipfooter: int = ...,
 | |
|     nrows: int | None = ...,
 | |
|     na_values: Hashable
 | |
|     | Iterable[Hashable]
 | |
|     | Mapping[Hashable, Iterable[Hashable]]
 | |
|     | None = ...,
 | |
|     keep_default_na: bool = ...,
 | |
|     na_filter: bool = ...,
 | |
|     verbose: bool | lib.NoDefault = ...,
 | |
|     skip_blank_lines: bool = ...,
 | |
|     parse_dates: bool | Sequence[Hashable] | None = ...,
 | |
|     infer_datetime_format: bool | lib.NoDefault = ...,
 | |
|     keep_date_col: bool | lib.NoDefault = ...,
 | |
|     date_parser: Callable | lib.NoDefault = ...,
 | |
|     date_format: str | dict[Hashable, str] | None = ...,
 | |
|     dayfirst: bool = ...,
 | |
|     cache_dates: bool = ...,
 | |
|     iterator: bool = ...,
 | |
|     chunksize: int,
 | |
|     compression: CompressionOptions = ...,
 | |
|     thousands: str | None = ...,
 | |
|     decimal: str = ...,
 | |
|     lineterminator: str | None = ...,
 | |
|     quotechar: str = ...,
 | |
|     quoting: int = ...,
 | |
|     doublequote: bool = ...,
 | |
|     escapechar: str | None = ...,
 | |
|     comment: str | None = ...,
 | |
|     encoding: str | None = ...,
 | |
|     encoding_errors: str | None = ...,
 | |
|     dialect: str | csv.Dialect | None = ...,
 | |
|     on_bad_lines=...,
 | |
|     delim_whitespace: bool | lib.NoDefault = ...,
 | |
|     low_memory: bool = ...,
 | |
|     memory_map: bool = ...,
 | |
|     float_precision: Literal["high", "legacy"] | None = ...,
 | |
|     storage_options: StorageOptions = ...,
 | |
|     dtype_backend: DtypeBackend | lib.NoDefault = ...,
 | |
| ) -> TextFileReader:
 | |
|     ...
 | |
| 
 | |
| 
 | |
| # default case -> DataFrame
 | |
| @overload
 | |
| def read_csv(
 | |
|     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
 | |
|     *,
 | |
|     sep: str | None | lib.NoDefault = ...,
 | |
|     delimiter: str | None | lib.NoDefault = ...,
 | |
|     header: int | Sequence[int] | None | Literal["infer"] = ...,
 | |
|     names: Sequence[Hashable] | None | lib.NoDefault = ...,
 | |
|     index_col: IndexLabel | Literal[False] | None = ...,
 | |
|     usecols: UsecolsArgType = ...,
 | |
|     dtype: DtypeArg | None = ...,
 | |
|     engine: CSVEngine | None = ...,
 | |
|     converters: Mapping[Hashable, Callable] | None = ...,
 | |
|     true_values: list | None = ...,
 | |
|     false_values: list | None = ...,
 | |
|     skipinitialspace: bool = ...,
 | |
|     skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
 | |
|     skipfooter: int = ...,
 | |
|     nrows: int | None = ...,
 | |
|     na_values: Hashable
 | |
|     | Iterable[Hashable]
 | |
|     | Mapping[Hashable, Iterable[Hashable]]
 | |
|     | None = ...,
 | |
|     keep_default_na: bool = ...,
 | |
|     na_filter: bool = ...,
 | |
|     verbose: bool | lib.NoDefault = ...,
 | |
|     skip_blank_lines: bool = ...,
 | |
|     parse_dates: bool | Sequence[Hashable] | None = ...,
 | |
|     infer_datetime_format: bool | lib.NoDefault = ...,
 | |
|     keep_date_col: bool | lib.NoDefault = ...,
 | |
|     date_parser: Callable | lib.NoDefault = ...,
 | |
|     date_format: str | dict[Hashable, str] | None = ...,
 | |
|     dayfirst: bool = ...,
 | |
|     cache_dates: bool = ...,
 | |
|     iterator: Literal[False] = ...,
 | |
|     chunksize: None = ...,
 | |
|     compression: CompressionOptions = ...,
 | |
|     thousands: str | None = ...,
 | |
|     decimal: str = ...,
 | |
|     lineterminator: str | None = ...,
 | |
|     quotechar: str = ...,
 | |
|     quoting: int = ...,
 | |
|     doublequote: bool = ...,
 | |
|     escapechar: str | None = ...,
 | |
|     comment: str | None = ...,
 | |
|     encoding: str | None = ...,
 | |
|     encoding_errors: str | None = ...,
 | |
|     dialect: str | csv.Dialect | None = ...,
 | |
|     on_bad_lines=...,
 | |
|     delim_whitespace: bool | lib.NoDefault = ...,
 | |
|     low_memory: bool = ...,
 | |
|     memory_map: bool = ...,
 | |
|     float_precision: Literal["high", "legacy"] | None = ...,
 | |
|     storage_options: StorageOptions = ...,
 | |
|     dtype_backend: DtypeBackend | lib.NoDefault = ...,
 | |
| ) -> DataFrame:
 | |
|     ...
 | |
| 
 | |
| 
 | |
| # Unions -> DataFrame | TextFileReader
 | |
| @overload
 | |
| def read_csv(
 | |
|     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
 | |
|     *,
 | |
|     sep: str | None | lib.NoDefault = ...,
 | |
|     delimiter: str | None | lib.NoDefault = ...,
 | |
|     header: int | Sequence[int] | None | Literal["infer"] = ...,
 | |
|     names: Sequence[Hashable] | None | lib.NoDefault = ...,
 | |
|     index_col: IndexLabel | Literal[False] | None = ...,
 | |
|     usecols: UsecolsArgType = ...,
 | |
|     dtype: DtypeArg | None = ...,
 | |
|     engine: CSVEngine | None = ...,
 | |
|     converters: Mapping[Hashable, Callable] | None = ...,
 | |
|     true_values: list | None = ...,
 | |
|     false_values: list | None = ...,
 | |
|     skipinitialspace: bool = ...,
 | |
|     skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
 | |
|     skipfooter: int = ...,
 | |
|     nrows: int | None = ...,
 | |
|     na_values: Hashable
 | |
|     | Iterable[Hashable]
 | |
|     | Mapping[Hashable, Iterable[Hashable]]
 | |
|     | None = ...,
 | |
|     keep_default_na: bool = ...,
 | |
|     na_filter: bool = ...,
 | |
|     verbose: bool | lib.NoDefault = ...,
 | |
|     skip_blank_lines: bool = ...,
 | |
|     parse_dates: bool | Sequence[Hashable] | None = ...,
 | |
|     infer_datetime_format: bool | lib.NoDefault = ...,
 | |
|     keep_date_col: bool | lib.NoDefault = ...,
 | |
|     date_parser: Callable | lib.NoDefault = ...,
 | |
|     date_format: str | dict[Hashable, str] | None = ...,
 | |
|     dayfirst: bool = ...,
 | |
|     cache_dates: bool = ...,
 | |
|     iterator: bool = ...,
 | |
|     chunksize: int | None = ...,
 | |
|     compression: CompressionOptions = ...,
 | |
|     thousands: str | None = ...,
 | |
|     decimal: str = ...,
 | |
|     lineterminator: str | None = ...,
 | |
|     quotechar: str = ...,
 | |
|     quoting: int = ...,
 | |
|     doublequote: bool = ...,
 | |
|     escapechar: str | None = ...,
 | |
|     comment: str | None = ...,
 | |
|     encoding: str | None = ...,
 | |
|     encoding_errors: str | None = ...,
 | |
|     dialect: str | csv.Dialect | None = ...,
 | |
|     on_bad_lines=...,
 | |
|     delim_whitespace: bool | lib.NoDefault = ...,
 | |
|     low_memory: bool = ...,
 | |
|     memory_map: bool = ...,
 | |
|     float_precision: Literal["high", "legacy"] | None = ...,
 | |
|     storage_options: StorageOptions = ...,
 | |
|     dtype_backend: DtypeBackend | lib.NoDefault = ...,
 | |
| ) -> DataFrame | TextFileReader:
 | |
|     ...
 | |
| 
 | |
| 
 | |
| @Appender(
 | |
|     _doc_read_csv_and_table.format(
 | |
|         func_name="read_csv",
 | |
|         summary="Read a comma-separated values (csv) file into DataFrame.",
 | |
|         see_also_func_name="read_table",
 | |
|         see_also_func_summary="Read general delimited file into DataFrame.",
 | |
|         _default_sep="','",
 | |
|         storage_options=_shared_docs["storage_options"],
 | |
|         decompression_options=_shared_docs["decompression_options"]
 | |
|         % "filepath_or_buffer",
 | |
|     )
 | |
| )
 | |
| def read_csv(
 | |
|     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
 | |
|     *,
 | |
|     sep: str | None | lib.NoDefault = lib.no_default,
 | |
|     delimiter: str | None | lib.NoDefault = None,
 | |
|     # Column and Index Locations and Names
 | |
|     header: int | Sequence[int] | None | Literal["infer"] = "infer",
 | |
|     names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
 | |
|     index_col: IndexLabel | Literal[False] | None = None,
 | |
|     usecols: UsecolsArgType = None,
 | |
|     # General Parsing Configuration
 | |
|     dtype: DtypeArg | None = None,
 | |
|     engine: CSVEngine | None = None,
 | |
|     converters: Mapping[Hashable, Callable] | None = None,
 | |
|     true_values: list | None = None,
 | |
|     false_values: list | None = None,
 | |
|     skipinitialspace: bool = False,
 | |
|     skiprows: list[int] | int | Callable[[Hashable], bool] | None = None,
 | |
|     skipfooter: int = 0,
 | |
|     nrows: int | None = None,
 | |
|     # NA and Missing Data Handling
 | |
|     na_values: Hashable
 | |
|     | Iterable[Hashable]
 | |
|     | Mapping[Hashable, Iterable[Hashable]]
 | |
|     | None = None,
 | |
|     keep_default_na: bool = True,
 | |
|     na_filter: bool = True,
 | |
|     verbose: bool | lib.NoDefault = lib.no_default,
 | |
|     skip_blank_lines: bool = True,
 | |
|     # Datetime Handling
 | |
|     parse_dates: bool | Sequence[Hashable] | None = None,
 | |
|     infer_datetime_format: bool | lib.NoDefault = lib.no_default,
 | |
|     keep_date_col: bool | lib.NoDefault = lib.no_default,
 | |
|     date_parser: Callable | lib.NoDefault = lib.no_default,
 | |
|     date_format: str | dict[Hashable, str] | None = None,
 | |
|     dayfirst: bool = False,
 | |
|     cache_dates: bool = True,
 | |
|     # Iteration
 | |
|     iterator: bool = False,
 | |
|     chunksize: int | None = None,
 | |
|     # Quoting, Compression, and File Format
 | |
|     compression: CompressionOptions = "infer",
 | |
|     thousands: str | None = None,
 | |
|     decimal: str = ".",
 | |
|     lineterminator: str | None = None,
 | |
|     quotechar: str = '"',
 | |
|     quoting: int = csv.QUOTE_MINIMAL,
 | |
|     doublequote: bool = True,
 | |
|     escapechar: str | None = None,
 | |
|     comment: str | None = None,
 | |
|     encoding: str | None = None,
 | |
|     encoding_errors: str | None = "strict",
 | |
|     dialect: str | csv.Dialect | None = None,
 | |
|     # Error Handling
 | |
|     on_bad_lines: str = "error",
 | |
|     # Internal
 | |
|     delim_whitespace: bool | lib.NoDefault = lib.no_default,
 | |
|     low_memory: bool = _c_parser_defaults["low_memory"],
 | |
|     memory_map: bool = False,
 | |
|     float_precision: Literal["high", "legacy"] | None = None,
 | |
|     storage_options: StorageOptions | None = None,
 | |
|     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
 | |
| ) -> DataFrame | TextFileReader:
 | |
|     if keep_date_col is not lib.no_default:
 | |
|         # GH#55569
 | |
|         warnings.warn(
 | |
|             "The 'keep_date_col' keyword in pd.read_csv is deprecated and "
 | |
|             "will be removed in a future version. Explicitly remove unwanted "
 | |
|             "columns after parsing instead.",
 | |
|             FutureWarning,
 | |
|             stacklevel=find_stack_level(),
 | |
|         )
 | |
|     else:
 | |
|         keep_date_col = False
 | |
| 
 | |
|     if lib.is_list_like(parse_dates):
 | |
|         # GH#55569
 | |
|         depr = False
 | |
|         # error: Item "bool" of "bool | Sequence[Hashable] | None" has no
 | |
|         # attribute "__iter__" (not iterable)
 | |
|         if not all(is_hashable(x) for x in parse_dates):  # type: ignore[union-attr]
 | |
|             depr = True
 | |
|         elif isinstance(parse_dates, dict) and any(
 | |
|             lib.is_list_like(x) for x in parse_dates.values()
 | |
|         ):
 | |
|             depr = True
 | |
|         if depr:
 | |
|             warnings.warn(
 | |
|                 "Support for nested sequences for 'parse_dates' in pd.read_csv "
 | |
|                 "is deprecated. Combine the desired columns with pd.to_datetime "
 | |
|                 "after parsing instead.",
 | |
|                 FutureWarning,
 | |
|                 stacklevel=find_stack_level(),
 | |
|             )
 | |
| 
 | |
|     if infer_datetime_format is not lib.no_default:
 | |
|         warnings.warn(
 | |
|             "The argument 'infer_datetime_format' is deprecated and will "
 | |
|             "be removed in a future version. "
 | |
|             "A strict version of it is now the default, see "
 | |
|             "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "
 | |
|             "You can safely remove this argument.",
 | |
|             FutureWarning,
 | |
|             stacklevel=find_stack_level(),
 | |
|         )
 | |
| 
 | |
|     if delim_whitespace is not lib.no_default:
 | |
|         # GH#55569
 | |
|         warnings.warn(
 | |
|             "The 'delim_whitespace' keyword in pd.read_csv is deprecated and "
 | |
|             "will be removed in a future version. Use ``sep='\\s+'`` instead",
 | |
|             FutureWarning,
 | |
|             stacklevel=find_stack_level(),
 | |
|         )
 | |
|     else:
 | |
|         delim_whitespace = False
 | |
| 
 | |
|     if verbose is not lib.no_default:
 | |
|         # GH#55569
 | |
|         warnings.warn(
 | |
|             "The 'verbose' keyword in pd.read_csv is deprecated and "
 | |
|             "will be removed in a future version.",
 | |
|             FutureWarning,
 | |
|             stacklevel=find_stack_level(),
 | |
|         )
 | |
|     else:
 | |
|         verbose = False
 | |
| 
 | |
|     # locals() should never be modified
 | |
|     kwds = locals().copy()
 | |
|     del kwds["filepath_or_buffer"]
 | |
|     del kwds["sep"]
 | |
| 
 | |
|     kwds_defaults = _refine_defaults_read(
 | |
|         dialect,
 | |
|         delimiter,
 | |
|         delim_whitespace,
 | |
|         engine,
 | |
|         sep,
 | |
|         on_bad_lines,
 | |
|         names,
 | |
|         defaults={"delimiter": ","},
 | |
|         dtype_backend=dtype_backend,
 | |
|     )
 | |
|     kwds.update(kwds_defaults)
 | |
| 
 | |
|     return _read(filepath_or_buffer, kwds)
 | |
| 
 | |
| 
 | |
| # iterator=True -> TextFileReader
 | |
| @overload
 | |
| def read_table(
 | |
|     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
 | |
|     *,
 | |
|     sep: str | None | lib.NoDefault = ...,
 | |
|     delimiter: str | None | lib.NoDefault = ...,
 | |
|     header: int | Sequence[int] | None | Literal["infer"] = ...,
 | |
|     names: Sequence[Hashable] | None | lib.NoDefault = ...,
 | |
|     index_col: IndexLabel | Literal[False] | None = ...,
 | |
|     usecols: UsecolsArgType = ...,
 | |
|     dtype: DtypeArg | None = ...,
 | |
|     engine: CSVEngine | None = ...,
 | |
|     converters: Mapping[Hashable, Callable] | None = ...,
 | |
|     true_values: list | None = ...,
 | |
|     false_values: list | None = ...,
 | |
|     skipinitialspace: bool = ...,
 | |
|     skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
 | |
|     skipfooter: int = ...,
 | |
|     nrows: int | None = ...,
 | |
|     na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ...,
 | |
|     keep_default_na: bool = ...,
 | |
|     na_filter: bool = ...,
 | |
|     verbose: bool | lib.NoDefault = ...,
 | |
|     skip_blank_lines: bool = ...,
 | |
|     parse_dates: bool | Sequence[Hashable] = ...,
 | |
|     infer_datetime_format: bool | lib.NoDefault = ...,
 | |
|     keep_date_col: bool | lib.NoDefault = ...,
 | |
|     date_parser: Callable | lib.NoDefault = ...,
 | |
|     date_format: str | dict[Hashable, str] | None = ...,
 | |
|     dayfirst: bool = ...,
 | |
|     cache_dates: bool = ...,
 | |
|     iterator: Literal[True],
 | |
|     chunksize: int | None = ...,
 | |
|     compression: CompressionOptions = ...,
 | |
|     thousands: str | None = ...,
 | |
|     decimal: str = ...,
 | |
|     lineterminator: str | None = ...,
 | |
|     quotechar: str = ...,
 | |
|     quoting: int = ...,
 | |
|     doublequote: bool = ...,
 | |
|     escapechar: str | None = ...,
 | |
|     comment: str | None = ...,
 | |
|     encoding: str | None = ...,
 | |
|     encoding_errors: str | None = ...,
 | |
|     dialect: str | csv.Dialect | None = ...,
 | |
|     on_bad_lines=...,
 | |
|     delim_whitespace: bool = ...,
 | |
|     low_memory: bool = ...,
 | |
|     memory_map: bool = ...,
 | |
|     float_precision: str | None = ...,
 | |
|     storage_options: StorageOptions = ...,
 | |
|     dtype_backend: DtypeBackend | lib.NoDefault = ...,
 | |
| ) -> TextFileReader:
 | |
|     ...
 | |
| 
 | |
| 
 | |
| # chunksize=int -> TextFileReader
 | |
| @overload
 | |
| def read_table(
 | |
|     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
 | |
|     *,
 | |
|     sep: str | None | lib.NoDefault = ...,
 | |
|     delimiter: str | None | lib.NoDefault = ...,
 | |
|     header: int | Sequence[int] | None | Literal["infer"] = ...,
 | |
|     names: Sequence[Hashable] | None | lib.NoDefault = ...,
 | |
|     index_col: IndexLabel | Literal[False] | None = ...,
 | |
|     usecols: UsecolsArgType = ...,
 | |
|     dtype: DtypeArg | None = ...,
 | |
|     engine: CSVEngine | None = ...,
 | |
|     converters: Mapping[Hashable, Callable] | None = ...,
 | |
|     true_values: list | None = ...,
 | |
|     false_values: list | None = ...,
 | |
|     skipinitialspace: bool = ...,
 | |
|     skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
 | |
|     skipfooter: int = ...,
 | |
|     nrows: int | None = ...,
 | |
|     na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ...,
 | |
|     keep_default_na: bool = ...,
 | |
|     na_filter: bool = ...,
 | |
|     verbose: bool | lib.NoDefault = ...,
 | |
|     skip_blank_lines: bool = ...,
 | |
|     parse_dates: bool | Sequence[Hashable] = ...,
 | |
|     infer_datetime_format: bool | lib.NoDefault = ...,
 | |
|     keep_date_col: bool | lib.NoDefault = ...,
 | |
|     date_parser: Callable | lib.NoDefault = ...,
 | |
|     date_format: str | dict[Hashable, str] | None = ...,
 | |
|     dayfirst: bool = ...,
 | |
|     cache_dates: bool = ...,
 | |
|     iterator: bool = ...,
 | |
|     chunksize: int,
 | |
|     compression: CompressionOptions = ...,
 | |
|     thousands: str | None = ...,
 | |
|     decimal: str = ...,
 | |
|     lineterminator: str | None = ...,
 | |
|     quotechar: str = ...,
 | |
|     quoting: int = ...,
 | |
|     doublequote: bool = ...,
 | |
|     escapechar: str | None = ...,
 | |
|     comment: str | None = ...,
 | |
|     encoding: str | None = ...,
 | |
|     encoding_errors: str | None = ...,
 | |
|     dialect: str | csv.Dialect | None = ...,
 | |
|     on_bad_lines=...,
 | |
|     delim_whitespace: bool = ...,
 | |
|     low_memory: bool = ...,
 | |
|     memory_map: bool = ...,
 | |
|     float_precision: str | None = ...,
 | |
|     storage_options: StorageOptions = ...,
 | |
|     dtype_backend: DtypeBackend | lib.NoDefault = ...,
 | |
| ) -> TextFileReader:
 | |
|     ...
 | |
| 
 | |
| 
 | |
| # default -> DataFrame
 | |
| @overload
 | |
| def read_table(
 | |
|     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
 | |
|     *,
 | |
|     sep: str | None | lib.NoDefault = ...,
 | |
|     delimiter: str | None | lib.NoDefault = ...,
 | |
|     header: int | Sequence[int] | None | Literal["infer"] = ...,
 | |
|     names: Sequence[Hashable] | None | lib.NoDefault = ...,
 | |
|     index_col: IndexLabel | Literal[False] | None = ...,
 | |
|     usecols: UsecolsArgType = ...,
 | |
|     dtype: DtypeArg | None = ...,
 | |
|     engine: CSVEngine | None = ...,
 | |
|     converters: Mapping[Hashable, Callable] | None = ...,
 | |
|     true_values: list | None = ...,
 | |
|     false_values: list | None = ...,
 | |
|     skipinitialspace: bool = ...,
 | |
|     skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
 | |
|     skipfooter: int = ...,
 | |
|     nrows: int | None = ...,
 | |
|     na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ...,
 | |
|     keep_default_na: bool = ...,
 | |
|     na_filter: bool = ...,
 | |
|     verbose: bool | lib.NoDefault = ...,
 | |
|     skip_blank_lines: bool = ...,
 | |
|     parse_dates: bool | Sequence[Hashable] = ...,
 | |
|     infer_datetime_format: bool | lib.NoDefault = ...,
 | |
|     keep_date_col: bool | lib.NoDefault = ...,
 | |
|     date_parser: Callable | lib.NoDefault = ...,
 | |
|     date_format: str | dict[Hashable, str] | None = ...,
 | |
|     dayfirst: bool = ...,
 | |
|     cache_dates: bool = ...,
 | |
|     iterator: Literal[False] = ...,
 | |
|     chunksize: None = ...,
 | |
|     compression: CompressionOptions = ...,
 | |
|     thousands: str | None = ...,
 | |
|     decimal: str = ...,
 | |
|     lineterminator: str | None = ...,
 | |
|     quotechar: str = ...,
 | |
|     quoting: int = ...,
 | |
|     doublequote: bool = ...,
 | |
|     escapechar: str | None = ...,
 | |
|     comment: str | None = ...,
 | |
|     encoding: str | None = ...,
 | |
|     encoding_errors: str | None = ...,
 | |
|     dialect: str | csv.Dialect | None = ...,
 | |
|     on_bad_lines=...,
 | |
|     delim_whitespace: bool = ...,
 | |
|     low_memory: bool = ...,
 | |
|     memory_map: bool = ...,
 | |
|     float_precision: str | None = ...,
 | |
|     storage_options: StorageOptions = ...,
 | |
|     dtype_backend: DtypeBackend | lib.NoDefault = ...,
 | |
| ) -> DataFrame:
 | |
|     ...
 | |
| 
 | |
| 
 | |
| # Unions -> DataFrame | TextFileReader
 | |
| @overload
 | |
| def read_table(
 | |
|     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
 | |
|     *,
 | |
|     sep: str | None | lib.NoDefault = ...,
 | |
|     delimiter: str | None | lib.NoDefault = ...,
 | |
|     header: int | Sequence[int] | None | Literal["infer"] = ...,
 | |
|     names: Sequence[Hashable] | None | lib.NoDefault = ...,
 | |
|     index_col: IndexLabel | Literal[False] | None = ...,
 | |
|     usecols: UsecolsArgType = ...,
 | |
|     dtype: DtypeArg | None = ...,
 | |
|     engine: CSVEngine | None = ...,
 | |
|     converters: Mapping[Hashable, Callable] | None = ...,
 | |
|     true_values: list | None = ...,
 | |
|     false_values: list | None = ...,
 | |
|     skipinitialspace: bool = ...,
 | |
|     skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
 | |
|     skipfooter: int = ...,
 | |
|     nrows: int | None = ...,
 | |
|     na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ...,
 | |
|     keep_default_na: bool = ...,
 | |
|     na_filter: bool = ...,
 | |
|     verbose: bool | lib.NoDefault = ...,
 | |
|     skip_blank_lines: bool = ...,
 | |
|     parse_dates: bool | Sequence[Hashable] = ...,
 | |
|     infer_datetime_format: bool | lib.NoDefault = ...,
 | |
|     keep_date_col: bool | lib.NoDefault = ...,
 | |
|     date_parser: Callable | lib.NoDefault = ...,
 | |
|     date_format: str | dict[Hashable, str] | None = ...,
 | |
|     dayfirst: bool = ...,
 | |
|     cache_dates: bool = ...,
 | |
|     iterator: bool = ...,
 | |
|     chunksize: int | None = ...,
 | |
|     compression: CompressionOptions = ...,
 | |
|     thousands: str | None = ...,
 | |
|     decimal: str = ...,
 | |
|     lineterminator: str | None = ...,
 | |
|     quotechar: str = ...,
 | |
|     quoting: int = ...,
 | |
|     doublequote: bool = ...,
 | |
|     escapechar: str | None = ...,
 | |
|     comment: str | None = ...,
 | |
|     encoding: str | None = ...,
 | |
|     encoding_errors: str | None = ...,
 | |
|     dialect: str | csv.Dialect | None = ...,
 | |
|     on_bad_lines=...,
 | |
|     delim_whitespace: bool = ...,
 | |
|     low_memory: bool = ...,
 | |
|     memory_map: bool = ...,
 | |
|     float_precision: str | None = ...,
 | |
|     storage_options: StorageOptions = ...,
 | |
|     dtype_backend: DtypeBackend | lib.NoDefault = ...,
 | |
| ) -> DataFrame | TextFileReader:
 | |
|     ...
 | |
| 
 | |
| 
 | |
| @Appender(
 | |
|     _doc_read_csv_and_table.format(
 | |
|         func_name="read_table",
 | |
|         summary="Read general delimited file into DataFrame.",
 | |
|         see_also_func_name="read_csv",
 | |
|         see_also_func_summary=(
 | |
|             "Read a comma-separated values (csv) file into DataFrame."
 | |
|         ),
 | |
|         _default_sep=r"'\\t' (tab-stop)",
 | |
|         storage_options=_shared_docs["storage_options"],
 | |
|         decompression_options=_shared_docs["decompression_options"]
 | |
|         % "filepath_or_buffer",
 | |
|     )
 | |
| )
 | |
| def read_table(
 | |
|     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
 | |
|     *,
 | |
|     sep: str | None | lib.NoDefault = lib.no_default,
 | |
|     delimiter: str | None | lib.NoDefault = None,
 | |
|     # Column and Index Locations and Names
 | |
|     header: int | Sequence[int] | None | Literal["infer"] = "infer",
 | |
|     names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
 | |
|     index_col: IndexLabel | Literal[False] | None = None,
 | |
|     usecols: UsecolsArgType = None,
 | |
|     # General Parsing Configuration
 | |
|     dtype: DtypeArg | None = None,
 | |
|     engine: CSVEngine | None = None,
 | |
|     converters: Mapping[Hashable, Callable] | None = None,
 | |
|     true_values: list | None = None,
 | |
|     false_values: list | None = None,
 | |
|     skipinitialspace: bool = False,
 | |
|     skiprows: list[int] | int | Callable[[Hashable], bool] | None = None,
 | |
|     skipfooter: int = 0,
 | |
|     nrows: int | None = None,
 | |
|     # NA and Missing Data Handling
 | |
|     na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = None,
 | |
|     keep_default_na: bool = True,
 | |
|     na_filter: bool = True,
 | |
|     verbose: bool | lib.NoDefault = lib.no_default,
 | |
|     skip_blank_lines: bool = True,
 | |
|     # Datetime Handling
 | |
|     parse_dates: bool | Sequence[Hashable] = False,
 | |
|     infer_datetime_format: bool | lib.NoDefault = lib.no_default,
 | |
|     keep_date_col: bool | lib.NoDefault = lib.no_default,
 | |
|     date_parser: Callable | lib.NoDefault = lib.no_default,
 | |
|     date_format: str | dict[Hashable, str] | None = None,
 | |
|     dayfirst: bool = False,
 | |
|     cache_dates: bool = True,
 | |
|     # Iteration
 | |
|     iterator: bool = False,
 | |
|     chunksize: int | None = None,
 | |
|     # Quoting, Compression, and File Format
 | |
|     compression: CompressionOptions = "infer",
 | |
|     thousands: str | None = None,
 | |
|     decimal: str = ".",
 | |
|     lineterminator: str | None = None,
 | |
|     quotechar: str = '"',
 | |
|     quoting: int = csv.QUOTE_MINIMAL,
 | |
|     doublequote: bool = True,
 | |
|     escapechar: str | None = None,
 | |
|     comment: str | None = None,
 | |
|     encoding: str | None = None,
 | |
|     encoding_errors: str | None = "strict",
 | |
|     dialect: str | csv.Dialect | None = None,
 | |
|     # Error Handling
 | |
|     on_bad_lines: str = "error",
 | |
|     # Internal
 | |
|     delim_whitespace: bool | lib.NoDefault = lib.no_default,
 | |
|     low_memory: bool = _c_parser_defaults["low_memory"],
 | |
|     memory_map: bool = False,
 | |
|     float_precision: str | None = None,
 | |
|     storage_options: StorageOptions | None = None,
 | |
|     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
 | |
| ) -> DataFrame | TextFileReader:
 | |
|     if keep_date_col is not lib.no_default:
 | |
|         # GH#55569
 | |
|         warnings.warn(
 | |
|             "The 'keep_date_col' keyword in pd.read_table is deprecated and "
 | |
|             "will be removed in a future version. Explicitly remove unwanted "
 | |
|             "columns after parsing instead.",
 | |
|             FutureWarning,
 | |
|             stacklevel=find_stack_level(),
 | |
|         )
 | |
|     else:
 | |
|         keep_date_col = False
 | |
| 
 | |
|     # error: Item "bool" of "bool | Sequence[Hashable]" has no attribute "__iter__"
 | |
|     if lib.is_list_like(parse_dates) and not all(is_hashable(x) for x in parse_dates):  # type: ignore[union-attr]
 | |
|         # GH#55569
 | |
|         warnings.warn(
 | |
|             "Support for nested sequences for 'parse_dates' in pd.read_table "
 | |
|             "is deprecated. Combine the desired columns with pd.to_datetime "
 | |
|             "after parsing instead.",
 | |
|             FutureWarning,
 | |
|             stacklevel=find_stack_level(),
 | |
|         )
 | |
| 
 | |
|     if infer_datetime_format is not lib.no_default:
 | |
|         warnings.warn(
 | |
|             "The argument 'infer_datetime_format' is deprecated and will "
 | |
|             "be removed in a future version. "
 | |
|             "A strict version of it is now the default, see "
 | |
|             "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "
 | |
|             "You can safely remove this argument.",
 | |
|             FutureWarning,
 | |
|             stacklevel=find_stack_level(),
 | |
|         )
 | |
| 
 | |
|     if delim_whitespace is not lib.no_default:
 | |
|         # GH#55569
 | |
|         warnings.warn(
 | |
|             "The 'delim_whitespace' keyword in pd.read_table is deprecated and "
 | |
|             "will be removed in a future version. Use ``sep='\\s+'`` instead",
 | |
|             FutureWarning,
 | |
|             stacklevel=find_stack_level(),
 | |
|         )
 | |
|     else:
 | |
|         delim_whitespace = False
 | |
| 
 | |
|     if verbose is not lib.no_default:
 | |
|         # GH#55569
 | |
|         warnings.warn(
 | |
|             "The 'verbose' keyword in pd.read_table is deprecated and "
 | |
|             "will be removed in a future version.",
 | |
|             FutureWarning,
 | |
|             stacklevel=find_stack_level(),
 | |
|         )
 | |
|     else:
 | |
|         verbose = False
 | |
| 
 | |
|     # locals() should never be modified
 | |
|     kwds = locals().copy()
 | |
|     del kwds["filepath_or_buffer"]
 | |
|     del kwds["sep"]
 | |
| 
 | |
|     kwds_defaults = _refine_defaults_read(
 | |
|         dialect,
 | |
|         delimiter,
 | |
|         delim_whitespace,
 | |
|         engine,
 | |
|         sep,
 | |
|         on_bad_lines,
 | |
|         names,
 | |
|         defaults={"delimiter": "\t"},
 | |
|         dtype_backend=dtype_backend,
 | |
|     )
 | |
|     kwds.update(kwds_defaults)
 | |
| 
 | |
|     return _read(filepath_or_buffer, kwds)
 | |
| 
 | |
| 
 | |
| @overload
 | |
| def read_fwf(
 | |
|     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
 | |
|     *,
 | |
|     colspecs: Sequence[tuple[int, int]] | str | None = ...,
 | |
|     widths: Sequence[int] | None = ...,
 | |
|     infer_nrows: int = ...,
 | |
|     dtype_backend: DtypeBackend | lib.NoDefault = ...,
 | |
|     iterator: Literal[True],
 | |
|     chunksize: int | None = ...,
 | |
|     **kwds,
 | |
| ) -> TextFileReader:
 | |
|     ...
 | |
| 
 | |
| 
 | |
| @overload
 | |
| def read_fwf(
 | |
|     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
 | |
|     *,
 | |
|     colspecs: Sequence[tuple[int, int]] | str | None = ...,
 | |
|     widths: Sequence[int] | None = ...,
 | |
|     infer_nrows: int = ...,
 | |
|     dtype_backend: DtypeBackend | lib.NoDefault = ...,
 | |
|     iterator: bool = ...,
 | |
|     chunksize: int,
 | |
|     **kwds,
 | |
| ) -> TextFileReader:
 | |
|     ...
 | |
| 
 | |
| 
 | |
| @overload
 | |
| def read_fwf(
 | |
|     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
 | |
|     *,
 | |
|     colspecs: Sequence[tuple[int, int]] | str | None = ...,
 | |
|     widths: Sequence[int] | None = ...,
 | |
|     infer_nrows: int = ...,
 | |
|     dtype_backend: DtypeBackend | lib.NoDefault = ...,
 | |
|     iterator: Literal[False] = ...,
 | |
|     chunksize: None = ...,
 | |
|     **kwds,
 | |
| ) -> DataFrame:
 | |
|     ...
 | |
| 
 | |
| 
 | |
| def read_fwf(
 | |
|     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
 | |
|     *,
 | |
|     colspecs: Sequence[tuple[int, int]] | str | None = "infer",
 | |
|     widths: Sequence[int] | None = None,
 | |
|     infer_nrows: int = 100,
 | |
|     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
 | |
|     iterator: bool = False,
 | |
|     chunksize: int | None = None,
 | |
|     **kwds,
 | |
| ) -> DataFrame | TextFileReader:
 | |
|     r"""
 | |
|     Read a table of fixed-width formatted lines into DataFrame.
 | |
| 
 | |
|     Also supports optionally iterating or breaking of the file
 | |
|     into chunks.
 | |
| 
 | |
|     Additional help can be found in the `online docs for IO Tools
 | |
|     <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     filepath_or_buffer : str, path object, or file-like object
 | |
|         String, path object (implementing ``os.PathLike[str]``), or file-like
 | |
|         object implementing a text ``read()`` function.The string could be a URL.
 | |
|         Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
 | |
|         expected. A local file could be:
 | |
|         ``file://localhost/path/to/table.csv``.
 | |
|     colspecs : list of tuple (int, int) or 'infer'. optional
 | |
|         A list of tuples giving the extents of the fixed-width
 | |
|         fields of each line as half-open intervals (i.e.,  [from, to[ ).
 | |
|         String value 'infer' can be used to instruct the parser to try
 | |
|         detecting the column specifications from the first 100 rows of
 | |
|         the data which are not being skipped via skiprows (default='infer').
 | |
|     widths : list of int, optional
 | |
|         A list of field widths which can be used instead of 'colspecs' if
 | |
|         the intervals are contiguous.
 | |
|     infer_nrows : int, default 100
 | |
|         The number of rows to consider when letting the parser determine the
 | |
|         `colspecs`.
 | |
|     dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
 | |
|         Back-end data type applied to the resultant :class:`DataFrame`
 | |
|         (still experimental). Behaviour is as follows:
 | |
| 
 | |
|         * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
 | |
|           (default).
 | |
|         * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
 | |
|           DataFrame.
 | |
| 
 | |
|         .. versionadded:: 2.0
 | |
| 
 | |
|     **kwds : optional
 | |
|         Optional keyword arguments can be passed to ``TextFileReader``.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     DataFrame or TextFileReader
 | |
|         A comma-separated values (csv) file is returned as two-dimensional
 | |
|         data structure with labeled axes.
 | |
| 
 | |
|     See Also
 | |
|     --------
 | |
|     DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
 | |
|     read_csv : Read a comma-separated values (csv) file into DataFrame.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     >>> pd.read_fwf('data.csv')  # doctest: +SKIP
 | |
|     """
 | |
|     # Check input arguments.
 | |
|     if colspecs is None and widths is None:
 | |
|         raise ValueError("Must specify either colspecs or widths")
 | |
|     if colspecs not in (None, "infer") and widths is not None:
 | |
|         raise ValueError("You must specify only one of 'widths' and 'colspecs'")
 | |
| 
 | |
|     # Compute 'colspecs' from 'widths', if specified.
 | |
|     if widths is not None:
 | |
|         colspecs, col = [], 0
 | |
|         for w in widths:
 | |
|             colspecs.append((col, col + w))
 | |
|             col += w
 | |
| 
 | |
|     # for mypy
 | |
|     assert colspecs is not None
 | |
| 
 | |
|     # GH#40830
 | |
|     # Ensure length of `colspecs` matches length of `names`
 | |
|     names = kwds.get("names")
 | |
|     if names is not None:
 | |
|         if len(names) != len(colspecs) and colspecs != "infer":
 | |
|             # need to check len(index_col) as it might contain
 | |
|             # unnamed indices, in which case it's name is not required
 | |
|             len_index = 0
 | |
|             if kwds.get("index_col") is not None:
 | |
|                 index_col: Any = kwds.get("index_col")
 | |
|                 if index_col is not False:
 | |
|                     if not is_list_like(index_col):
 | |
|                         len_index = 1
 | |
|                     else:
 | |
|                         len_index = len(index_col)
 | |
|             if kwds.get("usecols") is None and len(names) + len_index != len(colspecs):
 | |
|                 # If usecols is used colspec may be longer than names
 | |
|                 raise ValueError("Length of colspecs must match length of names")
 | |
| 
 | |
|     kwds["colspecs"] = colspecs
 | |
|     kwds["infer_nrows"] = infer_nrows
 | |
|     kwds["engine"] = "python-fwf"
 | |
|     kwds["iterator"] = iterator
 | |
|     kwds["chunksize"] = chunksize
 | |
| 
 | |
|     check_dtype_backend(dtype_backend)
 | |
|     kwds["dtype_backend"] = dtype_backend
 | |
|     return _read(filepath_or_buffer, kwds)
 | |
| 
 | |
| 
 | |
| class TextFileReader(abc.Iterator):
 | |
|     """
 | |
| 
 | |
|     Passed dialect overrides any of the related parser options
 | |
| 
 | |
|     """
 | |
| 
 | |
|     def __init__(
 | |
|         self,
 | |
|         f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list,
 | |
|         engine: CSVEngine | None = None,
 | |
|         **kwds,
 | |
|     ) -> None:
 | |
|         if engine is not None:
 | |
|             engine_specified = True
 | |
|         else:
 | |
|             engine = "python"
 | |
|             engine_specified = False
 | |
|         self.engine = engine
 | |
|         self._engine_specified = kwds.get("engine_specified", engine_specified)
 | |
| 
 | |
|         _validate_skipfooter(kwds)
 | |
| 
 | |
|         dialect = _extract_dialect(kwds)
 | |
|         if dialect is not None:
 | |
|             if engine == "pyarrow":
 | |
|                 raise ValueError(
 | |
|                     "The 'dialect' option is not supported with the 'pyarrow' engine"
 | |
|                 )
 | |
|             kwds = _merge_with_dialect_properties(dialect, kwds)
 | |
| 
 | |
|         if kwds.get("header", "infer") == "infer":
 | |
|             kwds["header"] = 0 if kwds.get("names") is None else None
 | |
| 
 | |
|         self.orig_options = kwds
 | |
| 
 | |
|         # miscellanea
 | |
|         self._currow = 0
 | |
| 
 | |
|         options = self._get_options_with_defaults(engine)
 | |
|         options["storage_options"] = kwds.get("storage_options", None)
 | |
| 
 | |
|         self.chunksize = options.pop("chunksize", None)
 | |
|         self.nrows = options.pop("nrows", None)
 | |
| 
 | |
|         self._check_file_or_buffer(f, engine)
 | |
|         self.options, self.engine = self._clean_options(options, engine)
 | |
| 
 | |
|         if "has_index_names" in kwds:
 | |
|             self.options["has_index_names"] = kwds["has_index_names"]
 | |
| 
 | |
|         self.handles: IOHandles | None = None
 | |
|         self._engine = self._make_engine(f, self.engine)
 | |
| 
 | |
|     def close(self) -> None:
 | |
|         if self.handles is not None:
 | |
|             self.handles.close()
 | |
|         self._engine.close()
 | |
| 
 | |
|     def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]:
 | |
|         kwds = self.orig_options
 | |
| 
 | |
|         options = {}
 | |
|         default: object | None
 | |
| 
 | |
|         for argname, default in parser_defaults.items():
 | |
|             value = kwds.get(argname, default)
 | |
| 
 | |
|             # see gh-12935
 | |
|             if (
 | |
|                 engine == "pyarrow"
 | |
|                 and argname in _pyarrow_unsupported
 | |
|                 and value != default
 | |
|                 and value != getattr(value, "value", default)
 | |
|             ):
 | |
|                 raise ValueError(
 | |
|                     f"The {repr(argname)} option is not supported with the "
 | |
|                     f"'pyarrow' engine"
 | |
|                 )
 | |
|             options[argname] = value
 | |
| 
 | |
|         for argname, default in _c_parser_defaults.items():
 | |
|             if argname in kwds:
 | |
|                 value = kwds[argname]
 | |
| 
 | |
|                 if engine != "c" and value != default:
 | |
|                     # TODO: Refactor this logic, its pretty convoluted
 | |
|                     if "python" in engine and argname not in _python_unsupported:
 | |
|                         pass
 | |
|                     elif "pyarrow" in engine and argname not in _pyarrow_unsupported:
 | |
|                         pass
 | |
|                     else:
 | |
|                         raise ValueError(
 | |
|                             f"The {repr(argname)} option is not supported with the "
 | |
|                             f"{repr(engine)} engine"
 | |
|                         )
 | |
|             else:
 | |
|                 value = default
 | |
|             options[argname] = value
 | |
| 
 | |
|         if engine == "python-fwf":
 | |
|             for argname, default in _fwf_defaults.items():
 | |
|                 options[argname] = kwds.get(argname, default)
 | |
| 
 | |
|         return options
 | |
| 
 | |
|     def _check_file_or_buffer(self, f, engine: CSVEngine) -> None:
 | |
|         # see gh-16530
 | |
|         if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"):
 | |
|             # The C engine doesn't need the file-like to have the "__iter__"
 | |
|             # attribute. However, the Python engine needs "__iter__(...)"
 | |
|             # when iterating through such an object, meaning it
 | |
|             # needs to have that attribute
 | |
|             raise ValueError(
 | |
|                 "The 'python' engine cannot iterate through this file buffer."
 | |
|             )
 | |
| 
 | |
|     def _clean_options(
 | |
|         self, options: dict[str, Any], engine: CSVEngine
 | |
|     ) -> tuple[dict[str, Any], CSVEngine]:
 | |
|         result = options.copy()
 | |
| 
 | |
|         fallback_reason = None
 | |
| 
 | |
|         # C engine not supported yet
 | |
|         if engine == "c":
 | |
|             if options["skipfooter"] > 0:
 | |
|                 fallback_reason = "the 'c' engine does not support skipfooter"
 | |
|                 engine = "python"
 | |
| 
 | |
|         sep = options["delimiter"]
 | |
|         delim_whitespace = options["delim_whitespace"]
 | |
| 
 | |
|         if sep is None and not delim_whitespace:
 | |
|             if engine in ("c", "pyarrow"):
 | |
|                 fallback_reason = (
 | |
|                     f"the '{engine}' engine does not support "
 | |
|                     "sep=None with delim_whitespace=False"
 | |
|                 )
 | |
|                 engine = "python"
 | |
|         elif sep is not None and len(sep) > 1:
 | |
|             if engine == "c" and sep == r"\s+":
 | |
|                 result["delim_whitespace"] = True
 | |
|                 del result["delimiter"]
 | |
|             elif engine not in ("python", "python-fwf"):
 | |
|                 # wait until regex engine integrated
 | |
|                 fallback_reason = (
 | |
|                     f"the '{engine}' engine does not support "
 | |
|                     "regex separators (separators > 1 char and "
 | |
|                     r"different from '\s+' are interpreted as regex)"
 | |
|                 )
 | |
|                 engine = "python"
 | |
|         elif delim_whitespace:
 | |
|             if "python" in engine:
 | |
|                 result["delimiter"] = r"\s+"
 | |
|         elif sep is not None:
 | |
|             encodeable = True
 | |
|             encoding = sys.getfilesystemencoding() or "utf-8"
 | |
|             try:
 | |
|                 if len(sep.encode(encoding)) > 1:
 | |
|                     encodeable = False
 | |
|             except UnicodeDecodeError:
 | |
|                 encodeable = False
 | |
|             if not encodeable and engine not in ("python", "python-fwf"):
 | |
|                 fallback_reason = (
 | |
|                     f"the separator encoded in {encoding} "
 | |
|                     f"is > 1 char long, and the '{engine}' engine "
 | |
|                     "does not support such separators"
 | |
|                 )
 | |
|                 engine = "python"
 | |
| 
 | |
|         quotechar = options["quotechar"]
 | |
|         if quotechar is not None and isinstance(quotechar, (str, bytes)):
 | |
|             if (
 | |
|                 len(quotechar) == 1
 | |
|                 and ord(quotechar) > 127
 | |
|                 and engine not in ("python", "python-fwf")
 | |
|             ):
 | |
|                 fallback_reason = (
 | |
|                     "ord(quotechar) > 127, meaning the "
 | |
|                     "quotechar is larger than one byte, "
 | |
|                     f"and the '{engine}' engine does not support such quotechars"
 | |
|                 )
 | |
|                 engine = "python"
 | |
| 
 | |
|         if fallback_reason and self._engine_specified:
 | |
|             raise ValueError(fallback_reason)
 | |
| 
 | |
|         if engine == "c":
 | |
|             for arg in _c_unsupported:
 | |
|                 del result[arg]
 | |
| 
 | |
|         if "python" in engine:
 | |
|             for arg in _python_unsupported:
 | |
|                 if fallback_reason and result[arg] != _c_parser_defaults.get(arg):
 | |
|                     raise ValueError(
 | |
|                         "Falling back to the 'python' engine because "
 | |
|                         f"{fallback_reason}, but this causes {repr(arg)} to be "
 | |
|                         "ignored as it is not supported by the 'python' engine."
 | |
|                     )
 | |
|                 del result[arg]
 | |
| 
 | |
|         if fallback_reason:
 | |
|             warnings.warn(
 | |
|                 (
 | |
|                     "Falling back to the 'python' engine because "
 | |
|                     f"{fallback_reason}; you can avoid this warning by specifying "
 | |
|                     "engine='python'."
 | |
|                 ),
 | |
|                 ParserWarning,
 | |
|                 stacklevel=find_stack_level(),
 | |
|             )
 | |
| 
 | |
|         index_col = options["index_col"]
 | |
|         names = options["names"]
 | |
|         converters = options["converters"]
 | |
|         na_values = options["na_values"]
 | |
|         skiprows = options["skiprows"]
 | |
| 
 | |
|         validate_header_arg(options["header"])
 | |
| 
 | |
|         if index_col is True:
 | |
|             raise ValueError("The value of index_col couldn't be 'True'")
 | |
|         if is_index_col(index_col):
 | |
|             if not isinstance(index_col, (list, tuple, np.ndarray)):
 | |
|                 index_col = [index_col]
 | |
|         result["index_col"] = index_col
 | |
| 
 | |
|         names = list(names) if names is not None else names
 | |
| 
 | |
|         # type conversion-related
 | |
|         if converters is not None:
 | |
|             if not isinstance(converters, dict):
 | |
|                 raise TypeError(
 | |
|                     "Type converters must be a dict or subclass, "
 | |
|                     f"input was a {type(converters).__name__}"
 | |
|                 )
 | |
|         else:
 | |
|             converters = {}
 | |
| 
 | |
|         # Converting values to NA
 | |
|         keep_default_na = options["keep_default_na"]
 | |
|         floatify = engine != "pyarrow"
 | |
|         na_values, na_fvalues = _clean_na_values(
 | |
|             na_values, keep_default_na, floatify=floatify
 | |
|         )
 | |
| 
 | |
|         # handle skiprows; this is internally handled by the
 | |
|         # c-engine, so only need for python and pyarrow parsers
 | |
|         if engine == "pyarrow":
 | |
|             if not is_integer(skiprows) and skiprows is not None:
 | |
|                 # pyarrow expects skiprows to be passed as an integer
 | |
|                 raise ValueError(
 | |
|                     "skiprows argument must be an integer when using "
 | |
|                     "engine='pyarrow'"
 | |
|                 )
 | |
|         else:
 | |
|             if is_integer(skiprows):
 | |
|                 skiprows = list(range(skiprows))
 | |
|             if skiprows is None:
 | |
|                 skiprows = set()
 | |
|             elif not callable(skiprows):
 | |
|                 skiprows = set(skiprows)
 | |
| 
 | |
|         # put stuff back
 | |
|         result["names"] = names
 | |
|         result["converters"] = converters
 | |
|         result["na_values"] = na_values
 | |
|         result["na_fvalues"] = na_fvalues
 | |
|         result["skiprows"] = skiprows
 | |
| 
 | |
|         return result, engine
 | |
| 
 | |
|     def __next__(self) -> DataFrame:
 | |
|         try:
 | |
|             return self.get_chunk()
 | |
|         except StopIteration:
 | |
|             self.close()
 | |
|             raise
 | |
| 
 | |
|     def _make_engine(
 | |
|         self,
 | |
|         f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list | IO,
 | |
|         engine: CSVEngine = "c",
 | |
|     ) -> ParserBase:
 | |
|         mapping: dict[str, type[ParserBase]] = {
 | |
|             "c": CParserWrapper,
 | |
|             "python": PythonParser,
 | |
|             "pyarrow": ArrowParserWrapper,
 | |
|             "python-fwf": FixedWidthFieldParser,
 | |
|         }
 | |
|         if engine not in mapping:
 | |
|             raise ValueError(
 | |
|                 f"Unknown engine: {engine} (valid options are {mapping.keys()})"
 | |
|             )
 | |
|         if not isinstance(f, list):
 | |
|             # open file here
 | |
|             is_text = True
 | |
|             mode = "r"
 | |
|             if engine == "pyarrow":
 | |
|                 is_text = False
 | |
|                 mode = "rb"
 | |
|             elif (
 | |
|                 engine == "c"
 | |
|                 and self.options.get("encoding", "utf-8") == "utf-8"
 | |
|                 and isinstance(stringify_path(f), str)
 | |
|             ):
 | |
|                 # c engine can decode utf-8 bytes, adding TextIOWrapper makes
 | |
|                 # the c-engine especially for memory_map=True far slower
 | |
|                 is_text = False
 | |
|                 if "b" not in mode:
 | |
|                     mode += "b"
 | |
|             self.handles = get_handle(
 | |
|                 f,
 | |
|                 mode,
 | |
|                 encoding=self.options.get("encoding", None),
 | |
|                 compression=self.options.get("compression", None),
 | |
|                 memory_map=self.options.get("memory_map", False),
 | |
|                 is_text=is_text,
 | |
|                 errors=self.options.get("encoding_errors", "strict"),
 | |
|                 storage_options=self.options.get("storage_options", None),
 | |
|             )
 | |
|             assert self.handles is not None
 | |
|             f = self.handles.handle
 | |
| 
 | |
|         elif engine != "python":
 | |
|             msg = f"Invalid file path or buffer object type: {type(f)}"
 | |
|             raise ValueError(msg)
 | |
| 
 | |
|         try:
 | |
|             return mapping[engine](f, **self.options)
 | |
|         except Exception:
 | |
|             if self.handles is not None:
 | |
|                 self.handles.close()
 | |
|             raise
 | |
| 
 | |
|     def _failover_to_python(self) -> None:
 | |
|         raise AbstractMethodError(self)
 | |
| 
 | |
|     def read(self, nrows: int | None = None) -> DataFrame:
 | |
|         if self.engine == "pyarrow":
 | |
|             try:
 | |
|                 # error: "ParserBase" has no attribute "read"
 | |
|                 df = self._engine.read()  # type: ignore[attr-defined]
 | |
|             except Exception:
 | |
|                 self.close()
 | |
|                 raise
 | |
|         else:
 | |
|             nrows = validate_integer("nrows", nrows)
 | |
|             try:
 | |
|                 # error: "ParserBase" has no attribute "read"
 | |
|                 (
 | |
|                     index,
 | |
|                     columns,
 | |
|                     col_dict,
 | |
|                 ) = self._engine.read(  # type: ignore[attr-defined]
 | |
|                     nrows
 | |
|                 )
 | |
|             except Exception:
 | |
|                 self.close()
 | |
|                 raise
 | |
| 
 | |
|             if index is None:
 | |
|                 if col_dict:
 | |
|                     # Any column is actually fine:
 | |
|                     new_rows = len(next(iter(col_dict.values())))
 | |
|                     index = RangeIndex(self._currow, self._currow + new_rows)
 | |
|                 else:
 | |
|                     new_rows = 0
 | |
|             else:
 | |
|                 new_rows = len(index)
 | |
| 
 | |
|             if hasattr(self, "orig_options"):
 | |
|                 dtype_arg = self.orig_options.get("dtype", None)
 | |
|             else:
 | |
|                 dtype_arg = None
 | |
| 
 | |
|             if isinstance(dtype_arg, dict):
 | |
|                 dtype = defaultdict(lambda: None)  # type: ignore[var-annotated]
 | |
|                 dtype.update(dtype_arg)
 | |
|             elif dtype_arg is not None and pandas_dtype(dtype_arg) in (
 | |
|                 np.str_,
 | |
|                 np.object_,
 | |
|             ):
 | |
|                 dtype = defaultdict(lambda: dtype_arg)
 | |
|             else:
 | |
|                 dtype = None
 | |
| 
 | |
|             if dtype is not None:
 | |
|                 new_col_dict = {}
 | |
|                 for k, v in col_dict.items():
 | |
|                     d = (
 | |
|                         dtype[k]
 | |
|                         if pandas_dtype(dtype[k]) in (np.str_, np.object_)
 | |
|                         else None
 | |
|                     )
 | |
|                     new_col_dict[k] = Series(v, index=index, dtype=d, copy=False)
 | |
|             else:
 | |
|                 new_col_dict = col_dict
 | |
| 
 | |
|             df = DataFrame(
 | |
|                 new_col_dict,
 | |
|                 columns=columns,
 | |
|                 index=index,
 | |
|                 copy=not using_copy_on_write(),
 | |
|             )
 | |
| 
 | |
|             self._currow += new_rows
 | |
|         return df
 | |
| 
 | |
|     def get_chunk(self, size: int | None = None) -> DataFrame:
 | |
|         if size is None:
 | |
|             size = self.chunksize
 | |
|         if self.nrows is not None:
 | |
|             if self._currow >= self.nrows:
 | |
|                 raise StopIteration
 | |
|             size = min(size, self.nrows - self._currow)
 | |
|         return self.read(nrows=size)
 | |
| 
 | |
|     def __enter__(self) -> Self:
 | |
|         return self
 | |
| 
 | |
|     def __exit__(
 | |
|         self,
 | |
|         exc_type: type[BaseException] | None,
 | |
|         exc_value: BaseException | None,
 | |
|         traceback: TracebackType | None,
 | |
|     ) -> None:
 | |
|         self.close()
 | |
| 
 | |
| 
 | |
| def TextParser(*args, **kwds) -> TextFileReader:
 | |
|     """
 | |
|     Converts lists of lists/tuples into DataFrames with proper type inference
 | |
|     and optional (e.g. string to datetime) conversion. Also enables iterating
 | |
|     lazily over chunks of large files
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     data : file-like object or list
 | |
|     delimiter : separator character to use
 | |
|     dialect : str or csv.Dialect instance, optional
 | |
|         Ignored if delimiter is longer than 1 character
 | |
|     names : sequence, default
 | |
|     header : int, default 0
 | |
|         Row to use to parse column labels. Defaults to the first row. Prior
 | |
|         rows will be discarded
 | |
|     index_col : int or list, optional
 | |
|         Column or columns to use as the (possibly hierarchical) index
 | |
|     has_index_names: bool, default False
 | |
|         True if the cols defined in index_col have an index name and are
 | |
|         not in the header.
 | |
|     na_values : scalar, str, list-like, or dict, optional
 | |
|         Additional strings to recognize as NA/NaN.
 | |
|     keep_default_na : bool, default True
 | |
|     thousands : str, optional
 | |
|         Thousands separator
 | |
|     comment : str, optional
 | |
|         Comment out remainder of line
 | |
|     parse_dates : bool, default False
 | |
|     keep_date_col : bool, default False
 | |
|     date_parser : function, optional
 | |
| 
 | |
|         .. deprecated:: 2.0.0
 | |
|     date_format : str or dict of column -> format, default ``None``
 | |
| 
 | |
|         .. versionadded:: 2.0.0
 | |
|     skiprows : list of integers
 | |
|         Row numbers to skip
 | |
|     skipfooter : int
 | |
|         Number of line at bottom of file to skip
 | |
|     converters : dict, optional
 | |
|         Dict of functions for converting values in certain columns. Keys can
 | |
|         either be integers or column labels, values are functions that take one
 | |
|         input argument, the cell (not column) content, and return the
 | |
|         transformed content.
 | |
|     encoding : str, optional
 | |
|         Encoding to use for UTF when reading/writing (ex. 'utf-8')
 | |
|     float_precision : str, optional
 | |
|         Specifies which converter the C engine should use for floating-point
 | |
|         values. The options are `None` or `high` for the ordinary converter,
 | |
|         `legacy` for the original lower precision pandas converter, and
 | |
|         `round_trip` for the round-trip converter.
 | |
|     """
 | |
|     kwds["engine"] = "python"
 | |
|     return TextFileReader(*args, **kwds)
 | |
| 
 | |
| 
 | |
| def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = True):
 | |
|     na_fvalues: set | dict
 | |
|     if na_values is None:
 | |
|         if keep_default_na:
 | |
|             na_values = STR_NA_VALUES
 | |
|         else:
 | |
|             na_values = set()
 | |
|         na_fvalues = set()
 | |
|     elif isinstance(na_values, dict):
 | |
|         old_na_values = na_values.copy()
 | |
|         na_values = {}  # Prevent aliasing.
 | |
| 
 | |
|         # Convert the values in the na_values dictionary
 | |
|         # into array-likes for further use. This is also
 | |
|         # where we append the default NaN values, provided
 | |
|         # that `keep_default_na=True`.
 | |
|         for k, v in old_na_values.items():
 | |
|             if not is_list_like(v):
 | |
|                 v = [v]
 | |
| 
 | |
|             if keep_default_na:
 | |
|                 v = set(v) | STR_NA_VALUES
 | |
| 
 | |
|             na_values[k] = v
 | |
|         na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
 | |
|     else:
 | |
|         if not is_list_like(na_values):
 | |
|             na_values = [na_values]
 | |
|         na_values = _stringify_na_values(na_values, floatify)
 | |
|         if keep_default_na:
 | |
|             na_values = na_values | STR_NA_VALUES
 | |
| 
 | |
|         na_fvalues = _floatify_na_values(na_values)
 | |
| 
 | |
|     return na_values, na_fvalues
 | |
| 
 | |
| 
 | |
| def _floatify_na_values(na_values):
 | |
|     # create float versions of the na_values
 | |
|     result = set()
 | |
|     for v in na_values:
 | |
|         try:
 | |
|             v = float(v)
 | |
|             if not np.isnan(v):
 | |
|                 result.add(v)
 | |
|         except (TypeError, ValueError, OverflowError):
 | |
|             pass
 | |
|     return result
 | |
| 
 | |
| 
 | |
| def _stringify_na_values(na_values, floatify: bool):
 | |
|     """return a stringified and numeric for these values"""
 | |
|     result: list[str | float] = []
 | |
|     for x in na_values:
 | |
|         result.append(str(x))
 | |
|         result.append(x)
 | |
|         try:
 | |
|             v = float(x)
 | |
| 
 | |
|             # we are like 999 here
 | |
|             if v == int(v):
 | |
|                 v = int(v)
 | |
|                 result.append(f"{v}.0")
 | |
|                 result.append(str(v))
 | |
| 
 | |
|             if floatify:
 | |
|                 result.append(v)
 | |
|         except (TypeError, ValueError, OverflowError):
 | |
|             pass
 | |
|         if floatify:
 | |
|             try:
 | |
|                 result.append(int(x))
 | |
|             except (TypeError, ValueError, OverflowError):
 | |
|                 pass
 | |
|     return set(result)
 | |
| 
 | |
| 
 | |
| def _refine_defaults_read(
 | |
|     dialect: str | csv.Dialect | None,
 | |
|     delimiter: str | None | lib.NoDefault,
 | |
|     delim_whitespace: bool,
 | |
|     engine: CSVEngine | None,
 | |
|     sep: str | None | lib.NoDefault,
 | |
|     on_bad_lines: str | Callable,
 | |
|     names: Sequence[Hashable] | None | lib.NoDefault,
 | |
|     defaults: dict[str, Any],
 | |
|     dtype_backend: DtypeBackend | lib.NoDefault,
 | |
| ):
 | |
|     """Validate/refine default values of input parameters of read_csv, read_table.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     dialect : str or csv.Dialect
 | |
|         If provided, this parameter will override values (default or not) for the
 | |
|         following parameters: `delimiter`, `doublequote`, `escapechar`,
 | |
|         `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
 | |
|         override values, a ParserWarning will be issued. See csv.Dialect
 | |
|         documentation for more details.
 | |
|     delimiter : str or object
 | |
|         Alias for sep.
 | |
|     delim_whitespace : bool
 | |
|         Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
 | |
|         used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
 | |
|         is set to True, nothing should be passed in for the ``delimiter``
 | |
|         parameter.
 | |
| 
 | |
|         .. deprecated:: 2.2.0
 | |
|             Use ``sep="\\s+"`` instead.
 | |
|     engine : {{'c', 'python'}}
 | |
|         Parser engine to use. The C engine is faster while the python engine is
 | |
|         currently more feature-complete.
 | |
|     sep : str or object
 | |
|         A delimiter provided by the user (str) or a sentinel value, i.e.
 | |
|         pandas._libs.lib.no_default.
 | |
|     on_bad_lines : str, callable
 | |
|         An option for handling bad lines or a sentinel value(None).
 | |
|     names : array-like, optional
 | |
|         List of column names to use. If the file contains a header row,
 | |
|         then you should explicitly pass ``header=0`` to override the column names.
 | |
|         Duplicates in this list are not allowed.
 | |
|     defaults: dict
 | |
|         Default values of input parameters.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     kwds : dict
 | |
|         Input parameters with correct values.
 | |
| 
 | |
|     Raises
 | |
|     ------
 | |
|     ValueError :
 | |
|         If a delimiter was specified with ``sep`` (or ``delimiter``) and
 | |
|         ``delim_whitespace=True``.
 | |
|     """
 | |
|     # fix types for sep, delimiter to Union(str, Any)
 | |
|     delim_default = defaults["delimiter"]
 | |
|     kwds: dict[str, Any] = {}
 | |
|     # gh-23761
 | |
|     #
 | |
|     # When a dialect is passed, it overrides any of the overlapping
 | |
|     # parameters passed in directly. We don't want to warn if the
 | |
|     # default parameters were passed in (since it probably means
 | |
|     # that the user didn't pass them in explicitly in the first place).
 | |
|     #
 | |
|     # "delimiter" is the annoying corner case because we alias it to
 | |
|     # "sep" before doing comparison to the dialect values later on.
 | |
|     # Thus, we need a flag to indicate that we need to "override"
 | |
|     # the comparison to dialect values by checking if default values
 | |
|     # for BOTH "delimiter" and "sep" were provided.
 | |
|     if dialect is not None:
 | |
|         kwds["sep_override"] = delimiter is None and (
 | |
|             sep is lib.no_default or sep == delim_default
 | |
|         )
 | |
| 
 | |
|     if delimiter and (sep is not lib.no_default):
 | |
|         raise ValueError("Specified a sep and a delimiter; you can only specify one.")
 | |
| 
 | |
|     kwds["names"] = None if names is lib.no_default else names
 | |
| 
 | |
|     # Alias sep -> delimiter.
 | |
|     if delimiter is None:
 | |
|         delimiter = sep
 | |
| 
 | |
|     if delim_whitespace and (delimiter is not lib.no_default):
 | |
|         raise ValueError(
 | |
|             "Specified a delimiter with both sep and "
 | |
|             "delim_whitespace=True; you can only specify one."
 | |
|         )
 | |
| 
 | |
|     if delimiter == "\n":
 | |
|         raise ValueError(
 | |
|             r"Specified \n as separator or delimiter. This forces the python engine "
 | |
|             "which does not accept a line terminator. Hence it is not allowed to use "
 | |
|             "the line terminator as separator.",
 | |
|         )
 | |
| 
 | |
|     if delimiter is lib.no_default:
 | |
|         # assign default separator value
 | |
|         kwds["delimiter"] = delim_default
 | |
|     else:
 | |
|         kwds["delimiter"] = delimiter
 | |
| 
 | |
|     if engine is not None:
 | |
|         kwds["engine_specified"] = True
 | |
|     else:
 | |
|         kwds["engine"] = "c"
 | |
|         kwds["engine_specified"] = False
 | |
| 
 | |
|     if on_bad_lines == "error":
 | |
|         kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR
 | |
|     elif on_bad_lines == "warn":
 | |
|         kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN
 | |
|     elif on_bad_lines == "skip":
 | |
|         kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP
 | |
|     elif callable(on_bad_lines):
 | |
|         if engine not in ["python", "pyarrow"]:
 | |
|             raise ValueError(
 | |
|                 "on_bad_line can only be a callable function "
 | |
|                 "if engine='python' or 'pyarrow'"
 | |
|             )
 | |
|         kwds["on_bad_lines"] = on_bad_lines
 | |
|     else:
 | |
|         raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines")
 | |
| 
 | |
|     check_dtype_backend(dtype_backend)
 | |
| 
 | |
|     kwds["dtype_backend"] = dtype_backend
 | |
| 
 | |
|     return kwds
 | |
| 
 | |
| 
 | |
| def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None:
 | |
|     """
 | |
|     Extract concrete csv dialect instance.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     csv.Dialect or None
 | |
|     """
 | |
|     if kwds.get("dialect") is None:
 | |
|         return None
 | |
| 
 | |
|     dialect = kwds["dialect"]
 | |
|     if dialect in csv.list_dialects():
 | |
|         dialect = csv.get_dialect(dialect)
 | |
| 
 | |
|     _validate_dialect(dialect)
 | |
| 
 | |
|     return dialect
 | |
| 
 | |
| 
 | |
| MANDATORY_DIALECT_ATTRS = (
 | |
|     "delimiter",
 | |
|     "doublequote",
 | |
|     "escapechar",
 | |
|     "skipinitialspace",
 | |
|     "quotechar",
 | |
|     "quoting",
 | |
| )
 | |
| 
 | |
| 
 | |
| def _validate_dialect(dialect: csv.Dialect) -> None:
 | |
|     """
 | |
|     Validate csv dialect instance.
 | |
| 
 | |
|     Raises
 | |
|     ------
 | |
|     ValueError
 | |
|         If incorrect dialect is provided.
 | |
|     """
 | |
|     for param in MANDATORY_DIALECT_ATTRS:
 | |
|         if not hasattr(dialect, param):
 | |
|             raise ValueError(f"Invalid dialect {dialect} provided")
 | |
| 
 | |
| 
 | |
| def _merge_with_dialect_properties(
 | |
|     dialect: csv.Dialect,
 | |
|     defaults: dict[str, Any],
 | |
| ) -> dict[str, Any]:
 | |
|     """
 | |
|     Merge default kwargs in TextFileReader with dialect parameters.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     dialect : csv.Dialect
 | |
|         Concrete csv dialect. See csv.Dialect documentation for more details.
 | |
|     defaults : dict
 | |
|         Keyword arguments passed to TextFileReader.
 | |
| 
 | |
|     Returns
 | |
|     -------
 | |
|     kwds : dict
 | |
|         Updated keyword arguments, merged with dialect parameters.
 | |
|     """
 | |
|     kwds = defaults.copy()
 | |
| 
 | |
|     for param in MANDATORY_DIALECT_ATTRS:
 | |
|         dialect_val = getattr(dialect, param)
 | |
| 
 | |
|         parser_default = parser_defaults[param]
 | |
|         provided = kwds.get(param, parser_default)
 | |
| 
 | |
|         # Messages for conflicting values between the dialect
 | |
|         # instance and the actual parameters provided.
 | |
|         conflict_msgs = []
 | |
| 
 | |
|         # Don't warn if the default parameter was passed in,
 | |
|         # even if it conflicts with the dialect (gh-23761).
 | |
|         if provided not in (parser_default, dialect_val):
 | |
|             msg = (
 | |
|                 f"Conflicting values for '{param}': '{provided}' was "
 | |
|                 f"provided, but the dialect specifies '{dialect_val}'. "
 | |
|                 "Using the dialect-specified value."
 | |
|             )
 | |
| 
 | |
|             # Annoying corner case for not warning about
 | |
|             # conflicts between dialect and delimiter parameter.
 | |
|             # Refer to the outer "_read_" function for more info.
 | |
|             if not (param == "delimiter" and kwds.pop("sep_override", False)):
 | |
|                 conflict_msgs.append(msg)
 | |
| 
 | |
|         if conflict_msgs:
 | |
|             warnings.warn(
 | |
|                 "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level()
 | |
|             )
 | |
|         kwds[param] = dialect_val
 | |
|     return kwds
 | |
| 
 | |
| 
 | |
| def _validate_skipfooter(kwds: dict[str, Any]) -> None:
 | |
|     """
 | |
|     Check whether skipfooter is compatible with other kwargs in TextFileReader.
 | |
| 
 | |
|     Parameters
 | |
|     ----------
 | |
|     kwds : dict
 | |
|         Keyword arguments passed to TextFileReader.
 | |
| 
 | |
|     Raises
 | |
|     ------
 | |
|     ValueError
 | |
|         If skipfooter is not compatible with other parameters.
 | |
|     """
 | |
|     if kwds.get("skipfooter"):
 | |
|         if kwds.get("iterator") or kwds.get("chunksize"):
 | |
|             raise ValueError("'skipfooter' not supported for iteration")
 | |
|         if kwds.get("nrows"):
 | |
|             raise ValueError("'skipfooter' not supported with 'nrows'")
 |