done
This commit is contained in:
@ -0,0 +1,382 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import parsers as libparsers
|
||||
from pandas.errors import DtypeWarning
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", [0, "index"])
|
||||
def test_read_chunksize_with_index(all_parsers, index_col):
|
||||
parser = all_parsers
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
["foo", 2, 3, 4, 5],
|
||||
["bar", 7, 8, 9, 10],
|
||||
["baz", 12, 13, 14, 15],
|
||||
["qux", 12, 13, 14, 15],
|
||||
["foo2", 12, 13, 14, 15],
|
||||
["bar2", 12, 13, 14, 15],
|
||||
],
|
||||
columns=["index", "A", "B", "C", "D"],
|
||||
)
|
||||
expected = expected.set_index("index")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
|
||||
list(reader)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
|
||||
chunks = list(reader)
|
||||
tm.assert_frame_equal(chunks[0], expected[:2])
|
||||
tm.assert_frame_equal(chunks[1], expected[2:4])
|
||||
tm.assert_frame_equal(chunks[2], expected[4:])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
|
||||
def test_read_chunksize_bad(all_parsers, chunksize):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
msg = r"'chunksize' must be an integer >=1"
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [2, 8])
|
||||
def test_read_chunksize_and_nrows(all_parsers, chunksize):
|
||||
# see gh-15755
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0, "nrows": 5}
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
return
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
|
||||
tm.assert_frame_equal(concat(reader), expected)
|
||||
|
||||
|
||||
def test_read_chunksize_and_nrows_changing_size(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0, "nrows": 5}
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
return
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
|
||||
tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
|
||||
tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
|
||||
|
||||
with pytest.raises(StopIteration, match=""):
|
||||
reader.get_chunk(size=3)
|
||||
|
||||
|
||||
def test_get_chunk_passed_chunksize(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
1,2,3"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), chunksize=2) as reader:
|
||||
reader.get_chunk()
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), chunksize=2) as reader:
|
||||
result = reader.get_chunk()
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
|
||||
def test_read_chunksize_compat(all_parsers, kwargs):
|
||||
# see gh-12185
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
|
||||
concat(reader)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
|
||||
via_reader = concat(reader)
|
||||
tm.assert_frame_equal(via_reader, result)
|
||||
|
||||
|
||||
def test_read_chunksize_jagged_names(all_parsers):
|
||||
# see gh-23509
|
||||
parser = all_parsers
|
||||
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
|
||||
|
||||
expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(
|
||||
StringIO(data), names=range(10), chunksize=4
|
||||
) as reader:
|
||||
concat(reader)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
|
||||
result = concat(reader)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_chunk_begins_with_newline_whitespace(all_parsers):
|
||||
# see gh-10022
|
||||
parser = all_parsers
|
||||
data = "\n hello\nworld\n"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = DataFrame([" hello", "world"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
|
||||
# mainly an issue with the C parser
|
||||
heuristic = 2**3
|
||||
parser = all_parsers
|
||||
integers = [str(i) for i in range(heuristic - 1)]
|
||||
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
|
||||
|
||||
# Coercions should work without warnings.
|
||||
with monkeypatch.context() as m:
|
||||
m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
assert type(result.a[0]) is np.float64
|
||||
assert result.a.dtype == float
|
||||
|
||||
|
||||
def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
|
||||
warning_type = None
|
||||
parser = all_parsers
|
||||
size = 10000
|
||||
|
||||
# see gh-3866: if chunks are different types and can't
|
||||
# be coerced using numerical types, then issue warning.
|
||||
if parser.engine == "c" and parser.low_memory:
|
||||
warning_type = DtypeWarning
|
||||
# Use larger size to hit warning path
|
||||
size = 499999
|
||||
|
||||
integers = [str(i) for i in range(size)]
|
||||
data = "a\n" + "\n".join(integers + ["a", "b"] + integers)
|
||||
|
||||
buf = StringIO(data)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
df = parser.read_csv(
|
||||
buf,
|
||||
)
|
||||
else:
|
||||
df = parser.read_csv_check_warnings(
|
||||
warning_type,
|
||||
r"Columns \(0\) have mixed types. "
|
||||
"Specify dtype option on import or set low_memory=False.",
|
||||
buf,
|
||||
)
|
||||
if parser.engine == "c" and parser.low_memory:
|
||||
assert df.a.dtype == object
|
||||
elif using_infer_string:
|
||||
assert df.a.dtype == "str"
|
||||
else:
|
||||
assert df.a.dtype == object
|
||||
|
||||
|
||||
@pytest.mark.parametrize("iterator", [True, False])
|
||||
def test_empty_with_nrows_chunksize(all_parsers, iterator):
|
||||
# see gh-9535
|
||||
parser = all_parsers
|
||||
expected = DataFrame(columns=["foo", "bar"])
|
||||
|
||||
nrows = 10
|
||||
data = StringIO("foo,bar\n")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"The '(nrows|chunksize)' option is not supported with the 'pyarrow' engine"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
if iterator:
|
||||
with parser.read_csv(data, chunksize=nrows) as reader:
|
||||
next(iter(reader))
|
||||
else:
|
||||
parser.read_csv(data, nrows=nrows)
|
||||
return
|
||||
|
||||
if iterator:
|
||||
with parser.read_csv(data, chunksize=nrows) as reader:
|
||||
result = next(iter(reader))
|
||||
else:
|
||||
result = parser.read_csv(data, nrows=nrows)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_memory_growth_chunksize(all_parsers):
|
||||
# see gh-24805
|
||||
#
|
||||
# Let's just make sure that we don't crash
|
||||
# as we iteratively process all chunks.
|
||||
parser = all_parsers
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
for i in range(1000):
|
||||
f.write(str(i) + "\n")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(path, chunksize=20) as result:
|
||||
for _ in result:
|
||||
pass
|
||||
return
|
||||
|
||||
with parser.read_csv(path, chunksize=20) as result:
|
||||
for _ in result:
|
||||
pass
|
||||
|
||||
|
||||
def test_chunksize_with_usecols_second_block_shorter(all_parsers):
|
||||
# GH#21211
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4
|
||||
5,6,7,8
|
||||
9,10,11
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
names=["a", "b"],
|
||||
chunksize=2,
|
||||
usecols=[0, 1],
|
||||
header=None,
|
||||
)
|
||||
return
|
||||
|
||||
result_chunks = parser.read_csv(
|
||||
StringIO(data),
|
||||
names=["a", "b"],
|
||||
chunksize=2,
|
||||
usecols=[0, 1],
|
||||
header=None,
|
||||
)
|
||||
|
||||
expected_frames = [
|
||||
DataFrame({"a": [1, 5], "b": [2, 6]}),
|
||||
DataFrame({"a": [9], "b": [10]}, index=[2]),
|
||||
]
|
||||
|
||||
for i, result in enumerate(result_chunks):
|
||||
tm.assert_frame_equal(result, expected_frames[i])
|
||||
|
||||
|
||||
def test_chunksize_second_block_shorter(all_parsers):
|
||||
# GH#21211
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
1,2,3,4
|
||||
5,6,7,8
|
||||
9,10,11
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), chunksize=2)
|
||||
return
|
||||
|
||||
result_chunks = parser.read_csv(StringIO(data), chunksize=2)
|
||||
|
||||
expected_frames = [
|
||||
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
|
||||
DataFrame({"a": [9], "b": [10], "c": [11], "d": [np.nan]}, index=[2]),
|
||||
]
|
||||
|
||||
for i, result in enumerate(result_chunks):
|
||||
tm.assert_frame_equal(result, expected_frames[i])
|
||||
@ -0,0 +1,983 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from datetime import datetime
|
||||
from inspect import signature
|
||||
from io import StringIO
|
||||
import os
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._config import using_string_dtype
|
||||
|
||||
from pandas.compat import HAS_PYARROW
|
||||
from pandas.errors import (
|
||||
EmptyDataError,
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Timestamp,
|
||||
compat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.parsers import TextFileReader
|
||||
from pandas.io.parsers.c_parser_wrapper import CParserWrapper
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_override_set_noconvert_columns():
|
||||
# see gh-17351
|
||||
#
|
||||
# Usecols needs to be sorted in _set_noconvert_columns based
|
||||
# on the test_usecols_with_parse_dates test from test_usecols.py
|
||||
class MyTextFileReader(TextFileReader):
|
||||
def __init__(self) -> None:
|
||||
self._currow = 0
|
||||
self.squeeze = False
|
||||
|
||||
class MyCParserWrapper(CParserWrapper):
|
||||
def _set_noconvert_columns(self):
|
||||
if self.usecols_dtype == "integer":
|
||||
# self.usecols is a set, which is documented as unordered
|
||||
# but in practice, a CPython set of integers is sorted.
|
||||
# In other implementations this assumption does not hold.
|
||||
# The following code simulates a different order, which
|
||||
# before GH 17351 would cause the wrong columns to be
|
||||
# converted via the parse_dates parameter
|
||||
self.usecols = list(self.usecols)
|
||||
self.usecols.reverse()
|
||||
return CParserWrapper._set_noconvert_columns(self)
|
||||
|
||||
data = """a,b,c,d,e
|
||||
0,1,2014-01-01,09:00,4
|
||||
0,1,2014-01-02,10:00,4"""
|
||||
|
||||
parse_dates = [[1, 2]]
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
|
||||
parser = MyTextFileReader()
|
||||
parser.options = {
|
||||
"usecols": [0, 2, 3],
|
||||
"parse_dates": parse_dates,
|
||||
"delimiter": ",",
|
||||
}
|
||||
parser.engine = "c"
|
||||
parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
|
||||
|
||||
result = parser.read()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_local(all_parsers, csv1):
|
||||
prefix = "file:///" if compat.is_platform_windows() else "file://"
|
||||
parser = all_parsers
|
||||
|
||||
fname = prefix + str(os.path.abspath(csv1))
|
||||
result = parser.read_csv(fname, index_col=0, parse_dates=True)
|
||||
# TODO: make unit check more specific
|
||||
if parser.engine == "pyarrow":
|
||||
result.index = result.index.as_unit("ns")
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.980269, 3.685731, -0.364216805298, -1.159738],
|
||||
[1.047916, -0.041232, -0.16181208307, 0.212549],
|
||||
[0.498581, 0.731168, -0.537677223318, 1.346270],
|
||||
[1.120202, 1.567621, 0.00364077397681, 0.675253],
|
||||
[-0.487094, 0.571455, -1.6116394093, 0.103469],
|
||||
[0.836649, 0.246462, 0.588542635376, 1.062782],
|
||||
[-0.157161, 1.340307, 1.1957779562, -1.097007],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(
|
||||
[
|
||||
datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 4),
|
||||
datetime(2000, 1, 5),
|
||||
datetime(2000, 1, 6),
|
||||
datetime(2000, 1, 7),
|
||||
datetime(2000, 1, 10),
|
||||
datetime(2000, 1, 11),
|
||||
],
|
||||
name="index",
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_1000_sep(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A|B|C
|
||||
1|2,334|5
|
||||
10|13|10.
|
||||
"""
|
||||
expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep="|", thousands=",")
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep="|", thousands=",")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_unnamed_columns(all_parsers):
|
||||
data = """A,B,C,,
|
||||
1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
|
||||
dtype=np.int64,
|
||||
columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"],
|
||||
)
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_csv_mixed_type(all_parsers):
|
||||
data = """A,B,C
|
||||
a,1,2
|
||||
b,3,4
|
||||
c,4,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]})
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_low_memory_no_rows_with_index(all_parsers):
|
||||
# see gh-21141
|
||||
parser = all_parsers
|
||||
|
||||
if not parser.low_memory:
|
||||
pytest.skip("This is a low-memory specific test")
|
||||
|
||||
data = """A,B,C
|
||||
1,1,1,2
|
||||
2,2,3,4
|
||||
3,3,4,5
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
|
||||
expected = DataFrame(columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_dataframe(all_parsers, csv1):
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(csv1, index_col=0, parse_dates=True)
|
||||
# TODO: make unit check more specific
|
||||
if parser.engine == "pyarrow":
|
||||
result.index = result.index.as_unit("ns")
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.980269, 3.685731, -0.364216805298, -1.159738],
|
||||
[1.047916, -0.041232, -0.16181208307, 0.212549],
|
||||
[0.498581, 0.731168, -0.537677223318, 1.346270],
|
||||
[1.120202, 1.567621, 0.00364077397681, 0.675253],
|
||||
[-0.487094, 0.571455, -1.6116394093, 0.103469],
|
||||
[0.836649, 0.246462, 0.588542635376, 1.062782],
|
||||
[-0.157161, 1.340307, 1.1957779562, -1.097007],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(
|
||||
[
|
||||
datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 4),
|
||||
datetime(2000, 1, 5),
|
||||
datetime(2000, 1, 6),
|
||||
datetime(2000, 1, 7),
|
||||
datetime(2000, 1, 10),
|
||||
datetime(2000, 1, 11),
|
||||
],
|
||||
name="index",
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", [3, 3.0])
|
||||
def test_read_nrows(all_parsers, nrows):
|
||||
# see gh-10476
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]],
|
||||
columns=["index", "A", "B", "C", "D"],
|
||||
)
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), nrows=nrows)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), nrows=nrows)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", [1.2, "foo", -1])
|
||||
def test_read_nrows_bad(all_parsers, nrows):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
msg = r"'nrows' must be an integer >=0"
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), nrows=nrows)
|
||||
|
||||
|
||||
def test_nrows_skipfooter_errors(all_parsers):
|
||||
msg = "'skipfooter' not supported with 'nrows'"
|
||||
data = "a\n1\n2\n3\n4\n5\n6"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=1, nrows=5)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_missing_trailing_delimiters(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C,D
|
||||
1,2,3,4
|
||||
1,3,3,
|
||||
1,4,5"""
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]],
|
||||
columns=["A", "B", "C", "D"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skip_initial_space(all_parsers):
|
||||
data = (
|
||||
'"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
|
||||
"1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, "
|
||||
"314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, "
|
||||
"70.06056, 344.98370, 1, 1, -0.689265, -0.692787, "
|
||||
"0.212036, 14.7674, 41.605, -9999.0, -9999.0, "
|
||||
"-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128"
|
||||
)
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
names=list(range(33)),
|
||||
header=None,
|
||||
na_values=["-9999.0"],
|
||||
skipinitialspace=True,
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
names=list(range(33)),
|
||||
header=None,
|
||||
na_values=["-9999.0"],
|
||||
skipinitialspace=True,
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[
|
||||
"09-Apr-2012",
|
||||
"01:10:18.300",
|
||||
2456026.548822908,
|
||||
12849,
|
||||
1.00361,
|
||||
1.12551,
|
||||
330.65659,
|
||||
355626618.16711,
|
||||
73.48821,
|
||||
314.11625,
|
||||
1917.09447,
|
||||
179.71425,
|
||||
80.0,
|
||||
240.0,
|
||||
-350,
|
||||
70.06056,
|
||||
344.9837,
|
||||
1,
|
||||
1,
|
||||
-0.689265,
|
||||
-0.692787,
|
||||
0.212036,
|
||||
14.7674,
|
||||
41.605,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
0,
|
||||
12,
|
||||
128,
|
||||
]
|
||||
]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_trailing_delimiters(all_parsers):
|
||||
# see gh-2442
|
||||
data = """A,B,C
|
||||
1,2,3,
|
||||
4,5,6,
|
||||
7,8,9,"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=False)
|
||||
|
||||
expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_escapechar(all_parsers):
|
||||
# https://stackoverflow.com/questions/13824840/feature-request-for-
|
||||
# pandas-read-csv
|
||||
data = '''SEARCH_TERM,ACTUAL_URL
|
||||
"bra tv board","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
|
||||
"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
|
||||
"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"'''
|
||||
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8"
|
||||
)
|
||||
|
||||
assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series'
|
||||
|
||||
tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"]))
|
||||
|
||||
|
||||
def test_ignore_leading_whitespace(all_parsers):
|
||||
# see gh-3374, gh-6607
|
||||
parser = all_parsers
|
||||
data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
|
||||
expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]])
|
||||
def test_uneven_lines_with_usecols(all_parsers, usecols):
|
||||
# see gh-12203
|
||||
parser = all_parsers
|
||||
data = r"""a,b,c
|
||||
0,1,2
|
||||
3,4,5,6,7
|
||||
8,9,10"""
|
||||
|
||||
if usecols is None:
|
||||
# Make sure that an error is still raised
|
||||
# when the "usecols" parameter is not provided.
|
||||
msg = r"Expected \d+ fields in line \d+, saw \d+"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
else:
|
||||
expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
# First, check to see that the response of parser when faced with no
|
||||
# provided columns raises the correct error, with or without usecols.
|
||||
("", {}, None),
|
||||
("", {"usecols": ["X"]}, None),
|
||||
(
|
||||
",,",
|
||||
{"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
|
||||
DataFrame(columns=["X"], index=[0], dtype=np.float64),
|
||||
),
|
||||
(
|
||||
"",
|
||||
{"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
|
||||
DataFrame(columns=["X"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
|
||||
# see gh-12493
|
||||
parser = all_parsers
|
||||
|
||||
if expected is None:
|
||||
msg = "No columns to parse from file"
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,expected",
|
||||
[
|
||||
# gh-8661, gh-8679: this should ignore six lines, including
|
||||
# lines with trailing whitespace and blank lines.
|
||||
(
|
||||
{
|
||||
"header": None,
|
||||
"delim_whitespace": True,
|
||||
"skiprows": [0, 1, 2, 3, 5, 6],
|
||||
"skip_blank_lines": True,
|
||||
},
|
||||
DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]),
|
||||
),
|
||||
# gh-8983: test skipping set of rows after a row with trailing spaces.
|
||||
(
|
||||
{
|
||||
"delim_whitespace": True,
|
||||
"skiprows": [1, 2, 3, 5, 6],
|
||||
"skip_blank_lines": True,
|
||||
},
|
||||
DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_trailing_spaces(all_parsers, kwargs, expected):
|
||||
data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501
|
||||
parser = all_parsers
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_raise_on_sep_with_delim_whitespace(all_parsers):
|
||||
# see gh-6607
|
||||
data = "a b c\n1 2 3"
|
||||
parser = all_parsers
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with pytest.raises(ValueError, match="you can only specify one"):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
|
||||
|
||||
|
||||
def test_read_filepath_or_buffer(all_parsers):
|
||||
# see gh-43366
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match="Expected file path name or file-like"):
|
||||
parser.read_csv(filepath_or_buffer=b"input")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("delim_whitespace", [True, False])
|
||||
def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
|
||||
# see gh-9710
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
MyColumn
|
||||
a
|
||||
b
|
||||
a
|
||||
b\n"""
|
||||
|
||||
expected = DataFrame({"MyColumn": list("abab")})
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
skipinitialspace=True,
|
||||
delim_whitespace=delim_whitespace,
|
||||
)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sep,skip_blank_lines,exp_data",
|
||||
[
|
||||
(",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
|
||||
(r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
|
||||
(
|
||||
",",
|
||||
False,
|
||||
[
|
||||
[1.0, 2.0, 4.0],
|
||||
[np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan],
|
||||
[5.0, np.nan, 10.0],
|
||||
[np.nan, np.nan, np.nan],
|
||||
[-70.0, 0.4, 1.0],
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data, request):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
|
||||
|
||||
5.,NaN,10.0
|
||||
|
||||
-70,.4,1
|
||||
"""
|
||||
|
||||
if sep == r"\s+":
|
||||
data = data.replace(",", " ")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines)
|
||||
expected = DataFrame(exp_data, columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_whitespace_lines(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """
|
||||
|
||||
\t \t\t
|
||||
\t
|
||||
A,B,C
|
||||
\t 1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"])
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected",
|
||||
[
|
||||
(
|
||||
""" A B C D
|
||||
a 1 2 3 4
|
||||
b 1 2 3 4
|
||||
c 1 2 3 4
|
||||
""",
|
||||
DataFrame(
|
||||
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=["a", "b", "c"],
|
||||
),
|
||||
),
|
||||
(
|
||||
" a b c\n1 2 3 \n4 5 6\n 7 8 9",
|
||||
DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_whitespace_regex_separator(all_parsers, data, expected):
|
||||
# see gh-6607
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_sub_character(all_parsers, csv_dir_path):
|
||||
# see gh-16893
|
||||
filename = os.path.join(csv_dir_path, "sub_char.csv")
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
|
||||
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(filename)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"])
|
||||
def test_filename_with_special_chars(all_parsers, filename):
|
||||
# see gh-15086.
|
||||
parser = all_parsers
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
|
||||
with tm.ensure_clean(filename) as path:
|
||||
df.to_csv(path, index=False)
|
||||
|
||||
result = parser.read_csv(path)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_read_table_same_signature_as_read_csv(all_parsers):
|
||||
# GH-34976
|
||||
parser = all_parsers
|
||||
|
||||
table_sign = signature(parser.read_table)
|
||||
csv_sign = signature(parser.read_csv)
|
||||
|
||||
assert table_sign.parameters.keys() == csv_sign.parameters.keys()
|
||||
assert table_sign.return_annotation == csv_sign.return_annotation
|
||||
|
||||
for key, csv_param in csv_sign.parameters.items():
|
||||
table_param = table_sign.parameters[key]
|
||||
if key == "sep":
|
||||
assert csv_param.default == ","
|
||||
assert table_param.default == "\t"
|
||||
assert table_param.annotation == csv_param.annotation
|
||||
assert table_param.kind == csv_param.kind
|
||||
continue
|
||||
|
||||
assert table_param == csv_param
|
||||
|
||||
|
||||
def test_read_table_equivalency_to_read_csv(all_parsers):
|
||||
# see gh-21948
|
||||
# As of 0.25.0, read_table is undeprecated
|
||||
parser = all_parsers
|
||||
data = "a\tb\n1\t2\n3\t4"
|
||||
expected = parser.read_csv(StringIO(data), sep="\t")
|
||||
result = parser.read_table(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("read_func", ["read_csv", "read_table"])
|
||||
def test_read_csv_and_table_sys_setprofile(all_parsers, read_func):
|
||||
# GH#41069
|
||||
parser = all_parsers
|
||||
data = "a b\n0 1"
|
||||
|
||||
sys.setprofile(lambda *a, **k: None)
|
||||
result = getattr(parser, read_func)(StringIO(data))
|
||||
sys.setprofile(None)
|
||||
|
||||
expected = DataFrame({"a b": ["0 1"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_first_row_bom(all_parsers):
|
||||
# see gh-26545
|
||||
parser = all_parsers
|
||||
data = '''\ufeff"Head1"\t"Head2"\t"Head3"'''
|
||||
|
||||
result = parser.read_csv(StringIO(data), delimiter="\t")
|
||||
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_first_row_bom_unquoted(all_parsers):
|
||||
# see gh-36343
|
||||
parser = all_parsers
|
||||
data = """\ufeffHead1\tHead2\tHead3"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), delimiter="\t")
|
||||
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", range(1, 6))
|
||||
def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
|
||||
# GH 28071
|
||||
ref = DataFrame(
|
||||
[[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
|
||||
columns=list("ab"),
|
||||
)
|
||||
csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False
|
||||
)
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
|
||||
tm.assert_frame_equal(df, ref[:nrows])
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_no_header_two_extra_columns(all_parsers):
|
||||
# GH 26218
|
||||
column_names = ["one", "two", "three"]
|
||||
ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
|
||||
stream = StringIO("foo,bar,baz,bam,blah")
|
||||
parser = all_parsers
|
||||
df = parser.read_csv_check_warnings(
|
||||
ParserWarning,
|
||||
"Length of header or names does not match length of data. "
|
||||
"This leads to a loss of data with index_col=False.",
|
||||
stream,
|
||||
header=None,
|
||||
names=column_names,
|
||||
index_col=False,
|
||||
)
|
||||
tm.assert_frame_equal(df, ref)
|
||||
|
||||
|
||||
def test_read_csv_names_not_accepting_sets(all_parsers):
|
||||
# GH 34946
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6\n"""
|
||||
parser = all_parsers
|
||||
with pytest.raises(ValueError, match="Names should be an ordered collection."):
|
||||
parser.read_csv(StringIO(data), names=set("QAZ"))
|
||||
|
||||
|
||||
def test_read_table_delim_whitespace_default_sep(all_parsers):
|
||||
# GH: 35958
|
||||
f = StringIO("a b c\n1 -2 -3\n4 5 6")
|
||||
parser = all_parsers
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_table(f, delim_whitespace=True)
|
||||
return
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_table(f, delim_whitespace=True)
|
||||
expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("delimiter", [",", "\t"])
|
||||
def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
|
||||
# GH: 35958
|
||||
f = StringIO("a b c\n1 -2 -3\n4 5 6")
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"Specified a delimiter with both sep and "
|
||||
"delim_whitespace=True; you can only specify one."
|
||||
)
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(f, delim_whitespace=True, sep=delimiter)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
|
||||
|
||||
|
||||
def test_read_csv_delimiter_and_sep_no_default(all_parsers):
|
||||
# GH#39823
|
||||
f = StringIO("a,b\n1,2")
|
||||
parser = all_parsers
|
||||
msg = "Specified a sep and a delimiter; you can only specify one."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(f, sep=" ", delimiter=".")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [{"delimiter": "\n"}, {"sep": "\n"}])
|
||||
def test_read_csv_line_break_as_separator(kwargs, all_parsers):
|
||||
# GH#43528
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,2,3
|
||||
"""
|
||||
msg = (
|
||||
r"Specified \\n as separator or delimiter. This forces the python engine "
|
||||
r"which does not accept a line terminator. Hence it is not allowed to use "
|
||||
r"the line terminator as separator."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("delimiter", [",", "\t"])
|
||||
def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
|
||||
# GH: 35958
|
||||
f = StringIO("a b c\n1 -2 -3\n4 5 6")
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"Specified a delimiter with both sep and "
|
||||
"delim_whitespace=True; you can only specify one."
|
||||
)
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_table(f, delim_whitespace=True, sep=delimiter)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_dict_keys_as_names(all_parsers):
|
||||
# GH: 36928
|
||||
data = "1,2"
|
||||
|
||||
keys = {"a": int, "b": int}.keys()
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=keys)
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
|
||||
@xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0
|
||||
def test_encoding_surrogatepass(all_parsers):
|
||||
# GH39017
|
||||
parser = all_parsers
|
||||
content = b"\xed\xbd\xbf"
|
||||
decoded = content.decode("utf-8", errors="surrogatepass")
|
||||
expected = DataFrame({decoded: [decoded]}, index=[decoded * 2])
|
||||
expected.index.name = decoded * 2
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
Path(path).write_bytes(
|
||||
content * 2 + b"," + content + b"\n" + content * 2 + b"," + content
|
||||
)
|
||||
df = parser.read_csv(path, encoding_errors="surrogatepass", index_col=0)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte"):
|
||||
parser.read_csv(path)
|
||||
|
||||
|
||||
def test_malformed_second_line(all_parsers):
|
||||
# see GH14782
|
||||
parser = all_parsers
|
||||
data = "\na\nb\n"
|
||||
result = parser.read_csv(StringIO(data), skip_blank_lines=False, header=1)
|
||||
expected = DataFrame({"a": ["b"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_short_single_line(all_parsers):
|
||||
# GH 47566
|
||||
parser = all_parsers
|
||||
columns = ["a", "b", "c"]
|
||||
data = "1,2"
|
||||
result = parser.read_csv(StringIO(data), header=None, names=columns)
|
||||
expected = DataFrame({"a": [1], "b": [2], "c": [np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Length mismatch: Expected axis has 2 elements
|
||||
def test_short_multi_line(all_parsers):
|
||||
# GH 47566
|
||||
parser = all_parsers
|
||||
columns = ["a", "b", "c"]
|
||||
data = "1,2\n1,2"
|
||||
result = parser.read_csv(StringIO(data), header=None, names=columns)
|
||||
expected = DataFrame({"a": [1, 1], "b": [2, 2], "c": [np.nan, np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_seek(all_parsers):
|
||||
# GH48646
|
||||
parser = all_parsers
|
||||
prefix = "### DATA\n"
|
||||
content = "nkey,value\ntables,rectangular\n"
|
||||
with tm.ensure_clean() as path:
|
||||
Path(path).write_text(prefix + content, encoding="utf-8")
|
||||
with open(path, encoding="utf-8") as file:
|
||||
file.readline()
|
||||
actual = parser.read_csv(file)
|
||||
expected = parser.read_csv(StringIO(content))
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
@ -0,0 +1,91 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.parsers import TextParser
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
@xfail_pyarrow
|
||||
def test_read_data_list(all_parsers):
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"
|
||||
|
||||
data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]]
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
with TextParser(data_list, chunksize=2, **kwargs) as parser:
|
||||
result = parser.read()
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_reader_list(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
|
||||
lines = list(csv.reader(StringIO(data)))
|
||||
with TextParser(lines, chunksize=2, **kwargs) as reader:
|
||||
chunks = list(reader)
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
tm.assert_frame_equal(chunks[0], expected[:2])
|
||||
tm.assert_frame_equal(chunks[1], expected[2:4])
|
||||
tm.assert_frame_equal(chunks[2], expected[4:])
|
||||
|
||||
|
||||
def test_reader_list_skiprows(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
|
||||
lines = list(csv.reader(StringIO(data)))
|
||||
with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader:
|
||||
chunks = list(reader)
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
tm.assert_frame_equal(chunks[0], expected[1:3])
|
||||
|
||||
|
||||
def test_read_csv_parse_simple_list(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """foo
|
||||
bar baz
|
||||
qux foo
|
||||
foo
|
||||
bar"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,72 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,thousands,decimal",
|
||||
[
|
||||
(
|
||||
"""A|B|C
|
||||
1|2,334.01|5
|
||||
10|13|10.
|
||||
""",
|
||||
",",
|
||||
".",
|
||||
),
|
||||
(
|
||||
"""A|B|C
|
||||
1|2.334,01|5
|
||||
10|13|10,
|
||||
""",
|
||||
".",
|
||||
",",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), sep="|", thousands=thousands, decimal=decimal
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), sep="|", thousands=thousands, decimal=decimal
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_euro_decimal_format(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """Id;Number1;Number2;Text1;Text2;Number3
|
||||
1;1521,1541;187101,9543;ABC;poi;4,738797819
|
||||
2;121,12;14897,76;DEF;uyt;0,377320872
|
||||
3;878,158;108013,434;GHI;rez;2,735694704"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=";", decimal=",")
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819],
|
||||
[2, 121.12, 14897.76, "DEF", "uyt", 0.377320872],
|
||||
[3, 878.158, 108013.434, "GHI", "rez", 2.735694704],
|
||||
],
|
||||
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,478 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
)
|
||||
import os
|
||||
import platform
|
||||
from urllib.error import URLError
|
||||
import uuid
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import (
|
||||
EmptyDataError,
|
||||
ParserError,
|
||||
)
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.single_cpu
|
||||
def test_url(all_parsers, csv_dir_path, httpserver):
|
||||
parser = all_parsers
|
||||
kwargs = {"sep": "\t"}
|
||||
|
||||
local_path = os.path.join(csv_dir_path, "salaries.csv")
|
||||
with open(local_path, encoding="utf-8") as f:
|
||||
httpserver.serve_content(content=f.read())
|
||||
|
||||
url_result = parser.read_csv(httpserver.url, **kwargs)
|
||||
|
||||
local_result = parser.read_csv(local_path, **kwargs)
|
||||
tm.assert_frame_equal(url_result, local_result)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_local_file(all_parsers, csv_dir_path):
|
||||
parser = all_parsers
|
||||
kwargs = {"sep": "\t"}
|
||||
|
||||
local_path = os.path.join(csv_dir_path, "salaries.csv")
|
||||
local_result = parser.read_csv(local_path, **kwargs)
|
||||
url = "file://localhost/" + local_path
|
||||
|
||||
try:
|
||||
url_result = parser.read_csv(url, **kwargs)
|
||||
tm.assert_frame_equal(url_result, local_result)
|
||||
except URLError:
|
||||
# Fails on some systems.
|
||||
pytest.skip("Failing on: " + " ".join(platform.uname()))
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
def test_path_path_lib(all_parsers):
|
||||
parser = all_parsers
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
def test_path_local_path(all_parsers):
|
||||
parser = all_parsers
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
result = tm.round_trip_localpath(
|
||||
df.to_csv, lambda p: parser.read_csv(p, index_col=0)
|
||||
)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
|
||||
def test_nonexistent_path(all_parsers):
|
||||
# gh-2428: pls no segfault
|
||||
# gh-14086: raise more helpful FileNotFoundError
|
||||
# GH#29233 "File foo" instead of "File b'foo'"
|
||||
parser = all_parsers
|
||||
path = f"{uuid.uuid4()}.csv"
|
||||
|
||||
msg = r"\[Errno 2\]"
|
||||
with pytest.raises(FileNotFoundError, match=msg) as e:
|
||||
parser.read_csv(path)
|
||||
assert path == e.value.filename
|
||||
|
||||
|
||||
@td.skip_if_windows # os.chmod does not work in windows
|
||||
def test_no_permission(all_parsers):
|
||||
# GH 23784
|
||||
parser = all_parsers
|
||||
|
||||
msg = r"\[Errno 13\]"
|
||||
with tm.ensure_clean() as path:
|
||||
os.chmod(path, 0) # make file unreadable
|
||||
|
||||
# verify that this process cannot open the file (not running as sudo)
|
||||
try:
|
||||
with open(path, encoding="utf-8"):
|
||||
pass
|
||||
pytest.skip("Running as sudo.")
|
||||
except PermissionError:
|
||||
pass
|
||||
|
||||
with pytest.raises(PermissionError, match=msg) as e:
|
||||
parser.read_csv(path)
|
||||
assert path == e.value.filename
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected,msg",
|
||||
[
|
||||
# gh-10728: WHITESPACE_LINE
|
||||
(
|
||||
"a,b,c\n4,5,6\n ",
|
||||
{},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# gh-10548: EAT_LINE_COMMENT
|
||||
(
|
||||
"a,b,c\n4,5,6\n#comment",
|
||||
{"comment": "#"},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_CRNL_NOP
|
||||
(
|
||||
"a,b,c\n4,5,6\n\r",
|
||||
{},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_COMMENT
|
||||
(
|
||||
"a,b,c\n4,5,6#comment",
|
||||
{"comment": "#"},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# SKIP_LINE
|
||||
(
|
||||
"a,b,c\n4,5,6\nskipme",
|
||||
{"skiprows": [2]},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_LINE_COMMENT
|
||||
(
|
||||
"a,b,c\n4,5,6\n#comment",
|
||||
{"comment": "#", "skip_blank_lines": False},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# IN_FIELD
|
||||
(
|
||||
"a,b,c\n4,5,6\n ",
|
||||
{"skip_blank_lines": False},
|
||||
DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_CRNL
|
||||
(
|
||||
"a,b,c\n4,5,6\n\r",
|
||||
{"skip_blank_lines": False},
|
||||
DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# ESCAPED_CHAR
|
||||
(
|
||||
"a,b,c\n4,5,6\n\\",
|
||||
{"escapechar": "\\"},
|
||||
None,
|
||||
"(EOF following escape character)|(unexpected end of data)",
|
||||
),
|
||||
# ESCAPE_IN_QUOTED_FIELD
|
||||
(
|
||||
'a,b,c\n4,5,6\n"\\',
|
||||
{"escapechar": "\\"},
|
||||
None,
|
||||
"(EOF inside string starting at row 2)|(unexpected end of data)",
|
||||
),
|
||||
# IN_QUOTED_FIELD
|
||||
(
|
||||
'a,b,c\n4,5,6\n"',
|
||||
{"escapechar": "\\"},
|
||||
None,
|
||||
"(EOF inside string starting at row 2)|(unexpected end of data)",
|
||||
),
|
||||
],
|
||||
ids=[
|
||||
"whitespace-line",
|
||||
"eat-line-comment",
|
||||
"eat-crnl-nop",
|
||||
"eat-comment",
|
||||
"skip-line",
|
||||
"eat-line-comment",
|
||||
"in-field",
|
||||
"eat-crnl",
|
||||
"escaped-char",
|
||||
"escape-in-quoted-field",
|
||||
"in-quoted-field",
|
||||
],
|
||||
)
|
||||
def test_eof_states(all_parsers, data, kwargs, expected, msg, request):
|
||||
# see gh-10728, gh-10548
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and "comment" in kwargs:
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
return
|
||||
|
||||
if parser.engine == "pyarrow" and "\r" not in data:
|
||||
# pandas.errors.ParserError: CSV parse error: Expected 3 columns, got 1:
|
||||
# ValueError: skiprows argument must be an integer when using engine='pyarrow'
|
||||
# AssertionError: Regex pattern did not match.
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
if expected is None:
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_temporary_file(all_parsers):
|
||||
# see gh-13398
|
||||
parser = all_parsers
|
||||
data = "0 0"
|
||||
|
||||
with tm.ensure_clean(mode="w+", return_filelike=True) as new_file:
|
||||
new_file.write(data)
|
||||
new_file.flush()
|
||||
new_file.seek(0)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(new_file, sep=r"\s+", header=None)
|
||||
return
|
||||
|
||||
result = parser.read_csv(new_file, sep=r"\s+", header=None)
|
||||
|
||||
expected = DataFrame([[0, 0]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_internal_eof_byte(all_parsers):
|
||||
# see gh-5500
|
||||
parser = all_parsers
|
||||
data = "a,b\n1\x1a,2"
|
||||
|
||||
expected = DataFrame([["1\x1a", 2]], columns=["a", "b"])
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_internal_eof_byte_to_file(all_parsers):
|
||||
# see gh-16559
|
||||
parser = all_parsers
|
||||
data = b'c1,c2\r\n"test \x1a test", test\r\n'
|
||||
expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"])
|
||||
path = f"__{uuid.uuid4()}__.csv"
|
||||
|
||||
with tm.ensure_clean(path) as path:
|
||||
with open(path, "wb") as f:
|
||||
f.write(data)
|
||||
|
||||
result = parser.read_csv(path)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_file_handle_string_io(all_parsers):
|
||||
# gh-14418
|
||||
#
|
||||
# Don't close user provided file handles.
|
||||
parser = all_parsers
|
||||
data = "a,b\n1,2"
|
||||
|
||||
fh = StringIO(data)
|
||||
parser.read_csv(fh)
|
||||
assert not fh.closed
|
||||
|
||||
|
||||
def test_file_handles_with_open(all_parsers, csv1):
|
||||
# gh-14418
|
||||
#
|
||||
# Don't close user provided file handles.
|
||||
parser = all_parsers
|
||||
|
||||
for mode in ["r", "rb"]:
|
||||
with open(csv1, mode, encoding="utf-8" if mode == "r" else None) as f:
|
||||
parser.read_csv(f)
|
||||
assert not f.closed
|
||||
|
||||
|
||||
def test_invalid_file_buffer_class(all_parsers):
|
||||
# see gh-15337
|
||||
class InvalidBuffer:
|
||||
pass
|
||||
|
||||
parser = all_parsers
|
||||
msg = "Invalid file path or buffer object type"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(InvalidBuffer())
|
||||
|
||||
|
||||
def test_invalid_file_buffer_mock(all_parsers):
|
||||
# see gh-15337
|
||||
parser = all_parsers
|
||||
msg = "Invalid file path or buffer object type"
|
||||
|
||||
class Foo:
|
||||
pass
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(Foo())
|
||||
|
||||
|
||||
def test_valid_file_buffer_seems_invalid(all_parsers):
|
||||
# gh-16135: we want to ensure that "tell" and "seek"
|
||||
# aren't actually being used when we call `read_csv`
|
||||
#
|
||||
# Thus, while the object may look "invalid" (these
|
||||
# methods are attributes of the `StringIO` class),
|
||||
# it is still a valid file-object for our purposes.
|
||||
class NoSeekTellBuffer(StringIO):
|
||||
def tell(self):
|
||||
raise AttributeError("No tell method")
|
||||
|
||||
def seek(self, pos, whence=0):
|
||||
raise AttributeError("No seek method")
|
||||
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(NoSeekTellBuffer(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("io_class", [StringIO, BytesIO])
|
||||
@pytest.mark.parametrize("encoding", [None, "utf-8"])
|
||||
def test_read_csv_file_handle(all_parsers, io_class, encoding):
|
||||
"""
|
||||
Test whether read_csv does not close user-provided file handles.
|
||||
|
||||
GH 36980
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
|
||||
content = "a,b\n1,2"
|
||||
handle = io_class(content.encode("utf-8") if io_class == BytesIO else content)
|
||||
|
||||
tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected)
|
||||
assert not handle.closed
|
||||
|
||||
|
||||
def test_memory_map_compression(all_parsers, compression):
|
||||
"""
|
||||
Support memory map for compressed files.
|
||||
|
||||
GH 37621
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
expected.to_csv(path, index=False, compression=compression)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(path, memory_map=True, compression=compression)
|
||||
return
|
||||
|
||||
result = parser.read_csv(path, memory_map=True, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(
|
||||
result,
|
||||
expected,
|
||||
)
|
||||
|
||||
|
||||
def test_context_manager(all_parsers, datapath):
|
||||
# make sure that opened files are closed
|
||||
parser = all_parsers
|
||||
|
||||
path = datapath("io", "data", "csv", "iris.csv")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(path, chunksize=1)
|
||||
return
|
||||
|
||||
reader = parser.read_csv(path, chunksize=1)
|
||||
assert not reader.handles.handle.closed
|
||||
try:
|
||||
with reader:
|
||||
next(reader)
|
||||
assert False
|
||||
except AssertionError:
|
||||
assert reader.handles.handle.closed
|
||||
|
||||
|
||||
def test_context_manageri_user_provided(all_parsers, datapath):
|
||||
# make sure that user-provided handles are not closed
|
||||
parser = all_parsers
|
||||
|
||||
with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path:
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(path, chunksize=1)
|
||||
return
|
||||
|
||||
reader = parser.read_csv(path, chunksize=1)
|
||||
assert not reader.handles.handle.closed
|
||||
try:
|
||||
with reader:
|
||||
next(reader)
|
||||
assert False
|
||||
except AssertionError:
|
||||
assert not reader.handles.handle.closed
|
||||
|
||||
|
||||
@skip_pyarrow # ParserError: Empty CSV file
|
||||
def test_file_descriptor_leak(all_parsers, using_copy_on_write):
|
||||
# GH 31488
|
||||
parser = all_parsers
|
||||
with tm.ensure_clean() as path:
|
||||
with pytest.raises(EmptyDataError, match="No columns to parse from file"):
|
||||
parser.read_csv(path)
|
||||
|
||||
|
||||
def test_memory_map(all_parsers, csv_dir_path):
|
||||
mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]}
|
||||
)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(mmap_file, memory_map=True)
|
||||
return
|
||||
|
||||
result = parser.read_csv(mmap_file, memory_map=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,79 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_linux
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block
|
||||
def test_float_parser(all_parsers):
|
||||
# see gh-9565
|
||||
parser = all_parsers
|
||||
data = "45e-1,4.5,45.,inf,-inf"
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
|
||||
expected = DataFrame([[float(s) for s in data.split(",")]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_scientific_no_exponent(all_parsers_all_precisions):
|
||||
# see gh-12215
|
||||
df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
|
||||
data = df.to_csv(index=False)
|
||||
parser, precision = all_parsers_all_precisions
|
||||
|
||||
df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
|
||||
tm.assert_frame_equal(df_roundtrip, df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"neg_exp",
|
||||
[
|
||||
-617,
|
||||
-100000,
|
||||
pytest.param(-99999999999999999, marks=pytest.mark.skip_ubsan),
|
||||
],
|
||||
)
|
||||
def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
|
||||
# GH#38753
|
||||
parser, precision = all_parsers_all_precisions
|
||||
|
||||
data = f"data\n10E{neg_exp}"
|
||||
result = parser.read_csv(StringIO(data), float_precision=precision)
|
||||
expected = DataFrame({"data": [0.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.skip_ubsan
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
|
||||
def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
|
||||
# GH#38753
|
||||
parser, precision = all_parsers_all_precisions
|
||||
data = f"data\n10E{exp}"
|
||||
result = parser.read_csv(StringIO(data), float_precision=precision)
|
||||
if precision == "round_trip":
|
||||
if exp == 999999999999999999 and is_platform_linux():
|
||||
mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
|
||||
request.applymarker(mark)
|
||||
|
||||
value = np.inf if exp > 0 else 0.0
|
||||
expected = DataFrame({"data": [value]})
|
||||
else:
|
||||
expected = DataFrame({"data": [f"10E{exp}"]})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,304 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
"""foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
""",
|
||||
{"index_col": 0, "names": ["index", "A", "B", "C", "D"]},
|
||||
DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"),
|
||||
columns=["A", "B", "C", "D"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"""foo,one,2,3,4,5
|
||||
foo,two,7,8,9,10
|
||||
foo,three,12,13,14,15
|
||||
bar,one,12,13,14,15
|
||||
bar,two,12,13,14,15
|
||||
""",
|
||||
{"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]},
|
||||
DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
index=MultiIndex.from_tuples(
|
||||
[
|
||||
("foo", "one"),
|
||||
("foo", "two"),
|
||||
("foo", "three"),
|
||||
("bar", "one"),
|
||||
("bar", "two"),
|
||||
],
|
||||
names=["index1", "index2"],
|
||||
),
|
||||
columns=["A", "B", "C", "D"],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_pass_names_with_index(all_parsers, data, kwargs, expected):
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
|
||||
def test_multi_index_no_level_names(
|
||||
request, all_parsers, index_col, using_infer_string
|
||||
):
|
||||
data = """index1,index2,A,B,C,D
|
||||
foo,one,2,3,4,5
|
||||
foo,two,7,8,9,10
|
||||
foo,three,12,13,14,15
|
||||
bar,one,12,13,14,15
|
||||
bar,two,12,13,14,15
|
||||
"""
|
||||
headless_data = "\n".join(data.split("\n")[1:])
|
||||
|
||||
names = ["A", "B", "C", "D"]
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(headless_data), index_col=index_col, header=None, names=names
|
||||
)
|
||||
expected = parser.read_csv(StringIO(data), index_col=index_col)
|
||||
|
||||
# No index names in headless data.
|
||||
expected.index.names = [None] * 2
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_multi_index_no_level_names_implicit(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C,D
|
||||
foo,one,2,3,4,5
|
||||
foo,two,7,8,9,10
|
||||
foo,three,12,13,14,15
|
||||
bar,one,12,13,14,15
|
||||
bar,two,12,13,14,15
|
||||
"""
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=MultiIndex.from_tuples(
|
||||
[
|
||||
("foo", "one"),
|
||||
("foo", "two"),
|
||||
("foo", "three"),
|
||||
("bar", "one"),
|
||||
("bar", "two"),
|
||||
]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected,header",
|
||||
[
|
||||
("a,b", DataFrame(columns=["a", "b"]), [0]),
|
||||
(
|
||||
"a,b\nc,d",
|
||||
DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])),
|
||||
[0, 1],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("round_trip", [True, False])
|
||||
def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
|
||||
# see gh-14545
|
||||
parser = all_parsers
|
||||
data = expected.to_csv(index=False) if round_trip else data
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.columns are different
|
||||
def test_no_unnamed_index(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """ id c0 c1 c2
|
||||
0 1 0 a b
|
||||
1 2 0 c d
|
||||
2 2 2 e f
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), sep=" ")
|
||||
expected = DataFrame(
|
||||
[[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]],
|
||||
columns=["Unnamed: 0", "id", "c0", "c1", "c2"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_duplicate_index_explicit(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo,12,13,14,15
|
||||
bar,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=0)
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_read_duplicate_index_implicit(all_parsers):
|
||||
data = """A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo,12,13,14,15
|
||||
bar,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_read_csv_no_index_name(all_parsers, csv_dir_path):
|
||||
parser = all_parsers
|
||||
csv2 = os.path.join(csv_dir_path, "test2.csv")
|
||||
result = parser.read_csv(csv2, index_col=0, parse_dates=True)
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.980269, 3.685731, -0.364216805298, -1.159738, "foo"],
|
||||
[1.047916, -0.041232, -0.16181208307, 0.212549, "bar"],
|
||||
[0.498581, 0.731168, -0.537677223318, 1.346270, "baz"],
|
||||
[1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"],
|
||||
[-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"],
|
||||
],
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
index=Index(
|
||||
[
|
||||
datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 4),
|
||||
datetime(2000, 1, 5),
|
||||
datetime(2000, 1, 6),
|
||||
datetime(2000, 1, 7),
|
||||
]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_empty_with_index(all_parsers):
|
||||
# see gh-10184
|
||||
data = "x,y"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=0)
|
||||
|
||||
expected = DataFrame(columns=["y"], index=Index([], name="x"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# CSV parse error: Empty CSV file or block: cannot infer number of columns
|
||||
@skip_pyarrow
|
||||
def test_empty_with_multi_index(all_parsers):
|
||||
# see gh-10467
|
||||
data = "x,y,z"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=["x", "y"])
|
||||
|
||||
expected = DataFrame(
|
||||
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# CSV parse error: Empty CSV file or block: cannot infer number of columns
|
||||
@skip_pyarrow
|
||||
def test_empty_with_reversed_multi_index(all_parsers):
|
||||
data = "x,y,z"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=[1, 0])
|
||||
|
||||
expected = DataFrame(
|
||||
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,78 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
@pytest.mark.parametrize("na_filter", [True, False])
|
||||
def test_inf_parsing(all_parsers, na_filter):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
,A
|
||||
a,inf
|
||||
b,-inf
|
||||
c,+Inf
|
||||
d,-Inf
|
||||
e,INF
|
||||
f,-INF
|
||||
g,+INf
|
||||
h,-INf
|
||||
i,inF
|
||||
j,-inF"""
|
||||
expected = DataFrame(
|
||||
{"A": [float("inf"), float("-inf")] * 5},
|
||||
index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
@pytest.mark.parametrize("na_filter", [True, False])
|
||||
def test_infinity_parsing(all_parsers, na_filter):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
,A
|
||||
a,Infinity
|
||||
b,-Infinity
|
||||
c,+Infinity
|
||||
"""
|
||||
expected = DataFrame(
|
||||
{"A": [float("infinity"), float("-infinity"), float("+infinity")]},
|
||||
index=["a", "b", "c"],
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_with_use_inf_as_na(all_parsers):
|
||||
# https://github.com/pandas-dev/pandas/issues/35493
|
||||
parser = all_parsers
|
||||
data = "1.0\nNaN\n3.0"
|
||||
msg = "use_inf_as_na option is deprecated"
|
||||
warn = FutureWarning
|
||||
if parser.engine == "pyarrow":
|
||||
warn = (FutureWarning, DeprecationWarning)
|
||||
|
||||
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
|
||||
with option_context("use_inf_as_na", True):
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = DataFrame([1.0, np.nan, 3.0])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,231 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_int_conversion(all_parsers):
|
||||
data = """A,B
|
||||
1.0,1
|
||||
2.0,2
|
||||
3.0,3
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
"A,B\nTrue,1\nFalse,2\nTrue,3",
|
||||
{},
|
||||
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
|
||||
),
|
||||
(
|
||||
"A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3",
|
||||
{"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]},
|
||||
DataFrame(
|
||||
[[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]],
|
||||
columns=["A", "B"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"A,B\nTRUE,1\nFALSE,2\nTRUE,3",
|
||||
{},
|
||||
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
|
||||
),
|
||||
(
|
||||
"A,B\nfoo,bar\nbar,foo",
|
||||
{"true_values": ["foo"], "false_values": ["bar"]},
|
||||
DataFrame([[True, False], [False, True]], columns=["A", "B"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_parse_bool(all_parsers, data, kwargs, expected):
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_integers_above_fp_precision(all_parsers):
|
||||
data = """Numbers
|
||||
17007000002000191
|
||||
17007000002000191
|
||||
17007000002000191
|
||||
17007000002000191
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000194"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Numbers": [
|
||||
17007000002000191,
|
||||
17007000002000191,
|
||||
17007000002000191,
|
||||
17007000002000191,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000194,
|
||||
]
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sep", [" ", r"\s+"])
|
||||
def test_integer_overflow_bug(all_parsers, sep):
|
||||
# see gh-2601
|
||||
data = "65248E10 11\n55555E55 22\n"
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow" and sep != " ":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=None, sep=sep)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, sep=sep)
|
||||
expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_int64_min_issues(all_parsers):
|
||||
# see gh-2599
|
||||
parser = all_parsers
|
||||
data = "A,B\n0,0\n0,"
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame({"A": [0, 0], "B": [0, np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
|
||||
def test_int64_overflow(all_parsers, conv, request):
|
||||
data = """ID
|
||||
00013007854817840016671868
|
||||
00013007854817840016749251
|
||||
00013007854817840016754630
|
||||
00013007854817840016781876
|
||||
00013007854817840017028824
|
||||
00013007854817840017963235
|
||||
00013007854817840018860166"""
|
||||
parser = all_parsers
|
||||
|
||||
if conv is None:
|
||||
# 13007854817840016671868 > UINT64_MAX, so this
|
||||
# will overflow and return object as the dtype.
|
||||
if parser.engine == "pyarrow":
|
||||
mark = pytest.mark.xfail(reason="parses to float64")
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[
|
||||
"00013007854817840016671868",
|
||||
"00013007854817840016749251",
|
||||
"00013007854817840016754630",
|
||||
"00013007854817840016781876",
|
||||
"00013007854817840017028824",
|
||||
"00013007854817840017963235",
|
||||
"00013007854817840018860166",
|
||||
],
|
||||
columns=["ID"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
# 13007854817840016671868 > UINT64_MAX, so attempts
|
||||
# to cast to either int64 or uint64 will result in
|
||||
# an OverflowError being raised.
|
||||
msg = "|".join(
|
||||
[
|
||||
"Python int too large to convert to C long",
|
||||
"long too big to convert",
|
||||
"int too big to convert",
|
||||
]
|
||||
)
|
||||
err = OverflowError
|
||||
if parser.engine == "pyarrow":
|
||||
err = ValueError
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(StringIO(data), converters={"ID": conv})
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
@pytest.mark.parametrize(
|
||||
"val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
|
||||
)
|
||||
def test_int64_uint64_range(all_parsers, val):
|
||||
# These numbers fall right inside the int64-uint64
|
||||
# range, so they should be parsed as string.
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(str(val)), header=None)
|
||||
|
||||
expected = DataFrame([val])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
@pytest.mark.parametrize(
|
||||
"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
|
||||
)
|
||||
def test_outside_int64_uint64_range(all_parsers, val):
|
||||
# These numbers fall just outside the int64-uint64
|
||||
# range, so they should be parsed as string.
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(str(val)), header=None)
|
||||
|
||||
expected = DataFrame([str(val)])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # gets float64 dtype instead of object
|
||||
@pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]])
|
||||
def test_numeric_range_too_wide(all_parsers, exp_data):
|
||||
# No numerical dtype can hold both negative and uint64
|
||||
# values, so they should be cast as string.
|
||||
parser = all_parsers
|
||||
data = "\n".join(exp_data)
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_integer_precision(all_parsers):
|
||||
# Gh 7072
|
||||
s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765
|
||||
5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(s), header=None)[4]
|
||||
expected = Series([4321583677327450765, 4321113141090630389], name=4)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@ -0,0 +1,134 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
def test_iterator(all_parsers):
|
||||
# see gh-6607
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), iterator=True, **kwargs)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader:
|
||||
first_chunk = reader.read(3)
|
||||
tm.assert_frame_equal(first_chunk, expected[:3])
|
||||
|
||||
last_chunk = reader.read(5)
|
||||
tm.assert_frame_equal(last_chunk, expected[3:])
|
||||
|
||||
|
||||
def test_iterator2(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), iterator=True)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), iterator=True) as reader:
|
||||
result = list(reader)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(result[0], expected)
|
||||
|
||||
|
||||
def test_iterator_stop_on_chunksize(all_parsers):
|
||||
# gh-3967: stopping iteration when chunksize is specified
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), chunksize=1)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), chunksize=1) as reader:
|
||||
result = list(reader)
|
||||
|
||||
assert len(result) == 3
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(concat(result), expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}]
|
||||
)
|
||||
def test_iterator_skipfooter_errors(all_parsers, kwargs):
|
||||
msg = "'skipfooter' not supported for iteration"
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"The '(chunksize|iterator)' option is not supported with the "
|
||||
"'pyarrow' engine"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _:
|
||||
pass
|
||||
|
||||
|
||||
def test_iteration_open_handle(all_parsers):
|
||||
parser = all_parsers
|
||||
kwargs = {"header": None}
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG")
|
||||
|
||||
with open(path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
if "CCC" in line:
|
||||
break
|
||||
|
||||
result = parser.read_csv(f, **kwargs)
|
||||
expected = DataFrame({0: ["DDD", "EEE", "FFF", "GGG"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,320 @@
|
||||
"""
|
||||
Tests that work on the Python, C and PyArrow engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
import codecs
|
||||
import csv
|
||||
from io import StringIO
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY311
|
||||
from pandas.errors import (
|
||||
EmptyDataError,
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_empty_decimal_marker(all_parsers):
|
||||
data = """A|B|C
|
||||
1|2,334|5
|
||||
10|13|10.
|
||||
"""
|
||||
# Parsers support only length-1 decimals
|
||||
msg = "Only length-1 decimal markers supported"
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"only single character unicode strings can be "
|
||||
"converted to Py_UCS4, got length 0"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), decimal="")
|
||||
|
||||
|
||||
def test_bad_stream_exception(all_parsers, csv_dir_path):
|
||||
# see gh-13652
|
||||
#
|
||||
# This test validates that both the Python engine and C engine will
|
||||
# raise UnicodeDecodeError instead of C engine raising ParserError
|
||||
# and swallowing the exception that caused read to fail.
|
||||
path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv")
|
||||
codec = codecs.lookup("utf-8")
|
||||
utf8 = codecs.lookup("utf-8")
|
||||
parser = all_parsers
|
||||
msg = "'utf-8' codec can't decode byte"
|
||||
|
||||
# Stream must be binary UTF8.
|
||||
with open(path, "rb") as handle, codecs.StreamRecoder(
|
||||
handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter
|
||||
) as stream:
|
||||
with pytest.raises(UnicodeDecodeError, match=msg):
|
||||
parser.read_csv(stream)
|
||||
|
||||
|
||||
def test_malformed(all_parsers):
|
||||
# see gh-6607
|
||||
parser = all_parsers
|
||||
data = """ignore
|
||||
A,B,C
|
||||
1,2,3 # comment
|
||||
1,2,3,4,5
|
||||
2,3,4
|
||||
"""
|
||||
msg = "Expected 3 fields in line 4, saw 5"
|
||||
err = ParserError
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
err = ValueError
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(StringIO(data), header=1, comment="#")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", [5, 3, None])
|
||||
def test_malformed_chunks(all_parsers, nrows):
|
||||
data = """ignore
|
||||
A,B,C
|
||||
skip
|
||||
1,2,3
|
||||
3,5,10 # comment
|
||||
1,2,3,4,5
|
||||
2,3,4
|
||||
"""
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
header=1,
|
||||
comment="#",
|
||||
iterator=True,
|
||||
chunksize=1,
|
||||
skiprows=[2],
|
||||
)
|
||||
return
|
||||
|
||||
msg = "Expected 3 fields in line 6, saw 5"
|
||||
with parser.read_csv(
|
||||
StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2]
|
||||
) as reader:
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
reader.read(nrows)
|
||||
|
||||
|
||||
@xfail_pyarrow # does not raise
|
||||
def test_catch_too_many_names(all_parsers):
|
||||
# see gh-5156
|
||||
data = """\
|
||||
1,2,3
|
||||
4,,6
|
||||
7,8,9
|
||||
10,11,12\n"""
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"Too many columns specified: expected 4 and found 3"
|
||||
if parser.engine == "c"
|
||||
else "Number of passed names did not match "
|
||||
"number of header fields in the file"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
|
||||
def test_raise_on_no_columns(all_parsers, nrows):
|
||||
parser = all_parsers
|
||||
data = "\n" * nrows
|
||||
|
||||
msg = "No columns to parse from file"
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
|
||||
|
||||
def test_unexpected_keyword_parameter_exception(all_parsers):
|
||||
# GH-34976
|
||||
parser = all_parsers
|
||||
|
||||
msg = "{}\\(\\) got an unexpected keyword argument 'foo'"
|
||||
with pytest.raises(TypeError, match=msg.format("read_csv")):
|
||||
parser.read_csv("foo.csv", foo=1)
|
||||
with pytest.raises(TypeError, match=msg.format("read_table")):
|
||||
parser.read_table("foo.tsv", foo=1)
|
||||
|
||||
|
||||
def test_suppress_error_output(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
expected = DataFrame({"a": [1, 4]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), on_bad_lines="skip")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_error_bad_lines(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
|
||||
msg = "Expected 1 fields in line 3, saw 3"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
# "CSV parse error: Expected 1 columns, got 3: 1,2,3"
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), on_bad_lines="error")
|
||||
|
||||
|
||||
def test_warn_bad_lines(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
expected = DataFrame({"a": [1, 4]})
|
||||
match_msg = "Skipping line"
|
||||
|
||||
expected_warning = ParserWarning
|
||||
if parser.engine == "pyarrow":
|
||||
match_msg = "Expected 1 columns, but found 3: 1,2,3"
|
||||
expected_warning = (ParserWarning, DeprecationWarning)
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
expected_warning, match=match_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_wrong_num_columns(all_parsers):
|
||||
# Too few columns.
|
||||
data = """A,B,C,D,E,F
|
||||
1,2,3,4,5,6
|
||||
6,7,8,9,10,11,12
|
||||
11,12,13,14,15,16
|
||||
"""
|
||||
parser = all_parsers
|
||||
msg = "Expected 6 fields in line 3, saw 7"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
# Expected 6 columns, got 7: 6,7,8,9,10,11,12
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
|
||||
|
||||
def test_null_byte_char(request, all_parsers):
|
||||
# see gh-2741
|
||||
data = "\x00,foo"
|
||||
names = ["a", "b"]
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "c" or (parser.engine == "python" and PY311):
|
||||
if parser.engine == "python" and PY311:
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason="In Python 3.11, this is read as an empty character not null"
|
||||
)
|
||||
)
|
||||
expected = DataFrame([[np.nan, "foo"]], columns=names)
|
||||
out = parser.read_csv(StringIO(data), names=names)
|
||||
tm.assert_frame_equal(out, expected)
|
||||
else:
|
||||
if parser.engine == "pyarrow":
|
||||
# CSV parse error: Empty CSV file or block: "
|
||||
# cannot infer number of columns"
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
else:
|
||||
msg = "NULL byte detected"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=names)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("always::ResourceWarning")
|
||||
def test_open_file(request, all_parsers):
|
||||
# GH 39024
|
||||
parser = all_parsers
|
||||
|
||||
msg = "Could not determine delimiter"
|
||||
err = csv.Error
|
||||
if parser.engine == "c":
|
||||
msg = "the 'c' engine does not support sep=None with delim_whitespace=False"
|
||||
err = ValueError
|
||||
elif parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"the 'pyarrow' engine does not support sep=None with delim_whitespace=False"
|
||||
)
|
||||
err = ValueError
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
file = Path(path)
|
||||
file.write_bytes(b"\xe4\na\n1")
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
# should not trigger a ResourceWarning
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(file, sep=None, encoding_errors="replace")
|
||||
|
||||
|
||||
def test_invalid_on_bad_line(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
with pytest.raises(ValueError, match="Argument abc is invalid for on_bad_lines"):
|
||||
parser.read_csv(StringIO(data), on_bad_lines="abc")
|
||||
|
||||
|
||||
def test_bad_header_uniform_error(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n"
|
||||
msg = "Expected 2 fields in line 2, saw 4"
|
||||
if parser.engine == "c":
|
||||
msg = (
|
||||
"Could not construct index. Requested to use 1 "
|
||||
"number of columns, but 3 left to parse."
|
||||
)
|
||||
elif parser.engine == "pyarrow":
|
||||
# "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4"
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error")
|
||||
|
||||
|
||||
def test_on_bad_lines_warn_correct_formatting(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = """1,2
|
||||
a,b
|
||||
a,b,c
|
||||
a,b,d
|
||||
a,b
|
||||
"""
|
||||
expected = DataFrame({"1": "a", "2": ["b"] * 2})
|
||||
match_msg = "Skipping line"
|
||||
|
||||
expected_warning = ParserWarning
|
||||
if parser.engine == "pyarrow":
|
||||
match_msg = "Expected 2 columns, but found 3: a,b,c"
|
||||
expected_warning = (ParserWarning, DeprecationWarning)
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
expected_warning, match=match_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,81 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas._testing as tm
|
||||
|
||||
depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated"
|
||||
|
||||
|
||||
def test_verbose_read(all_parsers, capsys):
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
one,1,2,3
|
||||
one,1,2,3
|
||||
,1,2,3
|
||||
one,1,2,3
|
||||
,1,2,3
|
||||
,1,2,3
|
||||
one,1,2,3
|
||||
two,1,2,3"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'verbose' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True)
|
||||
return
|
||||
|
||||
# Engines are verbose in different ways.
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True)
|
||||
captured = capsys.readouterr()
|
||||
|
||||
if parser.engine == "c":
|
||||
assert "Tokenization took:" in captured.out
|
||||
assert "Parser memory cleanup took:" in captured.out
|
||||
else: # Python engine
|
||||
assert captured.out == "Filled 3 NA values in column a\n"
|
||||
|
||||
|
||||
def test_verbose_read2(all_parsers, capsys):
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
one,1,2,3
|
||||
two,1,2,3
|
||||
three,1,2,3
|
||||
four,1,2,3
|
||||
five,1,2,3
|
||||
,1,2,3
|
||||
seven,1,2,3
|
||||
eight,1,2,3"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'verbose' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True, index_col=0)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True, index_col=0)
|
||||
captured = capsys.readouterr()
|
||||
|
||||
# Engines are verbose in different ways.
|
||||
if parser.engine == "c":
|
||||
assert "Tokenization took:" in captured.out
|
||||
assert "Parser memory cleanup took:" in captured.out
|
||||
else: # Python engine
|
||||
assert captured.out == "Filled 1 NA values in column a\n"
|
||||
337
lib/python3.11/site-packages/pandas/tests/io/parser/conftest.py
Normal file
337
lib/python3.11/site-packages/pandas/tests/io/parser/conftest.py
Normal file
@ -0,0 +1,337 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import HAS_PYARROW
|
||||
from pandas.compat._optional import VERSIONS
|
||||
|
||||
from pandas import (
|
||||
read_csv,
|
||||
read_table,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class BaseParser:
|
||||
engine: str | None = None
|
||||
low_memory = True
|
||||
float_precision_choices: list[str | None] = []
|
||||
|
||||
def update_kwargs(self, kwargs):
|
||||
kwargs = kwargs.copy()
|
||||
kwargs.update({"engine": self.engine, "low_memory": self.low_memory})
|
||||
|
||||
return kwargs
|
||||
|
||||
def read_csv(self, *args, **kwargs):
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
return read_csv(*args, **kwargs)
|
||||
|
||||
def read_csv_check_warnings(
|
||||
self,
|
||||
warn_type: type[Warning],
|
||||
warn_msg: str,
|
||||
*args,
|
||||
raise_on_extra_warnings=True,
|
||||
check_stacklevel: bool = True,
|
||||
**kwargs,
|
||||
):
|
||||
# We need to check the stacklevel here instead of in the tests
|
||||
# since this is where read_csv is called and where the warning
|
||||
# should point to.
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
with tm.assert_produces_warning(
|
||||
warn_type,
|
||||
match=warn_msg,
|
||||
raise_on_extra_warnings=raise_on_extra_warnings,
|
||||
check_stacklevel=check_stacklevel,
|
||||
):
|
||||
return read_csv(*args, **kwargs)
|
||||
|
||||
def read_table(self, *args, **kwargs):
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
return read_table(*args, **kwargs)
|
||||
|
||||
def read_table_check_warnings(
|
||||
self,
|
||||
warn_type: type[Warning],
|
||||
warn_msg: str,
|
||||
*args,
|
||||
raise_on_extra_warnings=True,
|
||||
**kwargs,
|
||||
):
|
||||
# We need to check the stacklevel here instead of in the tests
|
||||
# since this is where read_table is called and where the warning
|
||||
# should point to.
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
with tm.assert_produces_warning(
|
||||
warn_type, match=warn_msg, raise_on_extra_warnings=raise_on_extra_warnings
|
||||
):
|
||||
return read_table(*args, **kwargs)
|
||||
|
||||
|
||||
class CParser(BaseParser):
|
||||
engine = "c"
|
||||
float_precision_choices = [None, "high", "round_trip"]
|
||||
|
||||
|
||||
class CParserHighMemory(CParser):
|
||||
low_memory = False
|
||||
|
||||
|
||||
class CParserLowMemory(CParser):
|
||||
low_memory = True
|
||||
|
||||
|
||||
class PythonParser(BaseParser):
|
||||
engine = "python"
|
||||
float_precision_choices = [None]
|
||||
|
||||
|
||||
class PyArrowParser(BaseParser):
|
||||
engine = "pyarrow"
|
||||
float_precision_choices = [None]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def csv_dir_path(datapath):
|
||||
"""
|
||||
The directory path to the data files needed for parser tests.
|
||||
"""
|
||||
return datapath("io", "parser", "data")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def csv1(datapath):
|
||||
"""
|
||||
The path to the data file "test1.csv" needed for parser tests.
|
||||
"""
|
||||
return os.path.join(datapath("io", "data", "csv"), "test1.csv")
|
||||
|
||||
|
||||
_cParserHighMemory = CParserHighMemory
|
||||
_cParserLowMemory = CParserLowMemory
|
||||
_pythonParser = PythonParser
|
||||
_pyarrowParser = PyArrowParser
|
||||
|
||||
_py_parsers_only = [_pythonParser]
|
||||
_c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
|
||||
_pyarrow_parsers_only = [
|
||||
pytest.param(
|
||||
_pyarrowParser,
|
||||
marks=[
|
||||
pytest.mark.single_cpu,
|
||||
pytest.mark.skipif(not HAS_PYARROW, reason="pyarrow is not installed"),
|
||||
],
|
||||
)
|
||||
]
|
||||
|
||||
_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
|
||||
|
||||
_py_parser_ids = ["python"]
|
||||
_c_parser_ids = ["c_high", "c_low"]
|
||||
_pyarrow_parsers_ids = ["pyarrow"]
|
||||
|
||||
_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parsers_ids]
|
||||
|
||||
|
||||
@pytest.fixture(params=_all_parsers, ids=_all_parser_ids)
|
||||
def all_parsers(request):
|
||||
"""
|
||||
Fixture all of the CSV parsers.
|
||||
"""
|
||||
parser = request.param()
|
||||
if parser.engine == "pyarrow":
|
||||
pytest.importorskip("pyarrow", VERSIONS["pyarrow"])
|
||||
# Try finding a way to disable threads all together
|
||||
# for more stable CI runs
|
||||
import pyarrow
|
||||
|
||||
pyarrow.set_cpu_count(1)
|
||||
return parser
|
||||
|
||||
|
||||
@pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids)
|
||||
def c_parser_only(request):
|
||||
"""
|
||||
Fixture all of the CSV parsers using the C engine.
|
||||
"""
|
||||
return request.param()
|
||||
|
||||
|
||||
@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids)
|
||||
def python_parser_only(request):
|
||||
"""
|
||||
Fixture all of the CSV parsers using the Python engine.
|
||||
"""
|
||||
return request.param()
|
||||
|
||||
|
||||
@pytest.fixture(params=_pyarrow_parsers_only, ids=_pyarrow_parsers_ids)
|
||||
def pyarrow_parser_only(request):
|
||||
"""
|
||||
Fixture all of the CSV parsers using the Pyarrow engine.
|
||||
"""
|
||||
return request.param()
|
||||
|
||||
|
||||
def _get_all_parser_float_precision_combinations():
|
||||
"""
|
||||
Return all allowable parser and float precision
|
||||
combinations and corresponding ids.
|
||||
"""
|
||||
params = []
|
||||
ids = []
|
||||
for parser, parser_id in zip(_all_parsers, _all_parser_ids):
|
||||
if hasattr(parser, "values"):
|
||||
# Wrapped in pytest.param, get the actual parser back
|
||||
parser = parser.values[0]
|
||||
for precision in parser.float_precision_choices:
|
||||
# Re-wrap in pytest.param for pyarrow
|
||||
mark = (
|
||||
[
|
||||
pytest.mark.single_cpu,
|
||||
pytest.mark.skipif(
|
||||
not HAS_PYARROW, reason="pyarrow is not installed"
|
||||
),
|
||||
]
|
||||
if parser.engine == "pyarrow"
|
||||
else ()
|
||||
)
|
||||
param = pytest.param((parser(), precision), marks=mark)
|
||||
params.append(param)
|
||||
ids.append(f"{parser_id}-{precision}")
|
||||
|
||||
return {"params": params, "ids": ids}
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=_get_all_parser_float_precision_combinations()["params"],
|
||||
ids=_get_all_parser_float_precision_combinations()["ids"],
|
||||
)
|
||||
def all_parsers_all_precisions(request):
|
||||
"""
|
||||
Fixture for all allowable combinations of parser
|
||||
and float precision
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
_utf_values = [8, 16, 32]
|
||||
|
||||
_encoding_seps = ["", "-", "_"]
|
||||
_encoding_prefixes = ["utf", "UTF"]
|
||||
|
||||
_encoding_fmts = [
|
||||
f"{prefix}{sep}{{0}}" for sep in _encoding_seps for prefix in _encoding_prefixes
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(params=_utf_values)
|
||||
def utf_value(request):
|
||||
"""
|
||||
Fixture for all possible integer values for a UTF encoding.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=_encoding_fmts)
|
||||
def encoding_fmt(request):
|
||||
"""
|
||||
Fixture for all possible string formats of a UTF encoding.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
("-1,0", -1.0),
|
||||
("-1,2e0", -1.2),
|
||||
("-1e0", -1.0),
|
||||
("+1e0", 1.0),
|
||||
("+1e+0", 1.0),
|
||||
("+1e-1", 0.1),
|
||||
("+,1e1", 1.0),
|
||||
("+1,e0", 1.0),
|
||||
("-,1e1", -1.0),
|
||||
("-1,e0", -1.0),
|
||||
("0,1", 0.1),
|
||||
("1,", 1.0),
|
||||
(",1", 0.1),
|
||||
("-,1", -0.1),
|
||||
("1_,", 1.0),
|
||||
("1_234,56", 1234.56),
|
||||
("1_234,56e0", 1234.56),
|
||||
# negative cases; must not parse as float
|
||||
("_", "_"),
|
||||
("-_", "-_"),
|
||||
("-_1", "-_1"),
|
||||
("-_1e0", "-_1e0"),
|
||||
("_1", "_1"),
|
||||
("_1,", "_1,"),
|
||||
("_1,_", "_1,_"),
|
||||
("_1e0", "_1e0"),
|
||||
("1,2e_1", "1,2e_1"),
|
||||
("1,2e1_0", "1,2e1_0"),
|
||||
("1,_2", "1,_2"),
|
||||
(",1__2", ",1__2"),
|
||||
(",1e", ",1e"),
|
||||
("-,1e", "-,1e"),
|
||||
("1_000,000_000", "1_000,000_000"),
|
||||
("1,e1_2", "1,e1_2"),
|
||||
("e11,2", "e11,2"),
|
||||
("1e11,2", "1e11,2"),
|
||||
("1,2,2", "1,2,2"),
|
||||
("1,2_1", "1,2_1"),
|
||||
("1,2e-10e1", "1,2e-10e1"),
|
||||
("--1,2", "--1,2"),
|
||||
("1a_2,1", "1a_2,1"),
|
||||
("1,2E-1", 0.12),
|
||||
("1,2E1", 12.0),
|
||||
]
|
||||
)
|
||||
def numeric_decimal(request):
|
||||
"""
|
||||
Fixture for all numeric formats which should get recognized. The first entry
|
||||
represents the value to read while the second represents the expected result.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pyarrow_xfail(request):
|
||||
"""
|
||||
Fixture that xfails a test if the engine is pyarrow.
|
||||
|
||||
Use if failure is do to unsupported keywords or inconsistent results.
|
||||
"""
|
||||
if "all_parsers" in request.fixturenames:
|
||||
parser = request.getfixturevalue("all_parsers")
|
||||
elif "all_parsers_all_precisions" in request.fixturenames:
|
||||
# Return value is tuple of (engine, precision)
|
||||
parser = request.getfixturevalue("all_parsers_all_precisions")[0]
|
||||
else:
|
||||
return
|
||||
if parser.engine == "pyarrow":
|
||||
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
|
||||
request.applymarker(mark)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pyarrow_skip(request):
|
||||
"""
|
||||
Fixture that skips a test if the engine is pyarrow.
|
||||
|
||||
Use if failure is do a parsing failure from pyarrow.csv.read_csv
|
||||
"""
|
||||
if "all_parsers" in request.fixturenames:
|
||||
parser = request.getfixturevalue("all_parsers")
|
||||
elif "all_parsers_all_precisions" in request.fixturenames:
|
||||
# Return value is tuple of (engine, precision)
|
||||
parser = request.getfixturevalue("all_parsers_all_precisions")[0]
|
||||
else:
|
||||
return
|
||||
if parser.engine == "pyarrow":
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
@ -0,0 +1,334 @@
|
||||
"""
|
||||
Tests dtype specification during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import parsers as libparsers
|
||||
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
"category",
|
||||
CategoricalDtype(),
|
||||
{"a": "category", "b": "category", "c": CategoricalDtype()},
|
||||
],
|
||||
)
|
||||
def test_categorical_dtype(all_parsers, dtype):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,a,3.4
|
||||
1,a,3.4
|
||||
2,b,4.5"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["a", "a", "b"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"]),
|
||||
}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
|
||||
def test_categorical_dtype_single(all_parsers, dtype, request):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,a,3.4
|
||||
1,a,3.4
|
||||
2,b,4.5"""
|
||||
expected = DataFrame(
|
||||
{"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
mark = pytest.mark.xfail(
|
||||
strict=False,
|
||||
reason="Flaky test sometimes gives object dtype instead of Categorical",
|
||||
)
|
||||
request.applymarker(mark)
|
||||
|
||||
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
def test_categorical_dtype_unsorted(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,b,3.4
|
||||
1,b,3.4
|
||||
2,a,4.5"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["b", "b", "a"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"]),
|
||||
}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
def test_categorical_dtype_missing(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,b,3.4
|
||||
1,nan,3.4
|
||||
2,a,4.5"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["b", np.nan, "a"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"]),
|
||||
}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
@pytest.mark.slow
|
||||
def test_categorical_dtype_high_cardinality_numeric(all_parsers, monkeypatch):
|
||||
# see gh-18186
|
||||
# was an issue with C parser, due to DEFAULT_BUFFER_HEURISTIC
|
||||
parser = all_parsers
|
||||
heuristic = 2**5
|
||||
data = np.sort([str(i) for i in range(heuristic + 1)])
|
||||
expected = DataFrame({"a": Categorical(data, ordered=True)})
|
||||
with monkeypatch.context() as m:
|
||||
m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
|
||||
actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
|
||||
actual["a"] = actual["a"].cat.reorder_categories(
|
||||
np.sort(actual.a.cat.categories), ordered=True
|
||||
)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
|
||||
# see gh-10153
|
||||
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
|
||||
parser = all_parsers
|
||||
encoding = "utf-16"
|
||||
sep = "\t"
|
||||
|
||||
expected = parser.read_csv(pth, sep=sep, encoding=encoding)
|
||||
expected = expected.apply(Categorical)
|
||||
|
||||
actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_chunksize_infer_categories(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
expecteds = [
|
||||
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
|
||||
DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
|
||||
]
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2)
|
||||
return
|
||||
|
||||
with parser.read_csv(
|
||||
StringIO(data), dtype={"b": "category"}, chunksize=2
|
||||
) as actuals:
|
||||
for actual, expected in zip(actuals, expecteds):
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
cats = ["a", "b", "c"]
|
||||
expecteds = [
|
||||
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
|
||||
DataFrame(
|
||||
{"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)},
|
||||
index=[2, 3],
|
||||
),
|
||||
]
|
||||
dtype = CategoricalDtype(cats)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals:
|
||||
for actual, expected in zip(actuals, expecteds):
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
|
||||
# see gh-10153
|
||||
pth = os.path.join(csv_dir_path, "unicode_series.csv")
|
||||
parser = all_parsers
|
||||
encoding = "latin-1"
|
||||
|
||||
expected = parser.read_csv(pth, header=None, encoding=encoding)
|
||||
expected[1] = Categorical(expected[1])
|
||||
|
||||
actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"})
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ordered", [False, True])
|
||||
@pytest.mark.parametrize(
|
||||
"categories",
|
||||
[["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
|
||||
)
|
||||
def test_categorical_category_dtype(all_parsers, categories, ordered):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 2],
|
||||
"b": Categorical(
|
||||
["a", "b", "b", "c"], categories=categories, ordered=ordered
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_category_dtype_unsorted(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
dtype = CategoricalDtype(["c", "b", "a"])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 2],
|
||||
"b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
|
||||
}
|
||||
)
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype={"b": dtype})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_numeric(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([1, 2, 3])}
|
||||
|
||||
data = "b\n1\n1\n2\n3"
|
||||
expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_datetime(all_parsers):
|
||||
parser = all_parsers
|
||||
dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None)
|
||||
dtype = {"b": CategoricalDtype(dti)}
|
||||
|
||||
data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
|
||||
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_timestamp(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([Timestamp("2014")])}
|
||||
|
||||
data = "b\n2014-01-01\n2014-01-01"
|
||||
expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_timedelta(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype(pd.to_timedelta(["1h", "2h", "3h"]))}
|
||||
|
||||
data = "b\n1h\n2h\n3h"
|
||||
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
"b\nTrue\nFalse\nNA\nFalse",
|
||||
"b\ntrue\nfalse\nNA\nfalse",
|
||||
"b\nTRUE\nFALSE\nNA\nFALSE",
|
||||
"b\nTrue\nFalse\nNA\nFALSE",
|
||||
],
|
||||
)
|
||||
def test_categorical_dtype_coerces_boolean(all_parsers, data):
|
||||
# see gh-20498
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([False, True])}
|
||||
expected = DataFrame({"b": Categorical([True, False, None, False])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_unexpected_categories(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
|
||||
|
||||
data = "b\nd\na\nc\nd" # Unexpected c
|
||||
expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,644 @@
|
||||
"""
|
||||
Tests dtype specification during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from collections import defaultdict
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import IntegerArray
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [str, object])
|
||||
@pytest.mark.parametrize("check_orig", [True, False])
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
|
||||
# see gh-3795, gh-6607
|
||||
parser = all_parsers
|
||||
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((5, 2)).round(4),
|
||||
columns=list("AB"),
|
||||
index=["1A", "1B", "1C", "1D", "1E"],
|
||||
)
|
||||
|
||||
with tm.ensure_clean("__passing_str_as_dtype__.csv") as path:
|
||||
df.to_csv(path)
|
||||
|
||||
result = parser.read_csv(path, dtype=dtype, index_col=0)
|
||||
|
||||
if check_orig:
|
||||
expected = df.copy()
|
||||
result = result.astype(float)
|
||||
elif using_infer_string and dtype is str:
|
||||
expected = df.astype(str)
|
||||
else:
|
||||
expected = df.astype(str).astype(object)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtype_per_column(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
one,two
|
||||
1,2.5
|
||||
2,3.5
|
||||
3,4.5
|
||||
4,5.5"""
|
||||
expected = DataFrame(
|
||||
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
|
||||
)
|
||||
expected["one"] = expected["one"].astype(np.float64)
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_invalid_dtype_per_column(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
one,two
|
||||
1,2.5
|
||||
2,3.5
|
||||
3,4.5
|
||||
4,5.5"""
|
||||
|
||||
with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"):
|
||||
parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
|
||||
|
||||
|
||||
def test_raise_on_passed_int_dtype_with_nas(all_parsers):
|
||||
# see gh-2631
|
||||
parser = all_parsers
|
||||
data = """YEAR, DOY, a
|
||||
2001,106380451,10
|
||||
2001,,11
|
||||
2001,106380451,67"""
|
||||
|
||||
if parser.engine == "c":
|
||||
msg = "Integer column has NA values"
|
||||
elif parser.engine == "pyarrow":
|
||||
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
|
||||
else:
|
||||
msg = "Unable to convert column DOY"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True)
|
||||
|
||||
|
||||
def test_dtype_with_converters(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1.1,2.2
|
||||
1.2,2.3"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)}
|
||||
)
|
||||
return
|
||||
|
||||
# Dtype spec ignored if converted specified.
|
||||
result = parser.read_csv_check_warnings(
|
||||
ParserWarning,
|
||||
"Both a converter and dtype were specified for column a "
|
||||
"- only the converter will be used.",
|
||||
StringIO(data),
|
||||
dtype={"a": "i8"},
|
||||
converters={"a": lambda x: str(x)},
|
||||
)
|
||||
expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"])
|
||||
)
|
||||
def test_numeric_dtype(all_parsers, dtype):
|
||||
data = "0\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame([0, 1], dtype=dtype)
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_boolean_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "\n".join(
|
||||
[
|
||||
"a",
|
||||
"True",
|
||||
"TRUE",
|
||||
"true",
|
||||
"1",
|
||||
"1.0",
|
||||
"False",
|
||||
"FALSE",
|
||||
"false",
|
||||
"0",
|
||||
"0.0",
|
||||
"NaN",
|
||||
"nan",
|
||||
"NA",
|
||||
"null",
|
||||
"NULL",
|
||||
]
|
||||
)
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype="boolean")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.array(
|
||||
[
|
||||
True,
|
||||
True,
|
||||
True,
|
||||
True,
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
],
|
||||
dtype="boolean",
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_delimiter_with_usecols_and_parse_dates(all_parsers):
|
||||
# GH#35873
|
||||
result = all_parsers.read_csv(
|
||||
StringIO('"dump","-9,1","-9,1",20101010'),
|
||||
engine="python",
|
||||
names=["col", "col1", "col2", "col3"],
|
||||
usecols=["col1", "col2", "col3"],
|
||||
parse_dates=["col3"],
|
||||
decimal=",",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("thousands", ["_", None])
|
||||
def test_decimal_and_exponential(
|
||||
request, python_parser_only, numeric_decimal, thousands
|
||||
):
|
||||
# GH#31920
|
||||
decimal_number_check(request, python_parser_only, numeric_decimal, thousands, None)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("thousands", ["_", None])
|
||||
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
|
||||
def test_1000_sep_decimal_float_precision(
|
||||
request, c_parser_only, numeric_decimal, float_precision, thousands
|
||||
):
|
||||
# test decimal and thousand sep handling in across 'float_precision'
|
||||
# parsers
|
||||
decimal_number_check(
|
||||
request, c_parser_only, numeric_decimal, thousands, float_precision
|
||||
)
|
||||
text, value = numeric_decimal
|
||||
text = " " + text + " "
|
||||
if isinstance(value, str): # the negative cases (parse as text)
|
||||
value = " " + value + " "
|
||||
decimal_number_check(
|
||||
request, c_parser_only, (text, value), thousands, float_precision
|
||||
)
|
||||
|
||||
|
||||
def decimal_number_check(request, parser, numeric_decimal, thousands, float_precision):
|
||||
# GH#31920
|
||||
value = numeric_decimal[0]
|
||||
if thousands is None and value in ("1_,", "1_234,56", "1_234,56e0"):
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(reason=f"thousands={thousands} and sep is in {value}")
|
||||
)
|
||||
df = parser.read_csv(
|
||||
StringIO(value),
|
||||
float_precision=float_precision,
|
||||
sep="|",
|
||||
thousands=thousands,
|
||||
decimal=",",
|
||||
header=None,
|
||||
)
|
||||
val = df.iloc[0, 0]
|
||||
assert val == numeric_decimal[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
|
||||
def test_skip_whitespace(c_parser_only, float_precision):
|
||||
DATA = """id\tnum\t
|
||||
1\t1.2 \t
|
||||
1\t 2.1\t
|
||||
2\t 1\t
|
||||
2\t 1.2 \t
|
||||
"""
|
||||
df = c_parser_only.read_csv(
|
||||
StringIO(DATA),
|
||||
float_precision=float_precision,
|
||||
sep="\t",
|
||||
header=0,
|
||||
dtype={1: np.float64},
|
||||
)
|
||||
tm.assert_series_equal(df.iloc[:, 1], pd.Series([1.2, 2.1, 1.0, 1.2], name="num"))
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_true_values_cast_to_bool(all_parsers):
|
||||
# GH#34655
|
||||
text = """a,b
|
||||
yes,xxx
|
||||
no,yyy
|
||||
1,zzz
|
||||
0,aaa
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO(text),
|
||||
true_values=["yes"],
|
||||
false_values=["no"],
|
||||
dtype={"a": "boolean"},
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"a": [True, False, True, False], "b": ["xxx", "yyy", "zzz", "aaa"]}
|
||||
)
|
||||
expected["a"] = expected["a"].astype("boolean")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
|
||||
def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
|
||||
# GH#35211
|
||||
parser = all_parsers
|
||||
data = """a,a\n1,1"""
|
||||
dtype_dict = {"a": str, **dtypes}
|
||||
# GH#42462
|
||||
dtype_dict_copy = dtype_dict.copy()
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype_dict)
|
||||
expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
|
||||
assert dtype_dict == dtype_dict_copy, "dtype dict changed"
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
|
||||
# GH#42022
|
||||
parser = all_parsers
|
||||
data = """a,a\n1,1"""
|
||||
result = parser.read_csv(StringIO(data), dtype=str)
|
||||
expected = DataFrame({"a": ["1"], "a.1": ["1"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtype_multi_index(all_parsers):
|
||||
# GH 42446
|
||||
parser = all_parsers
|
||||
data = "A,B,B\nX,Y,Z\n1,2,3"
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=list(range(2)),
|
||||
dtype={
|
||||
("A", "X"): np.int32,
|
||||
("B", "Y"): np.int32,
|
||||
("B", "Z"): np.float32,
|
||||
},
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
("A", "X"): np.int32([1]),
|
||||
("B", "Y"): np.int32([2]),
|
||||
("B", "Z"): np.float32([3]),
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
|
||||
# GH 25472
|
||||
parser = all_parsers
|
||||
dtype = any_int_ea_dtype
|
||||
|
||||
data = """a,b,c
|
||||
,3,5
|
||||
1,,6
|
||||
2,4,"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.array([pd.NA, 1, 2], dtype=dtype),
|
||||
"b": pd.array([3, pd.NA, 4], dtype=dtype),
|
||||
"c": pd.array([5, 6, pd.NA], dtype=dtype),
|
||||
}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
@pytest.mark.parametrize("default", ["float", "float64"])
|
||||
def test_dtypes_defaultdict(all_parsers, default):
|
||||
# GH#41574
|
||||
data = """a,b
|
||||
1,2
|
||||
"""
|
||||
dtype = defaultdict(lambda: default, a="int64")
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
expected = DataFrame({"a": [1], "b": 2.0})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
|
||||
# GH#41574
|
||||
data = """a,b,a,b,b.1
|
||||
1,2,3,4,5
|
||||
"""
|
||||
dtype = defaultdict(lambda: "float64", a="int64")
|
||||
dtype["b.1"] = "int64"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
expected = DataFrame({"a": [1], "b": [2.0], "a.1": [3], "b.2": [4.0], "b.1": [5]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtypes_defaultdict_invalid(all_parsers):
|
||||
# GH#41574
|
||||
data = """a,b
|
||||
1,2
|
||||
"""
|
||||
dtype = defaultdict(lambda: "invalid_dtype", a="int64")
|
||||
parser = all_parsers
|
||||
with pytest.raises(TypeError, match="not understood"):
|
||||
parser.read_csv(StringIO(data), dtype=dtype)
|
||||
|
||||
|
||||
def test_dtype_backend(all_parsers):
|
||||
# GH#36712
|
||||
|
||||
parser = all_parsers
|
||||
|
||||
data = """a,b,c,d,e,f,g,h,i,j
|
||||
1,2.5,True,a,,,,,12-31-2019,
|
||||
3,4.5,False,b,6,7.5,True,a,12-31-2019,
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data), dtype_backend="numpy_nullable", parse_dates=["i"]
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 3], dtype="Int64"),
|
||||
"b": pd.Series([2.5, 4.5], dtype="Float64"),
|
||||
"c": pd.Series([True, False], dtype="boolean"),
|
||||
"d": pd.Series(["a", "b"], dtype="string"),
|
||||
"e": pd.Series([pd.NA, 6], dtype="Int64"),
|
||||
"f": pd.Series([pd.NA, 7.5], dtype="Float64"),
|
||||
"g": pd.Series([pd.NA, True], dtype="boolean"),
|
||||
"h": pd.Series([pd.NA, "a"], dtype="string"),
|
||||
"i": pd.Series([Timestamp("2019-12-31")] * 2),
|
||||
"j": pd.Series([pd.NA, pd.NA], dtype="Int64"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_backend_and_dtype(all_parsers):
|
||||
# GH#36712
|
||||
|
||||
parser = all_parsers
|
||||
|
||||
data = """a,b
|
||||
1,2.5
|
||||
,
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data), dtype_backend="numpy_nullable", dtype="float64"
|
||||
)
|
||||
expected = DataFrame({"a": [1.0, np.nan], "b": [2.5, np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_backend_string(all_parsers, string_storage):
|
||||
# GH#36712
|
||||
with pd.option_context("mode.string_storage", string_storage):
|
||||
parser = all_parsers
|
||||
|
||||
data = """a,b
|
||||
a,x
|
||||
b,
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable")
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)),
|
||||
"b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)),
|
||||
},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_backend_ea_dtype_specified(all_parsers):
|
||||
# GH#491496
|
||||
data = """a,b
|
||||
1,2
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO(data), dtype="Int64", dtype_backend="numpy_nullable"
|
||||
)
|
||||
expected = DataFrame({"a": [1], "b": 2}, dtype="Int64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_backend_pyarrow(all_parsers, request):
|
||||
# GH#36712
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
parser = all_parsers
|
||||
|
||||
data = """a,b,c,d,e,f,g,h,i,j
|
||||
1,2.5,True,a,,,,,12-31-2019,
|
||||
3,4.5,False,b,6,7.5,True,a,12-31-2019,
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), dtype_backend="pyarrow", parse_dates=["i"])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 3], dtype="int64[pyarrow]"),
|
||||
"b": pd.Series([2.5, 4.5], dtype="float64[pyarrow]"),
|
||||
"c": pd.Series([True, False], dtype="bool[pyarrow]"),
|
||||
"d": pd.Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())),
|
||||
"e": pd.Series([pd.NA, 6], dtype="int64[pyarrow]"),
|
||||
"f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"),
|
||||
"g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"),
|
||||
"h": pd.Series(
|
||||
[pd.NA, "a"],
|
||||
dtype=pd.ArrowDtype(pa.string()),
|
||||
),
|
||||
"i": pd.Series([Timestamp("2019-12-31")] * 2),
|
||||
"j": pd.Series([pd.NA, pd.NA], dtype="null[pyarrow]"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# pyarrow engine failing:
|
||||
# https://github.com/pandas-dev/pandas/issues/56136
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_ea_int_avoid_overflow(all_parsers):
|
||||
# GH#32134
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,1
|
||||
,1
|
||||
1582218195625938945,1
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), dtype={"a": "Int64"})
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": IntegerArray(
|
||||
np.array([1, 1, 1582218195625938945]), np.array([False, True, False])
|
||||
),
|
||||
"b": 1,
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_string_inference(all_parsers):
|
||||
# GH#54430
|
||||
dtype = pd.StringDtype(na_value=np.nan)
|
||||
|
||||
data = """a,b
|
||||
x,1
|
||||
y,2
|
||||
,3"""
|
||||
parser = all_parsers
|
||||
with pd.option_context("future.infer_string", True):
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": pd.Series(["x", "y", None], dtype=dtype), "b": [1, 2, 3]},
|
||||
columns=pd.Index(["a", "b"], dtype=dtype),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_])
|
||||
def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
|
||||
# GH#56047
|
||||
data = """a,b
|
||||
x,a
|
||||
y,a
|
||||
z,a"""
|
||||
parser = all_parsers
|
||||
with pd.option_context("future.infer_string", True):
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
|
||||
expected_dtype = pd.StringDtype(na_value=np.nan) if dtype is str else object
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.Series(["x", "y", "z"], dtype=expected_dtype),
|
||||
"b": pd.Series(["a", "a", "a"], dtype=expected_dtype),
|
||||
},
|
||||
columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with pd.option_context("future.infer_string", True):
|
||||
result = parser.read_csv(StringIO(data), dtype={"a": dtype})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.Series(["x", "y", "z"], dtype=expected_dtype),
|
||||
"b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)),
|
||||
},
|
||||
columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow
|
||||
def test_accurate_parsing_of_large_integers(all_parsers):
|
||||
# GH#52505
|
||||
data = """SYMBOL,MOMENT,ID,ID_DEAL
|
||||
AAPL,20230301181139587,1925036343869802844,
|
||||
AAPL,20230301181139587,2023552585717889863,2023552585717263358
|
||||
NVDA,20230301181139587,2023552585717889863,2023552585717263359
|
||||
AMC,20230301181139587,2023552585717889863,2023552585717263360
|
||||
AMZN,20230301181139587,2023552585717889759,2023552585717263360
|
||||
MSFT,20230301181139587,2023552585717889863,2023552585717263361
|
||||
NVDA,20230301181139587,2023552585717889827,2023552585717263361"""
|
||||
orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
|
||||
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1
|
||||
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1
|
||||
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2
|
||||
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263361, "ID_DEAL"]) == 2
|
||||
|
||||
|
||||
def test_dtypes_with_usecols(all_parsers):
|
||||
# GH#54868
|
||||
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,2,3
|
||||
4,5,6"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["a", "c"], dtype={"a": object})
|
||||
if parser.engine == "pyarrow":
|
||||
values = [1, 4]
|
||||
else:
|
||||
values = ["1", "4"]
|
||||
expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_col_with_dtype_no_rangeindex(all_parsers):
|
||||
data = StringIO("345.5,519.5,0\n519.5,726.5,1")
|
||||
result = all_parsers.read_csv(
|
||||
data,
|
||||
header=None,
|
||||
names=["start", "stop", "bin_id"],
|
||||
dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32},
|
||||
index_col="bin_id",
|
||||
).index
|
||||
expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id")
|
||||
tm.assert_index_equal(result, expected)
|
||||
@ -0,0 +1,181 @@
|
||||
"""
|
||||
Tests dtype specification during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_dtype_all_columns_empty(all_parsers):
|
||||
# see gh-12048
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO("A,B"), dtype=str)
|
||||
|
||||
expected = DataFrame({"A": [], "B": []}, dtype=str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two"
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": "u1"})
|
||||
|
||||
expected = DataFrame(
|
||||
{"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_index_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two"
|
||||
result = parser.read_csv(
|
||||
StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"}
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_multi_index_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two,three"
|
||||
result = parser.read_csv(
|
||||
StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"}
|
||||
)
|
||||
|
||||
exp_idx = MultiIndex.from_arrays(
|
||||
[np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)],
|
||||
names=["one", "two"],
|
||||
)
|
||||
expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"})
|
||||
|
||||
expected = DataFrame(
|
||||
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
|
||||
|
||||
expected = DataFrame(
|
||||
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
|
||||
# see gh-9424
|
||||
parser = all_parsers
|
||||
expected = concat(
|
||||
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
|
||||
axis=1,
|
||||
)
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers):
|
||||
# see gh-9424
|
||||
parser = all_parsers
|
||||
expected = concat(
|
||||
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
|
||||
axis=1,
|
||||
)
|
||||
expected.index = expected.index.astype(object)
|
||||
|
||||
with pytest.raises(ValueError, match="Duplicate names"):
|
||||
data = ""
|
||||
parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"})
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype,expected",
|
||||
[
|
||||
(np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)),
|
||||
(
|
||||
"category",
|
||||
DataFrame({"a": Categorical([]), "b": Categorical([])}),
|
||||
),
|
||||
(
|
||||
{"a": "category", "b": "category"},
|
||||
DataFrame({"a": Categorical([]), "b": Categorical([])}),
|
||||
),
|
||||
("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")),
|
||||
(
|
||||
"timedelta64[ns]",
|
||||
DataFrame(
|
||||
{
|
||||
"a": Series([], dtype="timedelta64[ns]"),
|
||||
"b": Series([], dtype="timedelta64[ns]"),
|
||||
},
|
||||
),
|
||||
),
|
||||
(
|
||||
{"a": np.int64, "b": np.int32},
|
||||
DataFrame(
|
||||
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
|
||||
),
|
||||
),
|
||||
(
|
||||
{0: np.int64, 1: np.int32},
|
||||
DataFrame(
|
||||
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
|
||||
),
|
||||
),
|
||||
(
|
||||
{"a": np.int64, 1: np.int32},
|
||||
DataFrame(
|
||||
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_dtype(all_parsers, dtype, expected):
|
||||
# see gh-14712
|
||||
parser = all_parsers
|
||||
data = "a,b"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=0, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,647 @@
|
||||
"""
|
||||
Tests that apply specifically to the CParser. Unless specifically stated
|
||||
as a CParser-specific issue, the goal is to eventually move as many of
|
||||
these tests out of this module as soon as the Python parser can accept
|
||||
further arguments when parsing.
|
||||
"""
|
||||
from decimal import Decimal
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
TextIOWrapper,
|
||||
)
|
||||
import mmap
|
||||
import os
|
||||
import tarfile
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat.numpy import np_version_gte1p24
|
||||
from pandas.errors import (
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"malformed",
|
||||
["1\r1\r1\r 1\r 1\r", "1\r1\r1\r 1\r 1\r11\r", "1\r1\r1\r 1\r 1\r11\r1\r"],
|
||||
ids=["words pointer", "stream pointer", "lines pointer"],
|
||||
)
|
||||
def test_buffer_overflow(c_parser_only, malformed):
|
||||
# see gh-9205: test certain malformed input files that cause
|
||||
# buffer overflows in tokenizer.c
|
||||
msg = "Buffer overflow caught - possible malformed input file."
|
||||
parser = c_parser_only
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(malformed))
|
||||
|
||||
|
||||
def test_delim_whitespace_custom_terminator(c_parser_only):
|
||||
# See gh-12912
|
||||
data = "a b c~1 2 3~4 5 6~7 8 9"
|
||||
parser = c_parser_only
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
def test_dtype_and_names_error(c_parser_only):
|
||||
# see gh-8833: passing both dtype and names
|
||||
# resulting in an error reporting issue
|
||||
parser = c_parser_only
|
||||
data = """
|
||||
1.0 1
|
||||
2.0 2
|
||||
3.0 3
|
||||
"""
|
||||
# base cases
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None)
|
||||
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"])
|
||||
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# fallback casting
|
||||
result = parser.read_csv(
|
||||
StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32}
|
||||
)
|
||||
expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"])
|
||||
expected["a"] = expected["a"].astype(np.int32)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
data = """
|
||||
1.0 1
|
||||
nan 2
|
||||
3.0 3
|
||||
"""
|
||||
# fallback casting, but not castable
|
||||
warning = RuntimeWarning if np_version_gte1p24 else None
|
||||
with pytest.raises(ValueError, match="cannot safely convert"):
|
||||
with tm.assert_produces_warning(warning, check_stacklevel=False):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
sep=r"\s+",
|
||||
header=None,
|
||||
names=["a", "b"],
|
||||
dtype={"a": np.int32},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"match,kwargs",
|
||||
[
|
||||
# For each of these cases, all of the dtypes are valid, just unsupported.
|
||||
(
|
||||
(
|
||||
"the dtype datetime64 is not supported for parsing, "
|
||||
"pass this column using parse_dates instead"
|
||||
),
|
||||
{"dtype": {"A": "datetime64", "B": "float64"}},
|
||||
),
|
||||
(
|
||||
(
|
||||
"the dtype datetime64 is not supported for parsing, "
|
||||
"pass this column using parse_dates instead"
|
||||
),
|
||||
{"dtype": {"A": "datetime64", "B": "float64"}, "parse_dates": ["B"]},
|
||||
),
|
||||
(
|
||||
"the dtype timedelta64 is not supported for parsing",
|
||||
{"dtype": {"A": "timedelta64", "B": "float64"}},
|
||||
),
|
||||
(
|
||||
f"the dtype {tm.ENDIAN}U8 is not supported for parsing",
|
||||
{"dtype": {"A": "U8"}},
|
||||
),
|
||||
],
|
||||
ids=["dt64-0", "dt64-1", "td64", f"{tm.ENDIAN}U8"],
|
||||
)
|
||||
def test_unsupported_dtype(c_parser_only, match, kwargs):
|
||||
parser = c_parser_only
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((5, 2)),
|
||||
columns=list("AB"),
|
||||
index=["1A", "1B", "1C", "1D", "1E"],
|
||||
)
|
||||
|
||||
with tm.ensure_clean("__unsupported_dtype__.csv") as path:
|
||||
df.to_csv(path)
|
||||
|
||||
with pytest.raises(TypeError, match=match):
|
||||
parser.read_csv(path, index_col=0, **kwargs)
|
||||
|
||||
|
||||
@td.skip_if_32bit
|
||||
@pytest.mark.slow
|
||||
# test numbers between 1 and 2
|
||||
@pytest.mark.parametrize("num", np.linspace(1.0, 2.0, num=21))
|
||||
def test_precise_conversion(c_parser_only, num):
|
||||
parser = c_parser_only
|
||||
|
||||
normal_errors = []
|
||||
precise_errors = []
|
||||
|
||||
def error(val: float, actual_val: Decimal) -> Decimal:
|
||||
return abs(Decimal(f"{val:.100}") - actual_val)
|
||||
|
||||
# 25 decimal digits of precision
|
||||
text = f"a\n{num:.25}"
|
||||
|
||||
normal_val = float(
|
||||
parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
|
||||
)
|
||||
precise_val = float(parser.read_csv(StringIO(text), float_precision="high")["a"][0])
|
||||
roundtrip_val = float(
|
||||
parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
|
||||
)
|
||||
actual_val = Decimal(text[2:])
|
||||
|
||||
normal_errors.append(error(normal_val, actual_val))
|
||||
precise_errors.append(error(precise_val, actual_val))
|
||||
|
||||
# round-trip should match float()
|
||||
assert roundtrip_val == float(text[2:])
|
||||
|
||||
assert sum(precise_errors) <= sum(normal_errors)
|
||||
assert max(precise_errors) <= max(normal_errors)
|
||||
|
||||
|
||||
def test_usecols_dtypes(c_parser_only, using_infer_string):
|
||||
parser = c_parser_only
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
usecols=(0, 1, 2),
|
||||
names=("a", "b", "c"),
|
||||
header=None,
|
||||
converters={"a": str},
|
||||
dtype={"b": int, "c": float},
|
||||
)
|
||||
result2 = parser.read_csv(
|
||||
StringIO(data),
|
||||
usecols=(0, 2),
|
||||
names=("a", "b", "c"),
|
||||
header=None,
|
||||
converters={"a": str},
|
||||
dtype={"b": int, "c": float},
|
||||
)
|
||||
|
||||
if using_infer_string:
|
||||
assert (result.dtypes == ["string", int, float]).all()
|
||||
assert (result2.dtypes == ["string", float]).all()
|
||||
else:
|
||||
assert (result.dtypes == [object, int, float]).all()
|
||||
assert (result2.dtypes == [object, float]).all()
|
||||
|
||||
|
||||
def test_disable_bool_parsing(c_parser_only):
|
||||
# see gh-2090
|
||||
|
||||
parser = c_parser_only
|
||||
data = """A,B,C
|
||||
Yes,No,Yes
|
||||
No,Yes,Yes
|
||||
Yes,,Yes
|
||||
No,No,No"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=object)
|
||||
assert (result.dtypes == object).all()
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=object, na_filter=False)
|
||||
assert result["B"][2] == ""
|
||||
|
||||
|
||||
def test_custom_lineterminator(c_parser_only):
|
||||
parser = c_parser_only
|
||||
data = "a,b,c~1,2,3~4,5,6"
|
||||
|
||||
result = parser.read_csv(StringIO(data), lineterminator="~")
|
||||
expected = parser.read_csv(StringIO(data.replace("~", "\n")))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_ragged_csv(c_parser_only):
|
||||
parser = c_parser_only
|
||||
data = """1,2,3
|
||||
1,2,3,4
|
||||
1,2,3,4,5
|
||||
1,2
|
||||
1,2,3,4"""
|
||||
|
||||
nice_data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data), header=None, names=["a", "b", "c", "d", "e"]
|
||||
)
|
||||
|
||||
expected = parser.read_csv(
|
||||
StringIO(nice_data), header=None, names=["a", "b", "c", "d", "e"]
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# too many columns, cause segfault if not careful
|
||||
data = "1,2\n3,4,5"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, names=range(50))
|
||||
expected = parser.read_csv(StringIO(data), header=None, names=range(3)).reindex(
|
||||
columns=range(50)
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_tokenize_CR_with_quoting(c_parser_only):
|
||||
# see gh-3453
|
||||
parser = c_parser_only
|
||||
data = ' a,b,c\r"a,b","e,d","f,f"'
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = parser.read_csv(StringIO(data.replace("\r", "\n")), header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = parser.read_csv(StringIO(data.replace("\r", "\n")))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("count", [3 * 2**n for n in range(6)])
|
||||
def test_grow_boundary_at_cap(c_parser_only, count):
|
||||
# See gh-12494
|
||||
#
|
||||
# Cause of error was that the C parser
|
||||
# was not increasing the buffer size when
|
||||
# the desired space would fill the buffer
|
||||
# to capacity, which would later cause a
|
||||
# buffer overflow error when checking the
|
||||
# EOF terminator of the CSV stream.
|
||||
# 3 * 2^n commas was observed to break the parser
|
||||
parser = c_parser_only
|
||||
|
||||
with StringIO("," * count) as s:
|
||||
expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)])
|
||||
df = parser.read_csv(s)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("encoding", [None, "utf-8"])
|
||||
def test_parse_trim_buffers(c_parser_only, encoding):
|
||||
# This test is part of a bugfix for gh-13703. It attempts to
|
||||
# to stress the system memory allocator, to cause it to move the
|
||||
# stream buffer and either let the OS reclaim the region, or let
|
||||
# other memory requests of parser otherwise modify the contents
|
||||
# of memory space, where it was formally located.
|
||||
# This test is designed to cause a `segfault` with unpatched
|
||||
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
|
||||
# times it fails due to memory corruption, which causes the
|
||||
# loaded DataFrame to differ from the expected one.
|
||||
|
||||
# Also force 'utf-8' encoding, so that `_string_convert` would take
|
||||
# a different execution branch.
|
||||
|
||||
parser = c_parser_only
|
||||
|
||||
# Generate a large mixed-type CSV file on-the-fly (one record is
|
||||
# approx 1.5KiB).
|
||||
record_ = (
|
||||
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z"""
|
||||
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,"""
|
||||
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9"""
|
||||
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,"""
|
||||
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9."""
|
||||
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999."""
|
||||
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ"""
|
||||
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ"""
|
||||
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z"""
|
||||
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,"""
|
||||
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,"""
|
||||
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,"""
|
||||
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999"""
|
||||
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9."""
|
||||
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,"""
|
||||
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z"""
|
||||
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ"""
|
||||
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99"""
|
||||
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-"""
|
||||
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9"""
|
||||
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,"""
|
||||
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9."""
|
||||
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ"""
|
||||
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ"""
|
||||
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ"""
|
||||
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ"""
|
||||
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99"""
|
||||
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9"""
|
||||
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
|
||||
)
|
||||
|
||||
# Set the number of lines so that a call to `parser_trim_buffers`
|
||||
# is triggered: after a couple of full chunks are consumed a
|
||||
# relatively small 'residual' chunk would cause reallocation
|
||||
# within the parser.
|
||||
chunksize, n_lines = 128, 2 * 128 + 15
|
||||
csv_data = "\n".join([record_] * n_lines) + "\n"
|
||||
|
||||
# We will use StringIO to load the CSV from this text buffer.
|
||||
# pd.read_csv() will iterate over the file in chunks and will
|
||||
# finally read a residual chunk of really small size.
|
||||
|
||||
# Generate the expected output: manually create the dataframe
|
||||
# by splitting by comma and repeating the `n_lines` times.
|
||||
row = tuple(val_ if val_ else np.nan for val_ in record_.split(","))
|
||||
expected = DataFrame(
|
||||
[row for _ in range(n_lines)], dtype=object, columns=None, index=None
|
||||
)
|
||||
|
||||
# Iterate over the CSV file in chunks of `chunksize` lines
|
||||
with parser.read_csv(
|
||||
StringIO(csv_data),
|
||||
header=None,
|
||||
dtype=object,
|
||||
chunksize=chunksize,
|
||||
encoding=encoding,
|
||||
) as chunks_:
|
||||
result = concat(chunks_, axis=0, ignore_index=True)
|
||||
|
||||
# Check for data corruption if there was no segfault
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_internal_null_byte(c_parser_only):
|
||||
# see gh-14012
|
||||
#
|
||||
# The null byte ('\x00') should not be used as a
|
||||
# true line terminator, escape character, or comment
|
||||
# character, only as a placeholder to indicate that
|
||||
# none was specified.
|
||||
#
|
||||
# This test should be moved to test_common.py ONLY when
|
||||
# Python's csv class supports parsing '\x00'.
|
||||
parser = c_parser_only
|
||||
|
||||
names = ["a", "b", "c"]
|
||||
data = "1,2,3\n4,\x00,6\n7,8,9"
|
||||
expected = DataFrame([[1, 2.0, 3], [4, np.nan, 6], [7, 8, 9]], columns=names)
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_nrows_large(c_parser_only):
|
||||
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
|
||||
parser = c_parser_only
|
||||
header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n"
|
||||
data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n"
|
||||
header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n"
|
||||
data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n"
|
||||
test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2
|
||||
|
||||
df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010)
|
||||
|
||||
assert df.size == 1010 * 10
|
||||
|
||||
|
||||
def test_float_precision_round_trip_with_text(c_parser_only):
|
||||
# see gh-15140
|
||||
parser = c_parser_only
|
||||
df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip")
|
||||
tm.assert_frame_equal(df, DataFrame({0: ["a"]}))
|
||||
|
||||
|
||||
def test_large_difference_in_columns(c_parser_only):
|
||||
# see gh-14125
|
||||
parser = c_parser_only
|
||||
|
||||
count = 10000
|
||||
large_row = ("X," * count)[:-1] + "\n"
|
||||
normal_row = "XXXXXX XXXXXX,111111111111111\n"
|
||||
test_input = (large_row + normal_row * 6)[:-1]
|
||||
|
||||
result = parser.read_csv(StringIO(test_input), header=None, usecols=[0])
|
||||
rows = test_input.split("\n")
|
||||
|
||||
expected = DataFrame([row.split(",")[0] for row in rows])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_data_after_quote(c_parser_only):
|
||||
# see gh-15910
|
||||
parser = c_parser_only
|
||||
|
||||
data = 'a\n1\n"b"a'
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame({"a": ["1", "ba"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_whitespace_delimited(c_parser_only):
|
||||
parser = c_parser_only
|
||||
test_input = """\
|
||||
1 2
|
||||
2 2 3
|
||||
3 2 3 # 3 fields
|
||||
4 2 3# 3 fields
|
||||
5 2 # 2 fields
|
||||
6 2# 2 fields
|
||||
7 # 1 field, NaN
|
||||
8# 1 field, NaN
|
||||
9 2 3 # skipped line
|
||||
# comment"""
|
||||
with tm.assert_produces_warning(
|
||||
ParserWarning, match="Skipping line", check_stacklevel=False
|
||||
):
|
||||
df = parser.read_csv(
|
||||
StringIO(test_input),
|
||||
comment="#",
|
||||
header=None,
|
||||
delimiter="\\s+",
|
||||
skiprows=0,
|
||||
on_bad_lines="warn",
|
||||
)
|
||||
expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
def test_file_like_no_next(c_parser_only):
|
||||
# gh-16530: the file-like need not have a "next" or "__next__"
|
||||
# attribute despite having an "__iter__" attribute.
|
||||
#
|
||||
# NOTE: This is only true for the C engine, not Python engine.
|
||||
class NoNextBuffer(StringIO):
|
||||
def __next__(self):
|
||||
raise AttributeError("No next method")
|
||||
|
||||
next = __next__
|
||||
|
||||
parser = c_parser_only
|
||||
data = "a\n1"
|
||||
|
||||
expected = DataFrame({"a": [1]})
|
||||
result = parser.read_csv(NoNextBuffer(data))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_buffer_rd_bytes_bad_unicode(c_parser_only):
|
||||
# see gh-22748
|
||||
t = BytesIO(b"\xB0")
|
||||
t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
|
||||
msg = "'utf-8' codec can't encode character"
|
||||
with pytest.raises(UnicodeError, match=msg):
|
||||
c_parser_only.read_csv(t, encoding="UTF-8")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
|
||||
def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix):
|
||||
# see gh-16530
|
||||
#
|
||||
# Unfortunately, Python's CSV library can't handle
|
||||
# tarfile objects (expects string, not bytes when
|
||||
# iterating through a file-like).
|
||||
parser = c_parser_only
|
||||
tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix)
|
||||
|
||||
with tarfile.open(tar_path, "r") as tar:
|
||||
data_file = tar.extractfile("tar_data.csv")
|
||||
|
||||
out = parser.read_csv(data_file)
|
||||
expected = DataFrame({"a": [1]})
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
|
||||
def test_chunk_whitespace_on_boundary(c_parser_only):
|
||||
# see gh-9735: this issue is C parser-specific (bug when
|
||||
# parsing whitespace and characters at chunk boundary)
|
||||
#
|
||||
# This test case has a field too large for the Python parser / CSV library.
|
||||
parser = c_parser_only
|
||||
|
||||
chunk1 = "a" * (1024 * 256 - 2) + "\na"
|
||||
chunk2 = "\n a"
|
||||
result = parser.read_csv(StringIO(chunk1 + chunk2), header=None)
|
||||
|
||||
expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_file_handles_mmap(c_parser_only, csv1):
|
||||
# gh-14418
|
||||
#
|
||||
# Don't close user provided file handles.
|
||||
parser = c_parser_only
|
||||
|
||||
with open(csv1, encoding="utf-8") as f:
|
||||
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m:
|
||||
parser.read_csv(m)
|
||||
assert not m.closed
|
||||
|
||||
|
||||
def test_file_binary_mode(c_parser_only):
|
||||
# see gh-23779
|
||||
parser = c_parser_only
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write("1,2,3\n4,5,6")
|
||||
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_unix_style_breaks(c_parser_only):
|
||||
# GH 11020
|
||||
parser = c_parser_only
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w", newline="\n", encoding="utf-8") as f:
|
||||
f.write("blah\n\ncol_1,col_2,col_3\n\n")
|
||||
result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
|
||||
expected = DataFrame(columns=["col_1", "col_2", "col_3"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
|
||||
@pytest.mark.parametrize(
|
||||
"data,thousands,decimal",
|
||||
[
|
||||
(
|
||||
"""A|B|C
|
||||
1|2,334.01|5
|
||||
10|13|10.
|
||||
""",
|
||||
",",
|
||||
".",
|
||||
),
|
||||
(
|
||||
"""A|B|C
|
||||
1|2.334,01|5
|
||||
10|13|10,
|
||||
""",
|
||||
".",
|
||||
",",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_1000_sep_with_decimal(
|
||||
c_parser_only, data, thousands, decimal, float_precision
|
||||
):
|
||||
parser = c_parser_only
|
||||
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
sep="|",
|
||||
thousands=thousands,
|
||||
decimal=decimal,
|
||||
float_precision=float_precision,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_float_precision_options(c_parser_only):
|
||||
# GH 17154, 36228
|
||||
parser = c_parser_only
|
||||
s = "foo\n243.164\n"
|
||||
df = parser.read_csv(StringIO(s))
|
||||
df2 = parser.read_csv(StringIO(s), float_precision="high")
|
||||
|
||||
tm.assert_frame_equal(df, df2)
|
||||
|
||||
df3 = parser.read_csv(StringIO(s), float_precision="legacy")
|
||||
|
||||
assert not df.iloc[0, 0] == df3.iloc[0, 0]
|
||||
|
||||
msg = "Unrecognized float_precision option: junk"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(s), float_precision="junk")
|
||||
@ -0,0 +1,227 @@
|
||||
"""
|
||||
Tests that comments are properly handled during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values", [None, ["NaN"]])
|
||||
def test_comment(all_parsers, na_values):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
1,2.,4.#hello world
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", na_values=na_values)
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), comment="#", na_values=na_values)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"read_kwargs", [{}, {"lineterminator": "*"}, {"delim_whitespace": True}]
|
||||
)
|
||||
def test_line_comment(all_parsers, read_kwargs, request):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
A,B,C
|
||||
1,2.,4.#hello world
|
||||
#ignore this line
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
warn = None
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
if read_kwargs.get("delim_whitespace"):
|
||||
data = data.replace(",", " ")
|
||||
warn = FutureWarning
|
||||
elif read_kwargs.get("lineterminator"):
|
||||
data = data.replace("\n", read_kwargs.get("lineterminator"))
|
||||
|
||||
read_kwargs["comment"] = "#"
|
||||
if parser.engine == "pyarrow":
|
||||
if "lineterminator" in read_kwargs:
|
||||
msg = (
|
||||
"The 'lineterminator' option is not supported with the 'pyarrow' engine"
|
||||
)
|
||||
else:
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
warn, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), **read_kwargs)
|
||||
return
|
||||
elif parser.engine == "python" and read_kwargs.get("lineterminator"):
|
||||
msg = r"Custom line terminators not supported in python parser \(yet\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
warn, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), **read_kwargs)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False):
|
||||
result = parser.read_csv(StringIO(data), **read_kwargs)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_skiprows(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
random line
|
||||
# second empty line
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# This should ignore the first four lines (including comments).
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", skiprows=4)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), comment="#", skiprows=4)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_header(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
# second empty line
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# Header should begin at the second non-comment line.
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", header=1)
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), comment="#", header=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_skiprows_header(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
# second empty line
|
||||
# third empty line
|
||||
X,Y,Z
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# Skiprows should skip the first 4 lines (including comments),
|
||||
# while header should start from the second non-commented line,
|
||||
# starting with line 5.
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"])
|
||||
def test_custom_comment_char(all_parsers, comment_char):
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data.replace("#", comment_char)), comment=comment_char
|
||||
)
|
||||
return
|
||||
result = parser.read_csv(
|
||||
StringIO(data.replace("#", comment_char)), comment=comment_char
|
||||
)
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", ["infer", None])
|
||||
def test_comment_first_line(all_parsers, header):
|
||||
# see gh-4623
|
||||
parser = all_parsers
|
||||
data = "# notes\na,b,c\n# more notes\n1,2,3"
|
||||
|
||||
if header is None:
|
||||
expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]})
|
||||
else:
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", header=header)
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), comment="#", header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_char_in_default_value(all_parsers, request):
|
||||
# GH#34002
|
||||
if all_parsers.engine == "c":
|
||||
reason = "see gh-34002: works on the python engine but not the c engine"
|
||||
# NA value containing comment char is interpreted as comment
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=AssertionError))
|
||||
parser = all_parsers
|
||||
|
||||
data = (
|
||||
"# this is a comment\n"
|
||||
"col1,col2,col3,col4\n"
|
||||
"1,2,3,4#inline comment\n"
|
||||
"4,5#,6,10\n"
|
||||
"7,8,#N/A,11\n"
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", na_values="#N/A")
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), comment="#", na_values="#N/A")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col1": [1, 4, 7],
|
||||
"col2": [2, 5, 8],
|
||||
"col3": [3.0, np.nan, np.nan],
|
||||
"col4": [4.0, np.nan, 11.0],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,211 @@
|
||||
"""
|
||||
Tests compressed data parsing functionality for all
|
||||
of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import tarfile
|
||||
import zipfile
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def buffer(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def parser_and_data(all_parsers, csv1):
|
||||
parser = all_parsers
|
||||
|
||||
with open(csv1, "rb") as f:
|
||||
data = f.read()
|
||||
expected = parser.read_csv(csv1)
|
||||
|
||||
return parser, data, expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("compression", ["zip", "infer", "zip2"])
|
||||
def test_zip(parser_and_data, compression):
|
||||
parser, data, expected = parser_and_data
|
||||
|
||||
with tm.ensure_clean("test_file.zip") as path:
|
||||
with zipfile.ZipFile(path, mode="w") as tmp:
|
||||
tmp.writestr("test_file", data)
|
||||
|
||||
if compression == "zip2":
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, compression="zip")
|
||||
else:
|
||||
result = parser.read_csv(path, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("compression", ["zip", "infer"])
|
||||
def test_zip_error_multiple_files(parser_and_data, compression):
|
||||
parser, data, expected = parser_and_data
|
||||
|
||||
with tm.ensure_clean("combined_zip.zip") as path:
|
||||
inner_file_names = ["test_file", "second_file"]
|
||||
|
||||
with zipfile.ZipFile(path, mode="w") as tmp:
|
||||
for file_name in inner_file_names:
|
||||
tmp.writestr(file_name, data)
|
||||
|
||||
with pytest.raises(ValueError, match="Multiple files"):
|
||||
parser.read_csv(path, compression=compression)
|
||||
|
||||
|
||||
def test_zip_error_no_files(parser_and_data):
|
||||
parser, _, _ = parser_and_data
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with zipfile.ZipFile(path, mode="w"):
|
||||
pass
|
||||
|
||||
with pytest.raises(ValueError, match="Zero files"):
|
||||
parser.read_csv(path, compression="zip")
|
||||
|
||||
|
||||
def test_zip_error_invalid_zip(parser_and_data):
|
||||
parser, _, _ = parser_and_data
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "rb") as f:
|
||||
with pytest.raises(zipfile.BadZipFile, match="File is not a zip file"):
|
||||
parser.read_csv(f, compression="zip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [None, "test.{ext}"])
|
||||
def test_compression(
|
||||
request,
|
||||
parser_and_data,
|
||||
compression_only,
|
||||
buffer,
|
||||
filename,
|
||||
compression_to_extension,
|
||||
):
|
||||
parser, data, expected = parser_and_data
|
||||
compress_type = compression_only
|
||||
|
||||
ext = compression_to_extension[compress_type]
|
||||
filename = filename if filename is None else filename.format(ext=ext)
|
||||
|
||||
if filename and buffer:
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason="Cannot deduce compression from buffer of compressed data."
|
||||
)
|
||||
)
|
||||
|
||||
with tm.ensure_clean(filename=filename) as path:
|
||||
tm.write_to_compressed(compress_type, path, data)
|
||||
compression = "infer" if filename else compress_type
|
||||
|
||||
if buffer:
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, compression=compression)
|
||||
else:
|
||||
result = parser.read_csv(path, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ext", [None, "gz", "bz2"])
|
||||
def test_infer_compression(all_parsers, csv1, buffer, ext):
|
||||
# see gh-9770
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0, "parse_dates": True}
|
||||
|
||||
expected = parser.read_csv(csv1, **kwargs)
|
||||
kwargs["compression"] = "infer"
|
||||
|
||||
if buffer:
|
||||
with open(csv1, encoding="utf-8") as f:
|
||||
result = parser.read_csv(f, **kwargs)
|
||||
else:
|
||||
ext = "." + ext if ext else ""
|
||||
result = parser.read_csv(csv1 + ext, **kwargs)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt):
|
||||
# see gh-18071, gh-24130
|
||||
parser = all_parsers
|
||||
encoding = encoding_fmt.format(utf_value)
|
||||
path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip")
|
||||
|
||||
result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Country": ["Venezuela", "Venezuela"],
|
||||
"Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."],
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
|
||||
def test_invalid_compression(all_parsers, invalid_compression):
|
||||
parser = all_parsers
|
||||
compress_kwargs = {"compression": invalid_compression}
|
||||
|
||||
msg = f"Unrecognized compression type: {invalid_compression}"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv("test_file.zip", **compress_kwargs)
|
||||
|
||||
|
||||
def test_compression_tar_archive(all_parsers, csv_dir_path):
|
||||
parser = all_parsers
|
||||
path = os.path.join(csv_dir_path, "tar_csv.tar.gz")
|
||||
df = parser.read_csv(path)
|
||||
assert list(df.columns) == ["a"]
|
||||
|
||||
|
||||
def test_ignore_compression_extension(all_parsers):
|
||||
parser = all_parsers
|
||||
df = DataFrame({"a": [0, 1]})
|
||||
with tm.ensure_clean("test.csv") as path_csv:
|
||||
with tm.ensure_clean("test.csv.zip") as path_zip:
|
||||
# make sure to create un-compressed file with zip extension
|
||||
df.to_csv(path_csv, index=False)
|
||||
Path(path_zip).write_text(
|
||||
Path(path_csv).read_text(encoding="utf-8"), encoding="utf-8"
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df)
|
||||
|
||||
|
||||
def test_writes_tar_gz(all_parsers):
|
||||
parser = all_parsers
|
||||
data = DataFrame(
|
||||
{
|
||||
"Country": ["Venezuela", "Venezuela"],
|
||||
"Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."],
|
||||
}
|
||||
)
|
||||
with tm.ensure_clean("test.tar.gz") as tar_path:
|
||||
data.to_csv(tar_path, index=False)
|
||||
|
||||
# test that read_csv infers .tar.gz to gzip:
|
||||
tm.assert_frame_equal(parser.read_csv(tar_path), data)
|
||||
|
||||
# test that file is indeed gzipped:
|
||||
with tarfile.open(tar_path, "r:gz") as tar:
|
||||
result = parser.read_csv(
|
||||
tar.extractfile(tar.getnames()[0]), compression="infer"
|
||||
)
|
||||
tm.assert_frame_equal(result, data)
|
||||
@ -0,0 +1,36 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import DtypeWarning
|
||||
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import ArrowExtensionArray
|
||||
|
||||
from pandas.io.parsers.c_parser_wrapper import _concatenate_chunks
|
||||
|
||||
|
||||
def test_concatenate_chunks_pyarrow():
|
||||
# GH#51876
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
chunks = [
|
||||
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
|
||||
{0: ArrowExtensionArray(pa.array([1, 2]))},
|
||||
]
|
||||
result = _concatenate_chunks(chunks)
|
||||
expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0]))
|
||||
tm.assert_extension_array_equal(result[0], expected)
|
||||
|
||||
|
||||
def test_concatenate_chunks_pyarrow_strings():
|
||||
# GH#51876
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
chunks = [
|
||||
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
|
||||
{0: ArrowExtensionArray(pa.array(["a", "b"]))},
|
||||
]
|
||||
with tm.assert_produces_warning(DtypeWarning, match="have mixed types"):
|
||||
result = _concatenate_chunks(chunks)
|
||||
expected = np.concatenate(
|
||||
[np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])]
|
||||
)
|
||||
tm.assert_numpy_array_equal(result[0], expected)
|
||||
@ -0,0 +1,263 @@
|
||||
"""
|
||||
Tests column conversion functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
from dateutil.parser import parse
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_converters_type_must_be_dict(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
"""
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), converters=0)
|
||||
return
|
||||
with pytest.raises(TypeError, match="Type converters.+"):
|
||||
parser.read_csv(StringIO(data), converters=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("column", [3, "D"])
|
||||
@pytest.mark.parametrize(
|
||||
"converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer.
|
||||
)
|
||||
def test_converters(all_parsers, column, converter):
|
||||
parser = all_parsers
|
||||
data = """A,B,C,D
|
||||
a,1,2,01/01/2009
|
||||
b,3,4,01/02/2009
|
||||
c,4,5,01/03/2009
|
||||
"""
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), converters={column: converter})
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), converters={column: converter})
|
||||
|
||||
expected = parser.read_csv(StringIO(data))
|
||||
expected["D"] = expected["D"].map(converter)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_no_implicit_conv(all_parsers):
|
||||
# see gh-2184
|
||||
parser = all_parsers
|
||||
data = """000102,1.2,A\n001245,2,B"""
|
||||
|
||||
converters = {0: lambda x: x.strip()}
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=None, converters=converters)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, converters=converters)
|
||||
|
||||
# Column 0 should not be casted to numeric and should remain as object.
|
||||
expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_euro_decimal_format(all_parsers):
|
||||
# see gh-583
|
||||
converters = {}
|
||||
parser = all_parsers
|
||||
|
||||
data = """Id;Number1;Number2;Text1;Text2;Number3
|
||||
1;1521,1541;187101,9543;ABC;poi;4,7387
|
||||
2;121,12;14897,76;DEF;uyt;0,3773
|
||||
3;878,158;108013,434;GHI;rez;2,7356"""
|
||||
converters["Number1"] = converters["Number2"] = converters[
|
||||
"Number3"
|
||||
] = lambda x: float(x.replace(",", "."))
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep=";", converters=converters)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=";", converters=converters)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387],
|
||||
[2, 121.12, 14897.76, "DEF", "uyt", 0.3773],
|
||||
[3, 878.158, 108013.434, "GHI", "rez", 2.7356],
|
||||
],
|
||||
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_corner_with_nans(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """id,score,days
|
||||
1,2,12
|
||||
2,2-5,
|
||||
3,,14+
|
||||
4,6-12,2"""
|
||||
|
||||
# Example converters.
|
||||
def convert_days(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
is_plus = x.endswith("+")
|
||||
|
||||
if is_plus:
|
||||
x = int(x[:-1]) + 1
|
||||
else:
|
||||
x = int(x)
|
||||
|
||||
return x
|
||||
|
||||
def convert_days_sentinel(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
is_plus = x.endswith("+")
|
||||
|
||||
if is_plus:
|
||||
x = int(x[:-1]) + 1
|
||||
else:
|
||||
x = int(x)
|
||||
|
||||
return x
|
||||
|
||||
def convert_score(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
if x.find("-") > 0:
|
||||
val_min, val_max = map(int, x.split("-"))
|
||||
val = 0.5 * (val_min + val_max)
|
||||
else:
|
||||
val = float(x)
|
||||
|
||||
return val
|
||||
|
||||
results = []
|
||||
|
||||
for day_converter in [convert_days, convert_days_sentinel]:
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
converters={"score": convert_score, "days": day_converter},
|
||||
na_values=["", None],
|
||||
)
|
||||
continue
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
converters={"score": convert_score, "days": day_converter},
|
||||
na_values=["", None],
|
||||
)
|
||||
assert pd.isna(result["days"][1])
|
||||
results.append(result)
|
||||
|
||||
if parser.engine != "pyarrow":
|
||||
tm.assert_frame_equal(results[0], results[1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("conv_f", [lambda x: x, str])
|
||||
def test_converter_index_col_bug(all_parsers, conv_f):
|
||||
# see gh-1835 , GH#40589
|
||||
parser = all_parsers
|
||||
data = "A;B\n1;2\n3;4"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
|
||||
)
|
||||
return
|
||||
|
||||
rs = parser.read_csv(
|
||||
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
|
||||
)
|
||||
|
||||
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A"))
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
|
||||
def test_converter_identity_object(all_parsers):
|
||||
# GH#40589
|
||||
parser = all_parsers
|
||||
data = "A,B\n1,2\n3,4"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), converters={"A": lambda x: x})
|
||||
return
|
||||
|
||||
rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x})
|
||||
|
||||
xp = DataFrame({"A": ["1", "3"], "B": [2, 4]})
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
|
||||
def test_converter_multi_index(all_parsers):
|
||||
# GH 42446
|
||||
parser = all_parsers
|
||||
data = "A,B,B\nX,Y,Z\n1,2,3"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
header=list(range(2)),
|
||||
converters={
|
||||
("A", "X"): np.int32,
|
||||
("B", "Y"): np.int32,
|
||||
("B", "Z"): np.float32,
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=list(range(2)),
|
||||
converters={
|
||||
("A", "X"): np.int32,
|
||||
("B", "Y"): np.int32,
|
||||
("B", "Z"): np.float32,
|
||||
},
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
("A", "X"): np.int32([1]),
|
||||
("B", "Y"): np.int32([2]),
|
||||
("B", "Z"): np.float32([3]),
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,195 @@
|
||||
"""
|
||||
Tests that dialects are properly handled during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def custom_dialect():
|
||||
dialect_name = "weird"
|
||||
dialect_kwargs = {
|
||||
"doublequote": False,
|
||||
"escapechar": "~",
|
||||
"delimiter": ":",
|
||||
"skipinitialspace": False,
|
||||
"quotechar": "`",
|
||||
"quoting": 3,
|
||||
}
|
||||
return dialect_name, dialect_kwargs
|
||||
|
||||
|
||||
def test_dialect(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
label1,label2,label3
|
||||
index1,"a,c,e
|
||||
index2,b,d,f
|
||||
"""
|
||||
|
||||
dia = csv.excel()
|
||||
dia.quoting = csv.QUOTE_NONE
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dialect=dia)
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(data), dialect=dia)
|
||||
|
||||
data = """\
|
||||
label1,label2,label3
|
||||
index1,a,c,e
|
||||
index2,b,d,f
|
||||
"""
|
||||
exp = parser.read_csv(StringIO(data))
|
||||
exp.replace("a", '"a', inplace=True)
|
||||
tm.assert_frame_equal(df, exp)
|
||||
|
||||
|
||||
def test_dialect_str(all_parsers):
|
||||
dialect_name = "mydialect"
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
fruit:vegetable
|
||||
apple:broccoli
|
||||
pear:tomato
|
||||
"""
|
||||
exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]})
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, delimiter=":"):
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dialect=dialect_name)
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(data), dialect=dialect_name)
|
||||
tm.assert_frame_equal(df, exp)
|
||||
|
||||
|
||||
def test_invalid_dialect(all_parsers):
|
||||
class InvalidDialect:
|
||||
pass
|
||||
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
msg = "Invalid dialect"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dialect=InvalidDialect)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg",
|
||||
[None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"],
|
||||
)
|
||||
@pytest.mark.parametrize("value", ["dialect", "default", "other"])
|
||||
def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, value):
|
||||
# see gh-23761.
|
||||
dialect_name, dialect_kwargs = custom_dialect
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
data = "a:b\n1:2"
|
||||
|
||||
warning_klass = None
|
||||
kwds = {}
|
||||
|
||||
# arg=None tests when we pass in the dialect without any other arguments.
|
||||
if arg is not None:
|
||||
if value == "dialect": # No conflict --> no warning.
|
||||
kwds[arg] = dialect_kwargs[arg]
|
||||
elif value == "default": # Default --> no warning.
|
||||
from pandas.io.parsers.base_parser import parser_defaults
|
||||
|
||||
kwds[arg] = parser_defaults[arg]
|
||||
else: # Non-default + conflict with dialect --> warning.
|
||||
warning_klass = ParserWarning
|
||||
kwds[arg] = "blah"
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv_check_warnings(
|
||||
# No warning bc we raise
|
||||
None,
|
||||
"Conflicting values for",
|
||||
StringIO(data),
|
||||
dialect=dialect_name,
|
||||
**kwds,
|
||||
)
|
||||
return
|
||||
result = parser.read_csv_check_warnings(
|
||||
warning_klass,
|
||||
"Conflicting values for",
|
||||
StringIO(data),
|
||||
dialect=dialect_name,
|
||||
**kwds,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,warning_klass",
|
||||
[
|
||||
({"sep": ","}, None), # sep is default --> sep_override=True
|
||||
({"sep": "."}, ParserWarning), # sep isn't default --> sep_override=False
|
||||
({"delimiter": ":"}, None), # No conflict
|
||||
({"delimiter": None}, None), # Default arguments --> sep_override=True
|
||||
({"delimiter": ","}, ParserWarning), # Conflict
|
||||
({"delimiter": "."}, ParserWarning), # Conflict
|
||||
],
|
||||
ids=[
|
||||
"sep-override-true",
|
||||
"sep-override-false",
|
||||
"delimiter-no-conflict",
|
||||
"delimiter-default-arg",
|
||||
"delimiter-conflict",
|
||||
"delimiter-conflict2",
|
||||
],
|
||||
)
|
||||
def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning_klass):
|
||||
# see gh-23761.
|
||||
dialect_name, dialect_kwargs = custom_dialect
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
data = "a:b\n1:2"
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv_check_warnings(
|
||||
# no warning bc we raise
|
||||
None,
|
||||
"Conflicting values for 'delimiter'",
|
||||
StringIO(data),
|
||||
dialect=dialect_name,
|
||||
**kwargs,
|
||||
)
|
||||
return
|
||||
result = parser.read_csv_check_warnings(
|
||||
warning_klass,
|
||||
"Conflicting values for 'delimiter'",
|
||||
StringIO(data),
|
||||
dialect=dialect_name,
|
||||
**kwargs,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,337 @@
|
||||
"""
|
||||
Tests encoding functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import (
|
||||
BytesIO,
|
||||
TextIOWrapper,
|
||||
)
|
||||
import os
|
||||
import tempfile
|
||||
import uuid
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
read_csv,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_bytes_io_input(all_parsers):
|
||||
encoding = "cp1255"
|
||||
parser = all_parsers
|
||||
|
||||
data = BytesIO("שלום:1234\n562:123".encode(encoding))
|
||||
result = parser.read_csv(data, sep=":", encoding=encoding)
|
||||
|
||||
expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_read_csv_unicode(all_parsers):
|
||||
parser = all_parsers
|
||||
data = BytesIO("\u0141aski, Jan;1".encode())
|
||||
|
||||
result = parser.read_csv(data, sep=";", encoding="utf-8", header=None)
|
||||
expected = DataFrame([["\u0141aski, Jan", 1]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
@pytest.mark.parametrize("sep", [",", "\t"])
|
||||
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
|
||||
def test_utf16_bom_skiprows(all_parsers, sep, encoding):
|
||||
# see gh-2298
|
||||
parser = all_parsers
|
||||
data = """skip this
|
||||
skip this too
|
||||
A,B,C
|
||||
1,2,3
|
||||
4,5,6""".replace(
|
||||
",", sep
|
||||
)
|
||||
path = f"__{uuid.uuid4()}__.csv"
|
||||
kwargs = {"sep": sep, "skiprows": 2}
|
||||
utf8 = "utf-8"
|
||||
|
||||
with tm.ensure_clean(path) as path:
|
||||
bytes_data = data.encode(encoding)
|
||||
|
||||
with open(path, "wb") as f:
|
||||
f.write(bytes_data)
|
||||
|
||||
with TextIOWrapper(BytesIO(data.encode(utf8)), encoding=utf8) as bytes_buffer:
|
||||
result = parser.read_csv(path, encoding=encoding, **kwargs)
|
||||
expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_utf16_example(all_parsers, csv_dir_path):
|
||||
path = os.path.join(csv_dir_path, "utf16_ex.txt")
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(path, encoding="utf-16", sep="\t")
|
||||
assert len(result) == 50
|
||||
|
||||
|
||||
def test_unicode_encoding(all_parsers, csv_dir_path):
|
||||
path = os.path.join(csv_dir_path, "unicode_series.csv")
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(path, header=None, encoding="latin-1")
|
||||
result = result.set_index(0)
|
||||
got = result[1][1632]
|
||||
|
||||
expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)"
|
||||
assert got == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
# Basic test
|
||||
("a\n1", {}, DataFrame({"a": [1]})),
|
||||
# "Regular" quoting
|
||||
('"a"\n1', {"quotechar": '"'}, DataFrame({"a": [1]})),
|
||||
# Test in a data row instead of header
|
||||
("b\n1", {"names": ["a"]}, DataFrame({"a": ["b", "1"]})),
|
||||
# Test in empty data row with skipping
|
||||
("\n1", {"names": ["a"], "skip_blank_lines": True}, DataFrame({"a": [1]})),
|
||||
# Test in empty data row without skipping
|
||||
(
|
||||
"\n1",
|
||||
{"names": ["a"], "skip_blank_lines": False},
|
||||
DataFrame({"a": [np.nan, 1]}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_utf8_bom(all_parsers, data, kwargs, expected, request):
|
||||
# see gh-4793
|
||||
parser = all_parsers
|
||||
bom = "\ufeff"
|
||||
utf8 = "utf-8"
|
||||
|
||||
def _encode_data_with_bom(_data):
|
||||
bom_data = (bom + _data).encode(utf8)
|
||||
return BytesIO(bom_data)
|
||||
|
||||
if (
|
||||
parser.engine == "pyarrow"
|
||||
and data == "\n1"
|
||||
and kwargs.get("skip_blank_lines", True)
|
||||
):
|
||||
# CSV parse error: Empty CSV file or block: cannot infer number of columns
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
|
||||
# see gh-13549
|
||||
expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
|
||||
parser = all_parsers
|
||||
|
||||
encoding = encoding_fmt.format(utf_value)
|
||||
data = "mb_num,multibyte\n4.8,test".encode(encoding)
|
||||
|
||||
result = parser.read_csv(BytesIO(data), encoding=encoding)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file_path,encoding",
|
||||
[
|
||||
(("io", "data", "csv", "test1.csv"), "utf-8"),
|
||||
(("io", "parser", "data", "unicode_series.csv"), "latin-1"),
|
||||
(("io", "parser", "data", "sauron.SHIFT_JIS.csv"), "shiftjis"),
|
||||
],
|
||||
)
|
||||
def test_binary_mode_file_buffers(all_parsers, file_path, encoding, datapath):
|
||||
# gh-23779: Python csv engine shouldn't error on files opened in binary.
|
||||
# gh-31575: Python csv engine shouldn't error on files opened in raw binary.
|
||||
parser = all_parsers
|
||||
|
||||
fpath = datapath(*file_path)
|
||||
expected = parser.read_csv(fpath, encoding=encoding)
|
||||
|
||||
with open(fpath, encoding=encoding) as fa:
|
||||
result = parser.read_csv(fa)
|
||||
assert not fa.closed
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
with open(fpath, mode="rb") as fb:
|
||||
result = parser.read_csv(fb, encoding=encoding)
|
||||
assert not fb.closed
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
with open(fpath, mode="rb", buffering=0) as fb:
|
||||
result = parser.read_csv(fb, encoding=encoding)
|
||||
assert not fb.closed
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pass_encoding", [True, False])
|
||||
def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding):
|
||||
# see gh-24130
|
||||
parser = all_parsers
|
||||
encoding = encoding_fmt.format(utf_value)
|
||||
|
||||
if parser.engine == "pyarrow" and pass_encoding is True and utf_value in [16, 32]:
|
||||
# FIXME: this is bad!
|
||||
pytest.skip("These cases freeze")
|
||||
|
||||
expected = DataFrame({"foo": ["bar"]})
|
||||
|
||||
with tm.ensure_clean(mode="w+", encoding=encoding, return_filelike=True) as f:
|
||||
f.write("foo\nbar")
|
||||
f.seek(0)
|
||||
|
||||
result = parser.read_csv(f, encoding=encoding if pass_encoding else None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_encoding_named_temp_file(all_parsers):
|
||||
# see gh-31819
|
||||
parser = all_parsers
|
||||
encoding = "shift-jis"
|
||||
|
||||
title = "てすと"
|
||||
data = "こむ"
|
||||
|
||||
expected = DataFrame({title: [data]})
|
||||
|
||||
with tempfile.NamedTemporaryFile() as f:
|
||||
f.write(f"{title}\n{data}".encode(encoding))
|
||||
|
||||
f.seek(0)
|
||||
|
||||
result = parser.read_csv(f, encoding=encoding)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert not f.closed
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"encoding", ["utf-8", "utf-16", "utf-16-be", "utf-16-le", "utf-32"]
|
||||
)
|
||||
def test_parse_encoded_special_characters(encoding):
|
||||
# GH16218 Verify parsing of data with encoded special characters
|
||||
# Data contains a Unicode 'FULLWIDTH COLON' (U+FF1A) at position (0,"a")
|
||||
data = "a\tb\n:foo\t0\nbar\t1\nbaz\t2" # noqa: RUF001
|
||||
encoded_data = BytesIO(data.encode(encoding))
|
||||
result = read_csv(encoded_data, delimiter="\t", encoding=encoding)
|
||||
|
||||
expected = DataFrame(
|
||||
data=[[":foo", 0], ["bar", 1], ["baz", 2]], # noqa: RUF001
|
||||
columns=["a", "b"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
|
||||
def test_encoding_memory_map(all_parsers, encoding):
|
||||
# GH40986
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
{
|
||||
"name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"],
|
||||
"mask": ["red", "purple", "orange", "blue"],
|
||||
"weapon": ["sai", "bo staff", "nunchunk", "katana"],
|
||||
}
|
||||
)
|
||||
with tm.ensure_clean() as file:
|
||||
expected.to_csv(file, index=False, encoding=encoding)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(file, encoding=encoding, memory_map=True)
|
||||
return
|
||||
|
||||
df = parser.read_csv(file, encoding=encoding, memory_map=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
def test_chunk_splits_multibyte_char(all_parsers):
|
||||
"""
|
||||
Chunk splits a multibyte character with memory_map=True
|
||||
|
||||
GH 43540
|
||||
"""
|
||||
parser = all_parsers
|
||||
# DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx
|
||||
df = DataFrame(data=["a" * 127] * 2048)
|
||||
|
||||
# Put two-bytes utf-8 encoded character "ą" at the end of chunk
|
||||
# utf-8 encoding of "ą" is b'\xc4\x85'
|
||||
df.iloc[2047] = "a" * 127 + "ą"
|
||||
with tm.ensure_clean("bug-gh43540.csv") as fname:
|
||||
df.to_csv(fname, index=False, header=False, encoding="utf-8")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(fname, header=None, memory_map=True)
|
||||
return
|
||||
|
||||
dfr = parser.read_csv(fname, header=None, memory_map=True)
|
||||
tm.assert_frame_equal(dfr, df)
|
||||
|
||||
|
||||
def test_readcsv_memmap_utf8(all_parsers):
|
||||
"""
|
||||
GH 43787
|
||||
|
||||
Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8
|
||||
"""
|
||||
lines = []
|
||||
line_length = 128
|
||||
start_char = " "
|
||||
end_char = "\U00010080"
|
||||
# This for loop creates a list of 128-char strings
|
||||
# consisting of consecutive Unicode chars
|
||||
for lnum in range(ord(start_char), ord(end_char), line_length):
|
||||
line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n"
|
||||
try:
|
||||
line.encode("utf-8")
|
||||
except UnicodeEncodeError:
|
||||
continue
|
||||
lines.append(line)
|
||||
parser = all_parsers
|
||||
df = DataFrame(lines)
|
||||
with tm.ensure_clean("utf8test.csv") as fname:
|
||||
df.to_csv(fname, index=False, header=False, encoding="utf-8")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
|
||||
return
|
||||
|
||||
dfr = parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
|
||||
tm.assert_frame_equal(df, dfr)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
@pytest.mark.parametrize("mode", ["w+b", "w+t"])
|
||||
def test_not_readable(all_parsers, mode):
|
||||
# GH43439
|
||||
parser = all_parsers
|
||||
content = b"abcd"
|
||||
if "t" in mode:
|
||||
content = "abcd"
|
||||
with tempfile.SpooledTemporaryFile(mode=mode, encoding="utf-8") as handle:
|
||||
handle.write(content)
|
||||
handle.seek(0)
|
||||
df = parser.read_csv(handle)
|
||||
expected = DataFrame([], columns=["abcd"])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
@ -0,0 +1,733 @@
|
||||
"""
|
||||
Tests that the file header is properly handled or inferred
|
||||
during parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from collections import namedtuple
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_read_with_bad_header(all_parsers):
|
||||
parser = all_parsers
|
||||
msg = r"but only \d+ lines in file"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
s = StringIO(",,")
|
||||
parser.read_csv(s, header=[10])
|
||||
|
||||
|
||||
def test_negative_header(all_parsers):
|
||||
# see gh-27779
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="Passing negative integer to header is invalid. "
|
||||
"For no header, use header=None instead",
|
||||
):
|
||||
parser.read_csv(StringIO(data), header=-1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [([-1, 2, 4]), ([-5, 0])])
|
||||
def test_negative_multi_index_header(all_parsers, header):
|
||||
# see gh-27779
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
with pytest.raises(
|
||||
ValueError, match="cannot specify multi-index header with negative integers"
|
||||
):
|
||||
parser.read_csv(StringIO(data), header=header)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [True, False])
|
||||
def test_bool_header_arg(all_parsers, header):
|
||||
# see gh-6114
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
MyColumn
|
||||
a
|
||||
b
|
||||
a
|
||||
b"""
|
||||
msg = "Passing a bool to header is invalid"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=header)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame are different
|
||||
def test_header_with_index_col(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
names = ["A", "B", "C"]
|
||||
result = parser.read_csv(StringIO(data), names=names)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_not_first_line(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """got,to,ignore,this,line
|
||||
got,to,ignore,this,line
|
||||
index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
"""
|
||||
data2 = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=2, index_col=0)
|
||||
expected = parser.read_csv(StringIO(data2), header=0, index_col=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_multi_index(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = """\
|
||||
C0,,C_l0_g0,C_l0_g1,C_l0_g2
|
||||
|
||||
C1,,C_l1_g0,C_l1_g1,C_l1_g2
|
||||
C2,,C_l2_g0,C_l2_g1,C_l2_g2
|
||||
C3,,C_l3_g0,C_l3_g1,C_l3_g2
|
||||
R0,R1,,,
|
||||
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
|
||||
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
|
||||
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
|
||||
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
|
||||
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1])
|
||||
data_gen_f = lambda r, c: f"R{r}C{c}"
|
||||
|
||||
data = [[data_gen_f(r, c) for c in range(3)] for r in range(5)]
|
||||
index = MultiIndex.from_arrays(
|
||||
[[f"R_l0_g{i}" for i in range(5)], [f"R_l1_g{i}" for i in range(5)]],
|
||||
names=["R0", "R1"],
|
||||
)
|
||||
columns = MultiIndex.from_arrays(
|
||||
[
|
||||
[f"C_l0_g{i}" for i in range(3)],
|
||||
[f"C_l1_g{i}" for i in range(3)],
|
||||
[f"C_l2_g{i}" for i in range(3)],
|
||||
[f"C_l3_g{i}" for i in range(3)],
|
||||
],
|
||||
names=["C0", "C1", "C2", "C3"],
|
||||
)
|
||||
expected = DataFrame(data, columns=columns, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,msg",
|
||||
[
|
||||
(
|
||||
{"index_col": ["foo", "bar"]},
|
||||
(
|
||||
"index_col must only contain "
|
||||
"row numbers when specifying "
|
||||
"a multi-index header"
|
||||
),
|
||||
),
|
||||
(
|
||||
{"index_col": [0, 1], "names": ["foo", "bar"]},
|
||||
("cannot specify names when specifying a multi-index header"),
|
||||
),
|
||||
(
|
||||
{"index_col": [0, 1], "usecols": ["foo", "bar"]},
|
||||
("cannot specify usecols when specifying a multi-index header"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_invalid(all_parsers, kwargs, msg):
|
||||
data = """\
|
||||
C0,,C_l0_g0,C_l0_g1,C_l0_g2
|
||||
|
||||
C1,,C_l1_g0,C_l1_g1,C_l1_g2
|
||||
C2,,C_l2_g0,C_l2_g1,C_l2_g2
|
||||
C3,,C_l3_g0,C_l3_g1,C_l3_g2
|
||||
R0,R1,,,
|
||||
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
|
||||
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
|
||||
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
|
||||
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
|
||||
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
|
||||
"""
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs)
|
||||
|
||||
|
||||
_TestTuple = namedtuple("_TestTuple", ["first", "second"])
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"header": [0, 1]},
|
||||
{
|
||||
"skiprows": 3,
|
||||
"names": [
|
||||
("a", "q"),
|
||||
("a", "r"),
|
||||
("a", "s"),
|
||||
("b", "t"),
|
||||
("c", "u"),
|
||||
("c", "v"),
|
||||
],
|
||||
},
|
||||
{
|
||||
"skiprows": 3,
|
||||
"names": [
|
||||
_TestTuple("a", "q"),
|
||||
_TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"),
|
||||
_TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"),
|
||||
_TestTuple("c", "v"),
|
||||
],
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_common_format1(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
|
||||
),
|
||||
)
|
||||
data = """,a,a,a,b,c,c
|
||||
,q,r,s,t,u,v
|
||||
,,,,,,
|
||||
one,1,2,3,4,5,6
|
||||
two,7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"header": [0, 1]},
|
||||
{
|
||||
"skiprows": 2,
|
||||
"names": [
|
||||
("a", "q"),
|
||||
("a", "r"),
|
||||
("a", "s"),
|
||||
("b", "t"),
|
||||
("c", "u"),
|
||||
("c", "v"),
|
||||
],
|
||||
},
|
||||
{
|
||||
"skiprows": 2,
|
||||
"names": [
|
||||
_TestTuple("a", "q"),
|
||||
_TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"),
|
||||
_TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"),
|
||||
_TestTuple("c", "v"),
|
||||
],
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_common_format2(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
|
||||
),
|
||||
)
|
||||
data = """,a,a,a,b,c,c
|
||||
,q,r,s,t,u,v
|
||||
one,1,2,3,4,5,6
|
||||
two,7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"header": [0, 1]},
|
||||
{
|
||||
"skiprows": 2,
|
||||
"names": [
|
||||
("a", "q"),
|
||||
("a", "r"),
|
||||
("a", "s"),
|
||||
("b", "t"),
|
||||
("c", "u"),
|
||||
("c", "v"),
|
||||
],
|
||||
},
|
||||
{
|
||||
"skiprows": 2,
|
||||
"names": [
|
||||
_TestTuple("a", "q"),
|
||||
_TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"),
|
||||
_TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"),
|
||||
_TestTuple("c", "v"),
|
||||
],
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_common_format3(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
|
||||
),
|
||||
)
|
||||
expected = expected.reset_index(drop=True)
|
||||
data = """a,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_multi_index_common_format_malformed1(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
|
||||
index=Index([1, 7]),
|
||||
columns=MultiIndex(
|
||||
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
|
||||
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
|
||||
names=["a", "q"],
|
||||
),
|
||||
)
|
||||
data = """a,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_multi_index_common_format_malformed2(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
|
||||
index=Index([1, 7]),
|
||||
columns=MultiIndex(
|
||||
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
|
||||
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
|
||||
names=[None, "q"],
|
||||
),
|
||||
)
|
||||
|
||||
data = """,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_multi_index_common_format_malformed3(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"),
|
||||
index=MultiIndex(levels=[[1, 7], [2, 8]], codes=[[0, 1], [0, 1]]),
|
||||
columns=MultiIndex(
|
||||
levels=[["a", "b", "c"], ["s", "t", "u", "v"]],
|
||||
codes=[[0, 1, 2, 2], [0, 1, 2, 3]],
|
||||
names=[None, "q"],
|
||||
),
|
||||
)
|
||||
data = """,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_multi_index_blank_line(all_parsers):
|
||||
# GH 40442
|
||||
parser = all_parsers
|
||||
data = [[None, None], [1, 2], [3, 4]]
|
||||
columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")])
|
||||
expected = DataFrame(data, columns=columns)
|
||||
data = "a,b\nA,B\n,\n1,2\n3,4"
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1])
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)]
|
||||
)
|
||||
def test_header_names_backward_compat(all_parsers, data, header, request):
|
||||
# see gh-2539
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and header is not None:
|
||||
mark = pytest.mark.xfail(reason="DataFrame.columns are different")
|
||||
request.applymarker(mark)
|
||||
|
||||
expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block: cannot infer
|
||||
@pytest.mark.parametrize("kwargs", [{}, {"index_col": False}])
|
||||
def test_read_only_header_no_rows(all_parsers, kwargs):
|
||||
# See gh-7773
|
||||
parser = all_parsers
|
||||
expected = DataFrame(columns=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO("a,b,c"), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,names",
|
||||
[
|
||||
({}, [0, 1, 2, 3, 4]),
|
||||
(
|
||||
{"names": ["foo", "bar", "baz", "quux", "panda"]},
|
||||
["foo", "bar", "baz", "quux", "panda"],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_no_header(all_parsers, kwargs, names):
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), header=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [["a", "b"], "string_header"])
|
||||
def test_non_int_header(all_parsers, header):
|
||||
# see gh-16338
|
||||
msg = "header must be integer or list of integers"
|
||||
data = """1,2\n3,4"""
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=header)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_singleton_header(all_parsers):
|
||||
# see gh-7757
|
||||
data = """a,b,c\n0,1,2\n1,2,3"""
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
|
||||
result = parser.read_csv(StringIO(data), header=[0])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected",
|
||||
[
|
||||
(
|
||||
"A,A,A,B\none,one,one,two\n0,40,34,0.1",
|
||||
DataFrame(
|
||||
[[0, 40, 34, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("A", "one"), ("A", "one.1"), ("A", "one.2"), ("B", "two")]
|
||||
),
|
||||
),
|
||||
),
|
||||
(
|
||||
"A,A,A,B\none,one,one.1,two\n0,40,34,0.1",
|
||||
DataFrame(
|
||||
[[0, 40, 34, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("A", "one"), ("A", "one.1"), ("A", "one.1.1"), ("B", "two")]
|
||||
),
|
||||
),
|
||||
),
|
||||
(
|
||||
"A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1",
|
||||
DataFrame(
|
||||
[[0, 40, 34, 0.1, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[
|
||||
("A", "one"),
|
||||
("A", "one.1"),
|
||||
("A", "one.1.1"),
|
||||
("B", "two"),
|
||||
("B", "two.1"),
|
||||
]
|
||||
),
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_mangles_multi_index(all_parsers, data, expected):
|
||||
# see gh-18062
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is requireds
|
||||
@pytest.mark.parametrize("index_col", [None, [0]])
|
||||
@pytest.mark.parametrize(
|
||||
"columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])]
|
||||
)
|
||||
def test_multi_index_unnamed(all_parsers, index_col, columns):
|
||||
# see gh-23687
|
||||
#
|
||||
# When specifying a multi-index header, make sure that
|
||||
# we don't error just because one of the rows in our header
|
||||
# has ALL column names containing the string "Unnamed". The
|
||||
# correct condition to check is whether the row contains
|
||||
# ALL columns that did not have names (and instead were given
|
||||
# placeholder ones).
|
||||
parser = all_parsers
|
||||
header = [0, 1]
|
||||
|
||||
if index_col is None:
|
||||
data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
|
||||
else:
|
||||
data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=header, index_col=index_col)
|
||||
exp_columns = []
|
||||
|
||||
if columns is None:
|
||||
columns = ["", "", ""]
|
||||
|
||||
for i, col in enumerate(columns):
|
||||
if not col: # Unnamed.
|
||||
col = f"Unnamed: {i if index_col is None else i + 1}_level_0"
|
||||
|
||||
exp_columns.append(col)
|
||||
|
||||
columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
|
||||
expected = DataFrame([[2, 3], [4, 5]], columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 2 columns, got 3
|
||||
def test_names_longer_than_header_but_equal_with_data_rows(all_parsers):
|
||||
# GH#38453
|
||||
parser = all_parsers
|
||||
data = """a, b
|
||||
1,2,3
|
||||
5,6,4
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), header=0, names=["A", "B", "C"])
|
||||
expected = DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_read_csv_multiindex_columns(all_parsers):
|
||||
# GH#6051
|
||||
parser = all_parsers
|
||||
|
||||
s1 = "Male, Male, Male, Female, Female\nR, R, L, R, R\n.86, .67, .88, .78, .81"
|
||||
s2 = (
|
||||
"Male, Male, Male, Female, Female\n"
|
||||
"R, R, L, R, R\n"
|
||||
".86, .67, .88, .78, .81\n"
|
||||
".86, .67, .88, .78, .82"
|
||||
)
|
||||
|
||||
mi = MultiIndex.from_tuples(
|
||||
[
|
||||
("Male", "R"),
|
||||
(" Male", " R"),
|
||||
(" Male", " L"),
|
||||
(" Female", " R"),
|
||||
(" Female", " R.1"),
|
||||
]
|
||||
)
|
||||
expected = DataFrame(
|
||||
[[0.86, 0.67, 0.88, 0.78, 0.81], [0.86, 0.67, 0.88, 0.78, 0.82]], columns=mi
|
||||
)
|
||||
|
||||
df1 = parser.read_csv(StringIO(s1), header=[0, 1])
|
||||
tm.assert_frame_equal(df1, expected.iloc[:1])
|
||||
df2 = parser.read_csv(StringIO(s2), header=[0, 1])
|
||||
tm.assert_frame_equal(df2, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_read_csv_multi_header_length_check(all_parsers):
|
||||
# GH#43102
|
||||
parser = all_parsers
|
||||
|
||||
case = """row11,row12,row13
|
||||
row21,row22, row23
|
||||
row31,row32
|
||||
"""
|
||||
|
||||
with pytest.raises(
|
||||
ParserError, match="Header rows must have an equal number of columns."
|
||||
):
|
||||
parser.read_csv(StringIO(case), header=[0, 2])
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 3 columns, got 2
|
||||
def test_header_none_and_implicit_index(all_parsers):
|
||||
# GH#22144
|
||||
parser = all_parsers
|
||||
data = "x,1,5\ny,2\nz,3\n"
|
||||
result = parser.read_csv(StringIO(data), names=["a", "b"], header=None)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3], "b": [5, np.nan, np.nan]}, index=["x", "y", "z"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # regex mismatch "CSV parse error: Expected 2 columns, got "
|
||||
def test_header_none_and_implicit_index_in_second_row(all_parsers):
|
||||
# GH#22144
|
||||
parser = all_parsers
|
||||
data = "x,1\ny,2,5\nz,3\n"
|
||||
with pytest.raises(ParserError, match="Expected 2 fields in line 2, saw 3"):
|
||||
parser.read_csv(StringIO(data), names=["a", "b"], header=None)
|
||||
|
||||
|
||||
def test_header_none_and_on_bad_lines_skip(all_parsers):
|
||||
# GH#22144
|
||||
parser = all_parsers
|
||||
data = "x,1\ny,2,5\nz,3\n"
|
||||
result = parser.read_csv(
|
||||
StringIO(data), names=["a", "b"], header=None, on_bad_lines="skip"
|
||||
)
|
||||
expected = DataFrame({"a": ["x", "z"], "b": [1, 3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is requireds
|
||||
def test_header_missing_rows(all_parsers):
|
||||
# GH#47400
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,2
|
||||
"""
|
||||
msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=[0, 1, 2])
|
||||
|
||||
|
||||
# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine
|
||||
@xfail_pyarrow
|
||||
def test_header_multiple_whitespaces(all_parsers):
|
||||
# GH#54931
|
||||
parser = all_parsers
|
||||
data = """aa bb(1,1) cc(1,1)
|
||||
0 2 3.5"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
expected = DataFrame({"aa": [0], "bb(1,1)": 2, "cc(1,1)": 3.5})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine
|
||||
@xfail_pyarrow
|
||||
def test_header_delim_whitespace(all_parsers):
|
||||
# GH#54918
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,2
|
||||
3,4
|
||||
"""
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(StringIO(data), delim_whitespace=True)
|
||||
expected = DataFrame({"a,b": ["1,2", "3,4"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_no_header_pyarrow(pyarrow_parser_only):
|
||||
parser = pyarrow_parser_only
|
||||
data = """
|
||||
a,i,x
|
||||
b,j,y
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
usecols=[0, 1],
|
||||
dtype="string[pyarrow]",
|
||||
dtype_backend="pyarrow",
|
||||
engine="pyarrow",
|
||||
)
|
||||
expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,376 @@
|
||||
"""
|
||||
Tests that the specified index column (a.k.a "index_col")
|
||||
is properly handled or inferred during parsing for all of
|
||||
the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("with_header", [True, False])
|
||||
def test_index_col_named(all_parsers, with_header):
|
||||
parser = all_parsers
|
||||
no_header = """\
|
||||
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
|
||||
header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
|
||||
|
||||
if with_header:
|
||||
data = header + no_header
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col="ID")
|
||||
expected = parser.read_csv(StringIO(data), header=0).set_index("ID")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
data = no_header
|
||||
msg = "Index ID invalid"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), index_col="ID")
|
||||
|
||||
|
||||
def test_index_col_named2(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
1,2,3,4,hello
|
||||
5,6,7,8,world
|
||||
9,10,11,12,foo
|
||||
"""
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [1, 5, 9], "b": [2, 6, 10], "c": [3, 7, 11], "d": [4, 8, 12]},
|
||||
index=Index(["hello", "world", "foo"], name="message"),
|
||||
)
|
||||
names = ["a", "b", "c", "d", "message"]
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, index_col=["message"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_col_is_true(all_parsers):
|
||||
# see gh-9798
|
||||
data = "a,b\n1,2"
|
||||
parser = all_parsers
|
||||
|
||||
msg = "The value of index_col couldn't be 'True'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), index_col=True)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 3 columns, got 4
|
||||
def test_infer_index_col(all_parsers):
|
||||
data = """A,B,C
|
||||
foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
@pytest.mark.parametrize(
|
||||
"index_col,kwargs",
|
||||
[
|
||||
(None, {"columns": ["x", "y", "z"]}),
|
||||
(False, {"columns": ["x", "y", "z"]}),
|
||||
(0, {"columns": ["y", "z"], "index": Index([], name="x")}),
|
||||
(1, {"columns": ["x", "z"], "index": Index([], name="y")}),
|
||||
("x", {"columns": ["y", "z"], "index": Index([], name="x")}),
|
||||
("y", {"columns": ["x", "z"], "index": Index([], name="y")}),
|
||||
(
|
||||
[0, 1],
|
||||
{
|
||||
"columns": ["z"],
|
||||
"index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
|
||||
},
|
||||
),
|
||||
(
|
||||
["x", "y"],
|
||||
{
|
||||
"columns": ["z"],
|
||||
"index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
|
||||
},
|
||||
),
|
||||
(
|
||||
[1, 0],
|
||||
{
|
||||
"columns": ["z"],
|
||||
"index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]),
|
||||
},
|
||||
),
|
||||
(
|
||||
["y", "x"],
|
||||
{
|
||||
"columns": ["z"],
|
||||
"index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]),
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_index_col_empty_data(all_parsers, index_col, kwargs):
|
||||
data = "x,y,z"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=index_col)
|
||||
|
||||
expected = DataFrame(**kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_index_col_false(all_parsers):
|
||||
# see gh-10413
|
||||
data = "x,y"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=False)
|
||||
|
||||
expected = DataFrame(columns=["x", "y"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_names",
|
||||
[
|
||||
["", ""],
|
||||
["foo", ""],
|
||||
["", "bar"],
|
||||
["foo", "bar"],
|
||||
["NotReallyUnnamed", "Unnamed: 0"],
|
||||
],
|
||||
)
|
||||
def test_multi_index_naming(all_parsers, index_names, request):
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and "" in index_names:
|
||||
mark = pytest.mark.xfail(reason="One case raises, others are wrong")
|
||||
request.applymarker(mark)
|
||||
|
||||
# We don't want empty index names being replaced with "Unnamed: 0"
|
||||
data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
|
||||
result = parser.read_csv(StringIO(data), index_col=[0, 1])
|
||||
|
||||
expected = DataFrame(
|
||||
{"col": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], ["c", "d"]])
|
||||
)
|
||||
expected.index.names = [name if name else None for name in index_names]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_multi_index_naming_not_all_at_beginning(all_parsers):
|
||||
parser = all_parsers
|
||||
data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
|
||||
result = parser.read_csv(StringIO(data), index_col=[0, 2])
|
||||
|
||||
expected = DataFrame(
|
||||
{"Unnamed: 2": ["c", "d", "c", "d"]},
|
||||
index=MultiIndex(
|
||||
levels=[["a", "b"], [1, 2, 3, 4]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_no_multi_index_level_names_empty(all_parsers):
|
||||
# GH 10984
|
||||
parser = all_parsers
|
||||
midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)])
|
||||
expected = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((3, 3)),
|
||||
index=midx,
|
||||
columns=["x", "y", "z"],
|
||||
)
|
||||
with tm.ensure_clean() as path:
|
||||
expected.to_csv(path)
|
||||
result = parser.read_csv(path, index_col=[0, 1, 2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_with_index_col(all_parsers):
|
||||
# GH 33476
|
||||
parser = all_parsers
|
||||
data = """
|
||||
I11,A,A
|
||||
I12,B,B
|
||||
I2,1,3
|
||||
"""
|
||||
midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"])
|
||||
idx = Index(["I2"])
|
||||
expected = DataFrame([[1, 3]], index=idx, columns=midx)
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
col_idx = Index(["A", "A.1"])
|
||||
idx = Index(["I12", "I2"], name="I11")
|
||||
expected = DataFrame([["B", "B"], ["1", "3"]], index=idx, columns=col_idx)
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col="I11", header=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_index_col_large_csv(all_parsers, monkeypatch):
|
||||
# https://github.com/pandas-dev/pandas/issues/37094
|
||||
parser = all_parsers
|
||||
|
||||
ARR_LEN = 100
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": range(ARR_LEN + 1),
|
||||
"b": np.random.default_rng(2).standard_normal(ARR_LEN + 1),
|
||||
}
|
||||
)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_csv(path, index=False)
|
||||
with monkeypatch.context() as m:
|
||||
m.setattr("pandas.core.algorithms._MINIMUM_COMP_ARR_LEN", ARR_LEN)
|
||||
result = parser.read_csv(path, index_col=[0])
|
||||
|
||||
tm.assert_frame_equal(result, df.set_index("a"))
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_index_col_multiindex_columns_no_data(all_parsers):
|
||||
# GH#38292
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1], index_col=0
|
||||
)
|
||||
expected = DataFrame(
|
||||
[],
|
||||
index=Index([]),
|
||||
columns=MultiIndex.from_arrays(
|
||||
[["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_index_col_header_no_data(all_parsers):
|
||||
# GH#38292
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO("a0,a1,a2\n"), header=[0], index_col=0)
|
||||
expected = DataFrame(
|
||||
[],
|
||||
columns=["a1", "a2"],
|
||||
index=Index([], name="a0"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_multiindex_columns_no_data(all_parsers):
|
||||
# GH#38292
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1])
|
||||
expected = DataFrame(
|
||||
[], columns=MultiIndex.from_arrays([["a0", "a1", "a2"], ["b0", "b1", "b2"]])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_multiindex_columns_index_col_with_data(all_parsers):
|
||||
# GH#38292
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO("a0,a1,a2\nb0,b1,b2\ndata,data,data"), header=[0, 1], index_col=0
|
||||
)
|
||||
expected = DataFrame(
|
||||
[["data", "data"]],
|
||||
columns=MultiIndex.from_arrays(
|
||||
[["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"]
|
||||
),
|
||||
index=Index(["data"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_infer_types_boolean_sum(all_parsers):
|
||||
# GH#44079
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO("0,1"),
|
||||
names=["a", "b"],
|
||||
index_col=["a"],
|
||||
dtype={"a": "UInt8"},
|
||||
)
|
||||
expected = DataFrame(
|
||||
data={
|
||||
"a": [
|
||||
0,
|
||||
],
|
||||
"b": [1],
|
||||
}
|
||||
).set_index("a")
|
||||
# Not checking index type now, because the C parser will return a
|
||||
# index column of dtype 'object', and the Python parser will return a
|
||||
# index column of dtype 'int64'.
|
||||
tm.assert_frame_equal(result, expected, check_index_type=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
|
||||
def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
|
||||
# GH#9435
|
||||
data = "a,b\n01,2"
|
||||
parser = all_parsers
|
||||
if dtype == object and parser.engine == "pyarrow":
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
|
||||
expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_multiindex_columns_not_leading_index_col(all_parsers):
|
||||
# GH#38549
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
e,f,g,h
|
||||
x,y,1,2
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=[0, 1],
|
||||
index_col=1,
|
||||
)
|
||||
cols = MultiIndex.from_tuples(
|
||||
[("a", "e"), ("c", "g"), ("d", "h")], names=["b", "f"]
|
||||
)
|
||||
expected = DataFrame([["x", 1, 2]], columns=cols, index=["y"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,182 @@
|
||||
"""
|
||||
Tests that duplicate columns are handled appropriately when parsed by the
|
||||
CSV engine. In general, the expected result is that they are either thoroughly
|
||||
de-duplicated (if mangling requested) or ignored otherwise.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_basic(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "a,a,b,b,b\n1,2,3,4,5"
|
||||
result = parser.read_csv(StringIO(data), sep=",")
|
||||
|
||||
expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_basic_names(all_parsers):
|
||||
# See gh-7160
|
||||
parser = all_parsers
|
||||
|
||||
data = "a,b,a\n0,1,2\n3,4,5"
|
||||
expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"])
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_basic_names_raise(all_parsers):
|
||||
# See gh-7160
|
||||
parser = all_parsers
|
||||
|
||||
data = "0,1,2\n3,4,5"
|
||||
with pytest.raises(ValueError, match="Duplicate names"):
|
||||
parser.read_csv(StringIO(data), names=["a", "b", "a"])
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected",
|
||||
[
|
||||
("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.2", "a.1"])),
|
||||
(
|
||||
"a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
|
||||
DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6]],
|
||||
columns=["a", "a.2", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
|
||||
DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6, 7]],
|
||||
columns=["a", "a.4", "a.3", "a.1", "a.2", "a.5", "a.6"],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_thorough_mangle_columns(all_parsers, data, expected):
|
||||
# see gh-17060
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,names,expected",
|
||||
[
|
||||
(
|
||||
"a,b,b\n1,2,3",
|
||||
["a.1", "a.1", "a.1.1"],
|
||||
DataFrame(
|
||||
[["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"]
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,b,c,d,e,f\n1,2,3,4,5,6",
|
||||
["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
|
||||
DataFrame(
|
||||
[["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]],
|
||||
columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,b,c,d,e,f,g\n1,2,3,4,5,6,7",
|
||||
["a", "a", "a.3", "a.1", "a.2", "a", "a"],
|
||||
DataFrame(
|
||||
[
|
||||
["a", "b", "c", "d", "e", "f", "g"],
|
||||
["1", "2", "3", "4", "5", "6", "7"],
|
||||
],
|
||||
columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_thorough_mangle_names(all_parsers, data, names, expected):
|
||||
# see gh-17095
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match="Duplicate names"):
|
||||
parser.read_csv(StringIO(data), names=names)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.columns are different
|
||||
def test_mangled_unnamed_placeholders(all_parsers):
|
||||
# xref gh-13017
|
||||
orig_key = "0"
|
||||
parser = all_parsers
|
||||
|
||||
orig_value = [1, 2, 3]
|
||||
df = DataFrame({orig_key: orig_value})
|
||||
|
||||
# This test recursively updates `df`.
|
||||
for i in range(3):
|
||||
expected = DataFrame(columns=Index([], dtype="str"))
|
||||
|
||||
for j in range(i + 1):
|
||||
col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
|
||||
expected.insert(loc=0, column=col_name, value=[0, 1, 2])
|
||||
|
||||
expected[orig_key] = orig_value
|
||||
df = parser.read_csv(StringIO(df.to_csv()))
|
||||
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_mangle_dupe_cols_already_exists(all_parsers):
|
||||
# GH#14704
|
||||
parser = all_parsers
|
||||
|
||||
data = "a,a,a.1,a,a.3,a.1,a.1.1\n1,2,3,4,5,6,7"
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6, 7]],
|
||||
columns=["a", "a.2", "a.1", "a.4", "a.3", "a.1.2", "a.1.1"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers):
|
||||
# GH#14704
|
||||
parser = all_parsers
|
||||
|
||||
data = ",Unnamed: 0,,Unnamed: 2\n1,2,3,4"
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4]],
|
||||
columns=["Unnamed: 0.1", "Unnamed: 0", "Unnamed: 2.1", "Unnamed: 2"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecol, engine", [([0, 1, 1], "python"), ([0, 1, 1], "c")])
|
||||
def test_mangle_cols_names(all_parsers, usecol, engine):
|
||||
# GH 11823
|
||||
parser = all_parsers
|
||||
data = "1,2,3"
|
||||
names = ["A", "A", "B"]
|
||||
with pytest.raises(ValueError, match="Duplicate names"):
|
||||
parser.read_csv(StringIO(data), names=names, usecols=usecol, engine=engine)
|
||||
@ -0,0 +1,157 @@
|
||||
"""
|
||||
Tests multithreading behaviour for reading and
|
||||
parsing files for each parser defined in parsers.py
|
||||
"""
|
||||
from contextlib import ExitStack
|
||||
from io import BytesIO
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
from pandas.util.version import Version
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
# We'll probably always skip these for pyarrow
|
||||
# Maybe we'll add our own tests for pyarrow too
|
||||
pytestmark = [
|
||||
pytest.mark.single_cpu,
|
||||
pytest.mark.slow,
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
|
||||
def test_multi_thread_string_io_read_csv(all_parsers, request):
|
||||
# see gh-11786
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
if Version(pa.__version__) < Version("16.0"):
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(reason="# ValueError: Found non-unique column index")
|
||||
)
|
||||
max_row_range = 100
|
||||
num_files = 10
|
||||
|
||||
bytes_to_df = (
|
||||
"\n".join([f"{i:d},{i:d},{i:d}" for i in range(max_row_range)]).encode()
|
||||
for _ in range(num_files)
|
||||
)
|
||||
|
||||
# Read all files in many threads.
|
||||
with ExitStack() as stack:
|
||||
files = [stack.enter_context(BytesIO(b)) for b in bytes_to_df]
|
||||
|
||||
pool = stack.enter_context(ThreadPool(8))
|
||||
|
||||
results = pool.map(parser.read_csv, files)
|
||||
first_result = results[0]
|
||||
|
||||
for result in results:
|
||||
tm.assert_frame_equal(first_result, result)
|
||||
|
||||
|
||||
def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks):
|
||||
"""
|
||||
Generate a DataFrame via multi-thread.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
parser : BaseParser
|
||||
The parser object to use for reading the data.
|
||||
path : str
|
||||
The location of the CSV file to read.
|
||||
num_rows : int
|
||||
The number of rows to read per task.
|
||||
num_tasks : int
|
||||
The number of tasks to use for reading this DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
"""
|
||||
|
||||
def reader(arg):
|
||||
"""
|
||||
Create a reader for part of the CSV.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arg : tuple
|
||||
A tuple of the following:
|
||||
|
||||
* start : int
|
||||
The starting row to start for parsing CSV
|
||||
* nrows : int
|
||||
The number of rows to read.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
"""
|
||||
start, nrows = arg
|
||||
|
||||
if not start:
|
||||
return parser.read_csv(
|
||||
path, index_col=0, header=0, nrows=nrows, parse_dates=["date"]
|
||||
)
|
||||
|
||||
return parser.read_csv(
|
||||
path,
|
||||
index_col=0,
|
||||
header=None,
|
||||
skiprows=int(start) + 1,
|
||||
nrows=nrows,
|
||||
parse_dates=[9],
|
||||
)
|
||||
|
||||
tasks = [
|
||||
(num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks)
|
||||
]
|
||||
|
||||
with ThreadPool(processes=num_tasks) as pool:
|
||||
results = pool.map(reader, tasks)
|
||||
|
||||
header = results[0].columns
|
||||
|
||||
for r in results[1:]:
|
||||
r.columns = header
|
||||
|
||||
final_dataframe = pd.concat(results)
|
||||
return final_dataframe
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: The 'nrows' option is not supported
|
||||
def test_multi_thread_path_multipart_read_csv(all_parsers):
|
||||
# see gh-11786
|
||||
num_tasks = 4
|
||||
num_rows = 48
|
||||
|
||||
parser = all_parsers
|
||||
file_name = "__thread_pool_reader__.csv"
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).random(num_rows),
|
||||
"b": np.random.default_rng(2).random(num_rows),
|
||||
"c": np.random.default_rng(2).random(num_rows),
|
||||
"d": np.random.default_rng(2).random(num_rows),
|
||||
"e": np.random.default_rng(2).random(num_rows),
|
||||
"foo": ["foo"] * num_rows,
|
||||
"bar": ["bar"] * num_rows,
|
||||
"baz": ["baz"] * num_rows,
|
||||
"date": pd.date_range("20000101 09:00:00", periods=num_rows, freq="s"),
|
||||
"int": np.arange(num_rows, dtype="int64"),
|
||||
}
|
||||
)
|
||||
|
||||
with tm.ensure_clean(file_name) as path:
|
||||
df.to_csv(path)
|
||||
|
||||
final_dataframe = _generate_multi_thread_dataframe(
|
||||
parser, path, num_rows, num_tasks
|
||||
)
|
||||
tm.assert_frame_equal(df, final_dataframe)
|
||||
@ -0,0 +1,780 @@
|
||||
"""
|
||||
Tests that NA values are properly handled during
|
||||
parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.parsers import STR_NA_VALUES
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_string_nas(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
a,b,c
|
||||
d,,f
|
||||
,g,h
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[["a", "b", "c"], ["d", np.nan, "f"], [np.nan, "g", "h"]],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
expected.loc[2, "A"] = None
|
||||
expected.loc[1, "B"] = None
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_detect_string_na(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B
|
||||
foo,bar
|
||||
NA,baz
|
||||
NaN,nan
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[["foo", "bar"], [np.nan, "baz"], [np.nan, np.nan]], columns=["A", "B"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
expected.loc[[1, 2], "A"] = None
|
||||
expected.loc[2, "B"] = None
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_values",
|
||||
[
|
||||
["-999.0", "-999"],
|
||||
[-999, -999.0],
|
||||
[-999.0, -999],
|
||||
["-999.0"],
|
||||
["-999"],
|
||||
[-999.0],
|
||||
[-999],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
"""A,B
|
||||
-999,1.2
|
||||
2,-999
|
||||
3,4.5
|
||||
""",
|
||||
"""A,B
|
||||
-999,1.200
|
||||
2,-999.000
|
||||
3,4.500
|
||||
""",
|
||||
],
|
||||
)
|
||||
def test_non_string_na_values(all_parsers, data, na_values, request):
|
||||
# see gh-3611: with an odd float format, we can't match
|
||||
# the string "999.0" exactly but still need float matching
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"])
|
||||
|
||||
if parser.engine == "pyarrow" and not all(isinstance(x, str) for x in na_values):
|
||||
msg = "The 'pyarrow' engine requires all na_values to be strings"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), na_values=na_values)
|
||||
return
|
||||
elif parser.engine == "pyarrow" and "-999.000" in data:
|
||||
# bc the pyarrow engine does not include the float-ified version
|
||||
# of "-999" -> -999, it does not match the entry with the trailing
|
||||
# zeros, so "-999.000" is not treated as null.
|
||||
mark = pytest.mark.xfail(
|
||||
reason="pyarrow engined does not recognize equivalent floats"
|
||||
)
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_default_na_values(all_parsers):
|
||||
_NA_VALUES = {
|
||||
"-1.#IND",
|
||||
"1.#QNAN",
|
||||
"1.#IND",
|
||||
"-1.#QNAN",
|
||||
"#N/A",
|
||||
"N/A",
|
||||
"n/a",
|
||||
"NA",
|
||||
"<NA>",
|
||||
"#NA",
|
||||
"NULL",
|
||||
"null",
|
||||
"NaN",
|
||||
"nan",
|
||||
"-NaN",
|
||||
"-nan",
|
||||
"#N/A N/A",
|
||||
"",
|
||||
"None",
|
||||
}
|
||||
assert _NA_VALUES == STR_NA_VALUES
|
||||
|
||||
parser = all_parsers
|
||||
nv = len(_NA_VALUES)
|
||||
|
||||
def f(i, v):
|
||||
if i == 0:
|
||||
buf = ""
|
||||
elif i > 0:
|
||||
buf = "".join([","] * i)
|
||||
|
||||
buf = f"{buf}{v}"
|
||||
|
||||
if i < nv - 1:
|
||||
joined = "".join([","] * (nv - i - 1))
|
||||
buf = f"{buf}{joined}"
|
||||
|
||||
return buf
|
||||
|
||||
data = StringIO("\n".join([f(i, v) for i, v in enumerate(_NA_VALUES)]))
|
||||
expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
|
||||
|
||||
result = parser.read_csv(data, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values", ["baz", ["baz"]])
|
||||
def test_custom_na_values(all_parsers, na_values):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
ignore,this,row
|
||||
1,NA,3
|
||||
-1.#IND,5,baz
|
||||
7,8,NaN
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "skiprows argument must be an integer when using engine='pyarrow'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_bool_na_values(all_parsers):
|
||||
data = """A,B,C
|
||||
True,False,True
|
||||
NA,True,False
|
||||
False,NA,True"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": np.array([True, np.nan, False], dtype=object),
|
||||
"B": np.array([False, True, np.nan], dtype=object),
|
||||
"C": [True, False, True],
|
||||
}
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
expected.loc[1, "A"] = None
|
||||
expected.loc[2, "B"] = None
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_na_value_dict(all_parsers):
|
||||
data = """A,B,C
|
||||
foo,bar,NA
|
||||
bar,foo,foo
|
||||
foo,bar,NA
|
||||
bar,foo,foo"""
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]})
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]})
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [np.nan, "bar", np.nan, "bar"],
|
||||
"B": [np.nan, "foo", np.nan, "foo"],
|
||||
"C": [np.nan, "foo", np.nan, "foo"],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_col,expected",
|
||||
[
|
||||
(
|
||||
[0],
|
||||
DataFrame({"b": [np.nan], "c": [1], "d": [5]}, index=Index([0], name="a")),
|
||||
),
|
||||
(
|
||||
[0, 2],
|
||||
DataFrame(
|
||||
{"b": [np.nan], "d": [5]},
|
||||
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
|
||||
),
|
||||
),
|
||||
(
|
||||
["a", "c"],
|
||||
DataFrame(
|
||||
{"b": [np.nan], "d": [5]},
|
||||
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_na_value_dict_multi_index(all_parsers, index_col, expected):
|
||||
data = """\
|
||||
a,b,c,d
|
||||
0,NA,1,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), na_values=set(), index_col=index_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,expected",
|
||||
[
|
||||
(
|
||||
{},
|
||||
DataFrame(
|
||||
{
|
||||
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
{"na_values": {"A": [], "C": []}, "keep_default_na": False},
|
||||
DataFrame(
|
||||
{
|
||||
"A": ["a", "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", "nan", "five", "", "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
{"na_values": ["a"], "keep_default_na": False},
|
||||
DataFrame(
|
||||
{
|
||||
"A": [np.nan, "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", "nan", "five", "", "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
{"na_values": {"A": [], "C": []}},
|
||||
DataFrame(
|
||||
{
|
||||
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_na_values_keep_default(
|
||||
all_parsers, kwargs, expected, request, using_infer_string
|
||||
):
|
||||
data = """\
|
||||
A,B,C
|
||||
a,1,one
|
||||
b,2,two
|
||||
,3,three
|
||||
d,4,nan
|
||||
e,5,five
|
||||
nan,6,
|
||||
g,7,seven
|
||||
"""
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
if "na_values" in kwargs and isinstance(kwargs["na_values"], dict):
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
return
|
||||
if not using_infer_string or "na_values" in kwargs:
|
||||
mark = pytest.mark.xfail()
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_na_values_no_keep_default(all_parsers):
|
||||
# see gh-4318: passing na_values=None and
|
||||
# keep_default_na=False yields 'None" as a na_value
|
||||
data = """\
|
||||
A,B,C
|
||||
a,1,None
|
||||
b,2,two
|
||||
,3,None
|
||||
d,4,nan
|
||||
e,5,five
|
||||
nan,6,
|
||||
g,7,seven
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), keep_default_na=False)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": ["a", "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["None", "two", "None", "nan", "five", "", "seven"],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_keep_default_na_dict_na_values(all_parsers):
|
||||
# see gh-19227
|
||||
data = "a,b\n,2"
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), na_values={"b": ["2"]}, keep_default_na=False
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), na_values={"b": ["2"]}, keep_default_na=False
|
||||
)
|
||||
expected = DataFrame({"a": [""], "b": [np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
|
||||
# see gh-19227
|
||||
#
|
||||
# Scalar values shouldn't cause the parsing to crash or fail.
|
||||
data = "a,b\n1,2"
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False)
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False)
|
||||
expected = DataFrame({"a": [1], "b": [np.nan]})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("col_zero_na_values", [113125, "113125"])
|
||||
def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values):
|
||||
# see gh-19227
|
||||
data = """\
|
||||
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
|
||||
729639,"qwer","",asdfkj,466.681,,252.373
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: [np.nan, 729639.0],
|
||||
1: [np.nan, "qwer"],
|
||||
2: ["/blaha", np.nan],
|
||||
3: ["kjsdkj", "asdfkj"],
|
||||
4: [412.166, 466.681],
|
||||
5: ["225.874", ""],
|
||||
6: [np.nan, 252.373],
|
||||
}
|
||||
)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
keep_default_na=False,
|
||||
na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values},
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
keep_default_na=False,
|
||||
na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_filter,row_data",
|
||||
[
|
||||
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
|
||||
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
|
||||
],
|
||||
)
|
||||
def test_na_values_na_filter_override(
|
||||
request, all_parsers, na_filter, row_data, using_infer_string
|
||||
):
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
# mismatched dtypes in both cases, FutureWarning in the True case
|
||||
if not (using_infer_string and na_filter):
|
||||
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
|
||||
request.applymarker(mark)
|
||||
data = """\
|
||||
A,B
|
||||
1,A
|
||||
nan,B
|
||||
3,C
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)
|
||||
|
||||
expected = DataFrame(row_data, columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 8 columns, got 5:
|
||||
def test_na_trailing_columns(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
|
||||
2012-03-14,USD,AAPL,BUY,1000
|
||||
2012-05-12,USD,SBUX,SELL,500"""
|
||||
|
||||
# Trailing columns should be all NaN.
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[
|
||||
["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan],
|
||||
["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan],
|
||||
],
|
||||
columns=[
|
||||
"Date",
|
||||
"Currency",
|
||||
"Symbol",
|
||||
"Type",
|
||||
"Units",
|
||||
"UnitPrice",
|
||||
"Cost",
|
||||
"Tax",
|
||||
],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_values,row_data",
|
||||
[
|
||||
(1, [[np.nan, 2.0], [2.0, np.nan]]),
|
||||
({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]),
|
||||
],
|
||||
)
|
||||
def test_na_values_scalar(all_parsers, na_values, row_data):
|
||||
# see gh-12224
|
||||
parser = all_parsers
|
||||
names = ["a", "b"]
|
||||
data = "1,2\n2,1"
|
||||
|
||||
if parser.engine == "pyarrow" and isinstance(na_values, dict):
|
||||
if isinstance(na_values, dict):
|
||||
err = ValueError
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
else:
|
||||
err = TypeError
|
||||
msg = "The 'pyarrow' engine requires all na_values to be strings"
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
return
|
||||
elif parser.engine == "pyarrow":
|
||||
msg = "The 'pyarrow' engine requires all na_values to be strings"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
expected = DataFrame(row_data, columns=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_na_values_dict_aliasing(all_parsers):
|
||||
parser = all_parsers
|
||||
na_values = {"a": 2, "b": 1}
|
||||
na_values_copy = na_values.copy()
|
||||
|
||||
names = ["a", "b"]
|
||||
data = "1,2\n2,1"
|
||||
|
||||
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_dict_equal(na_values, na_values_copy)
|
||||
|
||||
|
||||
def test_na_values_dict_col_index(all_parsers):
|
||||
# see gh-14203
|
||||
data = "a\nfoo\n1"
|
||||
parser = all_parsers
|
||||
na_values = {0: "foo"}
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), na_values=na_values)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values)
|
||||
expected = DataFrame({"a": [np.nan, 1]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
str(2**63) + "\n" + str(2**63 + 1),
|
||||
{"na_values": [2**63]},
|
||||
DataFrame([str(2**63), str(2**63 + 1)]),
|
||||
),
|
||||
(str(2**63) + ",1" + "\n,2", {}, DataFrame([[str(2**63), 1], ["", 2]])),
|
||||
(str(2**63) + "\n1", {"na_values": [2**63]}, DataFrame([np.nan, 1])),
|
||||
],
|
||||
)
|
||||
def test_na_values_uint64(all_parsers, data, kwargs, expected, request):
|
||||
# see gh-14983
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and "na_values" in kwargs:
|
||||
msg = "The 'pyarrow' engine requires all na_values to be strings"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=None, **kwargs)
|
||||
return
|
||||
elif parser.engine == "pyarrow":
|
||||
mark = pytest.mark.xfail(reason="Returns float64 instead of object")
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_na_values_no_default_with_index(all_parsers):
|
||||
# see gh-15835
|
||||
data = "a,1\nb,2"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"1": [2]}, index=Index(["b"], name="a"))
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, keep_default_na=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])]
|
||||
)
|
||||
def test_no_na_filter_on_index(all_parsers, na_filter, index_data, request):
|
||||
# see gh-5239
|
||||
#
|
||||
# Don't parse NA-values in index unless na_filter=True
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,,3\n4,5,6"
|
||||
|
||||
if parser.engine == "pyarrow" and na_filter is False:
|
||||
mark = pytest.mark.xfail(reason="mismatched index result")
|
||||
request.applymarker(mark)
|
||||
|
||||
expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b"))
|
||||
result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_inf_na_values_with_int_index(all_parsers):
|
||||
# see gh-17128
|
||||
parser = all_parsers
|
||||
data = "idx,col1,col2\n1,3,4\n2,inf,-inf"
|
||||
|
||||
# Don't fail with OverflowError with inf's and integer index column.
|
||||
out = parser.read_csv(StringIO(data), index_col=[0], na_values=["inf", "-inf"])
|
||||
expected = DataFrame(
|
||||
{"col1": [3, np.nan], "col2": [4, np.nan]}, index=Index([1, 2], name="idx")
|
||||
)
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # mismatched shape
|
||||
@pytest.mark.parametrize("na_filter", [True, False])
|
||||
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
|
||||
# see gh-20377
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,,3\n4,5,6"
|
||||
|
||||
# na_filter=True --> missing value becomes NaN.
|
||||
# na_filter=False --> missing value remains empty string.
|
||||
empty = np.nan if na_filter else ""
|
||||
expected = DataFrame({"a": ["1", "4"], "b": [empty, "5"], "c": ["3", "6"]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # mismatched exception message
|
||||
@pytest.mark.parametrize(
|
||||
"data, na_values",
|
||||
[
|
||||
("false,1\n,1\ntrue", None),
|
||||
("false,1\nnull,1\ntrue", None),
|
||||
("false,1\nnan,1\ntrue", None),
|
||||
("false,1\nfoo,1\ntrue", "foo"),
|
||||
("false,1\nfoo,1\ntrue", ["foo"]),
|
||||
("false,1\nfoo,1\ntrue", {"a": "foo"}),
|
||||
],
|
||||
)
|
||||
def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
|
||||
parser = all_parsers
|
||||
msg = "|".join(
|
||||
[
|
||||
"Bool column has NA values in column [0a]",
|
||||
"cannot safely convert passed user dtype of "
|
||||
"bool for object dtyped data in column 0",
|
||||
]
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
names=["a", "b"],
|
||||
dtype={"a": "bool"},
|
||||
na_values=na_values,
|
||||
)
|
||||
|
||||
|
||||
# TODO: this test isn't about the na_values keyword, it is about the empty entries
|
||||
# being returned with NaN entries, whereas the pyarrow engine returns "nan"
|
||||
@xfail_pyarrow # mismatched shapes
|
||||
def test_str_nan_dropped(all_parsers):
|
||||
# see gh-21131
|
||||
parser = all_parsers
|
||||
|
||||
data = """File: small.csv,,
|
||||
10010010233,0123,654
|
||||
foo,,bar
|
||||
01001000155,4530,898"""
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
names=["col1", "col2", "col3"],
|
||||
dtype={"col1": str, "col2": str, "col3": str},
|
||||
).dropna()
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col1": ["10010010233", "01001000155"],
|
||||
"col2": ["0123", "4530"],
|
||||
"col3": ["654", "898"],
|
||||
},
|
||||
index=[1, 3],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nan_multi_index(all_parsers):
|
||||
# GH 42446
|
||||
parser = all_parsers
|
||||
data = "A,B,B\nX,Y,Z\n1,2,inf"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"}
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"}
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
("A", "X"): [1],
|
||||
("B", "Y"): [2],
|
||||
("B", "Z"): [np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # Failed: DID NOT RAISE <class 'ValueError'>; it casts the NaN to False
|
||||
def test_bool_and_nan_to_bool(all_parsers):
|
||||
# GH#42808
|
||||
parser = all_parsers
|
||||
data = """0
|
||||
NaN
|
||||
True
|
||||
False
|
||||
"""
|
||||
with pytest.raises(ValueError, match="NA values"):
|
||||
parser.read_csv(StringIO(data), dtype="bool")
|
||||
|
||||
|
||||
def test_bool_and_nan_to_int(all_parsers):
|
||||
# GH#42808
|
||||
parser = all_parsers
|
||||
data = """0
|
||||
NaN
|
||||
True
|
||||
False
|
||||
"""
|
||||
with pytest.raises(ValueError, match="convert|NoneType"):
|
||||
parser.read_csv(StringIO(data), dtype="int")
|
||||
|
||||
|
||||
def test_bool_and_nan_to_float(all_parsers):
|
||||
# GH#42808
|
||||
parser = all_parsers
|
||||
data = """0
|
||||
NaN
|
||||
True
|
||||
False
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), dtype="float")
|
||||
expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,327 @@
|
||||
"""
|
||||
Tests parsers ability to read and parse non-local files
|
||||
and hence require a network connection to be read.
|
||||
"""
|
||||
from io import BytesIO
|
||||
import logging
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.feather_format import read_feather
|
||||
from pandas.io.parsers import read_csv
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.single_cpu
|
||||
@pytest.mark.parametrize("mode", ["explicit", "infer"])
|
||||
@pytest.mark.parametrize("engine", ["python", "c"])
|
||||
def test_compressed_urls(
|
||||
httpserver,
|
||||
datapath,
|
||||
salaries_table,
|
||||
mode,
|
||||
engine,
|
||||
compression_only,
|
||||
compression_to_extension,
|
||||
):
|
||||
# test reading compressed urls with various engines and
|
||||
# extension inference
|
||||
if compression_only == "tar":
|
||||
pytest.skip("TODO: Add tar salaraies.csv to pandas/io/parsers/data")
|
||||
|
||||
extension = compression_to_extension[compression_only]
|
||||
with open(datapath("io", "parser", "data", "salaries.csv" + extension), "rb") as f:
|
||||
httpserver.serve_content(content=f.read())
|
||||
|
||||
url = httpserver.url + "/salaries.csv" + extension
|
||||
|
||||
if mode != "explicit":
|
||||
compression_only = mode
|
||||
|
||||
url_table = read_csv(url, sep="\t", compression=compression_only, engine=engine)
|
||||
tm.assert_frame_equal(url_table, salaries_table)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.single_cpu
|
||||
def test_url_encoding_csv(httpserver, datapath):
|
||||
"""
|
||||
read_csv should honor the requested encoding for URLs.
|
||||
|
||||
GH 10424
|
||||
"""
|
||||
with open(datapath("io", "parser", "data", "unicode_series.csv"), "rb") as f:
|
||||
httpserver.serve_content(content=f.read())
|
||||
df = read_csv(httpserver.url, encoding="latin-1", header=None)
|
||||
assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tips_df(datapath):
|
||||
"""DataFrame with the tips dataset."""
|
||||
return read_csv(datapath("io", "data", "csv", "tips.csv"))
|
||||
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
@pytest.mark.usefixtures("s3_resource")
|
||||
@td.skip_if_not_us_locale()
|
||||
class TestS3:
|
||||
def test_parse_public_s3_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
|
||||
# more of an integration test due to the not-public contents portion
|
||||
# can probably mock this though.
|
||||
pytest.importorskip("s3fs")
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
compression=comp,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_private_s3_bucket(self, s3_private_bucket_with_data, tips_df, s3so):
|
||||
# Read public file from bucket with not-public contents
|
||||
pytest.importorskip("s3fs")
|
||||
df = read_csv(
|
||||
f"s3://{s3_private_bucket_with_data.name}/tips.csv", storage_options=s3so
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_public_s3n_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
|
||||
# Read from AWS s3 as "s3n" URL
|
||||
df = read_csv(
|
||||
f"s3n://{s3_public_bucket_with_data.name}/tips.csv",
|
||||
nrows=10,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3a_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
|
||||
# Read from AWS s3 as "s3a" URL
|
||||
df = read_csv(
|
||||
f"s3a://{s3_public_bucket_with_data.name}/tips.csv",
|
||||
nrows=10,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3_bucket_nrows(
|
||||
self, s3_public_bucket_with_data, tips_df, s3so
|
||||
):
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
nrows=10,
|
||||
compression=comp,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3_bucket_chunked(
|
||||
self, s3_public_bucket_with_data, tips_df, s3so
|
||||
):
|
||||
# Read with a chunksize
|
||||
chunksize = 5
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
with read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
chunksize=chunksize,
|
||||
compression=comp,
|
||||
storage_options=s3so,
|
||||
) as df_reader:
|
||||
assert df_reader.chunksize == chunksize
|
||||
for i_chunk in [0, 1, 2]:
|
||||
# Read a couple of chunks and make sure we see them
|
||||
# properly.
|
||||
df = df_reader.get_chunk()
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
true_df = tips_df.iloc[
|
||||
chunksize * i_chunk : chunksize * (i_chunk + 1)
|
||||
]
|
||||
tm.assert_frame_equal(true_df, df)
|
||||
|
||||
def test_parse_public_s3_bucket_chunked_python(
|
||||
self, s3_public_bucket_with_data, tips_df, s3so
|
||||
):
|
||||
# Read with a chunksize using the Python parser
|
||||
chunksize = 5
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
with read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
chunksize=chunksize,
|
||||
compression=comp,
|
||||
engine="python",
|
||||
storage_options=s3so,
|
||||
) as df_reader:
|
||||
assert df_reader.chunksize == chunksize
|
||||
for i_chunk in [0, 1, 2]:
|
||||
# Read a couple of chunks and make sure we see them properly.
|
||||
df = df_reader.get_chunk()
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
true_df = tips_df.iloc[
|
||||
chunksize * i_chunk : chunksize * (i_chunk + 1)
|
||||
]
|
||||
tm.assert_frame_equal(true_df, df)
|
||||
|
||||
def test_parse_public_s3_bucket_python(
|
||||
self, s3_public_bucket_with_data, tips_df, s3so
|
||||
):
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
engine="python",
|
||||
compression=comp,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_infer_s3_compression(self, s3_public_bucket_with_data, tips_df, s3so):
|
||||
for ext in ["", ".gz", ".bz2"]:
|
||||
df = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
engine="python",
|
||||
compression="infer",
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_public_s3_bucket_nrows_python(
|
||||
self, s3_public_bucket_with_data, tips_df, s3so
|
||||
):
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
engine="python",
|
||||
nrows=10,
|
||||
compression=comp,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_read_s3_fails(self, s3so):
|
||||
msg = "The specified bucket does not exist"
|
||||
with pytest.raises(OSError, match=msg):
|
||||
read_csv("s3://nyqpug/asdf.csv", storage_options=s3so)
|
||||
|
||||
def test_read_s3_fails_private(self, s3_private_bucket, s3so):
|
||||
msg = "The specified bucket does not exist"
|
||||
# Receive a permission error when trying to read a private bucket.
|
||||
# It's irrelevant here that this isn't actually a table.
|
||||
with pytest.raises(OSError, match=msg):
|
||||
read_csv(f"s3://{s3_private_bucket.name}/file.csv")
|
||||
|
||||
@pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False)
|
||||
def test_write_s3_csv_fails(self, tips_df, s3so):
|
||||
# GH 32486
|
||||
# Attempting to write to an invalid S3 path should raise
|
||||
import botocore
|
||||
|
||||
# GH 34087
|
||||
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
|
||||
# Catch a ClientError since AWS Service Errors are defined dynamically
|
||||
error = (FileNotFoundError, botocore.exceptions.ClientError)
|
||||
|
||||
with pytest.raises(error, match="The specified bucket does not exist"):
|
||||
tips_df.to_csv(
|
||||
"s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so
|
||||
)
|
||||
|
||||
@pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False)
|
||||
def test_write_s3_parquet_fails(self, tips_df, s3so):
|
||||
# GH 27679
|
||||
# Attempting to write to an invalid S3 path should raise
|
||||
pytest.importorskip("pyarrow")
|
||||
import botocore
|
||||
|
||||
# GH 34087
|
||||
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
|
||||
# Catch a ClientError since AWS Service Errors are defined dynamically
|
||||
error = (FileNotFoundError, botocore.exceptions.ClientError)
|
||||
|
||||
with pytest.raises(error, match="The specified bucket does not exist"):
|
||||
tips_df.to_parquet(
|
||||
"s3://an_s3_bucket_data_doesnt_exit/not_real.parquet",
|
||||
storage_options=s3so,
|
||||
)
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
def test_read_csv_handles_boto_s3_object(
|
||||
self, s3_public_bucket_with_data, tips_file
|
||||
):
|
||||
# see gh-16135
|
||||
|
||||
s3_object = s3_public_bucket_with_data.Object("tips.csv")
|
||||
|
||||
with BytesIO(s3_object.get()["Body"].read()) as buffer:
|
||||
result = read_csv(buffer, encoding="utf8")
|
||||
assert isinstance(result, DataFrame)
|
||||
assert not result.empty
|
||||
|
||||
expected = read_csv(tips_file)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
def test_read_csv_chunked_download(self, s3_public_bucket, caplog, s3so):
|
||||
# 8 MB, S3FS uses 5MB chunks
|
||||
df = DataFrame(np.zeros((100000, 4)), columns=list("abcd"))
|
||||
with BytesIO(df.to_csv().encode("utf-8")) as buf:
|
||||
s3_public_bucket.put_object(Key="large-file.csv", Body=buf)
|
||||
uri = f"{s3_public_bucket.name}/large-file.csv"
|
||||
match_re = re.compile(rf"^Fetch: {uri}, 0-(?P<stop>\d+)$")
|
||||
with caplog.at_level(logging.DEBUG, logger="s3fs"):
|
||||
read_csv(
|
||||
f"s3://{uri}",
|
||||
nrows=5,
|
||||
storage_options=s3so,
|
||||
)
|
||||
for log in caplog.messages:
|
||||
if match := re.match(match_re, log):
|
||||
# Less than 8 MB
|
||||
assert int(match.group("stop")) < 8000000
|
||||
|
||||
def test_read_s3_with_hash_in_key(self, s3_public_bucket_with_data, tips_df, s3so):
|
||||
# GH 25945
|
||||
result = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips#1.csv", storage_options=s3so
|
||||
)
|
||||
tm.assert_frame_equal(tips_df, result)
|
||||
|
||||
def test_read_feather_s3_file_path(
|
||||
self, s3_public_bucket_with_data, feather_file, s3so
|
||||
):
|
||||
# GH 29055
|
||||
pytest.importorskip("pyarrow")
|
||||
expected = read_feather(feather_file)
|
||||
res = read_feather(
|
||||
f"s3://{s3_public_bucket_with_data.name}/simple_dataset.feather",
|
||||
storage_options=s3so,
|
||||
)
|
||||
tm.assert_frame_equal(expected, res)
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,566 @@
|
||||
"""
|
||||
Tests that apply specifically to the Python parser. Unless specifically
|
||||
stated as a Python-specific issue, the goal is to eventually move as many of
|
||||
these tests out of this module as soon as the C parser can accept further
|
||||
arguments when parsing.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
TextIOWrapper,
|
||||
)
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import (
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator
|
||||
|
||||
|
||||
def test_default_separator(python_parser_only):
|
||||
# see gh-17333
|
||||
#
|
||||
# csv.Sniffer in Python treats "o" as separator.
|
||||
data = "aob\n1o2\n3o4"
|
||||
parser = python_parser_only
|
||||
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True])
|
||||
def test_invalid_skipfooter_non_int(python_parser_only, skipfooter):
|
||||
# see gh-15925 (comment)
|
||||
data = "a\n1\n2"
|
||||
parser = python_parser_only
|
||||
msg = "skipfooter must be an integer"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=skipfooter)
|
||||
|
||||
|
||||
def test_invalid_skipfooter_negative(python_parser_only):
|
||||
# see gh-15925 (comment)
|
||||
data = "a\n1\n2"
|
||||
parser = python_parser_only
|
||||
msg = "skipfooter cannot be negative"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=-1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [{"sep": None}, {"delimiter": "|"}])
|
||||
def test_sniff_delimiter(python_parser_only, kwargs):
|
||||
data = """index|A|B|C
|
||||
foo|1|2|3
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"],
|
||||
index=Index(["foo", "bar", "baz"], name="index"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_sniff_delimiter_comment(python_parser_only):
|
||||
data = """# comment line
|
||||
index|A|B|C
|
||||
# comment line
|
||||
foo|1|2|3 # ignore | this
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO(data), index_col=0, sep=None, comment="#")
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"],
|
||||
index=Index(["foo", "bar", "baz"], name="index"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("encoding", [None, "utf-8"])
|
||||
def test_sniff_delimiter_encoding(python_parser_only, encoding):
|
||||
parser = python_parser_only
|
||||
data = """ignore this
|
||||
ignore this too
|
||||
index|A|B|C
|
||||
foo|1|2|3
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
|
||||
if encoding is not None:
|
||||
data = data.encode(encoding)
|
||||
data = BytesIO(data)
|
||||
data = TextIOWrapper(data, encoding=encoding)
|
||||
else:
|
||||
data = StringIO(data)
|
||||
|
||||
result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding)
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"],
|
||||
index=Index(["foo", "bar", "baz"], name="index"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_single_line(python_parser_only):
|
||||
# see gh-6607: sniff separator
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None)
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [{"skipfooter": 2}, {"nrows": 3}])
|
||||
def test_skipfooter(python_parser_only, kwargs):
|
||||
# see gh-6607
|
||||
data = """A,B,C
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
want to skip this
|
||||
also also skip this
|
||||
"""
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")]
|
||||
)
|
||||
def test_decompression_regex_sep(python_parser_only, csv1, compression, klass):
|
||||
# see gh-6607
|
||||
parser = python_parser_only
|
||||
|
||||
with open(csv1, "rb") as f:
|
||||
data = f.read()
|
||||
|
||||
data = data.replace(b",", b"::")
|
||||
expected = parser.read_csv(csv1)
|
||||
|
||||
module = pytest.importorskip(compression)
|
||||
klass = getattr(module, klass)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with klass(path, mode="wb") as tmp:
|
||||
tmp.write(data)
|
||||
|
||||
result = parser.read_csv(path, sep="::", compression=compression)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_buglet_4x_multi_index(python_parser_only):
|
||||
# see gh-6607
|
||||
data = """ A B C D E
|
||||
one two three four
|
||||
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
|
||||
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
|
||||
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
|
||||
parser = python_parser_only
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640],
|
||||
[0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
|
||||
[-0.6662, -0.5243, -0.3580, 0.89145, 2.5838],
|
||||
],
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
index=MultiIndex.from_tuples(
|
||||
[("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)],
|
||||
names=["one", "two", "three", "four"],
|
||||
),
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_buglet_4x_multi_index2(python_parser_only):
|
||||
# see gh-6893
|
||||
data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9"
|
||||
parser = python_parser_only
|
||||
|
||||
expected = DataFrame.from_records(
|
||||
[(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
|
||||
columns=list("abcABC"),
|
||||
index=list("abc"),
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("add_footer", [True, False])
|
||||
def test_skipfooter_with_decimal(python_parser_only, add_footer):
|
||||
# see gh-6971
|
||||
data = "1#2\n3#4"
|
||||
parser = python_parser_only
|
||||
expected = DataFrame({"a": [1.2, 3.4]})
|
||||
|
||||
if add_footer:
|
||||
# The stray footer line should not mess with the
|
||||
# casting of the first two lines if we skip it.
|
||||
kwargs = {"skipfooter": 1}
|
||||
data += "\nFooter"
|
||||
else:
|
||||
kwargs = {}
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"]
|
||||
)
|
||||
def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
|
||||
# see gh-3404
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
parser = python_parser_only
|
||||
|
||||
data = "1" + sep + "2"
|
||||
encoded_data = data.encode(encoding)
|
||||
|
||||
result = parser.read_csv(
|
||||
BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
|
||||
def test_multi_char_sep_quotes(python_parser_only, quoting):
|
||||
# see gh-13374
|
||||
kwargs = {"sep": ",,"}
|
||||
parser = python_parser_only
|
||||
|
||||
data = 'a,,b\n1,,a\n2,,"2,,b"'
|
||||
|
||||
if quoting == csv.QUOTE_NONE:
|
||||
msg = "Expected 2 fields in line 3, saw 3"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
|
||||
else:
|
||||
msg = "ignored when a multi-char delimiter is used"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
|
||||
|
||||
|
||||
def test_none_delimiter(python_parser_only):
|
||||
# see gh-13374 and gh-17465
|
||||
parser = python_parser_only
|
||||
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
|
||||
expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]})
|
||||
|
||||
# We expect the third line in the data to be
|
||||
# skipped because it is malformed, but we do
|
||||
# not expect any errors to occur.
|
||||
with tm.assert_produces_warning(
|
||||
ParserWarning, match="Skipping line 3", check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data), header=0, sep=None, on_bad_lines="warn"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
|
||||
@pytest.mark.parametrize("skipfooter", [0, 1])
|
||||
def test_skipfooter_bad_row(python_parser_only, data, skipfooter):
|
||||
# see gh-13879 and gh-15910
|
||||
parser = python_parser_only
|
||||
if skipfooter:
|
||||
msg = "parsing errors in the skipped footer rows"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=skipfooter)
|
||||
else:
|
||||
msg = "unexpected end of data|expected after"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=skipfooter)
|
||||
|
||||
|
||||
def test_malformed_skipfooter(python_parser_only):
|
||||
parser = python_parser_only
|
||||
data = """ignore
|
||||
A,B,C
|
||||
1,2,3 # comment
|
||||
1,2,3,4,5
|
||||
2,3,4
|
||||
footer
|
||||
"""
|
||||
msg = "Expected 3 fields in line 4, saw 5"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)
|
||||
|
||||
|
||||
def test_python_engine_file_no_next(python_parser_only):
|
||||
parser = python_parser_only
|
||||
|
||||
class NoNextBuffer:
|
||||
def __init__(self, csv_data) -> None:
|
||||
self.data = csv_data
|
||||
|
||||
def __iter__(self) -> Iterator:
|
||||
return self.data.__iter__()
|
||||
|
||||
def read(self):
|
||||
return self.data
|
||||
|
||||
def readline(self):
|
||||
return self.data
|
||||
|
||||
parser.read_csv(NoNextBuffer("a\n1"))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bad_line_func", [lambda x: ["2", "3"], lambda x: x[:2]])
|
||||
def test_on_bad_lines_callable(python_parser_only, bad_line_func):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2
|
||||
2,3,4,5,6
|
||||
3,4
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
|
||||
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_on_bad_lines_callable_write_to_external_list(python_parser_only):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2
|
||||
2,3,4,5,6
|
||||
3,4
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
lst = []
|
||||
|
||||
def bad_line_func(bad_line: list[str]) -> list[str]:
|
||||
lst.append(bad_line)
|
||||
return ["2", "3"]
|
||||
|
||||
result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
|
||||
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert lst == [["2", "3", "4", "5", "6"]]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bad_line_func", [lambda x: ["foo", "bar"], lambda x: x[:2]])
|
||||
@pytest.mark.parametrize("sep", [",", "111"])
|
||||
def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, sep):
|
||||
# GH 5686
|
||||
# iterator=True has a separate code path than iterator=False
|
||||
parser = python_parser_only
|
||||
data = f"""
|
||||
0{sep}1
|
||||
hi{sep}there
|
||||
foo{sep}bar{sep}baz
|
||||
good{sep}bye
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
result_iter = parser.read_csv(
|
||||
bad_sio, on_bad_lines=bad_line_func, chunksize=1, iterator=True, sep=sep
|
||||
)
|
||||
expecteds = [
|
||||
{"0": "hi", "1": "there"},
|
||||
{"0": "foo", "1": "bar"},
|
||||
{"0": "good", "1": "bye"},
|
||||
]
|
||||
for i, (result, expected) in enumerate(zip(result_iter, expecteds)):
|
||||
expected = DataFrame(expected, index=range(i, i + 1))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2
|
||||
2,3,4,5,6
|
||||
3,4
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
msg = "This function is buggy."
|
||||
|
||||
def bad_line_func(bad_line):
|
||||
raise ValueError(msg)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
|
||||
|
||||
|
||||
def test_on_bad_lines_callable_not_expected_length(python_parser_only):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2
|
||||
2,3,4,5,6
|
||||
3,4
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
|
||||
result = parser.read_csv_check_warnings(
|
||||
ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x
|
||||
)
|
||||
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_on_bad_lines_callable_returns_none(python_parser_only):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2
|
||||
2,3,4,5,6
|
||||
3,4
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
|
||||
result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None)
|
||||
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_on_bad_lines_index_col_inferred(python_parser_only):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2,3
|
||||
4,5,6
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
|
||||
result = parser.read_csv(bad_sio, on_bad_lines=lambda x: ["99", "99"])
|
||||
expected = DataFrame({"a": [2, 5], "b": [3, 6]}, index=[1, 4])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_col_false_and_header_none(python_parser_only):
|
||||
# GH#46955
|
||||
parser = python_parser_only
|
||||
data = """
|
||||
0.5,0.03
|
||||
0.1,0.2,0.3,2
|
||||
"""
|
||||
result = parser.read_csv_check_warnings(
|
||||
ParserWarning,
|
||||
"Length of header",
|
||||
StringIO(data),
|
||||
sep=",",
|
||||
header=None,
|
||||
index_col=False,
|
||||
)
|
||||
expected = DataFrame({0: [0.5, 0.1], 1: [0.03, 0.2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parser_only):
|
||||
# GH#46569
|
||||
parser = python_parser_only
|
||||
data = StringIO("a\na,b\nc,d,e\nf,g,h")
|
||||
result = parser.read_csv_check_warnings(
|
||||
ParserWarning, "Length of header", data, engine="python", index_col=False
|
||||
)
|
||||
expected = DataFrame({"a": ["a", "c", "f"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}]
|
||||
)
|
||||
def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, dtype):
|
||||
# GH#50270
|
||||
parser = python_parser_only
|
||||
data = """\
|
||||
a;b;c
|
||||
0000.7995;16.000;0
|
||||
3.03.001.00514;0;4.000
|
||||
4923.600.041;23.000;131"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
sep=";",
|
||||
dtype=dtype,
|
||||
thousands=".",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": ["0000.7995", "3.03.001.00514", "4923.600.041"],
|
||||
"b": [16000, 0, 23000],
|
||||
"c": [0, 4000, 131],
|
||||
}
|
||||
)
|
||||
if dtype["a"] == object:
|
||||
expected["a"] = expected["a"].astype(object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype,expected",
|
||||
[
|
||||
(
|
||||
{"a": str, "b": np.float64, "c": np.int64},
|
||||
DataFrame(
|
||||
{
|
||||
"b": [16000.1, 0, 23000],
|
||||
"c": [0, 4001, 131],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
str,
|
||||
DataFrame(
|
||||
{
|
||||
"b": ["16,000.1", "0", "23,000"],
|
||||
"c": ["0", "4,001", "131"],
|
||||
}
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, expected):
|
||||
# GH#50270
|
||||
parser = python_parser_only
|
||||
data = """a;b;c
|
||||
0000,7995;16,000.1;0
|
||||
3,03,001,00514;0;4,001
|
||||
4923,600,041;23,000;131
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
sep=";",
|
||||
dtype=dtype,
|
||||
thousands=",",
|
||||
)
|
||||
expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,183 @@
|
||||
"""
|
||||
Tests that quoting specifications are properly handled
|
||||
during parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY311
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,msg",
|
||||
[
|
||||
({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'),
|
||||
(
|
||||
{"quotechar": None, "quoting": csv.QUOTE_MINIMAL},
|
||||
"quotechar must be set if quoting enabled",
|
||||
),
|
||||
({"quotechar": 2}, '"quotechar" must be string( or None)?, not int'),
|
||||
],
|
||||
)
|
||||
@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block
|
||||
def test_bad_quote_char(all_parsers, kwargs, msg):
|
||||
data = "1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"quoting,msg",
|
||||
[
|
||||
("foo", '"quoting" must be an integer|Argument'),
|
||||
(10, 'bad "quoting" value'), # quoting must be in the range [0, 3]
|
||||
],
|
||||
)
|
||||
@xfail_pyarrow # ValueError: The 'quoting' option is not supported
|
||||
def test_bad_quoting(all_parsers, quoting, msg):
|
||||
data = "1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), quoting=quoting)
|
||||
|
||||
|
||||
def test_quote_char_basic(all_parsers):
|
||||
parser = all_parsers
|
||||
data = 'a,b,c\n1,2,"cat"'
|
||||
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar='"')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
|
||||
def test_quote_char_various(all_parsers, quote_char):
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
|
||||
|
||||
data = 'a,b,c\n1,2,"cat"'
|
||||
new_data = data.replace('"', quote_char)
|
||||
|
||||
result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: The 'quoting' option is not supported
|
||||
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
|
||||
@pytest.mark.parametrize("quote_char", ["", None])
|
||||
def test_null_quote_char(all_parsers, quoting, quote_char):
|
||||
kwargs = {"quotechar": quote_char, "quoting": quoting}
|
||||
data = "a,b,c\n1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
if quoting != csv.QUOTE_NONE:
|
||||
# Sanity checking.
|
||||
msg = (
|
||||
'"quotechar" must be a 1-character string'
|
||||
if PY311 and all_parsers.engine == "python" and quote_char == ""
|
||||
else "quotechar must be set if quoting enabled"
|
||||
)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
elif not (PY311 and all_parsers.engine == "python"):
|
||||
# Python 3.11+ doesn't support null/blank quote chars in their csv parsers
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,exp_data",
|
||||
[
|
||||
({}, [[1, 2, "foo"]]), # Test default.
|
||||
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
|
||||
({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]),
|
||||
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
|
||||
({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]),
|
||||
# QUOTE_NONE tells the reader to do no special handling
|
||||
# of quote characters and leave them alone.
|
||||
({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]),
|
||||
# QUOTE_NONNUMERIC tells the reader to cast
|
||||
# all non-quoted fields to float
|
||||
({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]),
|
||||
],
|
||||
)
|
||||
@xfail_pyarrow # ValueError: The 'quoting' option is not supported
|
||||
def test_quoting_various(all_parsers, kwargs, exp_data):
|
||||
data = '1,2,"foo"'
|
||||
parser = all_parsers
|
||||
columns = ["a", "b", "c"]
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=columns, **kwargs)
|
||||
expected = DataFrame(exp_data, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])]
|
||||
)
|
||||
def test_double_quote(all_parsers, doublequote, exp_data, request):
|
||||
parser = all_parsers
|
||||
data = 'a,b\n3,"4 "" 5"'
|
||||
|
||||
if parser.engine == "pyarrow" and not doublequote:
|
||||
mark = pytest.mark.xfail(reason="Mismatched result")
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote)
|
||||
expected = DataFrame(exp_data, columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quotechar", ['"', "\u0001"])
|
||||
def test_quotechar_unicode(all_parsers, quotechar):
|
||||
# see gh-14477
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar=quotechar)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("balanced", [True, False])
|
||||
def test_unbalanced_quoting(all_parsers, balanced, request):
|
||||
# see gh-22789.
|
||||
parser = all_parsers
|
||||
data = 'a,b,c\n1,2,"3'
|
||||
|
||||
if parser.engine == "pyarrow" and not balanced:
|
||||
mark = pytest.mark.xfail(reason="Mismatched result")
|
||||
request.applymarker(mark)
|
||||
|
||||
if balanced:
|
||||
# Re-balance the quoting and read in without errors.
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
result = parser.read_csv(StringIO(data + '"'))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
msg = (
|
||||
"EOF inside string starting at row 1"
|
||||
if parser.engine == "c"
|
||||
else "unexpected end of data"
|
||||
)
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
1034
lib/python3.11/site-packages/pandas/tests/io/parser/test_read_fwf.py
Normal file
1034
lib/python3.11/site-packages/pandas/tests/io/parser/test_read_fwf.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,334 @@
|
||||
"""
|
||||
Tests that skipped rows are properly handled during
|
||||
parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import EmptyDataError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
@pytest.mark.parametrize("skiprows", [list(range(6)), 6])
|
||||
def test_skip_rows_bug(all_parsers, skiprows):
|
||||
# see gh-505
|
||||
parser = all_parsers
|
||||
text = """#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
1/1/2000,1.,2.,3.
|
||||
1/2/2000,4,5,6
|
||||
1/3/2000,7,8,9
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True
|
||||
)
|
||||
index = Index(
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_deep_skip_rows(all_parsers):
|
||||
# see gh-4382
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n" + "\n".join(
|
||||
[",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)]
|
||||
)
|
||||
condensed_data = "a,b,c\n" + "\n".join(
|
||||
[",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]
|
||||
)
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=[6, 8])
|
||||
condensed_result = parser.read_csv(StringIO(condensed_data))
|
||||
tm.assert_frame_equal(result, condensed_result)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame are different
|
||||
def test_skip_rows_blank(all_parsers):
|
||||
# see gh-9832
|
||||
parser = all_parsers
|
||||
text = """#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
|
||||
1/1/2000,1.,2.,3.
|
||||
1/2/2000,4,5,6
|
||||
1/3/2000,7,8,9
|
||||
"""
|
||||
data = parser.read_csv(
|
||||
StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True
|
||||
)
|
||||
index = Index(
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
|
||||
)
|
||||
tm.assert_frame_equal(data, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line 11
|
||||
line 12",2
|
||||
2,"line 21
|
||||
line 22",2
|
||||
3,"line 31",1""",
|
||||
{"skiprows": [1]},
|
||||
DataFrame(
|
||||
[[2, "line 21\nline 22", 2], [3, "line 31", 1]],
|
||||
columns=["id", "text", "num_lines"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
|
||||
{"quotechar": "~", "skiprows": [2]},
|
||||
DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]),
|
||||
),
|
||||
(
|
||||
(
|
||||
"Text,url\n~example\n "
|
||||
"sentence\n one~,url1\n~"
|
||||
"example\n sentence\n two~,url2\n~"
|
||||
"example\n sentence\n three~,url3"
|
||||
),
|
||||
{"quotechar": "~", "skiprows": [1, 3]},
|
||||
DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_row_with_quote(all_parsers):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
data = """id,text,num_lines
|
||||
1,"line '11' line 12",2
|
||||
2,"line '21' line 22",2
|
||||
3,"line '31' line 32",1"""
|
||||
|
||||
exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]]
|
||||
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=[1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,exp_data",
|
||||
[
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line \n'11' line 12",2
|
||||
2,"line \n'21' line 22",2
|
||||
3,"line \n'31' line 32",1""",
|
||||
[[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
|
||||
),
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line '11\n' line 12",2
|
||||
2,"line '21\n' line 22",2
|
||||
3,"line '31\n' line 32",1""",
|
||||
[[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
|
||||
),
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line '11\n' \r\tline 12",2
|
||||
2,"line '21\n' \r\tline 22",2
|
||||
3,"line '31\n' \r\tline 32",1""",
|
||||
[[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
|
||||
),
|
||||
],
|
||||
)
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), skiprows=[1])
|
||||
|
||||
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: The 'delim_whitespace' option is not supported
|
||||
@pytest.mark.parametrize(
|
||||
"lineterminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR"
|
||||
)
|
||||
def test_skiprows_lineterminator(all_parsers, lineterminator, request):
|
||||
# see gh-9079
|
||||
parser = all_parsers
|
||||
data = "\n".join(
|
||||
[
|
||||
"SMOSMANIA ThetaProbe-ML2X ",
|
||||
"2007/01/01 01:00 0.2140 U M ",
|
||||
"2007/01/01 02:00 0.2141 M O ",
|
||||
"2007/01/01 04:00 0.2142 D M ",
|
||||
]
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
["2007/01/01", "01:00", 0.2140, "U", "M"],
|
||||
["2007/01/01", "02:00", 0.2141, "M", "O"],
|
||||
["2007/01/01", "04:00", 0.2142, "D", "M"],
|
||||
],
|
||||
columns=["date", "time", "var", "flag", "oflag"],
|
||||
)
|
||||
|
||||
if parser.engine == "python" and lineterminator == "\r":
|
||||
mark = pytest.mark.xfail(reason="'CR' not respect with the Python parser yet")
|
||||
request.applymarker(mark)
|
||||
|
||||
data = data.replace("\n", lineterminator)
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
skiprows=1,
|
||||
delim_whitespace=True,
|
||||
names=["date", "time", "var", "flag", "oflag"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame are different
|
||||
def test_skiprows_infield_quote(all_parsers):
|
||||
# see gh-14459
|
||||
parser = all_parsers
|
||||
data = 'a"\nb"\na\n1'
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,expected",
|
||||
[
|
||||
({}, DataFrame({"1": [3, 5]})),
|
||||
({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})),
|
||||
],
|
||||
)
|
||||
def test_skip_rows_callable(all_parsers, kwargs, expected):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_rows_callable_not_in(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "0,a\n1,b\n2,c\n3,d\n4,e"
|
||||
expected = DataFrame([[1, "b"], [3, "d"]])
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), header=None, skiprows=lambda x: x not in [1, 3]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_rows_skip_all(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
msg = "No columns to parse from file"
|
||||
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
parser.read_csv(StringIO(data), skiprows=lambda x: True)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_rows_bad_callable(all_parsers):
|
||||
msg = "by zero"
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
|
||||
with pytest.raises(ZeroDivisionError, match=msg):
|
||||
parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_rows_and_n_rows(all_parsers):
|
||||
# GH#44021
|
||||
data = """a,b
|
||||
1,a
|
||||
2,b
|
||||
3,c
|
||||
4,d
|
||||
5,e
|
||||
6,f
|
||||
7,g
|
||||
8,h
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
|
||||
expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow
|
||||
def test_skip_rows_with_chunks(all_parsers):
|
||||
# GH 55677
|
||||
data = """col_a
|
||||
10
|
||||
20
|
||||
30
|
||||
40
|
||||
50
|
||||
60
|
||||
70
|
||||
80
|
||||
90
|
||||
100
|
||||
"""
|
||||
parser = all_parsers
|
||||
reader = parser.read_csv(
|
||||
StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4
|
||||
)
|
||||
df1 = next(reader)
|
||||
df2 = next(reader)
|
||||
|
||||
tm.assert_frame_equal(df1, DataFrame({"col_a": [20, 30, 60, 70]}))
|
||||
tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6]))
|
||||
@ -0,0 +1,342 @@
|
||||
"""
|
||||
Tests the TextReader class in parsers.pyx, which
|
||||
is integral to the C engine in parsers.py
|
||||
"""
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas._libs.parsers as parser
|
||||
from pandas._libs.parsers import TextReader
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.parsers import (
|
||||
TextFileReader,
|
||||
read_csv,
|
||||
)
|
||||
from pandas.io.parsers.c_parser_wrapper import ensure_dtype_objs
|
||||
|
||||
|
||||
class TestTextReader:
|
||||
@pytest.fixture
|
||||
def csv_path(self, datapath):
|
||||
return datapath("io", "data", "csv", "test1.csv")
|
||||
|
||||
def test_file_handle(self, csv_path):
|
||||
with open(csv_path, "rb") as f:
|
||||
reader = TextReader(f)
|
||||
reader.read()
|
||||
|
||||
def test_file_handle_mmap(self, csv_path):
|
||||
# this was never using memory_map=True
|
||||
with open(csv_path, "rb") as f:
|
||||
reader = TextReader(f, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_StringIO(self, csv_path):
|
||||
with open(csv_path, "rb") as f:
|
||||
text = f.read()
|
||||
src = BytesIO(text)
|
||||
reader = TextReader(src, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_string_factorize(self):
|
||||
# should this be optional?
|
||||
data = "a\nb\na\nb\na"
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
assert len(set(map(id, result[0]))) == 2
|
||||
|
||||
def test_skipinitialspace(self):
|
||||
data = "a, b\na, b\na, b\na, b"
|
||||
|
||||
reader = TextReader(StringIO(data), skipinitialspace=True, header=None)
|
||||
result = reader.read()
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
result[0], np.array(["a", "a", "a", "a"], dtype=np.object_)
|
||||
)
|
||||
tm.assert_numpy_array_equal(
|
||||
result[1], np.array(["b", "b", "b", "b"], dtype=np.object_)
|
||||
)
|
||||
|
||||
def test_parse_booleans(self):
|
||||
data = "True\nFalse\nTrue\nTrue"
|
||||
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
|
||||
assert result[0].dtype == np.bool_
|
||||
|
||||
def test_delimit_whitespace(self):
|
||||
data = 'a b\na\t\t "b"\n"a"\t \t b'
|
||||
|
||||
reader = TextReader(StringIO(data), delim_whitespace=True, header=None)
|
||||
result = reader.read()
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
result[0], np.array(["a", "a", "a"], dtype=np.object_)
|
||||
)
|
||||
tm.assert_numpy_array_equal(
|
||||
result[1], np.array(["b", "b", "b"], dtype=np.object_)
|
||||
)
|
||||
|
||||
def test_embedded_newline(self):
|
||||
data = 'a\n"hello\nthere"\nthis'
|
||||
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_)
|
||||
tm.assert_numpy_array_equal(result[0], expected)
|
||||
|
||||
def test_euro_decimal(self):
|
||||
data = "12345,67\n345,678"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array([12345.67, 345.678])
|
||||
tm.assert_almost_equal(result[0], expected)
|
||||
|
||||
def test_integer_thousands(self):
|
||||
data = "123,456\n12,500"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array([123456, 12500], dtype=np.int64)
|
||||
tm.assert_almost_equal(result[0], expected)
|
||||
|
||||
def test_integer_thousands_alt(self):
|
||||
data = "123.456\n12.500"
|
||||
|
||||
reader = TextFileReader(
|
||||
StringIO(data), delimiter=":", thousands=".", header=None
|
||||
)
|
||||
result = reader.read()
|
||||
|
||||
expected = DataFrame([123456, 12500])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_skip_bad_lines(self):
|
||||
# too many lines, see #2430 for why
|
||||
data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=":", header=None)
|
||||
msg = r"Error tokenizing data\. C error: Expected 3 fields in line 4, saw 4"
|
||||
with pytest.raises(parser.ParserError, match=msg):
|
||||
reader.read()
|
||||
|
||||
reader = TextReader(
|
||||
StringIO(data), delimiter=":", header=None, on_bad_lines=2 # Skip
|
||||
)
|
||||
result = reader.read()
|
||||
expected = {
|
||||
0: np.array(["a", "d", "g", "l"], dtype=object),
|
||||
1: np.array(["b", "e", "h", "m"], dtype=object),
|
||||
2: np.array(["c", "f", "i", "n"], dtype=object),
|
||||
}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
with tm.assert_produces_warning(ParserWarning, match="Skipping line"):
|
||||
reader = TextReader(
|
||||
StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn
|
||||
)
|
||||
reader.read()
|
||||
|
||||
def test_header_not_enough_lines(self):
|
||||
data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=",", header=2)
|
||||
header = reader.header
|
||||
expected = [["a", "b", "c"]]
|
||||
assert header == expected
|
||||
|
||||
recs = reader.read()
|
||||
expected = {
|
||||
0: np.array([1, 4], dtype=np.int64),
|
||||
1: np.array([2, 5], dtype=np.int64),
|
||||
2: np.array([3, 6], dtype=np.int64),
|
||||
}
|
||||
assert_array_dicts_equal(recs, expected)
|
||||
|
||||
def test_escapechar(self):
|
||||
data = '\\"hello world"\n\\"hello world"\n\\"hello world"'
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\")
|
||||
result = reader.read()
|
||||
expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
def test_eof_has_eol(self):
|
||||
# handling of new line at EOF
|
||||
pass
|
||||
|
||||
def test_na_substitution(self):
|
||||
pass
|
||||
|
||||
def test_numpy_string_dtype(self):
|
||||
data = """\
|
||||
a,1
|
||||
aa,2
|
||||
aaa,3
|
||||
aaaa,4
|
||||
aaaaa,5"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
if "dtype" in kwds:
|
||||
kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
|
||||
return TextReader(StringIO(data), delimiter=",", header=None, **kwds)
|
||||
|
||||
reader = _make_reader(dtype="S5,i4")
|
||||
result = reader.read()
|
||||
|
||||
assert result[0].dtype == "S5"
|
||||
|
||||
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaaa"], dtype="S5")
|
||||
assert (result[0] == ex_values).all()
|
||||
assert result[1].dtype == "i4"
|
||||
|
||||
reader = _make_reader(dtype="S4")
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "S4"
|
||||
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaa"], dtype="S4")
|
||||
assert (result[0] == ex_values).all()
|
||||
assert result[1].dtype == "S4"
|
||||
|
||||
def test_pass_dtype(self):
|
||||
data = """\
|
||||
one,two
|
||||
1,a
|
||||
2,b
|
||||
3,c
|
||||
4,d"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
if "dtype" in kwds:
|
||||
kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
|
||||
return TextReader(StringIO(data), delimiter=",", **kwds)
|
||||
|
||||
reader = _make_reader(dtype={"one": "u1", 1: "S1"})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "u1"
|
||||
assert result[1].dtype == "S1"
|
||||
|
||||
reader = _make_reader(dtype={"one": np.uint8, 1: object})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "u1"
|
||||
assert result[1].dtype == "O"
|
||||
|
||||
reader = _make_reader(dtype={"one": np.dtype("u1"), 1: np.dtype("O")})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "u1"
|
||||
assert result[1].dtype == "O"
|
||||
|
||||
def test_usecols(self):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
return TextReader(StringIO(data), delimiter=",", **kwds)
|
||||
|
||||
reader = _make_reader(usecols=(1, 2))
|
||||
result = reader.read()
|
||||
|
||||
exp = _make_reader().read()
|
||||
assert len(result) == 2
|
||||
assert (result[1] == exp[1]).all()
|
||||
assert (result[2] == exp[2]).all()
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, kwargs",
|
||||
[
|
||||
("a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12", {"delimiter": ","}),
|
||||
(
|
||||
"a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12",
|
||||
{"delim_whitespace": True},
|
||||
),
|
||||
("a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12", {"delimiter": ","}),
|
||||
(
|
||||
(
|
||||
"A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r"
|
||||
"AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r"
|
||||
",BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0"
|
||||
),
|
||||
{"delimiter": ","},
|
||||
),
|
||||
("A B C\r 2 3\r4 5 6", {"delim_whitespace": True}),
|
||||
("A B C\r2 3\r4 5 6", {"delim_whitespace": True}),
|
||||
],
|
||||
)
|
||||
def test_cr_delimited(self, text, kwargs):
|
||||
nice_text = text.replace("\r", "\r\n")
|
||||
result = TextReader(StringIO(text), **kwargs).read()
|
||||
expected = TextReader(StringIO(nice_text), **kwargs).read()
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
def test_empty_field_eof(self):
|
||||
data = "a,b,c\n1,2,3\n4,,"
|
||||
|
||||
result = TextReader(StringIO(data), delimiter=",").read()
|
||||
|
||||
expected = {
|
||||
0: np.array([1, 4], dtype=np.int64),
|
||||
1: np.array(["2", ""], dtype=object),
|
||||
2: np.array(["3", ""], dtype=object),
|
||||
}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("repeat", range(10))
|
||||
def test_empty_field_eof_mem_access_bug(self, repeat):
|
||||
# GH5664
|
||||
a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"])
|
||||
b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1])
|
||||
c = DataFrame(
|
||||
[
|
||||
[1, 2, 3, 4],
|
||||
[6, np.nan, np.nan, np.nan],
|
||||
[8, 9, 10, 11],
|
||||
[13, 14, np.nan, np.nan],
|
||||
],
|
||||
columns=list("abcd"),
|
||||
index=[0, 5, 7, 12],
|
||||
)
|
||||
|
||||
df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c")
|
||||
tm.assert_frame_equal(df, a)
|
||||
|
||||
df = read_csv(
|
||||
StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c"
|
||||
)
|
||||
tm.assert_frame_equal(df, b)
|
||||
|
||||
df = read_csv(
|
||||
StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"),
|
||||
names=list("abcd"),
|
||||
engine="c",
|
||||
)
|
||||
tm.assert_frame_equal(df, c)
|
||||
|
||||
def test_empty_csv_input(self):
|
||||
# GH14867
|
||||
with read_csv(
|
||||
StringIO(), chunksize=20, header=None, names=["a", "b", "c"]
|
||||
) as df:
|
||||
assert isinstance(df, TextFileReader)
|
||||
|
||||
|
||||
def assert_array_dicts_equal(left, right):
|
||||
for k, v in left.items():
|
||||
tm.assert_numpy_array_equal(np.asarray(v), np.asarray(right[k]))
|
||||
@ -0,0 +1,226 @@
|
||||
"""
|
||||
Tests that features that are currently unsupported in
|
||||
either the Python or C parser are actually enforced
|
||||
and are clearly communicated to the user.
|
||||
|
||||
Ultimately, the goal is to remove test cases from this
|
||||
test suite as new feature support is added to the parsers.
|
||||
"""
|
||||
from io import StringIO
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserError
|
||||
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.parsers import read_csv
|
||||
import pandas.io.parsers.readers as parsers
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
|
||||
def python_engine(request):
|
||||
return request.param
|
||||
|
||||
|
||||
class TestUnsupportedFeatures:
|
||||
def test_mangle_dupe_cols_false(self):
|
||||
# see gh-12935
|
||||
data = "a b c\n1 2 3"
|
||||
|
||||
for engine in ("c", "python"):
|
||||
with pytest.raises(TypeError, match="unexpected keyword"):
|
||||
read_csv(StringIO(data), engine=engine, mangle_dupe_cols=True)
|
||||
|
||||
def test_c_engine(self):
|
||||
# see gh-6607
|
||||
data = "a b c\n1 2 3"
|
||||
msg = "does not support"
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
# specify C engine with unsupported options (raise)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
|
||||
read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine="c", sep=r"\s")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine="c", sep="\t", quotechar=chr(128))
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine="c", skipfooter=1)
|
||||
|
||||
# specify C-unsupported options without python-unsupported options
|
||||
with tm.assert_produces_warning((parsers.ParserWarning, FutureWarning)):
|
||||
read_csv(StringIO(data), sep=None, delim_whitespace=False)
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), sep=r"\s")
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), sep="\t", quotechar=chr(128))
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), skipfooter=1)
|
||||
|
||||
text = """ A B C D E
|
||||
one two three four
|
||||
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
|
||||
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
|
||||
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
|
||||
msg = "Error tokenizing data"
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
read_csv(StringIO(text), sep="\\s+")
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
read_csv(StringIO(text), engine="c", sep="\\s+")
|
||||
|
||||
msg = "Only length-1 thousands markers supported"
|
||||
data = """A|B|C
|
||||
1|2,334|5
|
||||
10|13|10.
|
||||
"""
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), thousands=",,")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), thousands="")
|
||||
|
||||
msg = "Only length-1 line terminators supported"
|
||||
data = "a,b,c~~1,2,3~~4,5,6"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), lineterminator="~~")
|
||||
|
||||
def test_python_engine(self, python_engine):
|
||||
from pandas.io.parsers.readers import _python_unsupported as py_unsupported
|
||||
|
||||
data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
|
||||
for default in py_unsupported:
|
||||
msg = (
|
||||
f"The {repr(default)} option is not "
|
||||
f"supported with the {repr(python_engine)} engine"
|
||||
)
|
||||
|
||||
kwargs = {default: object()}
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine=python_engine, **kwargs)
|
||||
|
||||
def test_python_engine_file_no_iter(self, python_engine):
|
||||
# see gh-16530
|
||||
class NoNextBuffer:
|
||||
def __init__(self, csv_data) -> None:
|
||||
self.data = csv_data
|
||||
|
||||
def __next__(self):
|
||||
return self.data.__next__()
|
||||
|
||||
def read(self):
|
||||
return self.data
|
||||
|
||||
def readline(self):
|
||||
return self.data
|
||||
|
||||
data = "a\n1"
|
||||
msg = "'NoNextBuffer' object is not iterable|argument 1 must be an iterator"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
read_csv(NoNextBuffer(data), engine=python_engine)
|
||||
|
||||
def test_pyarrow_engine(self):
|
||||
from pandas.io.parsers.readers import _pyarrow_unsupported as pa_unsupported
|
||||
|
||||
data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
|
||||
for default in pa_unsupported:
|
||||
msg = (
|
||||
f"The {repr(default)} option is not "
|
||||
f"supported with the 'pyarrow' engine"
|
||||
)
|
||||
kwargs = {default: object()}
|
||||
default_needs_bool = {"warn_bad_lines", "error_bad_lines"}
|
||||
if default == "dialect":
|
||||
kwargs[default] = "excel" # test a random dialect
|
||||
elif default in default_needs_bool:
|
||||
kwargs[default] = True
|
||||
elif default == "on_bad_lines":
|
||||
kwargs[default] = "warn"
|
||||
|
||||
warn = None
|
||||
depr_msg = None
|
||||
if "delim_whitespace" in kwargs:
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
warn = FutureWarning
|
||||
if "verbose" in kwargs:
|
||||
depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated"
|
||||
warn = FutureWarning
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(warn, match=depr_msg):
|
||||
read_csv(StringIO(data), engine="pyarrow", **kwargs)
|
||||
|
||||
def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers):
|
||||
# GH 5686
|
||||
# GH 54643
|
||||
sio = StringIO("a,b\n1,2")
|
||||
bad_lines_func = lambda x: x
|
||||
parser = all_parsers
|
||||
if all_parsers.engine not in ["python", "pyarrow"]:
|
||||
msg = (
|
||||
"on_bad_line can only be a callable "
|
||||
"function if engine='python' or 'pyarrow'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(sio, on_bad_lines=bad_lines_func)
|
||||
else:
|
||||
parser.read_csv(sio, on_bad_lines=bad_lines_func)
|
||||
|
||||
|
||||
def test_close_file_handle_on_invalid_usecols(all_parsers):
|
||||
# GH 45384
|
||||
parser = all_parsers
|
||||
|
||||
error = ValueError
|
||||
if parser.engine == "pyarrow":
|
||||
# Raises pyarrow.lib.ArrowKeyError
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
with tm.ensure_clean("test.csv") as fname:
|
||||
Path(fname).write_text("col1,col2\na,b\n1,2", encoding="utf-8")
|
||||
with tm.assert_produces_warning(False):
|
||||
with pytest.raises(error, match="col3"):
|
||||
parser.read_csv(fname, usecols=["col1", "col2", "col3"])
|
||||
# unlink fails on windows if file handles still point to it
|
||||
os.unlink(fname)
|
||||
|
||||
|
||||
def test_invalid_file_inputs(request, all_parsers):
|
||||
# GH#45957
|
||||
parser = all_parsers
|
||||
if parser.engine == "python":
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(reason=f"{parser.engine} engine supports lists.")
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid"):
|
||||
parser.read_csv([])
|
||||
|
||||
|
||||
def test_invalid_dtype_backend(all_parsers):
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
|
||||
"'pyarrow' are allowed."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv("test", dtype_backend="numpy")
|
||||
@ -0,0 +1,102 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.parsers import (
|
||||
_maybe_upcast,
|
||||
na_values,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import NA
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import (
|
||||
ArrowStringArray,
|
||||
BooleanArray,
|
||||
FloatingArray,
|
||||
IntegerArray,
|
||||
StringArray,
|
||||
)
|
||||
|
||||
|
||||
def test_maybe_upcast(any_real_numpy_dtype):
|
||||
# GH#36712
|
||||
|
||||
dtype = np.dtype(any_real_numpy_dtype)
|
||||
na_value = na_values[dtype]
|
||||
arr = np.array([1, 2, na_value], dtype=dtype)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
expected_mask = np.array([False, False, True])
|
||||
if issubclass(dtype.type, np.integer):
|
||||
expected = IntegerArray(arr, mask=expected_mask)
|
||||
else:
|
||||
expected = FloatingArray(arr, mask=expected_mask)
|
||||
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_maybe_upcast_no_na(any_real_numpy_dtype):
|
||||
# GH#36712
|
||||
arr = np.array([1, 2, 3], dtype=any_real_numpy_dtype)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
expected_mask = np.array([False, False, False])
|
||||
if issubclass(np.dtype(any_real_numpy_dtype).type, np.integer):
|
||||
expected = IntegerArray(arr, mask=expected_mask)
|
||||
else:
|
||||
expected = FloatingArray(arr, mask=expected_mask)
|
||||
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_maybe_upcaste_bool():
|
||||
# GH#36712
|
||||
dtype = np.bool_
|
||||
na_value = na_values[dtype]
|
||||
arr = np.array([True, False, na_value], dtype="uint8").view(dtype)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
expected_mask = np.array([False, False, True])
|
||||
expected = BooleanArray(arr, mask=expected_mask)
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_maybe_upcaste_bool_no_nan():
|
||||
# GH#36712
|
||||
dtype = np.bool_
|
||||
arr = np.array([True, False, False], dtype="uint8").view(dtype)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
expected_mask = np.array([False, False, False])
|
||||
expected = BooleanArray(arr, mask=expected_mask)
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_maybe_upcaste_all_nan():
|
||||
# GH#36712
|
||||
dtype = np.int64
|
||||
na_value = na_values[dtype]
|
||||
arr = np.array([na_value, na_value], dtype=dtype)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
expected_mask = np.array([True, True])
|
||||
expected = IntegerArray(arr, mask=expected_mask)
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("val", [na_values[np.object_], "c"])
|
||||
def test_maybe_upcast_object(val, string_storage):
|
||||
# GH#36712
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
|
||||
with pd.option_context("mode.string_storage", string_storage):
|
||||
arr = np.array(["a", "b", val], dtype=np.object_)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
if string_storage == "python":
|
||||
exp_val = "c" if val == "c" else NA
|
||||
expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_))
|
||||
else:
|
||||
exp_val = "c" if val == "c" else None
|
||||
expected = ArrowStringArray(pa.array(["a", "b", exp_val]))
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
@ -0,0 +1,194 @@
|
||||
"""
|
||||
Tests the usecols functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
_msg_pyarrow_requires_names = (
|
||||
"The pyarrow engine does not allow 'usecols' to be integer column "
|
||||
"positions. Pass a list of string column names instead."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
|
||||
def test_usecols_with_parse_dates(all_parsers, usecols):
|
||||
# see gh-9755
|
||||
data = """a,b,c,d,e
|
||||
0,1,2014-01-01,09:00,4
|
||||
0,1,2014-01-02,10:00,4"""
|
||||
parser = all_parsers
|
||||
parse_dates = [[1, 2]]
|
||||
|
||||
depr_msg = (
|
||||
"Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
|
||||
)
|
||||
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
if parser.engine == "pyarrow":
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(
|
||||
StringIO(data), usecols=usecols, parse_dates=parse_dates
|
||||
)
|
||||
return
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data), usecols=usecols, parse_dates=parse_dates
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # pyarrow.lib.ArrowKeyError: Column 'fdate' in include_columns
|
||||
def test_usecols_with_parse_dates2(all_parsers):
|
||||
# see gh-13604
|
||||
parser = all_parsers
|
||||
data = """2008-02-07 09:40,1032.43
|
||||
2008-02-07 09:50,1042.54
|
||||
2008-02-07 10:00,1051.65"""
|
||||
|
||||
names = ["date", "values"]
|
||||
usecols = names[:]
|
||||
parse_dates = [0]
|
||||
|
||||
index = Index(
|
||||
[
|
||||
Timestamp("2008-02-07 09:40"),
|
||||
Timestamp("2008-02-07 09:50"),
|
||||
Timestamp("2008-02-07 10:00"),
|
||||
],
|
||||
name="date",
|
||||
)
|
||||
cols = {"values": [1032.43, 1042.54, 1051.65]}
|
||||
expected = DataFrame(cols, index=index)
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
parse_dates=parse_dates,
|
||||
index_col=0,
|
||||
usecols=usecols,
|
||||
header=None,
|
||||
names=names,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_parse_dates3(all_parsers):
|
||||
# see gh-14792
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d,e,f,g,h,i,j
|
||||
2016/09/21,1,1,2,3,4,5,6,7,8"""
|
||||
|
||||
usecols = list("abcdefghij")
|
||||
parse_dates = [0]
|
||||
|
||||
cols = {
|
||||
"a": Timestamp("2016-09-21").as_unit("ns"),
|
||||
"b": [1],
|
||||
"c": [1],
|
||||
"d": [2],
|
||||
"e": [3],
|
||||
"f": [4],
|
||||
"g": [5],
|
||||
"h": [6],
|
||||
"i": [7],
|
||||
"j": [8],
|
||||
}
|
||||
expected = DataFrame(cols, columns=usecols)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_parse_dates4(all_parsers):
|
||||
data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
|
||||
usecols = list("abcdefghij")
|
||||
parse_dates = [[0, 1]]
|
||||
parser = all_parsers
|
||||
|
||||
cols = {
|
||||
"a_b": "2016/09/21 1",
|
||||
"c": [1],
|
||||
"d": [2],
|
||||
"e": [3],
|
||||
"f": [4],
|
||||
"g": [5],
|
||||
"h": [6],
|
||||
"i": [7],
|
||||
"j": [8],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
|
||||
|
||||
depr_msg = (
|
||||
"Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(
|
||||
(FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
usecols=usecols,
|
||||
parse_dates=parse_dates,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
|
||||
@pytest.mark.parametrize(
|
||||
"names",
|
||||
[
|
||||
list("abcde"), # Names span all columns in original data.
|
||||
list("acd"), # Names span only the selected columns.
|
||||
],
|
||||
)
|
||||
def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names, request):
|
||||
# see gh-9755
|
||||
s = """0,1,2014-01-01,09:00,4
|
||||
0,1,2014-01-02,10:00,4"""
|
||||
parse_dates = [[1, 2]]
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and not (len(names) == 3 and usecols[0] == 0):
|
||||
mark = pytest.mark.xfail(
|
||||
reason="Length mismatch in some cases, UserWarning in other"
|
||||
)
|
||||
request.applymarker(mark)
|
||||
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
|
||||
depr_msg = (
|
||||
"Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(
|
||||
(FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,96 @@
|
||||
"""
|
||||
Tests the usecols functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
def test_usecols_with_unicode_strings(all_parsers):
|
||||
# see gh-13219
|
||||
data = """AAA,BBB,CCC,DDD
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"AAA": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002,
|
||||
},
|
||||
"BBB": {0: 8, 1: 2, 2: 7},
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_single_byte_unicode_strings(all_parsers):
|
||||
# see gh-13219
|
||||
data = """A,B,C,D
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"A": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002,
|
||||
},
|
||||
"B": {0: 8, 1: 2, 2: 7},
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]])
|
||||
def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
|
||||
data = """AAA,BBB,CCC,DDD
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
_msg_validate_usecols_arg = (
|
||||
"'usecols' must either be list-like "
|
||||
"of all strings, all unicode, all "
|
||||
"integers or a callable."
|
||||
)
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]])
|
||||
def test_usecols_with_multi_byte_characters(all_parsers, usecols):
|
||||
data = """あああ,いい,ううう,ええええ
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"あああ": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002,
|
||||
},
|
||||
"いい": {0: 8, 1: 2, 2: 7},
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,563 @@
|
||||
"""
|
||||
Tests the usecols functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
array,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
_msg_validate_usecols_arg = (
|
||||
"'usecols' must either be list-like "
|
||||
"of all strings, all unicode, all "
|
||||
"integers or a callable."
|
||||
)
|
||||
_msg_validate_usecols_names = (
|
||||
"Usecols do not match columns, columns expected but not found: {0}"
|
||||
)
|
||||
_msg_pyarrow_requires_names = (
|
||||
"The pyarrow engine does not allow 'usecols' to be integer column "
|
||||
"positions. Pass a list of string column names instead."
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
def test_raise_on_mixed_dtype_usecols(all_parsers):
|
||||
# See gh-12678
|
||||
data = """a,b,c
|
||||
1000,2000,3000
|
||||
4000,5000,6000
|
||||
"""
|
||||
usecols = [0, "b", 2]
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
|
||||
def test_usecols(all_parsers, usecols, request):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_names(all_parsers):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
names = ["foo", "bar"]
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
|
||||
)
|
||||
def test_usecols_relative_to_names(all_parsers, names, usecols):
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow" and not isinstance(usecols[0], int):
|
||||
# ArrowKeyError: Column 'fb' in include_columns does not exist
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_relative_to_names2(all_parsers):
|
||||
# see gh-5766
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), names=["a", "b"], header=None, usecols=[0, 1]
|
||||
)
|
||||
|
||||
expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# regex mismatch: "Length mismatch: Expected axis has 1 elements"
|
||||
@xfail_pyarrow
|
||||
def test_usecols_name_length_conflict(all_parsers):
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
msg = "Number of passed names did not match number of header fields in the file"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1])
|
||||
|
||||
|
||||
def test_usecols_single_string(all_parsers):
|
||||
# see gh-20558
|
||||
parser = all_parsers
|
||||
data = """foo, bar, baz
|
||||
1000, 2000, 3000
|
||||
4000, 5000, 6000"""
|
||||
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols="foo")
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error in one case, AttributeError in another
|
||||
@pytest.mark.parametrize(
|
||||
"data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
|
||||
)
|
||||
def test_usecols_index_col_false(all_parsers, data):
|
||||
# see gh-9082
|
||||
parser = all_parsers
|
||||
usecols = ["a", "c", "d"]
|
||||
expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", ["b", 0])
|
||||
@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
|
||||
def test_usecols_index_col_conflict(all_parsers, usecols, index_col, request):
|
||||
# see gh-4201: test that index_col as integer reflects usecols
|
||||
parser = all_parsers
|
||||
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
|
||||
|
||||
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
|
||||
return
|
||||
|
||||
expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_index_col_conflict2(all_parsers):
|
||||
# see gh-4201: test that index_col as integer reflects usecols
|
||||
parser = all_parsers
|
||||
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
|
||||
|
||||
expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
|
||||
expected = expected.set_index(["b", "c"])
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 3 columns, got 4
|
||||
def test_usecols_implicit_index_col(all_parsers):
|
||||
# see gh-2654
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["a", "b"])
|
||||
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_index_col_middle(all_parsers):
|
||||
# GH#9098
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
1,2,3,4
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="c")
|
||||
expected = DataFrame({"b": [2], "d": [4]}, index=Index([3], name="c"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_index_col_end(all_parsers):
|
||||
# GH#9098
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
1,2,3,4
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="d")
|
||||
expected = DataFrame({"b": [2], "c": [3]}, index=Index([4], name="d"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_regex_sep(all_parsers):
|
||||
# see gh-2733
|
||||
parser = all_parsers
|
||||
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
|
||||
|
||||
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_whitespace(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(
|
||||
StringIO(data), delim_whitespace=True, usecols=("a", "b")
|
||||
)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data), delim_whitespace=True, usecols=("a", "b")
|
||||
)
|
||||
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"usecols,expected",
|
||||
[
|
||||
# Column selection by index.
|
||||
([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])),
|
||||
# Column selection by name.
|
||||
(
|
||||
["0", "1"],
|
||||
DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_usecols_with_integer_like_header(all_parsers, usecols, expected, request):
|
||||
parser = all_parsers
|
||||
data = """2,0,1
|
||||
1000,2000,3000
|
||||
4000,5000,6000"""
|
||||
|
||||
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # mismatched shape
|
||||
def test_empty_usecols(all_parsers):
|
||||
data = "a,b,c\n1,2,3\n4,5,6"
|
||||
expected = DataFrame(columns=Index([]))
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=set())
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_np_array_usecols(all_parsers):
|
||||
# see gh-12546
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,2,3"
|
||||
usecols = np.array(["a", "b"])
|
||||
|
||||
expected = DataFrame([[1, 2]], columns=usecols)
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"usecols,expected",
|
||||
[
|
||||
(
|
||||
lambda x: x.upper() in ["AAA", "BBB", "DDD"],
|
||||
DataFrame(
|
||||
{
|
||||
"AaA": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002,
|
||||
},
|
||||
"bBb": {0: 8, 1: 2, 2: 7},
|
||||
"ddd": {0: "a", 1: "b", 2: "a"},
|
||||
}
|
||||
),
|
||||
),
|
||||
(lambda x: False, DataFrame(columns=Index([]))),
|
||||
],
|
||||
)
|
||||
def test_callable_usecols(all_parsers, usecols, expected):
|
||||
# see gh-14154
|
||||
data = """AaA,bBb,CCC,ddd
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# ArrowKeyError: Column 'fa' in include_columns does not exist in CSV file
|
||||
@skip_pyarrow
|
||||
@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
|
||||
def test_incomplete_first_row(all_parsers, usecols):
|
||||
# see gh-6710
|
||||
data = "1,2\n1,2,3"
|
||||
parser = all_parsers
|
||||
names = ["a", "b", "c"]
|
||||
expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 3 columns, got 4
|
||||
@pytest.mark.parametrize(
|
||||
"data,usecols,kwargs,expected",
|
||||
[
|
||||
# see gh-8985
|
||||
(
|
||||
"19,29,39\n" * 2 + "10,20,30,40",
|
||||
[0, 1, 2],
|
||||
{"header": None},
|
||||
DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]),
|
||||
),
|
||||
# see gh-9549
|
||||
(
|
||||
("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"),
|
||||
["A", "B", "C"],
|
||||
{},
|
||||
DataFrame(
|
||||
{
|
||||
"A": [1, 3, 1, 1, 1, 5],
|
||||
"B": [2, 4, 2, 2, 2, 6],
|
||||
"C": [3, 5, 4, 3, 3, 7],
|
||||
}
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
|
||||
# see gh-8985
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"usecols,kwargs,expected,msg",
|
||||
[
|
||||
(
|
||||
["a", "b", "c", "d"],
|
||||
{},
|
||||
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
|
||||
None,
|
||||
),
|
||||
(
|
||||
["a", "b", "c", "f"],
|
||||
{},
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]"),
|
||||
),
|
||||
(["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")),
|
||||
(
|
||||
["a", "b", "f", "g"],
|
||||
{},
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"),
|
||||
),
|
||||
# see gh-14671
|
||||
(
|
||||
None,
|
||||
{"header": 0, "names": ["A", "B", "C", "D"]},
|
||||
DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}),
|
||||
None,
|
||||
),
|
||||
(
|
||||
["A", "B", "C", "f"],
|
||||
{"header": 0, "names": ["A", "B", "C", "D"]},
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]"),
|
||||
),
|
||||
(
|
||||
["A", "B", "f"],
|
||||
{"names": ["A", "B", "C", "D"]},
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_raises_on_usecols_names_mismatch(
|
||||
all_parsers, usecols, kwargs, expected, msg, request
|
||||
):
|
||||
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
||||
kwargs.update(usecols=usecols)
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and not (
|
||||
usecols is not None and expected is not None
|
||||
):
|
||||
# everything but the first case
|
||||
# ArrowKeyError: Column 'f' in include_columns does not exist in CSV file
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
if expected is None:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
|
||||
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request):
|
||||
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
||||
names = ["A", "B", "C", "D"]
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
if isinstance(usecols[0], int):
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
|
||||
return
|
||||
# "pyarrow.lib.ArrowKeyError: Column 'A' in include_columns does not exist"
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
|
||||
expected = DataFrame({"A": [1, 5], "C": [3, 7]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("names", [None, ["a", "b"]])
|
||||
def test_usecols_indices_out_of_bounds(all_parsers, names):
|
||||
# GH#25623 & GH 41130; enforced in 2.0
|
||||
parser = all_parsers
|
||||
data = """
|
||||
a,b
|
||||
1,2
|
||||
"""
|
||||
|
||||
err = ParserError
|
||||
msg = "Defining usecols with out-of-bounds"
|
||||
if parser.engine == "pyarrow":
|
||||
err = ValueError
|
||||
msg = _msg_pyarrow_requires_names
|
||||
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0)
|
||||
|
||||
|
||||
def test_usecols_additional_columns(all_parsers):
|
||||
# GH#46997
|
||||
parser = all_parsers
|
||||
usecols = lambda header: header.strip() in ["a", "b", "c"]
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
|
||||
return
|
||||
result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
|
||||
expected = DataFrame({"a": ["x"], "b": "y"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_additional_columns_integer_columns(all_parsers):
|
||||
# GH#46997
|
||||
parser = all_parsers
|
||||
usecols = lambda header: header.strip() in ["0", "1"]
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
|
||||
return
|
||||
result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
|
||||
expected = DataFrame({"0": ["x"], "1": "y"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """
|
||||
col1,col2,col3
|
||||
a,1,x
|
||||
b,2,y
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
usecols=["col1", "col2"],
|
||||
dtype={"col1": "string", "col2": "uint8", "col3": "string"},
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"col1": array(["a", "b"]), "col2": np.array([1, 2], dtype="uint8")}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
Reference in New Issue
Block a user