done
This commit is contained in:
225
lib/python3.11/site-packages/pandas/tests/io/conftest.py
Normal file
225
lib/python3.11/site-packages/pandas/tests/io/conftest.py
Normal file
@ -0,0 +1,225 @@
|
||||
import shlex
|
||||
import subprocess
|
||||
import time
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import (
|
||||
is_ci_environment,
|
||||
is_platform_arm,
|
||||
is_platform_mac,
|
||||
is_platform_windows,
|
||||
)
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas.io.common as icom
|
||||
from pandas.io.parsers import read_csv
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def compression_to_extension():
|
||||
return {value: key for key, value in icom.extension_to_compression.items()}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tips_file(datapath):
|
||||
"""Path to the tips dataset"""
|
||||
return datapath("io", "data", "csv", "tips.csv")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def jsonl_file(datapath):
|
||||
"""Path to a JSONL dataset"""
|
||||
return datapath("io", "parser", "data", "items.jsonl")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def salaries_table(datapath):
|
||||
"""DataFrame with the salaries dataset"""
|
||||
return read_csv(datapath("io", "parser", "data", "salaries.csv"), sep="\t")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def feather_file(datapath):
|
||||
return datapath("io", "data", "feather", "feather-0_3_1.feather")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def xml_file(datapath):
|
||||
return datapath("io", "data", "xml", "books.xml")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def s3_base(worker_id, monkeypatch):
|
||||
"""
|
||||
Fixture for mocking S3 interaction.
|
||||
|
||||
Sets up moto server in separate process locally
|
||||
Return url for motoserver/moto CI service
|
||||
"""
|
||||
pytest.importorskip("s3fs")
|
||||
pytest.importorskip("boto3")
|
||||
|
||||
# temporary workaround as moto fails for botocore >= 1.11 otherwise,
|
||||
# see https://github.com/spulec/moto/issues/1924 & 1952
|
||||
monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key")
|
||||
monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret")
|
||||
if is_ci_environment():
|
||||
if is_platform_arm() or is_platform_mac() or is_platform_windows():
|
||||
# NOT RUN on Windows/macOS, only Ubuntu
|
||||
# - subprocess in CI can cause timeouts
|
||||
# - GitHub Actions do not support
|
||||
# container services for the above OSs
|
||||
pytest.skip(
|
||||
"S3 tests do not have a corresponding service on "
|
||||
"Windows or macOS platforms"
|
||||
)
|
||||
else:
|
||||
# set in .github/workflows/unit-tests.yml
|
||||
yield "http://localhost:5000"
|
||||
else:
|
||||
requests = pytest.importorskip("requests")
|
||||
pytest.importorskip("moto")
|
||||
pytest.importorskip("flask") # server mode needs flask too
|
||||
|
||||
# Launching moto in server mode, i.e., as a separate process
|
||||
# with an S3 endpoint on localhost
|
||||
|
||||
worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw")
|
||||
endpoint_port = f"555{worker_id}"
|
||||
endpoint_uri = f"http://127.0.0.1:{endpoint_port}/"
|
||||
|
||||
# pipe to null to avoid logging in terminal
|
||||
with subprocess.Popen(
|
||||
shlex.split(f"moto_server s3 -p {endpoint_port}"),
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
) as proc:
|
||||
timeout = 5
|
||||
while timeout > 0:
|
||||
try:
|
||||
# OK to go once server is accepting connections
|
||||
r = requests.get(endpoint_uri)
|
||||
if r.ok:
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
timeout -= 0.1
|
||||
time.sleep(0.1)
|
||||
yield endpoint_uri
|
||||
|
||||
proc.terminate()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def s3so(s3_base):
|
||||
return {"client_kwargs": {"endpoint_url": s3_base}}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def s3_resource(s3_base):
|
||||
import boto3
|
||||
|
||||
s3 = boto3.resource("s3", endpoint_url=s3_base)
|
||||
return s3
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def s3_public_bucket(s3_resource):
|
||||
bucket = s3_resource.Bucket(f"pandas-test-{uuid.uuid4()}")
|
||||
bucket.create()
|
||||
yield bucket
|
||||
bucket.objects.delete()
|
||||
bucket.delete()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def s3_public_bucket_with_data(
|
||||
s3_public_bucket, tips_file, jsonl_file, feather_file, xml_file
|
||||
):
|
||||
"""
|
||||
The following datasets
|
||||
are loaded.
|
||||
|
||||
- tips.csv
|
||||
- tips.csv.gz
|
||||
- tips.csv.bz2
|
||||
- items.jsonl
|
||||
"""
|
||||
test_s3_files = [
|
||||
("tips#1.csv", tips_file),
|
||||
("tips.csv", tips_file),
|
||||
("tips.csv.gz", tips_file + ".gz"),
|
||||
("tips.csv.bz2", tips_file + ".bz2"),
|
||||
("items.jsonl", jsonl_file),
|
||||
("simple_dataset.feather", feather_file),
|
||||
("books.xml", xml_file),
|
||||
]
|
||||
for s3_key, file_name in test_s3_files:
|
||||
with open(file_name, "rb") as f:
|
||||
s3_public_bucket.put_object(Key=s3_key, Body=f)
|
||||
return s3_public_bucket
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def s3_private_bucket(s3_resource):
|
||||
bucket = s3_resource.Bucket(f"cant_get_it-{uuid.uuid4()}")
|
||||
bucket.create(ACL="private")
|
||||
yield bucket
|
||||
bucket.objects.delete()
|
||||
bucket.delete()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def s3_private_bucket_with_data(
|
||||
s3_private_bucket, tips_file, jsonl_file, feather_file, xml_file
|
||||
):
|
||||
"""
|
||||
The following datasets
|
||||
are loaded.
|
||||
|
||||
- tips.csv
|
||||
- tips.csv.gz
|
||||
- tips.csv.bz2
|
||||
- items.jsonl
|
||||
"""
|
||||
test_s3_files = [
|
||||
("tips#1.csv", tips_file),
|
||||
("tips.csv", tips_file),
|
||||
("tips.csv.gz", tips_file + ".gz"),
|
||||
("tips.csv.bz2", tips_file + ".bz2"),
|
||||
("items.jsonl", jsonl_file),
|
||||
("simple_dataset.feather", feather_file),
|
||||
("books.xml", xml_file),
|
||||
]
|
||||
for s3_key, file_name in test_s3_files:
|
||||
with open(file_name, "rb") as f:
|
||||
s3_private_bucket.put_object(Key=s3_key, Body=f)
|
||||
return s3_private_bucket
|
||||
|
||||
|
||||
_compression_formats_params = [
|
||||
(".no_compress", None),
|
||||
("", None),
|
||||
(".gz", "gzip"),
|
||||
(".GZ", "gzip"),
|
||||
(".bz2", "bz2"),
|
||||
(".BZ2", "bz2"),
|
||||
(".zip", "zip"),
|
||||
(".ZIP", "zip"),
|
||||
(".xz", "xz"),
|
||||
(".XZ", "xz"),
|
||||
pytest.param((".zst", "zstd"), marks=td.skip_if_no("zstandard")),
|
||||
pytest.param((".ZST", "zstd"), marks=td.skip_if_no("zstandard")),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(params=_compression_formats_params[1:])
|
||||
def compression_format(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=_compression_formats_params)
|
||||
def compression_ext(request):
|
||||
return request.param[0]
|
||||
@ -0,0 +1,77 @@
|
||||
import functools
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_windows
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
pytest.importorskip("odf")
|
||||
|
||||
if is_platform_windows():
|
||||
pytestmark = pytest.mark.single_cpu
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def cd_and_set_engine(monkeypatch, datapath):
|
||||
func = functools.partial(pd.read_excel, engine="odf")
|
||||
monkeypatch.setattr(pd, "read_excel", func)
|
||||
monkeypatch.chdir(datapath("io", "data", "excel"))
|
||||
|
||||
|
||||
def test_read_invalid_types_raises():
|
||||
# the invalid_value_type.ods required manually editing
|
||||
# of the included content.xml file
|
||||
with pytest.raises(ValueError, match="Unrecognized type awesome_new_type"):
|
||||
pd.read_excel("invalid_value_type.ods")
|
||||
|
||||
|
||||
def test_read_writer_table():
|
||||
# Also test reading tables from an text OpenDocument file
|
||||
# (.odt)
|
||||
index = pd.Index(["Row 1", "Row 2", "Row 3"], name="Header")
|
||||
expected = pd.DataFrame(
|
||||
[[1, np.nan, 7], [2, np.nan, 8], [3, np.nan, 9]],
|
||||
index=index,
|
||||
columns=["Column 1", "Unnamed: 2", "Column 3"],
|
||||
)
|
||||
|
||||
result = pd.read_excel("writertable.odt", sheet_name="Table1", index_col=0)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_newlines_between_xml_elements_table():
|
||||
# GH#45598
|
||||
expected = pd.DataFrame(
|
||||
[[1.0, 4.0, 7], [np.nan, np.nan, 8], [3.0, 6.0, 9]],
|
||||
columns=["Column 1", "Column 2", "Column 3"],
|
||||
)
|
||||
|
||||
result = pd.read_excel("test_newlines.ods")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_unempty_cells():
|
||||
expected = pd.DataFrame(
|
||||
[1, np.nan, 3, np.nan, 5],
|
||||
columns=["Column 1"],
|
||||
)
|
||||
|
||||
result = pd.read_excel("test_unempty_cells.ods")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_cell_annotation():
|
||||
expected = pd.DataFrame(
|
||||
["test", np.nan, "test 3"],
|
||||
columns=["Column 1"],
|
||||
)
|
||||
|
||||
result = pd.read_excel("test_cell_annotation.ods")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,106 @@
|
||||
from datetime import (
|
||||
date,
|
||||
datetime,
|
||||
)
|
||||
import re
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_windows
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.excel import ExcelWriter
|
||||
|
||||
odf = pytest.importorskip("odf")
|
||||
|
||||
if is_platform_windows():
|
||||
pytestmark = pytest.mark.single_cpu
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ext():
|
||||
return ".ods"
|
||||
|
||||
|
||||
def test_write_append_mode_raises(ext):
|
||||
msg = "Append mode is not supported with odf!"
|
||||
|
||||
with tm.ensure_clean(ext) as f:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ExcelWriter(f, engine="odf", mode="a")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("engine_kwargs", [None, {"kwarg": 1}])
|
||||
def test_engine_kwargs(ext, engine_kwargs):
|
||||
# GH 42286
|
||||
# GH 43445
|
||||
# test for error: OpenDocumentSpreadsheet does not accept any arguments
|
||||
with tm.ensure_clean(ext) as f:
|
||||
if engine_kwargs is not None:
|
||||
error = re.escape(
|
||||
"OpenDocumentSpreadsheet() got an unexpected keyword argument 'kwarg'"
|
||||
)
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=error,
|
||||
):
|
||||
ExcelWriter(f, engine="odf", engine_kwargs=engine_kwargs)
|
||||
else:
|
||||
with ExcelWriter(f, engine="odf", engine_kwargs=engine_kwargs) as _:
|
||||
pass
|
||||
|
||||
|
||||
def test_book_and_sheets_consistent(ext):
|
||||
# GH#45687 - Ensure sheets is updated if user modifies book
|
||||
with tm.ensure_clean(ext) as f:
|
||||
with ExcelWriter(f) as writer:
|
||||
assert writer.sheets == {}
|
||||
table = odf.table.Table(name="test_name")
|
||||
writer.book.spreadsheet.addElement(table)
|
||||
assert writer.sheets == {"test_name": table}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["value", "cell_value_type", "cell_value_attribute", "cell_value"],
|
||||
argvalues=[
|
||||
(True, "boolean", "boolean-value", "true"),
|
||||
("test string", "string", "string-value", "test string"),
|
||||
(1, "float", "value", "1"),
|
||||
(1.5, "float", "value", "1.5"),
|
||||
(
|
||||
datetime(2010, 10, 10, 10, 10, 10),
|
||||
"date",
|
||||
"date-value",
|
||||
"2010-10-10T10:10:10",
|
||||
),
|
||||
(date(2010, 10, 10), "date", "date-value", "2010-10-10"),
|
||||
],
|
||||
)
|
||||
def test_cell_value_type(ext, value, cell_value_type, cell_value_attribute, cell_value):
|
||||
# GH#54994 ODS: cell attributes should follow specification
|
||||
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#refTable13
|
||||
from odf.namespaces import OFFICENS
|
||||
from odf.table import (
|
||||
TableCell,
|
||||
TableRow,
|
||||
)
|
||||
|
||||
table_cell_name = TableCell().qname
|
||||
|
||||
with tm.ensure_clean(ext) as f:
|
||||
pd.DataFrame([[value]]).to_excel(f, header=False, index=False)
|
||||
|
||||
with pd.ExcelFile(f) as wb:
|
||||
sheet = wb._reader.get_sheet_by_index(0)
|
||||
sheet_rows = sheet.getElementsByType(TableRow)
|
||||
sheet_cells = [
|
||||
x
|
||||
for x in sheet_rows[0].childNodes
|
||||
if hasattr(x, "qname") and x.qname == table_cell_name
|
||||
]
|
||||
|
||||
cell = sheet_cells[0]
|
||||
assert cell.attributes.get((OFFICENS, "value-type")) == cell_value_type
|
||||
assert cell.attributes.get((OFFICENS, cell_value_attribute)) == cell_value
|
||||
@ -0,0 +1,432 @@
|
||||
import contextlib
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_windows
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.excel import (
|
||||
ExcelWriter,
|
||||
_OpenpyxlWriter,
|
||||
)
|
||||
from pandas.io.excel._openpyxl import OpenpyxlReader
|
||||
|
||||
openpyxl = pytest.importorskip("openpyxl")
|
||||
|
||||
if is_platform_windows():
|
||||
pytestmark = pytest.mark.single_cpu
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ext():
|
||||
return ".xlsx"
|
||||
|
||||
|
||||
def test_to_excel_styleconverter():
|
||||
from openpyxl import styles
|
||||
|
||||
hstyle = {
|
||||
"font": {"color": "00FF0000", "bold": True},
|
||||
"borders": {"top": "thin", "right": "thin", "bottom": "thin", "left": "thin"},
|
||||
"alignment": {"horizontal": "center", "vertical": "top"},
|
||||
"fill": {"patternType": "solid", "fgColor": {"rgb": "006666FF", "tint": 0.3}},
|
||||
"number_format": {"format_code": "0.00"},
|
||||
"protection": {"locked": True, "hidden": False},
|
||||
}
|
||||
|
||||
font_color = styles.Color("00FF0000")
|
||||
font = styles.Font(bold=True, color=font_color)
|
||||
side = styles.Side(style=styles.borders.BORDER_THIN)
|
||||
border = styles.Border(top=side, right=side, bottom=side, left=side)
|
||||
alignment = styles.Alignment(horizontal="center", vertical="top")
|
||||
fill_color = styles.Color(rgb="006666FF", tint=0.3)
|
||||
fill = styles.PatternFill(patternType="solid", fgColor=fill_color)
|
||||
|
||||
number_format = "0.00"
|
||||
|
||||
protection = styles.Protection(locked=True, hidden=False)
|
||||
|
||||
kw = _OpenpyxlWriter._convert_to_style_kwargs(hstyle)
|
||||
assert kw["font"] == font
|
||||
assert kw["border"] == border
|
||||
assert kw["alignment"] == alignment
|
||||
assert kw["fill"] == fill
|
||||
assert kw["number_format"] == number_format
|
||||
assert kw["protection"] == protection
|
||||
|
||||
|
||||
def test_write_cells_merge_styled(ext):
|
||||
from pandas.io.formats.excel import ExcelCell
|
||||
|
||||
sheet_name = "merge_styled"
|
||||
|
||||
sty_b1 = {"font": {"color": "00FF0000"}}
|
||||
sty_a2 = {"font": {"color": "0000FF00"}}
|
||||
|
||||
initial_cells = [
|
||||
ExcelCell(col=1, row=0, val=42, style=sty_b1),
|
||||
ExcelCell(col=0, row=1, val=99, style=sty_a2),
|
||||
]
|
||||
|
||||
sty_merged = {"font": {"color": "000000FF", "bold": True}}
|
||||
sty_kwargs = _OpenpyxlWriter._convert_to_style_kwargs(sty_merged)
|
||||
openpyxl_sty_merged = sty_kwargs["font"]
|
||||
merge_cells = [
|
||||
ExcelCell(
|
||||
col=0, row=0, val="pandas", mergestart=1, mergeend=1, style=sty_merged
|
||||
)
|
||||
]
|
||||
|
||||
with tm.ensure_clean(ext) as path:
|
||||
with _OpenpyxlWriter(path) as writer:
|
||||
writer._write_cells(initial_cells, sheet_name=sheet_name)
|
||||
writer._write_cells(merge_cells, sheet_name=sheet_name)
|
||||
|
||||
wks = writer.sheets[sheet_name]
|
||||
xcell_b1 = wks["B1"]
|
||||
xcell_a2 = wks["A2"]
|
||||
assert xcell_b1.font == openpyxl_sty_merged
|
||||
assert xcell_a2.font == openpyxl_sty_merged
|
||||
|
||||
|
||||
@pytest.mark.parametrize("iso_dates", [True, False])
|
||||
def test_engine_kwargs_write(ext, iso_dates):
|
||||
# GH 42286 GH 43445
|
||||
engine_kwargs = {"iso_dates": iso_dates}
|
||||
with tm.ensure_clean(ext) as f:
|
||||
with ExcelWriter(f, engine="openpyxl", engine_kwargs=engine_kwargs) as writer:
|
||||
assert writer.book.iso_dates == iso_dates
|
||||
# ExcelWriter won't allow us to close without writing something
|
||||
DataFrame().to_excel(writer)
|
||||
|
||||
|
||||
def test_engine_kwargs_append_invalid(ext):
|
||||
# GH 43445
|
||||
# test whether an invalid engine kwargs actually raises
|
||||
with tm.ensure_clean(ext) as f:
|
||||
DataFrame(["hello", "world"]).to_excel(f)
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=re.escape(
|
||||
"load_workbook() got an unexpected keyword argument 'apple_banana'"
|
||||
),
|
||||
):
|
||||
with ExcelWriter(
|
||||
f, engine="openpyxl", mode="a", engine_kwargs={"apple_banana": "fruit"}
|
||||
) as writer:
|
||||
# ExcelWriter needs us to write something to close properly
|
||||
DataFrame(["good"]).to_excel(writer, sheet_name="Sheet2")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data_only, expected", [(True, 0), (False, "=1+1")])
|
||||
def test_engine_kwargs_append_data_only(ext, data_only, expected):
|
||||
# GH 43445
|
||||
# tests whether the data_only engine_kwarg actually works well for
|
||||
# openpyxl's load_workbook
|
||||
with tm.ensure_clean(ext) as f:
|
||||
DataFrame(["=1+1"]).to_excel(f)
|
||||
with ExcelWriter(
|
||||
f, engine="openpyxl", mode="a", engine_kwargs={"data_only": data_only}
|
||||
) as writer:
|
||||
assert writer.sheets["Sheet1"]["B2"].value == expected
|
||||
# ExcelWriter needs us to writer something to close properly?
|
||||
DataFrame().to_excel(writer, sheet_name="Sheet2")
|
||||
|
||||
# ensure that data_only also works for reading
|
||||
# and that formulas/values roundtrip
|
||||
assert (
|
||||
pd.read_excel(
|
||||
f,
|
||||
sheet_name="Sheet1",
|
||||
engine="openpyxl",
|
||||
engine_kwargs={"data_only": data_only},
|
||||
).iloc[0, 1]
|
||||
== expected
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwarg_name", ["read_only", "data_only"])
|
||||
@pytest.mark.parametrize("kwarg_value", [True, False])
|
||||
def test_engine_kwargs_append_reader(datapath, ext, kwarg_name, kwarg_value):
|
||||
# GH 55027
|
||||
# test that `read_only` and `data_only` can be passed to
|
||||
# `openpyxl.reader.excel.load_workbook` via `engine_kwargs`
|
||||
filename = datapath("io", "data", "excel", "test1" + ext)
|
||||
with contextlib.closing(
|
||||
OpenpyxlReader(filename, engine_kwargs={kwarg_name: kwarg_value})
|
||||
) as reader:
|
||||
assert getattr(reader.book, kwarg_name) == kwarg_value
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"mode,expected", [("w", ["baz"]), ("a", ["foo", "bar", "baz"])]
|
||||
)
|
||||
def test_write_append_mode(ext, mode, expected):
|
||||
df = DataFrame([1], columns=["baz"])
|
||||
|
||||
with tm.ensure_clean(ext) as f:
|
||||
wb = openpyxl.Workbook()
|
||||
wb.worksheets[0].title = "foo"
|
||||
wb.worksheets[0]["A1"].value = "foo"
|
||||
wb.create_sheet("bar")
|
||||
wb.worksheets[1]["A1"].value = "bar"
|
||||
wb.save(f)
|
||||
|
||||
with ExcelWriter(f, engine="openpyxl", mode=mode) as writer:
|
||||
df.to_excel(writer, sheet_name="baz", index=False)
|
||||
|
||||
with contextlib.closing(openpyxl.load_workbook(f)) as wb2:
|
||||
result = [sheet.title for sheet in wb2.worksheets]
|
||||
assert result == expected
|
||||
|
||||
for index, cell_value in enumerate(expected):
|
||||
assert wb2.worksheets[index]["A1"].value == cell_value
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"if_sheet_exists,num_sheets,expected",
|
||||
[
|
||||
("new", 2, ["apple", "banana"]),
|
||||
("replace", 1, ["pear"]),
|
||||
("overlay", 1, ["pear", "banana"]),
|
||||
],
|
||||
)
|
||||
def test_if_sheet_exists_append_modes(ext, if_sheet_exists, num_sheets, expected):
|
||||
# GH 40230
|
||||
df1 = DataFrame({"fruit": ["apple", "banana"]})
|
||||
df2 = DataFrame({"fruit": ["pear"]})
|
||||
|
||||
with tm.ensure_clean(ext) as f:
|
||||
df1.to_excel(f, engine="openpyxl", sheet_name="foo", index=False)
|
||||
with ExcelWriter(
|
||||
f, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists
|
||||
) as writer:
|
||||
df2.to_excel(writer, sheet_name="foo", index=False)
|
||||
|
||||
with contextlib.closing(openpyxl.load_workbook(f)) as wb:
|
||||
assert len(wb.sheetnames) == num_sheets
|
||||
assert wb.sheetnames[0] == "foo"
|
||||
result = pd.read_excel(wb, "foo", engine="openpyxl")
|
||||
assert list(result["fruit"]) == expected
|
||||
if len(wb.sheetnames) == 2:
|
||||
result = pd.read_excel(wb, wb.sheetnames[1], engine="openpyxl")
|
||||
tm.assert_frame_equal(result, df2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"startrow, startcol, greeting, goodbye",
|
||||
[
|
||||
(0, 0, ["poop", "world"], ["goodbye", "people"]),
|
||||
(0, 1, ["hello", "world"], ["poop", "people"]),
|
||||
(1, 0, ["hello", "poop"], ["goodbye", "people"]),
|
||||
(1, 1, ["hello", "world"], ["goodbye", "poop"]),
|
||||
],
|
||||
)
|
||||
def test_append_overlay_startrow_startcol(ext, startrow, startcol, greeting, goodbye):
|
||||
df1 = DataFrame({"greeting": ["hello", "world"], "goodbye": ["goodbye", "people"]})
|
||||
df2 = DataFrame(["poop"])
|
||||
|
||||
with tm.ensure_clean(ext) as f:
|
||||
df1.to_excel(f, engine="openpyxl", sheet_name="poo", index=False)
|
||||
with ExcelWriter(
|
||||
f, engine="openpyxl", mode="a", if_sheet_exists="overlay"
|
||||
) as writer:
|
||||
# use startrow+1 because we don't have a header
|
||||
df2.to_excel(
|
||||
writer,
|
||||
index=False,
|
||||
header=False,
|
||||
startrow=startrow + 1,
|
||||
startcol=startcol,
|
||||
sheet_name="poo",
|
||||
)
|
||||
|
||||
result = pd.read_excel(f, sheet_name="poo", engine="openpyxl")
|
||||
expected = DataFrame({"greeting": greeting, "goodbye": goodbye})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"if_sheet_exists,msg",
|
||||
[
|
||||
(
|
||||
"invalid",
|
||||
"'invalid' is not valid for if_sheet_exists. Valid options "
|
||||
"are 'error', 'new', 'replace' and 'overlay'.",
|
||||
),
|
||||
(
|
||||
"error",
|
||||
"Sheet 'foo' already exists and if_sheet_exists is set to 'error'.",
|
||||
),
|
||||
(
|
||||
None,
|
||||
"Sheet 'foo' already exists and if_sheet_exists is set to 'error'.",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_if_sheet_exists_raises(ext, if_sheet_exists, msg):
|
||||
# GH 40230
|
||||
df = DataFrame({"fruit": ["pear"]})
|
||||
with tm.ensure_clean(ext) as f:
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
df.to_excel(f, sheet_name="foo", engine="openpyxl")
|
||||
with ExcelWriter(
|
||||
f, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists
|
||||
) as writer:
|
||||
df.to_excel(writer, sheet_name="foo")
|
||||
|
||||
|
||||
def test_to_excel_with_openpyxl_engine(ext):
|
||||
# GH 29854
|
||||
with tm.ensure_clean(ext) as filename:
|
||||
df1 = DataFrame({"A": np.linspace(1, 10, 10)})
|
||||
df2 = DataFrame({"B": np.linspace(1, 20, 10)})
|
||||
df = pd.concat([df1, df2], axis=1)
|
||||
styled = df.style.map(
|
||||
lambda val: f"color: {'red' if val < 0 else 'black'}"
|
||||
).highlight_max()
|
||||
|
||||
styled.to_excel(filename, engine="openpyxl")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("read_only", [True, False])
|
||||
def test_read_workbook(datapath, ext, read_only):
|
||||
# GH 39528
|
||||
filename = datapath("io", "data", "excel", "test1" + ext)
|
||||
with contextlib.closing(
|
||||
openpyxl.load_workbook(filename, read_only=read_only)
|
||||
) as wb:
|
||||
result = pd.read_excel(wb, engine="openpyxl")
|
||||
expected = pd.read_excel(filename)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"header, expected_data",
|
||||
[
|
||||
(
|
||||
0,
|
||||
{
|
||||
"Title": [np.nan, "A", 1, 2, 3],
|
||||
"Unnamed: 1": [np.nan, "B", 4, 5, 6],
|
||||
"Unnamed: 2": [np.nan, "C", 7, 8, 9],
|
||||
},
|
||||
),
|
||||
(2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"filename", ["dimension_missing", "dimension_small", "dimension_large"]
|
||||
)
|
||||
# When read_only is None, use read_excel instead of a workbook
|
||||
@pytest.mark.parametrize("read_only", [True, False, None])
|
||||
def test_read_with_bad_dimension(
|
||||
datapath, ext, header, expected_data, filename, read_only
|
||||
):
|
||||
# GH 38956, 39001 - no/incorrect dimension information
|
||||
path = datapath("io", "data", "excel", f"{filename}{ext}")
|
||||
if read_only is None:
|
||||
result = pd.read_excel(path, header=header)
|
||||
else:
|
||||
with contextlib.closing(
|
||||
openpyxl.load_workbook(path, read_only=read_only)
|
||||
) as wb:
|
||||
result = pd.read_excel(wb, engine="openpyxl", header=header)
|
||||
expected = DataFrame(expected_data)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_append_mode_file(ext):
|
||||
# GH 39576
|
||||
df = DataFrame()
|
||||
|
||||
with tm.ensure_clean(ext) as f:
|
||||
df.to_excel(f, engine="openpyxl")
|
||||
|
||||
with ExcelWriter(
|
||||
f, mode="a", engine="openpyxl", if_sheet_exists="new"
|
||||
) as writer:
|
||||
df.to_excel(writer)
|
||||
|
||||
# make sure that zip files are not concatenated by making sure that
|
||||
# "docProps/app.xml" only occurs twice in the file
|
||||
data = Path(f).read_bytes()
|
||||
first = data.find(b"docProps/app.xml")
|
||||
second = data.find(b"docProps/app.xml", first + 1)
|
||||
third = data.find(b"docProps/app.xml", second + 1)
|
||||
assert second != -1 and third == -1
|
||||
|
||||
|
||||
# When read_only is None, use read_excel instead of a workbook
|
||||
@pytest.mark.parametrize("read_only", [True, False, None])
|
||||
def test_read_with_empty_trailing_rows(datapath, ext, read_only):
|
||||
# GH 39181
|
||||
path = datapath("io", "data", "excel", f"empty_trailing_rows{ext}")
|
||||
if read_only is None:
|
||||
result = pd.read_excel(path)
|
||||
else:
|
||||
with contextlib.closing(
|
||||
openpyxl.load_workbook(path, read_only=read_only)
|
||||
) as wb:
|
||||
result = pd.read_excel(wb, engine="openpyxl")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Title": [np.nan, "A", 1, 2, 3],
|
||||
"Unnamed: 1": [np.nan, "B", 4, 5, 6],
|
||||
"Unnamed: 2": [np.nan, "C", 7, 8, 9],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# When read_only is None, use read_excel instead of a workbook
|
||||
@pytest.mark.parametrize("read_only", [True, False, None])
|
||||
def test_read_empty_with_blank_row(datapath, ext, read_only):
|
||||
# GH 39547 - empty excel file with a row that has no data
|
||||
path = datapath("io", "data", "excel", f"empty_with_blank_row{ext}")
|
||||
if read_only is None:
|
||||
result = pd.read_excel(path)
|
||||
else:
|
||||
with contextlib.closing(
|
||||
openpyxl.load_workbook(path, read_only=read_only)
|
||||
) as wb:
|
||||
result = pd.read_excel(wb, engine="openpyxl")
|
||||
expected = DataFrame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_book_and_sheets_consistent(ext):
|
||||
# GH#45687 - Ensure sheets is updated if user modifies book
|
||||
with tm.ensure_clean(ext) as f:
|
||||
with ExcelWriter(f, engine="openpyxl") as writer:
|
||||
assert writer.sheets == {}
|
||||
sheet = writer.book.create_sheet("test_name", 0)
|
||||
assert writer.sheets == {"test_name": sheet}
|
||||
|
||||
|
||||
def test_ints_spelled_with_decimals(datapath, ext):
|
||||
# GH 46988 - openpyxl returns this sheet with floats
|
||||
path = datapath("io", "data", "excel", f"ints_spelled_with_decimals{ext}")
|
||||
result = pd.read_excel(path)
|
||||
expected = DataFrame(range(2, 12), columns=[1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_multiindex_header_no_index_names(datapath, ext):
|
||||
# GH#47487
|
||||
path = datapath("io", "data", "excel", f"multiindex_no_index_names{ext}")
|
||||
result = pd.read_excel(path, index_col=[0, 1, 2], header=[0, 1, 2])
|
||||
expected = DataFrame(
|
||||
[[np.nan, "x", "x", "x"], ["x", np.nan, np.nan, np.nan]],
|
||||
columns=pd.MultiIndex.from_tuples(
|
||||
[("X", "Y", "A1"), ("X", "Y", "A2"), ("XX", "YY", "B1"), ("XX", "YY", "B2")]
|
||||
),
|
||||
index=pd.MultiIndex.from_tuples([("A", "AA", "AAA"), ("A", "BB", "BBB")]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
1735
lib/python3.11/site-packages/pandas/tests/io/excel/test_readers.py
Normal file
1735
lib/python3.11/site-packages/pandas/tests/io/excel/test_readers.py
Normal file
File diff suppressed because it is too large
Load Diff
298
lib/python3.11/site-packages/pandas/tests/io/excel/test_style.py
Normal file
298
lib/python3.11/site-packages/pandas/tests/io/excel/test_style.py
Normal file
@ -0,0 +1,298 @@
|
||||
import contextlib
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_windows
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
read_excel,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.excel import ExcelWriter
|
||||
from pandas.io.formats.excel import ExcelFormatter
|
||||
|
||||
pytest.importorskip("jinja2")
|
||||
# jinja2 is currently required for Styler.__init__(). Technically Styler.to_excel
|
||||
# could compute styles and render to excel without jinja2, since there is no
|
||||
# 'template' file, but this needs the import error to delayed until render time.
|
||||
|
||||
if is_platform_windows():
|
||||
pytestmark = pytest.mark.single_cpu
|
||||
|
||||
|
||||
def assert_equal_cell_styles(cell1, cell2):
|
||||
# TODO: should find a better way to check equality
|
||||
assert cell1.alignment.__dict__ == cell2.alignment.__dict__
|
||||
assert cell1.border.__dict__ == cell2.border.__dict__
|
||||
assert cell1.fill.__dict__ == cell2.fill.__dict__
|
||||
assert cell1.font.__dict__ == cell2.font.__dict__
|
||||
assert cell1.number_format == cell2.number_format
|
||||
assert cell1.protection.__dict__ == cell2.protection.__dict__
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"engine",
|
||||
["xlsxwriter", "openpyxl"],
|
||||
)
|
||||
def test_styler_to_excel_unstyled(engine):
|
||||
# compare DataFrame.to_excel and Styler.to_excel when no styles applied
|
||||
pytest.importorskip(engine)
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((2, 2)))
|
||||
with tm.ensure_clean(".xlsx") as path:
|
||||
with ExcelWriter(path, engine=engine) as writer:
|
||||
df.to_excel(writer, sheet_name="dataframe")
|
||||
df.style.to_excel(writer, sheet_name="unstyled")
|
||||
|
||||
openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl
|
||||
with contextlib.closing(openpyxl.load_workbook(path)) as wb:
|
||||
for col1, col2 in zip(wb["dataframe"].columns, wb["unstyled"].columns):
|
||||
assert len(col1) == len(col2)
|
||||
for cell1, cell2 in zip(col1, col2):
|
||||
assert cell1.value == cell2.value
|
||||
assert_equal_cell_styles(cell1, cell2)
|
||||
|
||||
|
||||
shared_style_params = [
|
||||
(
|
||||
"background-color: #111222",
|
||||
["fill", "fgColor", "rgb"],
|
||||
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
|
||||
),
|
||||
(
|
||||
"color: #111222",
|
||||
["font", "color", "value"],
|
||||
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
|
||||
),
|
||||
("font-family: Arial;", ["font", "name"], "arial"),
|
||||
("font-weight: bold;", ["font", "b"], True),
|
||||
("font-style: italic;", ["font", "i"], True),
|
||||
("text-decoration: underline;", ["font", "u"], "single"),
|
||||
("number-format: $??,???.00;", ["number_format"], "$??,???.00"),
|
||||
("text-align: left;", ["alignment", "horizontal"], "left"),
|
||||
(
|
||||
"vertical-align: bottom;",
|
||||
["alignment", "vertical"],
|
||||
{"xlsxwriter": None, "openpyxl": "bottom"}, # xlsxwriter Fails
|
||||
),
|
||||
("vertical-align: middle;", ["alignment", "vertical"], "center"),
|
||||
# Border widths
|
||||
("border-left: 2pt solid red", ["border", "left", "style"], "medium"),
|
||||
("border-left: 1pt dotted red", ["border", "left", "style"], "dotted"),
|
||||
("border-left: 2pt dotted red", ["border", "left", "style"], "mediumDashDotDot"),
|
||||
("border-left: 1pt dashed red", ["border", "left", "style"], "dashed"),
|
||||
("border-left: 2pt dashed red", ["border", "left", "style"], "mediumDashed"),
|
||||
("border-left: 1pt solid red", ["border", "left", "style"], "thin"),
|
||||
("border-left: 3pt solid red", ["border", "left", "style"], "thick"),
|
||||
# Border expansion
|
||||
(
|
||||
"border-left: 2pt solid #111222",
|
||||
["border", "left", "color", "rgb"],
|
||||
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
|
||||
),
|
||||
("border: 1pt solid red", ["border", "top", "style"], "thin"),
|
||||
(
|
||||
"border: 1pt solid #111222",
|
||||
["border", "top", "color", "rgb"],
|
||||
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
|
||||
),
|
||||
("border: 1pt solid red", ["border", "right", "style"], "thin"),
|
||||
(
|
||||
"border: 1pt solid #111222",
|
||||
["border", "right", "color", "rgb"],
|
||||
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
|
||||
),
|
||||
("border: 1pt solid red", ["border", "bottom", "style"], "thin"),
|
||||
(
|
||||
"border: 1pt solid #111222",
|
||||
["border", "bottom", "color", "rgb"],
|
||||
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
|
||||
),
|
||||
("border: 1pt solid red", ["border", "left", "style"], "thin"),
|
||||
(
|
||||
"border: 1pt solid #111222",
|
||||
["border", "left", "color", "rgb"],
|
||||
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
|
||||
),
|
||||
# Border styles
|
||||
(
|
||||
"border-left-style: hair; border-left-color: black",
|
||||
["border", "left", "style"],
|
||||
"hair",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"engine",
|
||||
["xlsxwriter", "openpyxl"],
|
||||
)
|
||||
@pytest.mark.parametrize("css, attrs, expected", shared_style_params)
|
||||
def test_styler_to_excel_basic(engine, css, attrs, expected):
|
||||
pytest.importorskip(engine)
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1, 1)))
|
||||
styler = df.style.map(lambda x: css)
|
||||
|
||||
with tm.ensure_clean(".xlsx") as path:
|
||||
with ExcelWriter(path, engine=engine) as writer:
|
||||
df.to_excel(writer, sheet_name="dataframe")
|
||||
styler.to_excel(writer, sheet_name="styled")
|
||||
|
||||
openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl
|
||||
with contextlib.closing(openpyxl.load_workbook(path)) as wb:
|
||||
# test unstyled data cell does not have expected styles
|
||||
# test styled cell has expected styles
|
||||
u_cell, s_cell = wb["dataframe"].cell(2, 2), wb["styled"].cell(2, 2)
|
||||
for attr in attrs:
|
||||
u_cell, s_cell = getattr(u_cell, attr, None), getattr(s_cell, attr)
|
||||
|
||||
if isinstance(expected, dict):
|
||||
assert u_cell is None or u_cell != expected[engine]
|
||||
assert s_cell == expected[engine]
|
||||
else:
|
||||
assert u_cell is None or u_cell != expected
|
||||
assert s_cell == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"engine",
|
||||
["xlsxwriter", "openpyxl"],
|
||||
)
|
||||
@pytest.mark.parametrize("css, attrs, expected", shared_style_params)
|
||||
def test_styler_to_excel_basic_indexes(engine, css, attrs, expected):
|
||||
pytest.importorskip(engine)
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1, 1)))
|
||||
|
||||
styler = df.style
|
||||
styler.map_index(lambda x: css, axis=0)
|
||||
styler.map_index(lambda x: css, axis=1)
|
||||
|
||||
null_styler = df.style
|
||||
null_styler.map(lambda x: "null: css;")
|
||||
null_styler.map_index(lambda x: "null: css;", axis=0)
|
||||
null_styler.map_index(lambda x: "null: css;", axis=1)
|
||||
|
||||
with tm.ensure_clean(".xlsx") as path:
|
||||
with ExcelWriter(path, engine=engine) as writer:
|
||||
null_styler.to_excel(writer, sheet_name="null_styled")
|
||||
styler.to_excel(writer, sheet_name="styled")
|
||||
|
||||
openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl
|
||||
with contextlib.closing(openpyxl.load_workbook(path)) as wb:
|
||||
# test null styled index cells does not have expected styles
|
||||
# test styled cell has expected styles
|
||||
ui_cell, si_cell = wb["null_styled"].cell(2, 1), wb["styled"].cell(2, 1)
|
||||
uc_cell, sc_cell = wb["null_styled"].cell(1, 2), wb["styled"].cell(1, 2)
|
||||
for attr in attrs:
|
||||
ui_cell, si_cell = getattr(ui_cell, attr, None), getattr(si_cell, attr)
|
||||
uc_cell, sc_cell = getattr(uc_cell, attr, None), getattr(sc_cell, attr)
|
||||
|
||||
if isinstance(expected, dict):
|
||||
assert ui_cell is None or ui_cell != expected[engine]
|
||||
assert si_cell == expected[engine]
|
||||
assert uc_cell is None or uc_cell != expected[engine]
|
||||
assert sc_cell == expected[engine]
|
||||
else:
|
||||
assert ui_cell is None or ui_cell != expected
|
||||
assert si_cell == expected
|
||||
assert uc_cell is None or uc_cell != expected
|
||||
assert sc_cell == expected
|
||||
|
||||
|
||||
# From https://openpyxl.readthedocs.io/en/stable/api/openpyxl.styles.borders.html
|
||||
# Note: Leaving behavior of "width"-type styles undefined; user should use border-width
|
||||
# instead
|
||||
excel_border_styles = [
|
||||
# "thin",
|
||||
"dashed",
|
||||
"mediumDashDot",
|
||||
"dashDotDot",
|
||||
"hair",
|
||||
"dotted",
|
||||
"mediumDashDotDot",
|
||||
# "medium",
|
||||
"double",
|
||||
"dashDot",
|
||||
"slantDashDot",
|
||||
# "thick",
|
||||
"mediumDashed",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"engine",
|
||||
["xlsxwriter", "openpyxl"],
|
||||
)
|
||||
@pytest.mark.parametrize("border_style", excel_border_styles)
|
||||
def test_styler_to_excel_border_style(engine, border_style):
|
||||
css = f"border-left: {border_style} black thin"
|
||||
attrs = ["border", "left", "style"]
|
||||
expected = border_style
|
||||
|
||||
pytest.importorskip(engine)
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1, 1)))
|
||||
styler = df.style.map(lambda x: css)
|
||||
|
||||
with tm.ensure_clean(".xlsx") as path:
|
||||
with ExcelWriter(path, engine=engine) as writer:
|
||||
df.to_excel(writer, sheet_name="dataframe")
|
||||
styler.to_excel(writer, sheet_name="styled")
|
||||
|
||||
openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl
|
||||
with contextlib.closing(openpyxl.load_workbook(path)) as wb:
|
||||
# test unstyled data cell does not have expected styles
|
||||
# test styled cell has expected styles
|
||||
u_cell, s_cell = wb["dataframe"].cell(2, 2), wb["styled"].cell(2, 2)
|
||||
for attr in attrs:
|
||||
u_cell, s_cell = getattr(u_cell, attr, None), getattr(s_cell, attr)
|
||||
|
||||
if isinstance(expected, dict):
|
||||
assert u_cell is None or u_cell != expected[engine]
|
||||
assert s_cell == expected[engine]
|
||||
else:
|
||||
assert u_cell is None or u_cell != expected
|
||||
assert s_cell == expected
|
||||
|
||||
|
||||
def test_styler_custom_converter():
|
||||
openpyxl = pytest.importorskip("openpyxl")
|
||||
|
||||
def custom_converter(css):
|
||||
return {"font": {"color": {"rgb": "111222"}}}
|
||||
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1, 1)))
|
||||
styler = df.style.map(lambda x: "color: #888999")
|
||||
with tm.ensure_clean(".xlsx") as path:
|
||||
with ExcelWriter(path, engine="openpyxl") as writer:
|
||||
ExcelFormatter(styler, style_converter=custom_converter).write(
|
||||
writer, sheet_name="custom"
|
||||
)
|
||||
|
||||
with contextlib.closing(openpyxl.load_workbook(path)) as wb:
|
||||
assert wb["custom"].cell(2, 2).font.color.value == "00111222"
|
||||
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
@td.skip_if_not_us_locale
|
||||
def test_styler_to_s3(s3_public_bucket, s3so):
|
||||
# GH#46381
|
||||
|
||||
mock_bucket_name, target_file = s3_public_bucket.name, "test.xlsx"
|
||||
df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
|
||||
styler = df.style.set_sticky(axis="index")
|
||||
styler.to_excel(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so)
|
||||
timeout = 5
|
||||
while True:
|
||||
if target_file in (obj.key for obj in s3_public_bucket.objects.all()):
|
||||
break
|
||||
time.sleep(0.1)
|
||||
timeout -= 0.1
|
||||
assert timeout > 0, "Timed out waiting for file to appear on moto"
|
||||
result = read_excel(
|
||||
f"s3://{mock_bucket_name}/{target_file}", index_col=0, storage_options=s3so
|
||||
)
|
||||
tm.assert_frame_equal(result, df)
|
||||
1514
lib/python3.11/site-packages/pandas/tests/io/excel/test_writers.py
Normal file
1514
lib/python3.11/site-packages/pandas/tests/io/excel/test_writers.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,76 @@
|
||||
import io
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_windows
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.excel import ExcelFile
|
||||
from pandas.io.excel._base import inspect_excel_format
|
||||
|
||||
xlrd = pytest.importorskip("xlrd")
|
||||
|
||||
if is_platform_windows():
|
||||
pytestmark = pytest.mark.single_cpu
|
||||
|
||||
|
||||
@pytest.fixture(params=[".xls"])
|
||||
def read_ext_xlrd(request):
|
||||
"""
|
||||
Valid extensions for reading Excel files with xlrd.
|
||||
|
||||
Similar to read_ext, but excludes .ods, .xlsb, and for xlrd>2 .xlsx, .xlsm
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
def test_read_xlrd_book(read_ext_xlrd, datapath):
|
||||
engine = "xlrd"
|
||||
sheet_name = "Sheet1"
|
||||
pth = datapath("io", "data", "excel", "test1.xls")
|
||||
with xlrd.open_workbook(pth) as book:
|
||||
with ExcelFile(book, engine=engine) as xl:
|
||||
result = pd.read_excel(xl, sheet_name=sheet_name, index_col=0)
|
||||
|
||||
expected = pd.read_excel(
|
||||
book, sheet_name=sheet_name, engine=engine, index_col=0
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_xlsx_fails(datapath):
|
||||
# GH 29375
|
||||
from xlrd.biffh import XLRDError
|
||||
|
||||
path = datapath("io", "data", "excel", "test1.xlsx")
|
||||
with pytest.raises(XLRDError, match="Excel xlsx file; not supported"):
|
||||
pd.read_excel(path, engine="xlrd")
|
||||
|
||||
|
||||
def test_nan_in_xls(datapath):
|
||||
# GH 54564
|
||||
path = datapath("io", "data", "excel", "test6.xls")
|
||||
|
||||
expected = pd.DataFrame({0: np.r_[0, 2].astype("int64"), 1: np.r_[1, np.nan]})
|
||||
|
||||
result = pd.read_excel(path, header=None)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file_header",
|
||||
[
|
||||
b"\x09\x00\x04\x00\x07\x00\x10\x00",
|
||||
b"\x09\x02\x06\x00\x00\x00\x10\x00",
|
||||
b"\x09\x04\x06\x00\x00\x00\x10\x00",
|
||||
b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1",
|
||||
],
|
||||
)
|
||||
def test_read_old_xls_files(file_header):
|
||||
# GH 41226
|
||||
f = io.BytesIO(file_header)
|
||||
assert inspect_excel_format(f) == "xls"
|
||||
@ -0,0 +1,86 @@
|
||||
import contextlib
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_windows
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.excel import ExcelWriter
|
||||
|
||||
xlsxwriter = pytest.importorskip("xlsxwriter")
|
||||
|
||||
if is_platform_windows():
|
||||
pytestmark = pytest.mark.single_cpu
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ext():
|
||||
return ".xlsx"
|
||||
|
||||
|
||||
def test_column_format(ext):
|
||||
# Test that column formats are applied to cells. Test for issue #9167.
|
||||
# Applicable to xlsxwriter only.
|
||||
openpyxl = pytest.importorskip("openpyxl")
|
||||
|
||||
with tm.ensure_clean(ext) as path:
|
||||
frame = DataFrame({"A": [123456, 123456], "B": [123456, 123456]})
|
||||
|
||||
with ExcelWriter(path) as writer:
|
||||
frame.to_excel(writer)
|
||||
|
||||
# Add a number format to col B and ensure it is applied to cells.
|
||||
num_format = "#,##0"
|
||||
write_workbook = writer.book
|
||||
write_worksheet = write_workbook.worksheets()[0]
|
||||
col_format = write_workbook.add_format({"num_format": num_format})
|
||||
write_worksheet.set_column("B:B", None, col_format)
|
||||
|
||||
with contextlib.closing(openpyxl.load_workbook(path)) as read_workbook:
|
||||
try:
|
||||
read_worksheet = read_workbook["Sheet1"]
|
||||
except TypeError:
|
||||
# compat
|
||||
read_worksheet = read_workbook.get_sheet_by_name(name="Sheet1")
|
||||
|
||||
# Get the number format from the cell.
|
||||
try:
|
||||
cell = read_worksheet["B2"]
|
||||
except TypeError:
|
||||
# compat
|
||||
cell = read_worksheet.cell("B2")
|
||||
|
||||
try:
|
||||
read_num_format = cell.number_format
|
||||
except AttributeError:
|
||||
read_num_format = cell.style.number_format._format_code
|
||||
|
||||
assert read_num_format == num_format
|
||||
|
||||
|
||||
def test_write_append_mode_raises(ext):
|
||||
msg = "Append mode is not supported with xlsxwriter!"
|
||||
|
||||
with tm.ensure_clean(ext) as f:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ExcelWriter(f, engine="xlsxwriter", mode="a")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nan_inf_to_errors", [True, False])
|
||||
def test_engine_kwargs(ext, nan_inf_to_errors):
|
||||
# GH 42286
|
||||
engine_kwargs = {"options": {"nan_inf_to_errors": nan_inf_to_errors}}
|
||||
with tm.ensure_clean(ext) as f:
|
||||
with ExcelWriter(f, engine="xlsxwriter", engine_kwargs=engine_kwargs) as writer:
|
||||
assert writer.book.nan_inf_to_errors == nan_inf_to_errors
|
||||
|
||||
|
||||
def test_book_and_sheets_consistent(ext):
|
||||
# GH#45687 - Ensure sheets is updated if user modifies book
|
||||
with tm.ensure_clean(ext) as f:
|
||||
with ExcelWriter(f, engine="xlsxwriter") as writer:
|
||||
assert writer.sheets == {}
|
||||
sheet = writer.book.add_worksheet("test_name")
|
||||
assert writer.sheets == {"test_name": sheet}
|
||||
@ -0,0 +1,359 @@
|
||||
import io
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
NA,
|
||||
DataFrame,
|
||||
read_csv,
|
||||
)
|
||||
|
||||
pytest.importorskip("jinja2")
|
||||
|
||||
|
||||
def bar_grad(a=None, b=None, c=None, d=None):
|
||||
"""Used in multiple tests to simplify formatting of expected result"""
|
||||
ret = [("width", "10em")]
|
||||
if all(x is None for x in [a, b, c, d]):
|
||||
return ret
|
||||
return ret + [
|
||||
(
|
||||
"background",
|
||||
f"linear-gradient(90deg,{','.join([x for x in [a, b, c, d] if x])})",
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def no_bar():
|
||||
return bar_grad()
|
||||
|
||||
|
||||
def bar_to(x, color="#d65f5f"):
|
||||
return bar_grad(f" {color} {x:.1f}%", f" transparent {x:.1f}%")
|
||||
|
||||
|
||||
def bar_from_to(x, y, color="#d65f5f"):
|
||||
return bar_grad(
|
||||
f" transparent {x:.1f}%",
|
||||
f" {color} {x:.1f}%",
|
||||
f" {color} {y:.1f}%",
|
||||
f" transparent {y:.1f}%",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_pos():
|
||||
return DataFrame([[1], [2], [3]])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_neg():
|
||||
return DataFrame([[-1], [-2], [-3]])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_mix():
|
||||
return DataFrame([[-3], [1], [2]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"align, exp",
|
||||
[
|
||||
("left", [no_bar(), bar_to(50), bar_to(100)]),
|
||||
("right", [bar_to(100), bar_from_to(50, 100), no_bar()]),
|
||||
("mid", [bar_to(33.33), bar_to(66.66), bar_to(100)]),
|
||||
("zero", [bar_from_to(50, 66.7), bar_from_to(50, 83.3), bar_from_to(50, 100)]),
|
||||
("mean", [bar_to(50), no_bar(), bar_from_to(50, 100)]),
|
||||
(2.0, [bar_to(50), no_bar(), bar_from_to(50, 100)]),
|
||||
(np.median, [bar_to(50), no_bar(), bar_from_to(50, 100)]),
|
||||
],
|
||||
)
|
||||
def test_align_positive_cases(df_pos, align, exp):
|
||||
# test different align cases for all positive values
|
||||
result = df_pos.style.bar(align=align)._compute().ctx
|
||||
expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]}
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"align, exp",
|
||||
[
|
||||
("left", [bar_to(100), bar_to(50), no_bar()]),
|
||||
("right", [no_bar(), bar_from_to(50, 100), bar_to(100)]),
|
||||
("mid", [bar_from_to(66.66, 100), bar_from_to(33.33, 100), bar_to(100)]),
|
||||
("zero", [bar_from_to(33.33, 50), bar_from_to(16.66, 50), bar_to(50)]),
|
||||
("mean", [bar_from_to(50, 100), no_bar(), bar_to(50)]),
|
||||
(-2.0, [bar_from_to(50, 100), no_bar(), bar_to(50)]),
|
||||
(np.median, [bar_from_to(50, 100), no_bar(), bar_to(50)]),
|
||||
],
|
||||
)
|
||||
def test_align_negative_cases(df_neg, align, exp):
|
||||
# test different align cases for all negative values
|
||||
result = df_neg.style.bar(align=align)._compute().ctx
|
||||
expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]}
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"align, exp",
|
||||
[
|
||||
("left", [no_bar(), bar_to(80), bar_to(100)]),
|
||||
("right", [bar_to(100), bar_from_to(80, 100), no_bar()]),
|
||||
("mid", [bar_to(60), bar_from_to(60, 80), bar_from_to(60, 100)]),
|
||||
("zero", [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]),
|
||||
("mean", [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]),
|
||||
(-0.0, [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]),
|
||||
(np.nanmedian, [bar_to(50), no_bar(), bar_from_to(50, 62.5)]),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("nans", [True, False])
|
||||
def test_align_mixed_cases(df_mix, align, exp, nans):
|
||||
# test different align cases for mixed positive and negative values
|
||||
# also test no impact of NaNs and no_bar
|
||||
expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]}
|
||||
if nans:
|
||||
df_mix.loc[3, :] = np.nan
|
||||
expected.update({(3, 0): no_bar()})
|
||||
result = df_mix.style.bar(align=align)._compute().ctx
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"align, exp",
|
||||
[
|
||||
(
|
||||
"left",
|
||||
{
|
||||
"index": [[no_bar(), no_bar()], [bar_to(100), bar_to(100)]],
|
||||
"columns": [[no_bar(), bar_to(100)], [no_bar(), bar_to(100)]],
|
||||
"none": [[no_bar(), bar_to(33.33)], [bar_to(66.66), bar_to(100)]],
|
||||
},
|
||||
),
|
||||
(
|
||||
"mid",
|
||||
{
|
||||
"index": [[bar_to(33.33), bar_to(50)], [bar_to(100), bar_to(100)]],
|
||||
"columns": [[bar_to(50), bar_to(100)], [bar_to(75), bar_to(100)]],
|
||||
"none": [[bar_to(25), bar_to(50)], [bar_to(75), bar_to(100)]],
|
||||
},
|
||||
),
|
||||
(
|
||||
"zero",
|
||||
{
|
||||
"index": [
|
||||
[bar_from_to(50, 66.66), bar_from_to(50, 75)],
|
||||
[bar_from_to(50, 100), bar_from_to(50, 100)],
|
||||
],
|
||||
"columns": [
|
||||
[bar_from_to(50, 75), bar_from_to(50, 100)],
|
||||
[bar_from_to(50, 87.5), bar_from_to(50, 100)],
|
||||
],
|
||||
"none": [
|
||||
[bar_from_to(50, 62.5), bar_from_to(50, 75)],
|
||||
[bar_from_to(50, 87.5), bar_from_to(50, 100)],
|
||||
],
|
||||
},
|
||||
),
|
||||
(
|
||||
2,
|
||||
{
|
||||
"index": [
|
||||
[bar_to(50), no_bar()],
|
||||
[bar_from_to(50, 100), bar_from_to(50, 100)],
|
||||
],
|
||||
"columns": [
|
||||
[bar_to(50), no_bar()],
|
||||
[bar_from_to(50, 75), bar_from_to(50, 100)],
|
||||
],
|
||||
"none": [
|
||||
[bar_from_to(25, 50), no_bar()],
|
||||
[bar_from_to(50, 75), bar_from_to(50, 100)],
|
||||
],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("axis", ["index", "columns", "none"])
|
||||
def test_align_axis(align, exp, axis):
|
||||
# test all axis combinations with positive values and different aligns
|
||||
data = DataFrame([[1, 2], [3, 4]])
|
||||
result = (
|
||||
data.style.bar(align=align, axis=None if axis == "none" else axis)
|
||||
._compute()
|
||||
.ctx
|
||||
)
|
||||
expected = {
|
||||
(0, 0): exp[axis][0][0],
|
||||
(0, 1): exp[axis][0][1],
|
||||
(1, 0): exp[axis][1][0],
|
||||
(1, 1): exp[axis][1][1],
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, vmin, vmax",
|
||||
[
|
||||
("positive", 1.5, 2.5),
|
||||
("negative", -2.5, -1.5),
|
||||
("mixed", -2.5, 1.5),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("nullify", [None, "vmin", "vmax"]) # test min/max separately
|
||||
@pytest.mark.parametrize("align", ["left", "right", "zero", "mid"])
|
||||
def test_vmin_vmax_clipping(df_pos, df_neg, df_mix, values, vmin, vmax, nullify, align):
|
||||
# test that clipping occurs if any vmin > data_values or vmax < data_values
|
||||
if align == "mid": # mid acts as left or right in each case
|
||||
if values == "positive":
|
||||
align = "left"
|
||||
elif values == "negative":
|
||||
align = "right"
|
||||
df = {"positive": df_pos, "negative": df_neg, "mixed": df_mix}[values]
|
||||
vmin = None if nullify == "vmin" else vmin
|
||||
vmax = None if nullify == "vmax" else vmax
|
||||
|
||||
clip_df = df.where(df <= (vmax if vmax else 999), other=vmax)
|
||||
clip_df = clip_df.where(clip_df >= (vmin if vmin else -999), other=vmin)
|
||||
|
||||
result = (
|
||||
df.style.bar(align=align, vmin=vmin, vmax=vmax, color=["red", "green"])
|
||||
._compute()
|
||||
.ctx
|
||||
)
|
||||
expected = clip_df.style.bar(align=align, color=["red", "green"])._compute().ctx
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, vmin, vmax",
|
||||
[
|
||||
("positive", 0.5, 4.5),
|
||||
("negative", -4.5, -0.5),
|
||||
("mixed", -4.5, 4.5),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("nullify", [None, "vmin", "vmax"]) # test min/max separately
|
||||
@pytest.mark.parametrize("align", ["left", "right", "zero", "mid"])
|
||||
def test_vmin_vmax_widening(df_pos, df_neg, df_mix, values, vmin, vmax, nullify, align):
|
||||
# test that widening occurs if any vmax > data_values or vmin < data_values
|
||||
if align == "mid": # mid acts as left or right in each case
|
||||
if values == "positive":
|
||||
align = "left"
|
||||
elif values == "negative":
|
||||
align = "right"
|
||||
df = {"positive": df_pos, "negative": df_neg, "mixed": df_mix}[values]
|
||||
vmin = None if nullify == "vmin" else vmin
|
||||
vmax = None if nullify == "vmax" else vmax
|
||||
|
||||
expand_df = df.copy()
|
||||
expand_df.loc[3, :], expand_df.loc[4, :] = vmin, vmax
|
||||
|
||||
result = (
|
||||
df.style.bar(align=align, vmin=vmin, vmax=vmax, color=["red", "green"])
|
||||
._compute()
|
||||
.ctx
|
||||
)
|
||||
expected = expand_df.style.bar(align=align, color=["red", "green"])._compute().ctx
|
||||
assert result.items() <= expected.items()
|
||||
|
||||
|
||||
def test_numerics():
|
||||
# test data is pre-selected for numeric values
|
||||
data = DataFrame([[1, "a"], [2, "b"]])
|
||||
result = data.style.bar()._compute().ctx
|
||||
assert (0, 1) not in result
|
||||
assert (1, 1) not in result
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"align, exp",
|
||||
[
|
||||
("left", [no_bar(), bar_to(100, "green")]),
|
||||
("right", [bar_to(100, "red"), no_bar()]),
|
||||
("mid", [bar_to(25, "red"), bar_from_to(25, 100, "green")]),
|
||||
("zero", [bar_from_to(33.33, 50, "red"), bar_from_to(50, 100, "green")]),
|
||||
],
|
||||
)
|
||||
def test_colors_mixed(align, exp):
|
||||
data = DataFrame([[-1], [3]])
|
||||
result = data.style.bar(align=align, color=["red", "green"])._compute().ctx
|
||||
assert result == {(0, 0): exp[0], (1, 0): exp[1]}
|
||||
|
||||
|
||||
def test_bar_align_height():
|
||||
# test when keyword height is used 'no-repeat center' and 'background-size' present
|
||||
data = DataFrame([[1], [2]])
|
||||
result = data.style.bar(align="left", height=50)._compute().ctx
|
||||
bg_s = "linear-gradient(90deg, #d65f5f 100.0%, transparent 100.0%) no-repeat center"
|
||||
expected = {
|
||||
(0, 0): [("width", "10em")],
|
||||
(1, 0): [
|
||||
("width", "10em"),
|
||||
("background", bg_s),
|
||||
("background-size", "100% 50.0%"),
|
||||
],
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_bar_value_error_raises():
|
||||
df = DataFrame({"A": [-100, -60, -30, -20]})
|
||||
|
||||
msg = "`align` should be in {'left', 'right', 'mid', 'mean', 'zero'} or"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]).to_html()
|
||||
|
||||
msg = r"`width` must be a value in \[0, 100\]"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.style.bar(width=200).to_html()
|
||||
|
||||
msg = r"`height` must be a value in \[0, 100\]"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.style.bar(height=200).to_html()
|
||||
|
||||
|
||||
def test_bar_color_and_cmap_error_raises():
|
||||
df = DataFrame({"A": [1, 2, 3, 4]})
|
||||
msg = "`color` and `cmap` cannot both be given"
|
||||
# Test that providing both color and cmap raises a ValueError
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.style.bar(color="#d65f5f", cmap="viridis").to_html()
|
||||
|
||||
|
||||
def test_bar_invalid_color_type_error_raises():
|
||||
df = DataFrame({"A": [1, 2, 3, 4]})
|
||||
msg = (
|
||||
r"`color` must be string or list or tuple of 2 strings,"
|
||||
r"\(eg: color=\['#d65f5f', '#5fba7d'\]\)"
|
||||
)
|
||||
# Test that providing an invalid color type raises a ValueError
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.style.bar(color=123).to_html()
|
||||
|
||||
# Test that providing a color list with more than two elements raises a ValueError
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.style.bar(color=["#d65f5f", "#5fba7d", "#abcdef"]).to_html()
|
||||
|
||||
|
||||
def test_styler_bar_with_NA_values():
|
||||
df1 = DataFrame({"A": [1, 2, NA, 4]})
|
||||
df2 = DataFrame([[NA, NA], [NA, NA]])
|
||||
expected_substring = "style type="
|
||||
html_output1 = df1.style.bar(subset="A").to_html()
|
||||
html_output2 = df2.style.bar(align="left", axis=None).to_html()
|
||||
assert expected_substring in html_output1
|
||||
assert expected_substring in html_output2
|
||||
|
||||
|
||||
def test_style_bar_with_pyarrow_NA_values():
|
||||
pytest.importorskip("pyarrow")
|
||||
data = """name,age,test1,test2,teacher
|
||||
Adam,15,95.0,80,Ashby
|
||||
Bob,16,81.0,82,Ashby
|
||||
Dave,16,89.0,84,Jones
|
||||
Fred,15,,88,Jones"""
|
||||
df = read_csv(io.StringIO(data), dtype_backend="pyarrow")
|
||||
expected_substring = "style type="
|
||||
html_output = df.style.bar(subset="test1").to_html()
|
||||
assert expected_substring in html_output
|
||||
@ -0,0 +1,44 @@
|
||||
import pytest
|
||||
|
||||
jinja2 = pytest.importorskip("jinja2")
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
MultiIndex,
|
||||
)
|
||||
|
||||
from pandas.io.formats.style import Styler
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
return DataFrame(
|
||||
data=[[0, -0.609], [1, -1.228]],
|
||||
columns=["A", "B"],
|
||||
index=["x", "y"],
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def styler(df):
|
||||
return Styler(df, uuid_len=0)
|
||||
|
||||
|
||||
def test_concat_bad_columns(styler):
|
||||
msg = "`other.data` must have same columns as `Styler.data"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
styler.concat(DataFrame([[1, 2]]).style)
|
||||
|
||||
|
||||
def test_concat_bad_type(styler):
|
||||
msg = "`other` must be of type `Styler`"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
styler.concat(DataFrame([[1, 2]]))
|
||||
|
||||
|
||||
def test_concat_bad_index_levels(styler, df):
|
||||
df = df.copy()
|
||||
df.index = MultiIndex.from_tuples([(0, 0), (1, 1)])
|
||||
msg = "number of index levels must be same in `other`"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
styler.concat(df.style)
|
||||
@ -0,0 +1,562 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
NA,
|
||||
DataFrame,
|
||||
IndexSlice,
|
||||
MultiIndex,
|
||||
NaT,
|
||||
Timestamp,
|
||||
option_context,
|
||||
)
|
||||
|
||||
pytest.importorskip("jinja2")
|
||||
from pandas.io.formats.style import Styler
|
||||
from pandas.io.formats.style_render import _str_escape
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
return DataFrame(
|
||||
data=[[0, -0.609], [1, -1.228]],
|
||||
columns=["A", "B"],
|
||||
index=["x", "y"],
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def styler(df):
|
||||
return Styler(df, uuid_len=0)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_multi():
|
||||
return DataFrame(
|
||||
data=np.arange(16).reshape(4, 4),
|
||||
columns=MultiIndex.from_product([["A", "B"], ["a", "b"]]),
|
||||
index=MultiIndex.from_product([["X", "Y"], ["x", "y"]]),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def styler_multi(df_multi):
|
||||
return Styler(df_multi, uuid_len=0)
|
||||
|
||||
|
||||
def test_display_format(styler):
|
||||
ctx = styler.format("{:0.1f}")._translate(True, True)
|
||||
assert all(["display_value" in c for c in row] for row in ctx["body"])
|
||||
assert all([len(c["display_value"]) <= 3 for c in row[1:]] for row in ctx["body"])
|
||||
assert len(ctx["body"][0][1]["display_value"].lstrip("-")) <= 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index", [True, False])
|
||||
@pytest.mark.parametrize("columns", [True, False])
|
||||
def test_display_format_index(styler, index, columns):
|
||||
exp_index = ["x", "y"]
|
||||
if index:
|
||||
styler.format_index(lambda v: v.upper(), axis=0) # test callable
|
||||
exp_index = ["X", "Y"]
|
||||
|
||||
exp_columns = ["A", "B"]
|
||||
if columns:
|
||||
styler.format_index("*{}*", axis=1) # test string
|
||||
exp_columns = ["*A*", "*B*"]
|
||||
|
||||
ctx = styler._translate(True, True)
|
||||
|
||||
for r, row in enumerate(ctx["body"]):
|
||||
assert row[0]["display_value"] == exp_index[r]
|
||||
|
||||
for c, col in enumerate(ctx["head"][1:]):
|
||||
assert col["display_value"] == exp_columns[c]
|
||||
|
||||
|
||||
def test_format_dict(styler):
|
||||
ctx = styler.format({"A": "{:0.1f}", "B": "{0:.2%}"})._translate(True, True)
|
||||
assert ctx["body"][0][1]["display_value"] == "0.0"
|
||||
assert ctx["body"][0][2]["display_value"] == "-60.90%"
|
||||
|
||||
|
||||
def test_format_index_dict(styler):
|
||||
ctx = styler.format_index({0: lambda v: v.upper()})._translate(True, True)
|
||||
for i, val in enumerate(["X", "Y"]):
|
||||
assert ctx["body"][i][0]["display_value"] == val
|
||||
|
||||
|
||||
def test_format_string(styler):
|
||||
ctx = styler.format("{:.2f}")._translate(True, True)
|
||||
assert ctx["body"][0][1]["display_value"] == "0.00"
|
||||
assert ctx["body"][0][2]["display_value"] == "-0.61"
|
||||
assert ctx["body"][1][1]["display_value"] == "1.00"
|
||||
assert ctx["body"][1][2]["display_value"] == "-1.23"
|
||||
|
||||
|
||||
def test_format_callable(styler):
|
||||
ctx = styler.format(lambda v: "neg" if v < 0 else "pos")._translate(True, True)
|
||||
assert ctx["body"][0][1]["display_value"] == "pos"
|
||||
assert ctx["body"][0][2]["display_value"] == "neg"
|
||||
assert ctx["body"][1][1]["display_value"] == "pos"
|
||||
assert ctx["body"][1][2]["display_value"] == "neg"
|
||||
|
||||
|
||||
def test_format_with_na_rep():
|
||||
# GH 21527 28358
|
||||
df = DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"])
|
||||
|
||||
ctx = df.style.format(None, na_rep="-")._translate(True, True)
|
||||
assert ctx["body"][0][1]["display_value"] == "-"
|
||||
assert ctx["body"][0][2]["display_value"] == "-"
|
||||
|
||||
ctx = df.style.format("{:.2%}", na_rep="-")._translate(True, True)
|
||||
assert ctx["body"][0][1]["display_value"] == "-"
|
||||
assert ctx["body"][0][2]["display_value"] == "-"
|
||||
assert ctx["body"][1][1]["display_value"] == "110.00%"
|
||||
assert ctx["body"][1][2]["display_value"] == "120.00%"
|
||||
|
||||
ctx = df.style.format("{:.2%}", na_rep="-", subset=["B"])._translate(True, True)
|
||||
assert ctx["body"][0][2]["display_value"] == "-"
|
||||
assert ctx["body"][1][2]["display_value"] == "120.00%"
|
||||
|
||||
|
||||
def test_format_index_with_na_rep():
|
||||
df = DataFrame([[1, 2, 3, 4, 5]], columns=["A", None, np.nan, NaT, NA])
|
||||
ctx = df.style.format_index(None, na_rep="--", axis=1)._translate(True, True)
|
||||
assert ctx["head"][0][1]["display_value"] == "A"
|
||||
for i in [2, 3, 4, 5]:
|
||||
assert ctx["head"][0][i]["display_value"] == "--"
|
||||
|
||||
|
||||
def test_format_non_numeric_na():
|
||||
# GH 21527 28358
|
||||
df = DataFrame(
|
||||
{
|
||||
"object": [None, np.nan, "foo"],
|
||||
"datetime": [None, NaT, Timestamp("20120101")],
|
||||
}
|
||||
)
|
||||
ctx = df.style.format(None, na_rep="-")._translate(True, True)
|
||||
assert ctx["body"][0][1]["display_value"] == "-"
|
||||
assert ctx["body"][0][2]["display_value"] == "-"
|
||||
assert ctx["body"][1][1]["display_value"] == "-"
|
||||
assert ctx["body"][1][2]["display_value"] == "-"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"func, attr, kwargs",
|
||||
[
|
||||
("format", "_display_funcs", {}),
|
||||
("format_index", "_display_funcs_index", {"axis": 0}),
|
||||
("format_index", "_display_funcs_columns", {"axis": 1}),
|
||||
],
|
||||
)
|
||||
def test_format_clear(styler, func, attr, kwargs):
|
||||
assert (0, 0) not in getattr(styler, attr) # using default
|
||||
getattr(styler, func)("{:.2f}", **kwargs)
|
||||
assert (0, 0) in getattr(styler, attr) # formatter is specified
|
||||
getattr(styler, func)(**kwargs)
|
||||
assert (0, 0) not in getattr(styler, attr) # formatter cleared to default
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"escape, exp",
|
||||
[
|
||||
("html", "<>&"%$#_{}~^\\~ ^ \\ "),
|
||||
(
|
||||
"latex",
|
||||
'<>\\&"\\%\\$\\#\\_\\{\\}\\textasciitilde \\textasciicircum '
|
||||
"\\textbackslash \\textasciitilde \\space \\textasciicircum \\space "
|
||||
"\\textbackslash \\space ",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_format_escape_html(escape, exp):
|
||||
chars = '<>&"%$#_{}~^\\~ ^ \\ '
|
||||
df = DataFrame([[chars]])
|
||||
|
||||
s = Styler(df, uuid_len=0).format("&{0}&", escape=None)
|
||||
expected = f'<td id="T__row0_col0" class="data row0 col0" >&{chars}&</td>'
|
||||
assert expected in s.to_html()
|
||||
|
||||
# only the value should be escaped before passing to the formatter
|
||||
s = Styler(df, uuid_len=0).format("&{0}&", escape=escape)
|
||||
expected = f'<td id="T__row0_col0" class="data row0 col0" >&{exp}&</td>'
|
||||
assert expected in s.to_html()
|
||||
|
||||
# also test format_index()
|
||||
styler = Styler(DataFrame(columns=[chars]), uuid_len=0)
|
||||
styler.format_index("&{0}&", escape=None, axis=1)
|
||||
assert styler._translate(True, True)["head"][0][1]["display_value"] == f"&{chars}&"
|
||||
styler.format_index("&{0}&", escape=escape, axis=1)
|
||||
assert styler._translate(True, True)["head"][0][1]["display_value"] == f"&{exp}&"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"chars, expected",
|
||||
[
|
||||
(
|
||||
r"$ \$&%#_{}~^\ $ &%#_{}~^\ $",
|
||||
"".join(
|
||||
[
|
||||
r"$ \$&%#_{}~^\ $ ",
|
||||
r"\&\%\#\_\{\}\textasciitilde \textasciicircum ",
|
||||
r"\textbackslash \space \$",
|
||||
]
|
||||
),
|
||||
),
|
||||
(
|
||||
r"\( &%#_{}~^\ \) &%#_{}~^\ \(",
|
||||
"".join(
|
||||
[
|
||||
r"\( &%#_{}~^\ \) ",
|
||||
r"\&\%\#\_\{\}\textasciitilde \textasciicircum ",
|
||||
r"\textbackslash \space \textbackslash (",
|
||||
]
|
||||
),
|
||||
),
|
||||
(
|
||||
r"$\&%#_{}^\$",
|
||||
r"\$\textbackslash \&\%\#\_\{\}\textasciicircum \textbackslash \$",
|
||||
),
|
||||
(
|
||||
r"$ \frac{1}{2} $ \( \frac{1}{2} \)",
|
||||
"".join(
|
||||
[
|
||||
r"$ \frac{1}{2} $",
|
||||
r" \textbackslash ( \textbackslash frac\{1\}\{2\} \textbackslash )",
|
||||
]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_format_escape_latex_math(chars, expected):
|
||||
# GH 51903
|
||||
# latex-math escape works for each DataFrame cell separately. If we have
|
||||
# a combination of dollar signs and brackets, the dollar sign would apply.
|
||||
df = DataFrame([[chars]])
|
||||
s = df.style.format("{0}", escape="latex-math")
|
||||
assert s._translate(True, True)["body"][0][1]["display_value"] == expected
|
||||
|
||||
|
||||
def test_format_escape_na_rep():
|
||||
# tests the na_rep is not escaped
|
||||
df = DataFrame([['<>&"', None]])
|
||||
s = Styler(df, uuid_len=0).format("X&{0}>X", escape="html", na_rep="&")
|
||||
ex = '<td id="T__row0_col0" class="data row0 col0" >X&<>&">X</td>'
|
||||
expected2 = '<td id="T__row0_col1" class="data row0 col1" >&</td>'
|
||||
assert ex in s.to_html()
|
||||
assert expected2 in s.to_html()
|
||||
|
||||
# also test for format_index()
|
||||
df = DataFrame(columns=['<>&"', None])
|
||||
styler = Styler(df, uuid_len=0)
|
||||
styler.format_index("X&{0}>X", escape="html", na_rep="&", axis=1)
|
||||
ctx = styler._translate(True, True)
|
||||
assert ctx["head"][0][1]["display_value"] == "X&<>&">X"
|
||||
assert ctx["head"][0][2]["display_value"] == "&"
|
||||
|
||||
|
||||
def test_format_escape_floats(styler):
|
||||
# test given formatter for number format is not impacted by escape
|
||||
s = styler.format("{:.1f}", escape="html")
|
||||
for expected in [">0.0<", ">1.0<", ">-1.2<", ">-0.6<"]:
|
||||
assert expected in s.to_html()
|
||||
# tests precision of floats is not impacted by escape
|
||||
s = styler.format(precision=1, escape="html")
|
||||
for expected in [">0<", ">1<", ">-1.2<", ">-0.6<"]:
|
||||
assert expected in s.to_html()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("formatter", [5, True, [2.0]])
|
||||
@pytest.mark.parametrize("func", ["format", "format_index"])
|
||||
def test_format_raises(styler, formatter, func):
|
||||
with pytest.raises(TypeError, match="expected str or callable"):
|
||||
getattr(styler, func)(formatter)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"precision, expected",
|
||||
[
|
||||
(1, ["1.0", "2.0", "3.2", "4.6"]),
|
||||
(2, ["1.00", "2.01", "3.21", "4.57"]),
|
||||
(3, ["1.000", "2.009", "3.212", "4.566"]),
|
||||
],
|
||||
)
|
||||
def test_format_with_precision(precision, expected):
|
||||
# Issue #13257
|
||||
df = DataFrame([[1.0, 2.0090, 3.2121, 4.566]], columns=[1.0, 2.0090, 3.2121, 4.566])
|
||||
styler = Styler(df)
|
||||
styler.format(precision=precision)
|
||||
styler.format_index(precision=precision, axis=1)
|
||||
|
||||
ctx = styler._translate(True, True)
|
||||
for col, exp in enumerate(expected):
|
||||
assert ctx["body"][0][col + 1]["display_value"] == exp # format test
|
||||
assert ctx["head"][0][col + 1]["display_value"] == exp # format_index test
|
||||
|
||||
|
||||
@pytest.mark.parametrize("axis", [0, 1])
|
||||
@pytest.mark.parametrize(
|
||||
"level, expected",
|
||||
[
|
||||
(0, ["X", "X", "_", "_"]), # level int
|
||||
("zero", ["X", "X", "_", "_"]), # level name
|
||||
(1, ["_", "_", "X", "X"]), # other level int
|
||||
("one", ["_", "_", "X", "X"]), # other level name
|
||||
([0, 1], ["X", "X", "X", "X"]), # both levels
|
||||
([0, "zero"], ["X", "X", "_", "_"]), # level int and name simultaneous
|
||||
([0, "one"], ["X", "X", "X", "X"]), # both levels as int and name
|
||||
(["one", "zero"], ["X", "X", "X", "X"]), # both level names, reversed
|
||||
],
|
||||
)
|
||||
def test_format_index_level(axis, level, expected):
|
||||
midx = MultiIndex.from_arrays([["_", "_"], ["_", "_"]], names=["zero", "one"])
|
||||
df = DataFrame([[1, 2], [3, 4]])
|
||||
if axis == 0:
|
||||
df.index = midx
|
||||
else:
|
||||
df.columns = midx
|
||||
|
||||
styler = df.style.format_index(lambda v: "X", level=level, axis=axis)
|
||||
ctx = styler._translate(True, True)
|
||||
|
||||
if axis == 0: # compare index
|
||||
result = [ctx["body"][s][0]["display_value"] for s in range(2)]
|
||||
result += [ctx["body"][s][1]["display_value"] for s in range(2)]
|
||||
else: # compare columns
|
||||
result = [ctx["head"][0][s + 1]["display_value"] for s in range(2)]
|
||||
result += [ctx["head"][1][s + 1]["display_value"] for s in range(2)]
|
||||
|
||||
assert expected == result
|
||||
|
||||
|
||||
def test_format_subset():
|
||||
df = DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"])
|
||||
ctx = df.style.format(
|
||||
{"a": "{:0.1f}", "b": "{0:.2%}"}, subset=IndexSlice[0, :]
|
||||
)._translate(True, True)
|
||||
expected = "0.1"
|
||||
raw_11 = "1.123400"
|
||||
assert ctx["body"][0][1]["display_value"] == expected
|
||||
assert ctx["body"][1][1]["display_value"] == raw_11
|
||||
assert ctx["body"][0][2]["display_value"] == "12.34%"
|
||||
|
||||
ctx = df.style.format("{:0.1f}", subset=IndexSlice[0, :])._translate(True, True)
|
||||
assert ctx["body"][0][1]["display_value"] == expected
|
||||
assert ctx["body"][1][1]["display_value"] == raw_11
|
||||
|
||||
ctx = df.style.format("{:0.1f}", subset=IndexSlice["a"])._translate(True, True)
|
||||
assert ctx["body"][0][1]["display_value"] == expected
|
||||
assert ctx["body"][0][2]["display_value"] == "0.123400"
|
||||
|
||||
ctx = df.style.format("{:0.1f}", subset=IndexSlice[0, "a"])._translate(True, True)
|
||||
assert ctx["body"][0][1]["display_value"] == expected
|
||||
assert ctx["body"][1][1]["display_value"] == raw_11
|
||||
|
||||
ctx = df.style.format("{:0.1f}", subset=IndexSlice[[0, 1], ["a"]])._translate(
|
||||
True, True
|
||||
)
|
||||
assert ctx["body"][0][1]["display_value"] == expected
|
||||
assert ctx["body"][1][1]["display_value"] == "1.1"
|
||||
assert ctx["body"][0][2]["display_value"] == "0.123400"
|
||||
assert ctx["body"][1][2]["display_value"] == raw_11
|
||||
|
||||
|
||||
@pytest.mark.parametrize("formatter", [None, "{:,.1f}"])
|
||||
@pytest.mark.parametrize("decimal", [".", "*"])
|
||||
@pytest.mark.parametrize("precision", [None, 2])
|
||||
@pytest.mark.parametrize("func, col", [("format", 1), ("format_index", 0)])
|
||||
def test_format_thousands(formatter, decimal, precision, func, col):
|
||||
styler = DataFrame([[1000000.123456789]], index=[1000000.123456789]).style
|
||||
result = getattr(styler, func)( # testing float
|
||||
thousands="_", formatter=formatter, decimal=decimal, precision=precision
|
||||
)._translate(True, True)
|
||||
assert "1_000_000" in result["body"][0][col]["display_value"]
|
||||
|
||||
styler = DataFrame([[1000000]], index=[1000000]).style
|
||||
result = getattr(styler, func)( # testing int
|
||||
thousands="_", formatter=formatter, decimal=decimal, precision=precision
|
||||
)._translate(True, True)
|
||||
assert "1_000_000" in result["body"][0][col]["display_value"]
|
||||
|
||||
styler = DataFrame([[1 + 1000000.123456789j]], index=[1 + 1000000.123456789j]).style
|
||||
result = getattr(styler, func)( # testing complex
|
||||
thousands="_", formatter=formatter, decimal=decimal, precision=precision
|
||||
)._translate(True, True)
|
||||
assert "1_000_000" in result["body"][0][col]["display_value"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("formatter", [None, "{:,.4f}"])
|
||||
@pytest.mark.parametrize("thousands", [None, ",", "*"])
|
||||
@pytest.mark.parametrize("precision", [None, 4])
|
||||
@pytest.mark.parametrize("func, col", [("format", 1), ("format_index", 0)])
|
||||
def test_format_decimal(formatter, thousands, precision, func, col):
|
||||
styler = DataFrame([[1000000.123456789]], index=[1000000.123456789]).style
|
||||
result = getattr(styler, func)( # testing float
|
||||
decimal="_", formatter=formatter, thousands=thousands, precision=precision
|
||||
)._translate(True, True)
|
||||
assert "000_123" in result["body"][0][col]["display_value"]
|
||||
|
||||
styler = DataFrame([[1 + 1000000.123456789j]], index=[1 + 1000000.123456789j]).style
|
||||
result = getattr(styler, func)( # testing complex
|
||||
decimal="_", formatter=formatter, thousands=thousands, precision=precision
|
||||
)._translate(True, True)
|
||||
assert "000_123" in result["body"][0][col]["display_value"]
|
||||
|
||||
|
||||
def test_str_escape_error():
|
||||
msg = "`escape` only permitted in {'html', 'latex', 'latex-math'}, got "
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
_str_escape("text", "bad_escape")
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
_str_escape("text", [])
|
||||
|
||||
_str_escape(2.00, "bad_escape") # OK since dtype is float
|
||||
|
||||
|
||||
def test_long_int_formatting():
|
||||
df = DataFrame(data=[[1234567890123456789]], columns=["test"])
|
||||
styler = df.style
|
||||
ctx = styler._translate(True, True)
|
||||
assert ctx["body"][0][1]["display_value"] == "1234567890123456789"
|
||||
|
||||
styler = df.style.format(thousands="_")
|
||||
ctx = styler._translate(True, True)
|
||||
assert ctx["body"][0][1]["display_value"] == "1_234_567_890_123_456_789"
|
||||
|
||||
|
||||
def test_format_options():
|
||||
df = DataFrame({"int": [2000, 1], "float": [1.009, None], "str": ["&<", "&~"]})
|
||||
ctx = df.style._translate(True, True)
|
||||
|
||||
# test option: na_rep
|
||||
assert ctx["body"][1][2]["display_value"] == "nan"
|
||||
with option_context("styler.format.na_rep", "MISSING"):
|
||||
ctx_with_op = df.style._translate(True, True)
|
||||
assert ctx_with_op["body"][1][2]["display_value"] == "MISSING"
|
||||
|
||||
# test option: decimal and precision
|
||||
assert ctx["body"][0][2]["display_value"] == "1.009000"
|
||||
with option_context("styler.format.decimal", "_"):
|
||||
ctx_with_op = df.style._translate(True, True)
|
||||
assert ctx_with_op["body"][0][2]["display_value"] == "1_009000"
|
||||
with option_context("styler.format.precision", 2):
|
||||
ctx_with_op = df.style._translate(True, True)
|
||||
assert ctx_with_op["body"][0][2]["display_value"] == "1.01"
|
||||
|
||||
# test option: thousands
|
||||
assert ctx["body"][0][1]["display_value"] == "2000"
|
||||
with option_context("styler.format.thousands", "_"):
|
||||
ctx_with_op = df.style._translate(True, True)
|
||||
assert ctx_with_op["body"][0][1]["display_value"] == "2_000"
|
||||
|
||||
# test option: escape
|
||||
assert ctx["body"][0][3]["display_value"] == "&<"
|
||||
assert ctx["body"][1][3]["display_value"] == "&~"
|
||||
with option_context("styler.format.escape", "html"):
|
||||
ctx_with_op = df.style._translate(True, True)
|
||||
assert ctx_with_op["body"][0][3]["display_value"] == "&<"
|
||||
with option_context("styler.format.escape", "latex"):
|
||||
ctx_with_op = df.style._translate(True, True)
|
||||
assert ctx_with_op["body"][1][3]["display_value"] == "\\&\\textasciitilde "
|
||||
with option_context("styler.format.escape", "latex-math"):
|
||||
ctx_with_op = df.style._translate(True, True)
|
||||
assert ctx_with_op["body"][1][3]["display_value"] == "\\&\\textasciitilde "
|
||||
|
||||
# test option: formatter
|
||||
with option_context("styler.format.formatter", {"int": "{:,.2f}"}):
|
||||
ctx_with_op = df.style._translate(True, True)
|
||||
assert ctx_with_op["body"][0][1]["display_value"] == "2,000.00"
|
||||
|
||||
|
||||
def test_precision_zero(df):
|
||||
styler = Styler(df, precision=0)
|
||||
ctx = styler._translate(True, True)
|
||||
assert ctx["body"][0][2]["display_value"] == "-1"
|
||||
assert ctx["body"][1][2]["display_value"] == "-1"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"formatter, exp",
|
||||
[
|
||||
(lambda x: f"{x:.3f}", "9.000"),
|
||||
("{:.2f}", "9.00"),
|
||||
({0: "{:.1f}"}, "9.0"),
|
||||
(None, "9"),
|
||||
],
|
||||
)
|
||||
def test_formatter_options_validator(formatter, exp):
|
||||
df = DataFrame([[9]])
|
||||
with option_context("styler.format.formatter", formatter):
|
||||
assert f" {exp} " in df.style.to_latex()
|
||||
|
||||
|
||||
def test_formatter_options_raises():
|
||||
msg = "Value must be an instance of"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with option_context("styler.format.formatter", ["bad", "type"]):
|
||||
DataFrame().style.to_latex()
|
||||
|
||||
|
||||
def test_1level_multiindex():
|
||||
# GH 43383
|
||||
midx = MultiIndex.from_product([[1, 2]], names=[""])
|
||||
df = DataFrame(-1, index=midx, columns=[0, 1])
|
||||
ctx = df.style._translate(True, True)
|
||||
assert ctx["body"][0][0]["display_value"] == "1"
|
||||
assert ctx["body"][0][0]["is_visible"] is True
|
||||
assert ctx["body"][1][0]["display_value"] == "2"
|
||||
assert ctx["body"][1][0]["is_visible"] is True
|
||||
|
||||
|
||||
def test_boolean_format():
|
||||
# gh 46384: booleans do not collapse to integer representation on display
|
||||
df = DataFrame([[True, False]])
|
||||
ctx = df.style._translate(True, True)
|
||||
assert ctx["body"][0][1]["display_value"] is True
|
||||
assert ctx["body"][0][2]["display_value"] is False
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"hide, labels",
|
||||
[
|
||||
(False, [1, 2]),
|
||||
(True, [1, 2, 3, 4]),
|
||||
],
|
||||
)
|
||||
def test_relabel_raise_length(styler_multi, hide, labels):
|
||||
if hide:
|
||||
styler_multi.hide(axis=0, subset=[("X", "x"), ("Y", "y")])
|
||||
with pytest.raises(ValueError, match="``labels`` must be of length equal"):
|
||||
styler_multi.relabel_index(labels=labels)
|
||||
|
||||
|
||||
def test_relabel_index(styler_multi):
|
||||
labels = [(1, 2), (3, 4)]
|
||||
styler_multi.hide(axis=0, subset=[("X", "x"), ("Y", "y")])
|
||||
styler_multi.relabel_index(labels=labels)
|
||||
ctx = styler_multi._translate(True, True)
|
||||
assert {"value": "X", "display_value": 1}.items() <= ctx["body"][0][0].items()
|
||||
assert {"value": "y", "display_value": 2}.items() <= ctx["body"][0][1].items()
|
||||
assert {"value": "Y", "display_value": 3}.items() <= ctx["body"][1][0].items()
|
||||
assert {"value": "x", "display_value": 4}.items() <= ctx["body"][1][1].items()
|
||||
|
||||
|
||||
def test_relabel_columns(styler_multi):
|
||||
labels = [(1, 2), (3, 4)]
|
||||
styler_multi.hide(axis=1, subset=[("A", "a"), ("B", "b")])
|
||||
styler_multi.relabel_index(axis=1, labels=labels)
|
||||
ctx = styler_multi._translate(True, True)
|
||||
assert {"value": "A", "display_value": 1}.items() <= ctx["head"][0][3].items()
|
||||
assert {"value": "B", "display_value": 3}.items() <= ctx["head"][0][4].items()
|
||||
assert {"value": "b", "display_value": 2}.items() <= ctx["head"][1][3].items()
|
||||
assert {"value": "a", "display_value": 4}.items() <= ctx["head"][1][4].items()
|
||||
|
||||
|
||||
def test_relabel_roundtrip(styler):
|
||||
styler.relabel_index(["{}", "{}"])
|
||||
ctx = styler._translate(True, True)
|
||||
assert {"value": "x", "display_value": "x"}.items() <= ctx["body"][0][0].items()
|
||||
assert {"value": "y", "display_value": "y"}.items() <= ctx["body"][1][0].items()
|
||||
@ -0,0 +1,218 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
NA,
|
||||
DataFrame,
|
||||
IndexSlice,
|
||||
)
|
||||
|
||||
pytest.importorskip("jinja2")
|
||||
|
||||
from pandas.io.formats.style import Styler
|
||||
|
||||
|
||||
@pytest.fixture(params=[(None, "float64"), (NA, "Int64")])
|
||||
def df(request):
|
||||
# GH 45804
|
||||
return DataFrame(
|
||||
{"A": [0, np.nan, 10], "B": [1, request.param[0], 2]}, dtype=request.param[1]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def styler(df):
|
||||
return Styler(df, uuid_len=0)
|
||||
|
||||
|
||||
def test_highlight_null(styler):
|
||||
result = styler.highlight_null()._compute().ctx
|
||||
expected = {
|
||||
(1, 0): [("background-color", "red")],
|
||||
(1, 1): [("background-color", "red")],
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_highlight_null_subset(styler):
|
||||
# GH 31345
|
||||
result = (
|
||||
styler.highlight_null(color="red", subset=["A"])
|
||||
.highlight_null(color="green", subset=["B"])
|
||||
._compute()
|
||||
.ctx
|
||||
)
|
||||
expected = {
|
||||
(1, 0): [("background-color", "red")],
|
||||
(1, 1): [("background-color", "green")],
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("f", ["highlight_min", "highlight_max"])
|
||||
def test_highlight_minmax_basic(df, f):
|
||||
expected = {
|
||||
(0, 1): [("background-color", "red")],
|
||||
# ignores NaN row,
|
||||
(2, 0): [("background-color", "red")],
|
||||
}
|
||||
if f == "highlight_min":
|
||||
df = -df
|
||||
result = getattr(df.style, f)(axis=1, color="red")._compute().ctx
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("f", ["highlight_min", "highlight_max"])
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"axis": None, "color": "red"}, # test axis
|
||||
{"axis": 0, "subset": ["A"], "color": "red"}, # test subset and ignores NaN
|
||||
{"axis": None, "props": "background-color: red"}, # test props
|
||||
],
|
||||
)
|
||||
def test_highlight_minmax_ext(df, f, kwargs):
|
||||
expected = {(2, 0): [("background-color", "red")]}
|
||||
if f == "highlight_min":
|
||||
df = -df
|
||||
result = getattr(df.style, f)(**kwargs)._compute().ctx
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("f", ["highlight_min", "highlight_max"])
|
||||
@pytest.mark.parametrize("axis", [None, 0, 1])
|
||||
def test_highlight_minmax_nulls(f, axis):
|
||||
# GH 42750
|
||||
expected = {
|
||||
(1, 0): [("background-color", "yellow")],
|
||||
(1, 1): [("background-color", "yellow")],
|
||||
}
|
||||
if axis == 1:
|
||||
expected.update({(2, 1): [("background-color", "yellow")]})
|
||||
|
||||
if f == "highlight_max":
|
||||
df = DataFrame({"a": [NA, 1, None], "b": [np.nan, 1, -1]})
|
||||
else:
|
||||
df = DataFrame({"a": [NA, -1, None], "b": [np.nan, -1, 1]})
|
||||
|
||||
result = getattr(df.style, f)(axis=axis)._compute().ctx
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"left": 0, "right": 1}, # test basic range
|
||||
{"left": 0, "right": 1, "props": "background-color: yellow"}, # test props
|
||||
{"left": -100, "right": 100, "subset": IndexSlice[[0, 1], :]}, # test subset
|
||||
{"left": 0, "subset": IndexSlice[[0, 1], :]}, # test no right
|
||||
{"right": 1}, # test no left
|
||||
{"left": [0, 0, 11], "axis": 0}, # test left as sequence
|
||||
{"left": DataFrame({"A": [0, 0, 11], "B": [1, 1, 11]}), "axis": None}, # axis
|
||||
{"left": 0, "right": [0, 1], "axis": 1}, # test sequence right
|
||||
],
|
||||
)
|
||||
def test_highlight_between(styler, kwargs):
|
||||
expected = {
|
||||
(0, 0): [("background-color", "yellow")],
|
||||
(0, 1): [("background-color", "yellow")],
|
||||
}
|
||||
result = styler.highlight_between(**kwargs)._compute().ctx
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, map, axis",
|
||||
[
|
||||
("left", [1, 2], 0), # 0 axis has 3 elements not 2
|
||||
("left", [1, 2, 3], 1), # 1 axis has 2 elements not 3
|
||||
("left", np.array([[1, 2], [1, 2]]), None), # df is (2,3) not (2,2)
|
||||
("right", [1, 2], 0), # same tests as above for 'right' not 'left'
|
||||
("right", [1, 2, 3], 1), # ..
|
||||
("right", np.array([[1, 2], [1, 2]]), None), # ..
|
||||
],
|
||||
)
|
||||
def test_highlight_between_raises(arg, styler, map, axis):
|
||||
msg = f"supplied '{arg}' is not correct shape"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
styler.highlight_between(**{arg: map, "axis": axis})._compute()
|
||||
|
||||
|
||||
def test_highlight_between_raises2(styler):
|
||||
msg = "values can be 'both', 'left', 'right', or 'neither'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
styler.highlight_between(inclusive="badstring")._compute()
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
styler.highlight_between(inclusive=1)._compute()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"inclusive, expected",
|
||||
[
|
||||
(
|
||||
"both",
|
||||
{
|
||||
(0, 0): [("background-color", "yellow")],
|
||||
(0, 1): [("background-color", "yellow")],
|
||||
},
|
||||
),
|
||||
("neither", {}),
|
||||
("left", {(0, 0): [("background-color", "yellow")]}),
|
||||
("right", {(0, 1): [("background-color", "yellow")]}),
|
||||
],
|
||||
)
|
||||
def test_highlight_between_inclusive(styler, inclusive, expected):
|
||||
kwargs = {"left": 0, "right": 1, "subset": IndexSlice[[0, 1], :]}
|
||||
result = styler.highlight_between(**kwargs, inclusive=inclusive)._compute()
|
||||
assert result.ctx == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"q_left": 0.5, "q_right": 1, "axis": 0}, # base case
|
||||
{"q_left": 0.5, "q_right": 1, "axis": None}, # test axis
|
||||
{"q_left": 0, "q_right": 1, "subset": IndexSlice[2, :]}, # test subset
|
||||
{"q_left": 0.5, "axis": 0}, # test no high
|
||||
{"q_right": 1, "subset": IndexSlice[2, :], "axis": 1}, # test no low
|
||||
{"q_left": 0.5, "axis": 0, "props": "background-color: yellow"}, # tst prop
|
||||
],
|
||||
)
|
||||
def test_highlight_quantile(styler, kwargs):
|
||||
expected = {
|
||||
(2, 0): [("background-color", "yellow")],
|
||||
(2, 1): [("background-color", "yellow")],
|
||||
}
|
||||
result = styler.highlight_quantile(**kwargs)._compute().ctx
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"f,kwargs",
|
||||
[
|
||||
("highlight_min", {"axis": 1, "subset": IndexSlice[1, :]}),
|
||||
("highlight_max", {"axis": 0, "subset": [0]}),
|
||||
("highlight_quantile", {"axis": None, "q_left": 0.6, "q_right": 0.8}),
|
||||
("highlight_between", {"subset": [0]}),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"df",
|
||||
[
|
||||
DataFrame([[0, 10], [20, 30]], dtype=int),
|
||||
DataFrame([[0, 10], [20, 30]], dtype=float),
|
||||
DataFrame([[0, 10], [20, 30]], dtype="datetime64[ns]"),
|
||||
DataFrame([[0, 10], [20, 30]], dtype=str),
|
||||
DataFrame([[0, 10], [20, 30]], dtype="timedelta64[ns]"),
|
||||
],
|
||||
)
|
||||
def test_all_highlight_dtypes(f, kwargs, df):
|
||||
if f == "highlight_quantile" and isinstance(df.iloc[0, 0], (str)):
|
||||
return None # quantile incompatible with str
|
||||
if f == "highlight_between":
|
||||
kwargs["left"] = df.iloc[1, 0] # set the range low for testing
|
||||
|
||||
expected = {(1, 0): [("background-color", "yellow")]}
|
||||
result = getattr(df.style, f)(**kwargs)._compute().ctx
|
||||
assert result == expected
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,335 @@
|
||||
import gc
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
IndexSlice,
|
||||
Series,
|
||||
)
|
||||
|
||||
pytest.importorskip("matplotlib")
|
||||
pytest.importorskip("jinja2")
|
||||
|
||||
import matplotlib as mpl
|
||||
|
||||
from pandas.io.formats.style import Styler
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mpl_cleanup():
|
||||
# matplotlib/testing/decorators.py#L24
|
||||
# 1) Resets units registry
|
||||
# 2) Resets rc_context
|
||||
# 3) Closes all figures
|
||||
mpl = pytest.importorskip("matplotlib")
|
||||
mpl_units = pytest.importorskip("matplotlib.units")
|
||||
plt = pytest.importorskip("matplotlib.pyplot")
|
||||
orig_units_registry = mpl_units.registry.copy()
|
||||
with mpl.rc_context():
|
||||
mpl.use("template")
|
||||
yield
|
||||
mpl_units.registry.clear()
|
||||
mpl_units.registry.update(orig_units_registry)
|
||||
plt.close("all")
|
||||
# https://matplotlib.org/stable/users/prev_whats_new/whats_new_3.6.0.html#garbage-collection-is-no-longer-run-on-figure-close # noqa: E501
|
||||
gc.collect(1)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
return DataFrame([[1, 2], [2, 4]], columns=["A", "B"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def styler(df):
|
||||
return Styler(df, uuid_len=0)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_blank():
|
||||
return DataFrame([[0, 0], [0, 0]], columns=["A", "B"], index=["X", "Y"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def styler_blank(df_blank):
|
||||
return Styler(df_blank, uuid_len=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("f", ["background_gradient", "text_gradient"])
|
||||
def test_function_gradient(styler, f):
|
||||
for c_map in [None, "YlOrRd"]:
|
||||
result = getattr(styler, f)(cmap=c_map)._compute().ctx
|
||||
assert all("#" in x[0][1] for x in result.values())
|
||||
assert result[(0, 0)] == result[(0, 1)]
|
||||
assert result[(1, 0)] == result[(1, 1)]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("f", ["background_gradient", "text_gradient"])
|
||||
def test_background_gradient_color(styler, f):
|
||||
result = getattr(styler, f)(subset=IndexSlice[1, "A"])._compute().ctx
|
||||
if f == "background_gradient":
|
||||
assert result[(1, 0)] == [("background-color", "#fff7fb"), ("color", "#000000")]
|
||||
elif f == "text_gradient":
|
||||
assert result[(1, 0)] == [("color", "#fff7fb")]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"axis, expected",
|
||||
[
|
||||
(0, ["low", "low", "high", "high"]),
|
||||
(1, ["low", "high", "low", "high"]),
|
||||
(None, ["low", "mid", "mid", "high"]),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("f", ["background_gradient", "text_gradient"])
|
||||
def test_background_gradient_axis(styler, axis, expected, f):
|
||||
if f == "background_gradient":
|
||||
colors = {
|
||||
"low": [("background-color", "#f7fbff"), ("color", "#000000")],
|
||||
"mid": [("background-color", "#abd0e6"), ("color", "#000000")],
|
||||
"high": [("background-color", "#08306b"), ("color", "#f1f1f1")],
|
||||
}
|
||||
elif f == "text_gradient":
|
||||
colors = {
|
||||
"low": [("color", "#f7fbff")],
|
||||
"mid": [("color", "#abd0e6")],
|
||||
"high": [("color", "#08306b")],
|
||||
}
|
||||
result = getattr(styler, f)(cmap="Blues", axis=axis)._compute().ctx
|
||||
for i, cell in enumerate([(0, 0), (0, 1), (1, 0), (1, 1)]):
|
||||
assert result[cell] == colors[expected[i]]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"cmap, expected",
|
||||
[
|
||||
(
|
||||
"PuBu",
|
||||
{
|
||||
(4, 5): [("background-color", "#86b0d3"), ("color", "#000000")],
|
||||
(4, 6): [("background-color", "#83afd3"), ("color", "#f1f1f1")],
|
||||
},
|
||||
),
|
||||
(
|
||||
"YlOrRd",
|
||||
{
|
||||
(4, 8): [("background-color", "#fd913e"), ("color", "#000000")],
|
||||
(4, 9): [("background-color", "#fd8f3d"), ("color", "#f1f1f1")],
|
||||
},
|
||||
),
|
||||
(
|
||||
None,
|
||||
{
|
||||
(7, 0): [("background-color", "#48c16e"), ("color", "#f1f1f1")],
|
||||
(7, 1): [("background-color", "#4cc26c"), ("color", "#000000")],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_text_color_threshold(cmap, expected):
|
||||
# GH 39888
|
||||
df = DataFrame(np.arange(100).reshape(10, 10))
|
||||
result = df.style.background_gradient(cmap=cmap, axis=None)._compute().ctx
|
||||
for k in expected.keys():
|
||||
assert result[k] == expected[k]
|
||||
|
||||
|
||||
def test_background_gradient_vmin_vmax():
|
||||
# GH 12145
|
||||
df = DataFrame(range(5))
|
||||
ctx = df.style.background_gradient(vmin=1, vmax=3)._compute().ctx
|
||||
assert ctx[(0, 0)] == ctx[(1, 0)]
|
||||
assert ctx[(4, 0)] == ctx[(3, 0)]
|
||||
|
||||
|
||||
def test_background_gradient_int64():
|
||||
# GH 28869
|
||||
df1 = Series(range(3)).to_frame()
|
||||
df2 = Series(range(3), dtype="Int64").to_frame()
|
||||
ctx1 = df1.style.background_gradient()._compute().ctx
|
||||
ctx2 = df2.style.background_gradient()._compute().ctx
|
||||
assert ctx2[(0, 0)] == ctx1[(0, 0)]
|
||||
assert ctx2[(1, 0)] == ctx1[(1, 0)]
|
||||
assert ctx2[(2, 0)] == ctx1[(2, 0)]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"axis, gmap, expected",
|
||||
[
|
||||
(
|
||||
0,
|
||||
[1, 2],
|
||||
{
|
||||
(0, 0): [("background-color", "#fff7fb"), ("color", "#000000")],
|
||||
(1, 0): [("background-color", "#023858"), ("color", "#f1f1f1")],
|
||||
(0, 1): [("background-color", "#fff7fb"), ("color", "#000000")],
|
||||
(1, 1): [("background-color", "#023858"), ("color", "#f1f1f1")],
|
||||
},
|
||||
),
|
||||
(
|
||||
1,
|
||||
[1, 2],
|
||||
{
|
||||
(0, 0): [("background-color", "#fff7fb"), ("color", "#000000")],
|
||||
(1, 0): [("background-color", "#fff7fb"), ("color", "#000000")],
|
||||
(0, 1): [("background-color", "#023858"), ("color", "#f1f1f1")],
|
||||
(1, 1): [("background-color", "#023858"), ("color", "#f1f1f1")],
|
||||
},
|
||||
),
|
||||
(
|
||||
None,
|
||||
np.array([[2, 1], [1, 2]]),
|
||||
{
|
||||
(0, 0): [("background-color", "#023858"), ("color", "#f1f1f1")],
|
||||
(1, 0): [("background-color", "#fff7fb"), ("color", "#000000")],
|
||||
(0, 1): [("background-color", "#fff7fb"), ("color", "#000000")],
|
||||
(1, 1): [("background-color", "#023858"), ("color", "#f1f1f1")],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_background_gradient_gmap_array(styler_blank, axis, gmap, expected):
|
||||
# tests when gmap is given as a sequence and converted to ndarray
|
||||
result = styler_blank.background_gradient(axis=axis, gmap=gmap)._compute().ctx
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"gmap, axis", [([1, 2, 3], 0), ([1, 2], 1), (np.array([[1, 2], [1, 2]]), None)]
|
||||
)
|
||||
def test_background_gradient_gmap_array_raises(gmap, axis):
|
||||
# test when gmap as converted ndarray is bad shape
|
||||
df = DataFrame([[0, 0, 0], [0, 0, 0]])
|
||||
msg = "supplied 'gmap' is not correct shape"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.style.background_gradient(gmap=gmap, axis=axis)._compute()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"gmap",
|
||||
[
|
||||
DataFrame( # reverse the columns
|
||||
[[2, 1], [1, 2]], columns=["B", "A"], index=["X", "Y"]
|
||||
),
|
||||
DataFrame( # reverse the index
|
||||
[[2, 1], [1, 2]], columns=["A", "B"], index=["Y", "X"]
|
||||
),
|
||||
DataFrame( # reverse the index and columns
|
||||
[[1, 2], [2, 1]], columns=["B", "A"], index=["Y", "X"]
|
||||
),
|
||||
DataFrame( # add unnecessary columns
|
||||
[[1, 2, 3], [2, 1, 3]], columns=["A", "B", "C"], index=["X", "Y"]
|
||||
),
|
||||
DataFrame( # add unnecessary index
|
||||
[[1, 2], [2, 1], [3, 3]], columns=["A", "B"], index=["X", "Y", "Z"]
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"subset, exp_gmap", # exp_gmap is underlying map DataFrame should conform to
|
||||
[
|
||||
(None, [[1, 2], [2, 1]]),
|
||||
(["A"], [[1], [2]]), # slice only column "A" in data and gmap
|
||||
(["B", "A"], [[2, 1], [1, 2]]), # reverse the columns in data
|
||||
(IndexSlice["X", :], [[1, 2]]), # slice only index "X" in data and gmap
|
||||
(IndexSlice[["Y", "X"], :], [[2, 1], [1, 2]]), # reverse the index in data
|
||||
],
|
||||
)
|
||||
def test_background_gradient_gmap_dataframe_align(styler_blank, gmap, subset, exp_gmap):
|
||||
# test gmap given as DataFrame that it aligns to the data including subset
|
||||
expected = styler_blank.background_gradient(axis=None, gmap=exp_gmap, subset=subset)
|
||||
result = styler_blank.background_gradient(axis=None, gmap=gmap, subset=subset)
|
||||
assert expected._compute().ctx == result._compute().ctx
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"gmap, axis, exp_gmap",
|
||||
[
|
||||
(Series([2, 1], index=["Y", "X"]), 0, [[1, 1], [2, 2]]), # revrse the index
|
||||
(Series([2, 1], index=["B", "A"]), 1, [[1, 2], [1, 2]]), # revrse the cols
|
||||
(Series([1, 2, 3], index=["X", "Y", "Z"]), 0, [[1, 1], [2, 2]]), # add idx
|
||||
(Series([1, 2, 3], index=["A", "B", "C"]), 1, [[1, 2], [1, 2]]), # add col
|
||||
],
|
||||
)
|
||||
def test_background_gradient_gmap_series_align(styler_blank, gmap, axis, exp_gmap):
|
||||
# test gmap given as Series that it aligns to the data including subset
|
||||
expected = styler_blank.background_gradient(axis=None, gmap=exp_gmap)._compute()
|
||||
result = styler_blank.background_gradient(axis=axis, gmap=gmap)._compute()
|
||||
assert expected.ctx == result.ctx
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"gmap, axis",
|
||||
[
|
||||
(DataFrame([[1, 2], [2, 1]], columns=["A", "B"], index=["X", "Y"]), 1),
|
||||
(DataFrame([[1, 2], [2, 1]], columns=["A", "B"], index=["X", "Y"]), 0),
|
||||
],
|
||||
)
|
||||
def test_background_gradient_gmap_wrong_dataframe(styler_blank, gmap, axis):
|
||||
# test giving a gmap in DataFrame but with wrong axis
|
||||
msg = "'gmap' is a DataFrame but underlying data for operations is a Series"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
styler_blank.background_gradient(gmap=gmap, axis=axis)._compute()
|
||||
|
||||
|
||||
def test_background_gradient_gmap_wrong_series(styler_blank):
|
||||
# test giving a gmap in Series form but with wrong axis
|
||||
msg = "'gmap' is a Series but underlying data for operations is a DataFrame"
|
||||
gmap = Series([1, 2], index=["X", "Y"])
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
styler_blank.background_gradient(gmap=gmap, axis=None)._compute()
|
||||
|
||||
|
||||
def test_background_gradient_nullable_dtypes():
|
||||
# GH 50712
|
||||
df1 = DataFrame([[1], [0], [np.nan]], dtype=float)
|
||||
df2 = DataFrame([[1], [0], [None]], dtype="Int64")
|
||||
|
||||
ctx1 = df1.style.background_gradient()._compute().ctx
|
||||
ctx2 = df2.style.background_gradient()._compute().ctx
|
||||
assert ctx1 == ctx2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"cmap",
|
||||
["PuBu", mpl.colormaps["PuBu"]],
|
||||
)
|
||||
def test_bar_colormap(cmap):
|
||||
data = DataFrame([[1, 2], [3, 4]])
|
||||
ctx = data.style.bar(cmap=cmap, axis=None)._compute().ctx
|
||||
pubu_colors = {
|
||||
(0, 0): "#d0d1e6",
|
||||
(1, 0): "#056faf",
|
||||
(0, 1): "#73a9cf",
|
||||
(1, 1): "#023858",
|
||||
}
|
||||
for k, v in pubu_colors.items():
|
||||
assert v in ctx[k][1][1]
|
||||
|
||||
|
||||
def test_bar_color_raises(df):
|
||||
msg = "`color` must be string or list or tuple of 2 strings"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.style.bar(color={"a", "b"}).to_html()
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.style.bar(color=["a", "b", "c"]).to_html()
|
||||
|
||||
msg = "`color` and `cmap` cannot both be given"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.style.bar(color="something", cmap="something else").to_html()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"plot_method",
|
||||
["scatter", "hexbin"],
|
||||
)
|
||||
def test_pass_colormap_instance(df, plot_method):
|
||||
# https://github.com/pandas-dev/pandas/issues/49374
|
||||
cmap = mpl.colors.ListedColormap([[1, 1, 1], [0, 0, 0]])
|
||||
df["c"] = df.A + df.B
|
||||
kwargs = {"x": "A", "y": "B", "c": "c", "colormap": cmap}
|
||||
if plot_method == "hexbin":
|
||||
kwargs["C"] = kwargs.pop("c")
|
||||
getattr(df.plot, plot_method)(**kwargs)
|
||||
@ -0,0 +1,140 @@
|
||||
from textwrap import dedent
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
IndexSlice,
|
||||
)
|
||||
|
||||
pytest.importorskip("jinja2")
|
||||
|
||||
from pandas.io.formats.style import Styler
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
return DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["i", "j", "j"],
|
||||
columns=["c", "d", "d"],
|
||||
dtype=float,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def styler(df):
|
||||
return Styler(df, uuid_len=0)
|
||||
|
||||
|
||||
def test_format_non_unique(df):
|
||||
# GH 41269
|
||||
|
||||
# test dict
|
||||
html = df.style.format({"d": "{:.1f}"}).to_html()
|
||||
for val in ["1.000000<", "4.000000<", "7.000000<"]:
|
||||
assert val in html
|
||||
for val in ["2.0<", "3.0<", "5.0<", "6.0<", "8.0<", "9.0<"]:
|
||||
assert val in html
|
||||
|
||||
# test subset
|
||||
html = df.style.format(precision=1, subset=IndexSlice["j", "d"]).to_html()
|
||||
for val in ["1.000000<", "4.000000<", "7.000000<", "2.000000<", "3.000000<"]:
|
||||
assert val in html
|
||||
for val in ["5.0<", "6.0<", "8.0<", "9.0<"]:
|
||||
assert val in html
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["apply", "map"])
|
||||
def test_apply_map_non_unique_raises(df, func):
|
||||
# GH 41269
|
||||
if func == "apply":
|
||||
op = lambda s: ["color: red;"] * len(s)
|
||||
else:
|
||||
op = lambda v: "color: red;"
|
||||
|
||||
with pytest.raises(KeyError, match="`Styler.apply` and `.map` are not"):
|
||||
getattr(df.style, func)(op)._compute()
|
||||
|
||||
|
||||
def test_table_styles_dict_non_unique_index(styler):
|
||||
styles = styler.set_table_styles(
|
||||
{"j": [{"selector": "td", "props": "a: v;"}]}, axis=1
|
||||
).table_styles
|
||||
assert styles == [
|
||||
{"selector": "td.row1", "props": [("a", "v")]},
|
||||
{"selector": "td.row2", "props": [("a", "v")]},
|
||||
]
|
||||
|
||||
|
||||
def test_table_styles_dict_non_unique_columns(styler):
|
||||
styles = styler.set_table_styles(
|
||||
{"d": [{"selector": "td", "props": "a: v;"}]}, axis=0
|
||||
).table_styles
|
||||
assert styles == [
|
||||
{"selector": "td.col1", "props": [("a", "v")]},
|
||||
{"selector": "td.col2", "props": [("a", "v")]},
|
||||
]
|
||||
|
||||
|
||||
def test_tooltips_non_unique_raises(styler):
|
||||
# ttips has unique keys
|
||||
ttips = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "b"])
|
||||
styler.set_tooltips(ttips=ttips) # OK
|
||||
|
||||
# ttips has non-unique columns
|
||||
ttips = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "c"], index=["a", "b"])
|
||||
with pytest.raises(KeyError, match="Tooltips render only if `ttips` has unique"):
|
||||
styler.set_tooltips(ttips=ttips)
|
||||
|
||||
# ttips has non-unique index
|
||||
ttips = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "a"])
|
||||
with pytest.raises(KeyError, match="Tooltips render only if `ttips` has unique"):
|
||||
styler.set_tooltips(ttips=ttips)
|
||||
|
||||
|
||||
def test_set_td_classes_non_unique_raises(styler):
|
||||
# classes has unique keys
|
||||
classes = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "b"])
|
||||
styler.set_td_classes(classes=classes) # OK
|
||||
|
||||
# classes has non-unique columns
|
||||
classes = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "c"], index=["a", "b"])
|
||||
with pytest.raises(KeyError, match="Classes render only if `classes` has unique"):
|
||||
styler.set_td_classes(classes=classes)
|
||||
|
||||
# classes has non-unique index
|
||||
classes = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "a"])
|
||||
with pytest.raises(KeyError, match="Classes render only if `classes` has unique"):
|
||||
styler.set_td_classes(classes=classes)
|
||||
|
||||
|
||||
def test_hide_columns_non_unique(styler):
|
||||
ctx = styler.hide(["d"], axis="columns")._translate(True, True)
|
||||
|
||||
assert ctx["head"][0][1]["display_value"] == "c"
|
||||
assert ctx["head"][0][1]["is_visible"] is True
|
||||
|
||||
assert ctx["head"][0][2]["display_value"] == "d"
|
||||
assert ctx["head"][0][2]["is_visible"] is False
|
||||
|
||||
assert ctx["head"][0][3]["display_value"] == "d"
|
||||
assert ctx["head"][0][3]["is_visible"] is False
|
||||
|
||||
assert ctx["body"][0][1]["is_visible"] is True
|
||||
assert ctx["body"][0][2]["is_visible"] is False
|
||||
assert ctx["body"][0][3]["is_visible"] is False
|
||||
|
||||
|
||||
def test_latex_non_unique(styler):
|
||||
result = styler.to_latex()
|
||||
assert result == dedent(
|
||||
"""\
|
||||
\\begin{tabular}{lrrr}
|
||||
& c & d & d \\\\
|
||||
i & 1.000000 & 2.000000 & 3.000000 \\\\
|
||||
j & 4.000000 & 5.000000 & 6.000000 \\\\
|
||||
j & 7.000000 & 8.000000 & 9.000000 \\\\
|
||||
\\end{tabular}
|
||||
"""
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,96 @@
|
||||
from textwrap import dedent
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
pytest.importorskip("jinja2")
|
||||
from pandas.io.formats.style import Styler
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
return DataFrame(
|
||||
{"A": [0, 1], "B": [-0.61, -1.22], "C": Series(["ab", "cd"], dtype=object)}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def styler(df):
|
||||
return Styler(df, uuid_len=0, precision=2)
|
||||
|
||||
|
||||
def test_basic_string(styler):
|
||||
result = styler.to_string()
|
||||
expected = dedent(
|
||||
"""\
|
||||
A B C
|
||||
0 0 -0.61 ab
|
||||
1 1 -1.22 cd
|
||||
"""
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_string_delimiter(styler):
|
||||
result = styler.to_string(delimiter=";")
|
||||
expected = dedent(
|
||||
"""\
|
||||
;A;B;C
|
||||
0;0;-0.61;ab
|
||||
1;1;-1.22;cd
|
||||
"""
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_concat(styler):
|
||||
result = styler.concat(styler.data.agg(["sum"]).style).to_string()
|
||||
expected = dedent(
|
||||
"""\
|
||||
A B C
|
||||
0 0 -0.61 ab
|
||||
1 1 -1.22 cd
|
||||
sum 1 -1.830000 abcd
|
||||
"""
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_concat_recursion(styler):
|
||||
df = styler.data
|
||||
styler1 = styler
|
||||
styler2 = Styler(df.agg(["sum"]), uuid_len=0, precision=3)
|
||||
styler3 = Styler(df.agg(["sum"]), uuid_len=0, precision=4)
|
||||
result = styler1.concat(styler2.concat(styler3)).to_string()
|
||||
expected = dedent(
|
||||
"""\
|
||||
A B C
|
||||
0 0 -0.61 ab
|
||||
1 1 -1.22 cd
|
||||
sum 1 -1.830 abcd
|
||||
sum 1 -1.8300 abcd
|
||||
"""
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_concat_chain(styler):
|
||||
df = styler.data
|
||||
styler1 = styler
|
||||
styler2 = Styler(df.agg(["sum"]), uuid_len=0, precision=3)
|
||||
styler3 = Styler(df.agg(["sum"]), uuid_len=0, precision=4)
|
||||
result = styler1.concat(styler2).concat(styler3).to_string()
|
||||
expected = dedent(
|
||||
"""\
|
||||
A B C
|
||||
0 0 -0.61 ab
|
||||
1 1 -1.22 cd
|
||||
sum 1 -1.830 abcd
|
||||
sum 1 -1.8300 abcd
|
||||
"""
|
||||
)
|
||||
assert result == expected
|
||||
@ -0,0 +1,85 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
pytest.importorskip("jinja2")
|
||||
from pandas.io.formats.style import Styler
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
return DataFrame(
|
||||
data=[[0, 1, 2], [3, 4, 5], [6, 7, 8]],
|
||||
columns=["A", "B", "C"],
|
||||
index=["x", "y", "z"],
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def styler(df):
|
||||
return Styler(df, uuid_len=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ttips",
|
||||
[
|
||||
DataFrame( # Test basic reindex and ignoring blank
|
||||
data=[["Min", "Max"], [np.nan, ""]],
|
||||
columns=["A", "C"],
|
||||
index=["x", "y"],
|
||||
),
|
||||
DataFrame( # Test non-referenced columns, reversed col names, short index
|
||||
data=[["Max", "Min", "Bad-Col"]], columns=["C", "A", "D"], index=["x"]
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_tooltip_render(ttips, styler):
|
||||
# GH 21266
|
||||
result = styler.set_tooltips(ttips).to_html()
|
||||
|
||||
# test tooltip table level class
|
||||
assert "#T_ .pd-t {\n visibility: hidden;\n" in result
|
||||
|
||||
# test 'Min' tooltip added
|
||||
assert "#T_ #T__row0_col0:hover .pd-t {\n visibility: visible;\n}" in result
|
||||
assert '#T_ #T__row0_col0 .pd-t::after {\n content: "Min";\n}' in result
|
||||
assert 'class="data row0 col0" >0<span class="pd-t"></span></td>' in result
|
||||
|
||||
# test 'Max' tooltip added
|
||||
assert "#T_ #T__row0_col2:hover .pd-t {\n visibility: visible;\n}" in result
|
||||
assert '#T_ #T__row0_col2 .pd-t::after {\n content: "Max";\n}' in result
|
||||
assert 'class="data row0 col2" >2<span class="pd-t"></span></td>' in result
|
||||
|
||||
# test Nan, empty string and bad column ignored
|
||||
assert "#T_ #T__row1_col0:hover .pd-t {\n visibility: visible;\n}" not in result
|
||||
assert "#T_ #T__row1_col1:hover .pd-t {\n visibility: visible;\n}" not in result
|
||||
assert "#T_ #T__row0_col1:hover .pd-t {\n visibility: visible;\n}" not in result
|
||||
assert "#T_ #T__row1_col2:hover .pd-t {\n visibility: visible;\n}" not in result
|
||||
assert "Bad-Col" not in result
|
||||
|
||||
|
||||
def test_tooltip_ignored(styler):
|
||||
# GH 21266
|
||||
result = styler.to_html() # no set_tooltips() creates no <span>
|
||||
assert '<style type="text/css">\n</style>' in result
|
||||
assert '<span class="pd-t"></span>' not in result
|
||||
|
||||
|
||||
def test_tooltip_css_class(styler):
|
||||
# GH 21266
|
||||
result = styler.set_tooltips(
|
||||
DataFrame([["tooltip"]], index=["x"], columns=["A"]),
|
||||
css_class="other-class",
|
||||
props=[("color", "green")],
|
||||
).to_html()
|
||||
assert "#T_ .other-class {\n color: green;\n" in result
|
||||
assert '#T_ #T__row0_col0 .other-class::after {\n content: "tooltip";\n' in result
|
||||
|
||||
# GH 39563
|
||||
result = styler.set_tooltips( # set_tooltips overwrites previous
|
||||
DataFrame([["tooltip"]], index=["x"], columns=["A"]),
|
||||
css_class="another-class",
|
||||
props="color:green;color:red;",
|
||||
).to_html()
|
||||
assert "#T_ .another-class {\n color: green;\n color: red;\n}" in result
|
||||
@ -0,0 +1,72 @@
|
||||
import locale
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas._config import detect_console_encoding
|
||||
|
||||
|
||||
class MockEncoding:
|
||||
"""
|
||||
Used to add a side effect when accessing the 'encoding' property. If the
|
||||
side effect is a str in nature, the value will be returned. Otherwise, the
|
||||
side effect should be an exception that will be raised.
|
||||
"""
|
||||
|
||||
def __init__(self, encoding) -> None:
|
||||
super().__init__()
|
||||
self.val = encoding
|
||||
|
||||
@property
|
||||
def encoding(self):
|
||||
return self.raise_or_return(self.val)
|
||||
|
||||
@staticmethod
|
||||
def raise_or_return(val):
|
||||
if isinstance(val, str):
|
||||
return val
|
||||
else:
|
||||
raise val
|
||||
|
||||
|
||||
@pytest.mark.parametrize("empty,filled", [["stdin", "stdout"], ["stdout", "stdin"]])
|
||||
def test_detect_console_encoding_from_stdout_stdin(monkeypatch, empty, filled):
|
||||
# Ensures that when sys.stdout.encoding or sys.stdin.encoding is used when
|
||||
# they have values filled.
|
||||
# GH 21552
|
||||
with monkeypatch.context() as context:
|
||||
context.setattr(f"sys.{empty}", MockEncoding(""))
|
||||
context.setattr(f"sys.{filled}", MockEncoding(filled))
|
||||
assert detect_console_encoding() == filled
|
||||
|
||||
|
||||
@pytest.mark.parametrize("encoding", [AttributeError, OSError, "ascii"])
|
||||
def test_detect_console_encoding_fallback_to_locale(monkeypatch, encoding):
|
||||
# GH 21552
|
||||
with monkeypatch.context() as context:
|
||||
context.setattr("locale.getpreferredencoding", lambda: "foo")
|
||||
context.setattr("sys.stdout", MockEncoding(encoding))
|
||||
assert detect_console_encoding() == "foo"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"std,locale",
|
||||
[
|
||||
["ascii", "ascii"],
|
||||
["ascii", locale.Error],
|
||||
[AttributeError, "ascii"],
|
||||
[AttributeError, locale.Error],
|
||||
[OSError, "ascii"],
|
||||
[OSError, locale.Error],
|
||||
],
|
||||
)
|
||||
def test_detect_console_encoding_fallback_to_default(monkeypatch, std, locale):
|
||||
# When both the stdout/stdin encoding and locale preferred encoding checks
|
||||
# fail (or return 'ascii', we should default to the sys default encoding.
|
||||
# GH 21552
|
||||
with monkeypatch.context() as context:
|
||||
context.setattr(
|
||||
"locale.getpreferredencoding", lambda: MockEncoding.raise_or_return(locale)
|
||||
)
|
||||
context.setattr("sys.stdout", MockEncoding(std))
|
||||
context.setattr("sys.getdefaultencoding", lambda: "sysDefaultEncoding")
|
||||
assert detect_console_encoding() == "sysDefaultEncoding"
|
||||
289
lib/python3.11/site-packages/pandas/tests/io/formats/test_css.py
Normal file
289
lib/python3.11/site-packages/pandas/tests/io/formats/test_css.py
Normal file
@ -0,0 +1,289 @@
|
||||
import pytest
|
||||
|
||||
from pandas.errors import CSSWarning
|
||||
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.formats.css import CSSResolver
|
||||
|
||||
|
||||
def assert_resolves(css, props, inherited=None):
|
||||
resolve = CSSResolver()
|
||||
actual = resolve(css, inherited=inherited)
|
||||
assert props == actual
|
||||
|
||||
|
||||
def assert_same_resolution(css1, css2, inherited=None):
|
||||
resolve = CSSResolver()
|
||||
resolved1 = resolve(css1, inherited=inherited)
|
||||
resolved2 = resolve(css2, inherited=inherited)
|
||||
assert resolved1 == resolved2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name,norm,abnorm",
|
||||
[
|
||||
(
|
||||
"whitespace",
|
||||
"hello: world; foo: bar",
|
||||
" \t hello \t :\n world \n ; \n foo: \tbar\n\n",
|
||||
),
|
||||
("case", "hello: world; foo: bar", "Hello: WORLD; foO: bar"),
|
||||
("empty-decl", "hello: world; foo: bar", "; hello: world;; foo: bar;\n; ;"),
|
||||
("empty-list", "", ";"),
|
||||
],
|
||||
)
|
||||
def test_css_parse_normalisation(name, norm, abnorm):
|
||||
assert_same_resolution(norm, abnorm)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"invalid_css,remainder",
|
||||
[
|
||||
# No colon
|
||||
("hello-world", ""),
|
||||
("border-style: solid; hello-world", "border-style: solid"),
|
||||
(
|
||||
"border-style: solid; hello-world; font-weight: bold",
|
||||
"border-style: solid; font-weight: bold",
|
||||
),
|
||||
# Unclosed string fail
|
||||
# Invalid size
|
||||
("font-size: blah", "font-size: 1em"),
|
||||
("font-size: 1a2b", "font-size: 1em"),
|
||||
("font-size: 1e5pt", "font-size: 1em"),
|
||||
("font-size: 1+6pt", "font-size: 1em"),
|
||||
("font-size: 1unknownunit", "font-size: 1em"),
|
||||
("font-size: 10", "font-size: 1em"),
|
||||
("font-size: 10 pt", "font-size: 1em"),
|
||||
# Too many args
|
||||
("border-top: 1pt solid red green", "border-top: 1pt solid green"),
|
||||
],
|
||||
)
|
||||
def test_css_parse_invalid(invalid_css, remainder):
|
||||
with tm.assert_produces_warning(CSSWarning):
|
||||
assert_same_resolution(invalid_css, remainder)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"shorthand,expansions",
|
||||
[
|
||||
("margin", ["margin-top", "margin-right", "margin-bottom", "margin-left"]),
|
||||
("padding", ["padding-top", "padding-right", "padding-bottom", "padding-left"]),
|
||||
(
|
||||
"border-width",
|
||||
[
|
||||
"border-top-width",
|
||||
"border-right-width",
|
||||
"border-bottom-width",
|
||||
"border-left-width",
|
||||
],
|
||||
),
|
||||
(
|
||||
"border-color",
|
||||
[
|
||||
"border-top-color",
|
||||
"border-right-color",
|
||||
"border-bottom-color",
|
||||
"border-left-color",
|
||||
],
|
||||
),
|
||||
(
|
||||
"border-style",
|
||||
[
|
||||
"border-top-style",
|
||||
"border-right-style",
|
||||
"border-bottom-style",
|
||||
"border-left-style",
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_css_side_shorthands(shorthand, expansions):
|
||||
top, right, bottom, left = expansions
|
||||
|
||||
assert_resolves(
|
||||
f"{shorthand}: 1pt", {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}
|
||||
)
|
||||
|
||||
assert_resolves(
|
||||
f"{shorthand}: 1pt 4pt", {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}
|
||||
)
|
||||
|
||||
assert_resolves(
|
||||
f"{shorthand}: 1pt 4pt 2pt",
|
||||
{top: "1pt", right: "4pt", bottom: "2pt", left: "4pt"},
|
||||
)
|
||||
|
||||
assert_resolves(
|
||||
f"{shorthand}: 1pt 4pt 2pt 0pt",
|
||||
{top: "1pt", right: "4pt", bottom: "2pt", left: "0pt"},
|
||||
)
|
||||
|
||||
with tm.assert_produces_warning(CSSWarning):
|
||||
assert_resolves(f"{shorthand}: 1pt 1pt 1pt 1pt 1pt", {})
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"shorthand,sides",
|
||||
[
|
||||
("border-top", ["top"]),
|
||||
("border-right", ["right"]),
|
||||
("border-bottom", ["bottom"]),
|
||||
("border-left", ["left"]),
|
||||
("border", ["top", "right", "bottom", "left"]),
|
||||
],
|
||||
)
|
||||
def test_css_border_shorthand_sides(shorthand, sides):
|
||||
def create_border_dict(sides, color=None, style=None, width=None):
|
||||
resolved = {}
|
||||
for side in sides:
|
||||
if color:
|
||||
resolved[f"border-{side}-color"] = color
|
||||
if style:
|
||||
resolved[f"border-{side}-style"] = style
|
||||
if width:
|
||||
resolved[f"border-{side}-width"] = width
|
||||
return resolved
|
||||
|
||||
assert_resolves(
|
||||
f"{shorthand}: 1pt red solid", create_border_dict(sides, "red", "solid", "1pt")
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"prop, expected",
|
||||
[
|
||||
("1pt red solid", ("red", "solid", "1pt")),
|
||||
("red 1pt solid", ("red", "solid", "1pt")),
|
||||
("red solid 1pt", ("red", "solid", "1pt")),
|
||||
("solid 1pt red", ("red", "solid", "1pt")),
|
||||
("red solid", ("red", "solid", "1.500000pt")),
|
||||
# Note: color=black is not CSS conforming
|
||||
# (See https://drafts.csswg.org/css-backgrounds/#border-shorthands)
|
||||
("1pt solid", ("black", "solid", "1pt")),
|
||||
("1pt red", ("red", "none", "1pt")),
|
||||
("red", ("red", "none", "1.500000pt")),
|
||||
("1pt", ("black", "none", "1pt")),
|
||||
("solid", ("black", "solid", "1.500000pt")),
|
||||
# Sizes
|
||||
("1em", ("black", "none", "12pt")),
|
||||
],
|
||||
)
|
||||
def test_css_border_shorthands(prop, expected):
|
||||
color, style, width = expected
|
||||
|
||||
assert_resolves(
|
||||
f"border-left: {prop}",
|
||||
{
|
||||
"border-left-color": color,
|
||||
"border-left-style": style,
|
||||
"border-left-width": width,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"style,inherited,equiv",
|
||||
[
|
||||
("margin: 1px; margin: 2px", "", "margin: 2px"),
|
||||
("margin: 1px", "margin: 2px", "margin: 1px"),
|
||||
("margin: 1px; margin: inherit", "margin: 2px", "margin: 2px"),
|
||||
(
|
||||
"margin: 1px; margin-top: 2px",
|
||||
"",
|
||||
"margin-left: 1px; margin-right: 1px; "
|
||||
"margin-bottom: 1px; margin-top: 2px",
|
||||
),
|
||||
("margin-top: 2px", "margin: 1px", "margin: 1px; margin-top: 2px"),
|
||||
("margin: 1px", "margin-top: 2px", "margin: 1px"),
|
||||
(
|
||||
"margin: 1px; margin-top: inherit",
|
||||
"margin: 2px",
|
||||
"margin: 1px; margin-top: 2px",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_css_precedence(style, inherited, equiv):
|
||||
resolve = CSSResolver()
|
||||
inherited_props = resolve(inherited)
|
||||
style_props = resolve(style, inherited=inherited_props)
|
||||
equiv_props = resolve(equiv)
|
||||
assert style_props == equiv_props
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"style,equiv",
|
||||
[
|
||||
(
|
||||
"margin: 1px; margin-top: inherit",
|
||||
"margin-bottom: 1px; margin-right: 1px; margin-left: 1px",
|
||||
),
|
||||
("margin-top: inherit", ""),
|
||||
("margin-top: initial", ""),
|
||||
],
|
||||
)
|
||||
def test_css_none_absent(style, equiv):
|
||||
assert_same_resolution(style, equiv)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"size,resolved",
|
||||
[
|
||||
("xx-small", "6pt"),
|
||||
("x-small", f"{7.5:f}pt"),
|
||||
("small", f"{9.6:f}pt"),
|
||||
("medium", "12pt"),
|
||||
("large", f"{13.5:f}pt"),
|
||||
("x-large", "18pt"),
|
||||
("xx-large", "24pt"),
|
||||
("8px", "6pt"),
|
||||
("1.25pc", "15pt"),
|
||||
(".25in", "18pt"),
|
||||
("02.54cm", "72pt"),
|
||||
("25.4mm", "72pt"),
|
||||
("101.6q", "72pt"),
|
||||
("101.6q", "72pt"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("relative_to", [None, "16pt"]) # invariant to inherited size
|
||||
def test_css_absolute_font_size(size, relative_to, resolved):
|
||||
if relative_to is None:
|
||||
inherited = None
|
||||
else:
|
||||
inherited = {"font-size": relative_to}
|
||||
assert_resolves(f"font-size: {size}", {"font-size": resolved}, inherited=inherited)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"size,relative_to,resolved",
|
||||
[
|
||||
("1em", None, "12pt"),
|
||||
("1.0em", None, "12pt"),
|
||||
("1.25em", None, "15pt"),
|
||||
("1em", "16pt", "16pt"),
|
||||
("1.0em", "16pt", "16pt"),
|
||||
("1.25em", "16pt", "20pt"),
|
||||
("1rem", "16pt", "12pt"),
|
||||
("1.0rem", "16pt", "12pt"),
|
||||
("1.25rem", "16pt", "15pt"),
|
||||
("100%", None, "12pt"),
|
||||
("125%", None, "15pt"),
|
||||
("100%", "16pt", "16pt"),
|
||||
("125%", "16pt", "20pt"),
|
||||
("2ex", None, "12pt"),
|
||||
("2.0ex", None, "12pt"),
|
||||
("2.50ex", None, "15pt"),
|
||||
("inherit", "16pt", "16pt"),
|
||||
("smaller", None, "10pt"),
|
||||
("smaller", "18pt", "15pt"),
|
||||
("larger", None, f"{14.4:f}pt"),
|
||||
("larger", "15pt", "18pt"),
|
||||
],
|
||||
)
|
||||
def test_css_relative_font_size(size, relative_to, resolved):
|
||||
if relative_to is None:
|
||||
inherited = None
|
||||
else:
|
||||
inherited = {"font-size": relative_to}
|
||||
assert_resolves(f"font-size: {size}", {"font-size": resolved}, inherited=inherited)
|
||||
@ -0,0 +1,254 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
reset_option,
|
||||
set_eng_float_format,
|
||||
)
|
||||
|
||||
from pandas.io.formats.format import EngFormatter
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_float_format():
|
||||
yield
|
||||
reset_option("display.float_format")
|
||||
|
||||
|
||||
class TestEngFormatter:
|
||||
def test_eng_float_formatter2(self, float_frame):
|
||||
df = float_frame
|
||||
df.loc[5] = 0
|
||||
|
||||
set_eng_float_format()
|
||||
repr(df)
|
||||
|
||||
set_eng_float_format(use_eng_prefix=True)
|
||||
repr(df)
|
||||
|
||||
set_eng_float_format(accuracy=0)
|
||||
repr(df)
|
||||
|
||||
def test_eng_float_formatter(self):
|
||||
df = DataFrame({"A": [1.41, 141.0, 14100, 1410000.0]})
|
||||
|
||||
set_eng_float_format()
|
||||
result = df.to_string()
|
||||
expected = (
|
||||
" A\n"
|
||||
"0 1.410E+00\n"
|
||||
"1 141.000E+00\n"
|
||||
"2 14.100E+03\n"
|
||||
"3 1.410E+06"
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
set_eng_float_format(use_eng_prefix=True)
|
||||
result = df.to_string()
|
||||
expected = " A\n0 1.410\n1 141.000\n2 14.100k\n3 1.410M"
|
||||
assert result == expected
|
||||
|
||||
set_eng_float_format(accuracy=0)
|
||||
result = df.to_string()
|
||||
expected = " A\n0 1E+00\n1 141E+00\n2 14E+03\n3 1E+06"
|
||||
assert result == expected
|
||||
|
||||
def compare(self, formatter, input, output):
|
||||
formatted_input = formatter(input)
|
||||
assert formatted_input == output
|
||||
|
||||
def compare_all(self, formatter, in_out):
|
||||
"""
|
||||
Parameters:
|
||||
-----------
|
||||
formatter: EngFormatter under test
|
||||
in_out: list of tuples. Each tuple = (number, expected_formatting)
|
||||
|
||||
It is tested if 'formatter(number) == expected_formatting'.
|
||||
*number* should be >= 0 because formatter(-number) == fmt is also
|
||||
tested. *fmt* is derived from *expected_formatting*
|
||||
"""
|
||||
for input, output in in_out:
|
||||
self.compare(formatter, input, output)
|
||||
self.compare(formatter, -input, "-" + output[1:])
|
||||
|
||||
def test_exponents_with_eng_prefix(self):
|
||||
formatter = EngFormatter(accuracy=3, use_eng_prefix=True)
|
||||
f = np.sqrt(2)
|
||||
in_out = [
|
||||
(f * 10**-24, " 1.414y"),
|
||||
(f * 10**-23, " 14.142y"),
|
||||
(f * 10**-22, " 141.421y"),
|
||||
(f * 10**-21, " 1.414z"),
|
||||
(f * 10**-20, " 14.142z"),
|
||||
(f * 10**-19, " 141.421z"),
|
||||
(f * 10**-18, " 1.414a"),
|
||||
(f * 10**-17, " 14.142a"),
|
||||
(f * 10**-16, " 141.421a"),
|
||||
(f * 10**-15, " 1.414f"),
|
||||
(f * 10**-14, " 14.142f"),
|
||||
(f * 10**-13, " 141.421f"),
|
||||
(f * 10**-12, " 1.414p"),
|
||||
(f * 10**-11, " 14.142p"),
|
||||
(f * 10**-10, " 141.421p"),
|
||||
(f * 10**-9, " 1.414n"),
|
||||
(f * 10**-8, " 14.142n"),
|
||||
(f * 10**-7, " 141.421n"),
|
||||
(f * 10**-6, " 1.414u"),
|
||||
(f * 10**-5, " 14.142u"),
|
||||
(f * 10**-4, " 141.421u"),
|
||||
(f * 10**-3, " 1.414m"),
|
||||
(f * 10**-2, " 14.142m"),
|
||||
(f * 10**-1, " 141.421m"),
|
||||
(f * 10**0, " 1.414"),
|
||||
(f * 10**1, " 14.142"),
|
||||
(f * 10**2, " 141.421"),
|
||||
(f * 10**3, " 1.414k"),
|
||||
(f * 10**4, " 14.142k"),
|
||||
(f * 10**5, " 141.421k"),
|
||||
(f * 10**6, " 1.414M"),
|
||||
(f * 10**7, " 14.142M"),
|
||||
(f * 10**8, " 141.421M"),
|
||||
(f * 10**9, " 1.414G"),
|
||||
(f * 10**10, " 14.142G"),
|
||||
(f * 10**11, " 141.421G"),
|
||||
(f * 10**12, " 1.414T"),
|
||||
(f * 10**13, " 14.142T"),
|
||||
(f * 10**14, " 141.421T"),
|
||||
(f * 10**15, " 1.414P"),
|
||||
(f * 10**16, " 14.142P"),
|
||||
(f * 10**17, " 141.421P"),
|
||||
(f * 10**18, " 1.414E"),
|
||||
(f * 10**19, " 14.142E"),
|
||||
(f * 10**20, " 141.421E"),
|
||||
(f * 10**21, " 1.414Z"),
|
||||
(f * 10**22, " 14.142Z"),
|
||||
(f * 10**23, " 141.421Z"),
|
||||
(f * 10**24, " 1.414Y"),
|
||||
(f * 10**25, " 14.142Y"),
|
||||
(f * 10**26, " 141.421Y"),
|
||||
]
|
||||
self.compare_all(formatter, in_out)
|
||||
|
||||
def test_exponents_without_eng_prefix(self):
|
||||
formatter = EngFormatter(accuracy=4, use_eng_prefix=False)
|
||||
f = np.pi
|
||||
in_out = [
|
||||
(f * 10**-24, " 3.1416E-24"),
|
||||
(f * 10**-23, " 31.4159E-24"),
|
||||
(f * 10**-22, " 314.1593E-24"),
|
||||
(f * 10**-21, " 3.1416E-21"),
|
||||
(f * 10**-20, " 31.4159E-21"),
|
||||
(f * 10**-19, " 314.1593E-21"),
|
||||
(f * 10**-18, " 3.1416E-18"),
|
||||
(f * 10**-17, " 31.4159E-18"),
|
||||
(f * 10**-16, " 314.1593E-18"),
|
||||
(f * 10**-15, " 3.1416E-15"),
|
||||
(f * 10**-14, " 31.4159E-15"),
|
||||
(f * 10**-13, " 314.1593E-15"),
|
||||
(f * 10**-12, " 3.1416E-12"),
|
||||
(f * 10**-11, " 31.4159E-12"),
|
||||
(f * 10**-10, " 314.1593E-12"),
|
||||
(f * 10**-9, " 3.1416E-09"),
|
||||
(f * 10**-8, " 31.4159E-09"),
|
||||
(f * 10**-7, " 314.1593E-09"),
|
||||
(f * 10**-6, " 3.1416E-06"),
|
||||
(f * 10**-5, " 31.4159E-06"),
|
||||
(f * 10**-4, " 314.1593E-06"),
|
||||
(f * 10**-3, " 3.1416E-03"),
|
||||
(f * 10**-2, " 31.4159E-03"),
|
||||
(f * 10**-1, " 314.1593E-03"),
|
||||
(f * 10**0, " 3.1416E+00"),
|
||||
(f * 10**1, " 31.4159E+00"),
|
||||
(f * 10**2, " 314.1593E+00"),
|
||||
(f * 10**3, " 3.1416E+03"),
|
||||
(f * 10**4, " 31.4159E+03"),
|
||||
(f * 10**5, " 314.1593E+03"),
|
||||
(f * 10**6, " 3.1416E+06"),
|
||||
(f * 10**7, " 31.4159E+06"),
|
||||
(f * 10**8, " 314.1593E+06"),
|
||||
(f * 10**9, " 3.1416E+09"),
|
||||
(f * 10**10, " 31.4159E+09"),
|
||||
(f * 10**11, " 314.1593E+09"),
|
||||
(f * 10**12, " 3.1416E+12"),
|
||||
(f * 10**13, " 31.4159E+12"),
|
||||
(f * 10**14, " 314.1593E+12"),
|
||||
(f * 10**15, " 3.1416E+15"),
|
||||
(f * 10**16, " 31.4159E+15"),
|
||||
(f * 10**17, " 314.1593E+15"),
|
||||
(f * 10**18, " 3.1416E+18"),
|
||||
(f * 10**19, " 31.4159E+18"),
|
||||
(f * 10**20, " 314.1593E+18"),
|
||||
(f * 10**21, " 3.1416E+21"),
|
||||
(f * 10**22, " 31.4159E+21"),
|
||||
(f * 10**23, " 314.1593E+21"),
|
||||
(f * 10**24, " 3.1416E+24"),
|
||||
(f * 10**25, " 31.4159E+24"),
|
||||
(f * 10**26, " 314.1593E+24"),
|
||||
]
|
||||
self.compare_all(formatter, in_out)
|
||||
|
||||
def test_rounding(self):
|
||||
formatter = EngFormatter(accuracy=3, use_eng_prefix=True)
|
||||
in_out = [
|
||||
(5.55555, " 5.556"),
|
||||
(55.5555, " 55.556"),
|
||||
(555.555, " 555.555"),
|
||||
(5555.55, " 5.556k"),
|
||||
(55555.5, " 55.556k"),
|
||||
(555555, " 555.555k"),
|
||||
]
|
||||
self.compare_all(formatter, in_out)
|
||||
|
||||
formatter = EngFormatter(accuracy=1, use_eng_prefix=True)
|
||||
in_out = [
|
||||
(5.55555, " 5.6"),
|
||||
(55.5555, " 55.6"),
|
||||
(555.555, " 555.6"),
|
||||
(5555.55, " 5.6k"),
|
||||
(55555.5, " 55.6k"),
|
||||
(555555, " 555.6k"),
|
||||
]
|
||||
self.compare_all(formatter, in_out)
|
||||
|
||||
formatter = EngFormatter(accuracy=0, use_eng_prefix=True)
|
||||
in_out = [
|
||||
(5.55555, " 6"),
|
||||
(55.5555, " 56"),
|
||||
(555.555, " 556"),
|
||||
(5555.55, " 6k"),
|
||||
(55555.5, " 56k"),
|
||||
(555555, " 556k"),
|
||||
]
|
||||
self.compare_all(formatter, in_out)
|
||||
|
||||
formatter = EngFormatter(accuracy=3, use_eng_prefix=True)
|
||||
result = formatter(0)
|
||||
assert result == " 0.000"
|
||||
|
||||
def test_nan(self):
|
||||
# Issue #11981
|
||||
|
||||
formatter = EngFormatter(accuracy=1, use_eng_prefix=True)
|
||||
result = formatter(np.nan)
|
||||
assert result == "NaN"
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1.5, 10.3, 20.5],
|
||||
"b": [50.3, 60.67, 70.12],
|
||||
"c": [100.2, 101.33, 120.33],
|
||||
}
|
||||
)
|
||||
pt = df.pivot_table(values="a", index="b", columns="c")
|
||||
set_eng_float_format(accuracy=1)
|
||||
result = pt.to_string()
|
||||
assert "NaN" in result
|
||||
|
||||
def test_inf(self):
|
||||
# Issue #11981
|
||||
|
||||
formatter = EngFormatter(accuracy=1, use_eng_prefix=True)
|
||||
result = formatter(np.inf)
|
||||
assert result == "inf"
|
||||
2289
lib/python3.11/site-packages/pandas/tests/io/formats/test_format.py
Normal file
2289
lib/python3.11/site-packages/pandas/tests/io/formats/test_format.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,90 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas._config.config as cf
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
MultiIndex,
|
||||
)
|
||||
|
||||
|
||||
class TestTableSchemaRepr:
|
||||
def test_publishes(self, ip):
|
||||
ipython = ip.instance(config=ip.config)
|
||||
df = DataFrame({"A": [1, 2]})
|
||||
objects = [df["A"], df] # dataframe / series
|
||||
expected_keys = [
|
||||
{"text/plain", "application/vnd.dataresource+json"},
|
||||
{"text/plain", "text/html", "application/vnd.dataresource+json"},
|
||||
]
|
||||
|
||||
opt = cf.option_context("display.html.table_schema", True)
|
||||
last_obj = None
|
||||
for obj, expected in zip(objects, expected_keys):
|
||||
last_obj = obj
|
||||
with opt:
|
||||
formatted = ipython.display_formatter.format(obj)
|
||||
assert set(formatted[0].keys()) == expected
|
||||
|
||||
with_latex = cf.option_context("styler.render.repr", "latex")
|
||||
|
||||
with opt, with_latex:
|
||||
formatted = ipython.display_formatter.format(last_obj)
|
||||
|
||||
expected = {
|
||||
"text/plain",
|
||||
"text/html",
|
||||
"text/latex",
|
||||
"application/vnd.dataresource+json",
|
||||
}
|
||||
assert set(formatted[0].keys()) == expected
|
||||
|
||||
def test_publishes_not_implemented(self, ip):
|
||||
# column MultiIndex
|
||||
# GH#15996
|
||||
midx = MultiIndex.from_product([["A", "B"], ["a", "b", "c"]])
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((5, len(midx))), columns=midx
|
||||
)
|
||||
|
||||
opt = cf.option_context("display.html.table_schema", True)
|
||||
|
||||
with opt:
|
||||
formatted = ip.instance(config=ip.config).display_formatter.format(df)
|
||||
|
||||
expected = {"text/plain", "text/html"}
|
||||
assert set(formatted[0].keys()) == expected
|
||||
|
||||
def test_config_on(self):
|
||||
df = DataFrame({"A": [1, 2]})
|
||||
with cf.option_context("display.html.table_schema", True):
|
||||
result = df._repr_data_resource_()
|
||||
|
||||
assert result is not None
|
||||
|
||||
def test_config_default_off(self):
|
||||
df = DataFrame({"A": [1, 2]})
|
||||
with cf.option_context("display.html.table_schema", False):
|
||||
result = df._repr_data_resource_()
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_enable_data_resource_formatter(self, ip):
|
||||
# GH#10491
|
||||
formatters = ip.instance(config=ip.config).display_formatter.formatters
|
||||
mimetype = "application/vnd.dataresource+json"
|
||||
|
||||
with cf.option_context("display.html.table_schema", True):
|
||||
assert "application/vnd.dataresource+json" in formatters
|
||||
assert formatters[mimetype].enabled
|
||||
|
||||
# still there, just disabled
|
||||
assert "application/vnd.dataresource+json" in formatters
|
||||
assert not formatters[mimetype].enabled
|
||||
|
||||
# able to re-set
|
||||
with cf.option_context("display.html.table_schema", True):
|
||||
assert "application/vnd.dataresource+json" in formatters
|
||||
assert formatters[mimetype].enabled
|
||||
# smoke test that it works
|
||||
ip.instance(config=ip.config).display_formatter.format(cf)
|
||||
@ -0,0 +1,129 @@
|
||||
# Note! This file is aimed specifically at pandas.io.formats.printing utility
|
||||
# functions, not the general printing of pandas objects.
|
||||
import string
|
||||
|
||||
import pandas._config.config as cf
|
||||
|
||||
from pandas.io.formats import printing
|
||||
|
||||
|
||||
def test_adjoin():
|
||||
data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]]
|
||||
expected = "a dd ggg\nb ee hhh\nc ff iii"
|
||||
|
||||
adjoined = printing.adjoin(2, *data)
|
||||
|
||||
assert adjoined == expected
|
||||
|
||||
|
||||
class TestPPrintThing:
|
||||
def test_repr_binary_type(self):
|
||||
letters = string.ascii_letters
|
||||
try:
|
||||
raw = bytes(letters, encoding=cf.get_option("display.encoding"))
|
||||
except TypeError:
|
||||
raw = bytes(letters)
|
||||
b = str(raw.decode("utf-8"))
|
||||
res = printing.pprint_thing(b, quote_strings=True)
|
||||
assert res == repr(b)
|
||||
res = printing.pprint_thing(b, quote_strings=False)
|
||||
assert res == b
|
||||
|
||||
def test_repr_obeys_max_seq_limit(self):
|
||||
with cf.option_context("display.max_seq_items", 2000):
|
||||
assert len(printing.pprint_thing(list(range(1000)))) > 1000
|
||||
|
||||
with cf.option_context("display.max_seq_items", 5):
|
||||
assert len(printing.pprint_thing(list(range(1000)))) < 100
|
||||
|
||||
with cf.option_context("display.max_seq_items", 1):
|
||||
assert len(printing.pprint_thing(list(range(1000)))) < 9
|
||||
|
||||
def test_repr_set(self):
|
||||
assert printing.pprint_thing({1}) == "{1}"
|
||||
|
||||
|
||||
class TestFormatBase:
|
||||
def test_adjoin(self):
|
||||
data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]]
|
||||
expected = "a dd ggg\nb ee hhh\nc ff iii"
|
||||
|
||||
adjoined = printing.adjoin(2, *data)
|
||||
|
||||
assert adjoined == expected
|
||||
|
||||
def test_adjoin_unicode(self):
|
||||
data = [["あ", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "hhh", "いいい"]]
|
||||
expected = "あ dd ggg\nb ええ hhh\nc ff いいい"
|
||||
adjoined = printing.adjoin(2, *data)
|
||||
assert adjoined == expected
|
||||
|
||||
adj = printing._EastAsianTextAdjustment()
|
||||
|
||||
expected = """あ dd ggg
|
||||
b ええ hhh
|
||||
c ff いいい"""
|
||||
|
||||
adjoined = adj.adjoin(2, *data)
|
||||
assert adjoined == expected
|
||||
cols = adjoined.split("\n")
|
||||
assert adj.len(cols[0]) == 13
|
||||
assert adj.len(cols[1]) == 13
|
||||
assert adj.len(cols[2]) == 16
|
||||
|
||||
expected = """あ dd ggg
|
||||
b ええ hhh
|
||||
c ff いいい"""
|
||||
|
||||
adjoined = adj.adjoin(7, *data)
|
||||
assert adjoined == expected
|
||||
cols = adjoined.split("\n")
|
||||
assert adj.len(cols[0]) == 23
|
||||
assert adj.len(cols[1]) == 23
|
||||
assert adj.len(cols[2]) == 26
|
||||
|
||||
def test_justify(self):
|
||||
adj = printing._EastAsianTextAdjustment()
|
||||
|
||||
def just(x, *args, **kwargs):
|
||||
# wrapper to test single str
|
||||
return adj.justify([x], *args, **kwargs)[0]
|
||||
|
||||
assert just("abc", 5, mode="left") == "abc "
|
||||
assert just("abc", 5, mode="center") == " abc "
|
||||
assert just("abc", 5, mode="right") == " abc"
|
||||
assert just("abc", 5, mode="left") == "abc "
|
||||
assert just("abc", 5, mode="center") == " abc "
|
||||
assert just("abc", 5, mode="right") == " abc"
|
||||
|
||||
assert just("パンダ", 5, mode="left") == "パンダ"
|
||||
assert just("パンダ", 5, mode="center") == "パンダ"
|
||||
assert just("パンダ", 5, mode="right") == "パンダ"
|
||||
|
||||
assert just("パンダ", 10, mode="left") == "パンダ "
|
||||
assert just("パンダ", 10, mode="center") == " パンダ "
|
||||
assert just("パンダ", 10, mode="right") == " パンダ"
|
||||
|
||||
def test_east_asian_len(self):
|
||||
adj = printing._EastAsianTextAdjustment()
|
||||
|
||||
assert adj.len("abc") == 3
|
||||
assert adj.len("abc") == 3
|
||||
|
||||
assert adj.len("パンダ") == 6
|
||||
assert adj.len("パンダ") == 5
|
||||
assert adj.len("パンダpanda") == 11
|
||||
assert adj.len("パンダpanda") == 10
|
||||
|
||||
def test_ambiguous_width(self):
|
||||
adj = printing._EastAsianTextAdjustment()
|
||||
assert adj.len("¡¡ab") == 4
|
||||
|
||||
with cf.option_context("display.unicode.ambiguous_as_wide", True):
|
||||
adj = printing._EastAsianTextAdjustment()
|
||||
assert adj.len("¡¡ab") == 6
|
||||
|
||||
data = [["あ", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "¡¡ab", "いいい"]]
|
||||
expected = "あ dd ggg \nb ええ ¡¡ab\nc ff いいい"
|
||||
adjoined = adj.adjoin(2, *data)
|
||||
assert adjoined == expected
|
||||
@ -0,0 +1,758 @@
|
||||
import io
|
||||
import os
|
||||
import sys
|
||||
from zipfile import ZipFile
|
||||
|
||||
from _csv import Error
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
compat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestToCSV:
|
||||
def test_to_csv_with_single_column(self):
|
||||
# see gh-18676, https://bugs.python.org/issue32255
|
||||
#
|
||||
# Python's CSV library adds an extraneous '""'
|
||||
# before the newline when the NaN-value is in
|
||||
# the first row. Otherwise, only the newline
|
||||
# character is added. This behavior is inconsistent
|
||||
# and was patched in https://bugs.python.org/pull_request4672.
|
||||
df1 = DataFrame([None, 1])
|
||||
expected1 = """\
|
||||
""
|
||||
1.0
|
||||
"""
|
||||
with tm.ensure_clean("test.csv") as path:
|
||||
df1.to_csv(path, header=None, index=None)
|
||||
with open(path, encoding="utf-8") as f:
|
||||
assert f.read() == expected1
|
||||
|
||||
df2 = DataFrame([1, None])
|
||||
expected2 = """\
|
||||
1.0
|
||||
""
|
||||
"""
|
||||
with tm.ensure_clean("test.csv") as path:
|
||||
df2.to_csv(path, header=None, index=None)
|
||||
with open(path, encoding="utf-8") as f:
|
||||
assert f.read() == expected2
|
||||
|
||||
def test_to_csv_default_encoding(self):
|
||||
# GH17097
|
||||
df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]})
|
||||
|
||||
with tm.ensure_clean("test.csv") as path:
|
||||
# the default to_csv encoding is uft-8.
|
||||
df.to_csv(path)
|
||||
tm.assert_frame_equal(pd.read_csv(path, index_col=0), df)
|
||||
|
||||
def test_to_csv_quotechar(self):
|
||||
df = DataFrame({"col": [1, 2]})
|
||||
expected = """\
|
||||
"","col"
|
||||
"0","1"
|
||||
"1","2"
|
||||
"""
|
||||
|
||||
with tm.ensure_clean("test.csv") as path:
|
||||
df.to_csv(path, quoting=1) # 1=QUOTE_ALL
|
||||
with open(path, encoding="utf-8") as f:
|
||||
assert f.read() == expected
|
||||
|
||||
expected = """\
|
||||
$$,$col$
|
||||
$0$,$1$
|
||||
$1$,$2$
|
||||
"""
|
||||
|
||||
with tm.ensure_clean("test.csv") as path:
|
||||
df.to_csv(path, quoting=1, quotechar="$")
|
||||
with open(path, encoding="utf-8") as f:
|
||||
assert f.read() == expected
|
||||
|
||||
with tm.ensure_clean("test.csv") as path:
|
||||
with pytest.raises(TypeError, match="quotechar"):
|
||||
df.to_csv(path, quoting=1, quotechar=None)
|
||||
|
||||
def test_to_csv_doublequote(self):
|
||||
df = DataFrame({"col": ['a"a', '"bb"']})
|
||||
expected = '''\
|
||||
"","col"
|
||||
"0","a""a"
|
||||
"1","""bb"""
|
||||
'''
|
||||
|
||||
with tm.ensure_clean("test.csv") as path:
|
||||
df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL
|
||||
with open(path, encoding="utf-8") as f:
|
||||
assert f.read() == expected
|
||||
|
||||
with tm.ensure_clean("test.csv") as path:
|
||||
with pytest.raises(Error, match="escapechar"):
|
||||
df.to_csv(path, doublequote=False) # no escapechar set
|
||||
|
||||
def test_to_csv_escapechar(self):
|
||||
df = DataFrame({"col": ['a"a', '"bb"']})
|
||||
expected = """\
|
||||
"","col"
|
||||
"0","a\\"a"
|
||||
"1","\\"bb\\""
|
||||
"""
|
||||
|
||||
with tm.ensure_clean("test.csv") as path: # QUOTE_ALL
|
||||
df.to_csv(path, quoting=1, doublequote=False, escapechar="\\")
|
||||
with open(path, encoding="utf-8") as f:
|
||||
assert f.read() == expected
|
||||
|
||||
df = DataFrame({"col": ["a,a", ",bb,"]})
|
||||
expected = """\
|
||||
,col
|
||||
0,a\\,a
|
||||
1,\\,bb\\,
|
||||
"""
|
||||
|
||||
with tm.ensure_clean("test.csv") as path:
|
||||
df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE
|
||||
with open(path, encoding="utf-8") as f:
|
||||
assert f.read() == expected
|
||||
|
||||
def test_csv_to_string(self):
|
||||
df = DataFrame({"col": [1, 2]})
|
||||
expected_rows = [",col", "0,1", "1,2"]
|
||||
expected = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df.to_csv() == expected
|
||||
|
||||
def test_to_csv_decimal(self):
|
||||
# see gh-781
|
||||
df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]})
|
||||
|
||||
expected_rows = [",col1,col2,col3", "0,1,a,10.1"]
|
||||
expected_default = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df.to_csv() == expected_default
|
||||
|
||||
expected_rows = [";col1;col2;col3", "0;1;a;10,1"]
|
||||
expected_european_excel = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df.to_csv(decimal=",", sep=";") == expected_european_excel
|
||||
|
||||
expected_rows = [",col1,col2,col3", "0,1,a,10.10"]
|
||||
expected_float_format_default = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df.to_csv(float_format="%.2f") == expected_float_format_default
|
||||
|
||||
expected_rows = [";col1;col2;col3", "0;1;a;10,10"]
|
||||
expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert (
|
||||
df.to_csv(decimal=",", sep=";", float_format="%.2f")
|
||||
== expected_float_format
|
||||
)
|
||||
|
||||
# see gh-11553: testing if decimal is taken into account for '0.0'
|
||||
df = DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1})
|
||||
|
||||
expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"]
|
||||
expected = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df.to_csv(index=False, decimal="^") == expected
|
||||
|
||||
# same but for an index
|
||||
assert df.set_index("a").to_csv(decimal="^") == expected
|
||||
|
||||
# same for a multi-index
|
||||
assert df.set_index(["a", "b"]).to_csv(decimal="^") == expected
|
||||
|
||||
def test_to_csv_float_format(self):
|
||||
# testing if float_format is taken into account for the index
|
||||
# GH 11553
|
||||
df = DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1})
|
||||
|
||||
expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"]
|
||||
expected = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df.set_index("a").to_csv(float_format="%.2f") == expected
|
||||
|
||||
# same for a multi-index
|
||||
assert df.set_index(["a", "b"]).to_csv(float_format="%.2f") == expected
|
||||
|
||||
def test_to_csv_na_rep(self):
|
||||
# see gh-11553
|
||||
#
|
||||
# Testing if NaN values are correctly represented in the index.
|
||||
df = DataFrame({"a": [0, np.nan], "b": [0, 1], "c": [2, 3]})
|
||||
expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"]
|
||||
expected = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
|
||||
assert df.set_index("a").to_csv(na_rep="_") == expected
|
||||
assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected
|
||||
|
||||
# now with an index containing only NaNs
|
||||
df = DataFrame({"a": np.nan, "b": [0, 1], "c": [2, 3]})
|
||||
expected_rows = ["a,b,c", "_,0,2", "_,1,3"]
|
||||
expected = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
|
||||
assert df.set_index("a").to_csv(na_rep="_") == expected
|
||||
assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected
|
||||
|
||||
# check if na_rep parameter does not break anything when no NaN
|
||||
df = DataFrame({"a": 0, "b": [0, 1], "c": [2, 3]})
|
||||
expected_rows = ["a,b,c", "0,0,2", "0,1,3"]
|
||||
expected = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
|
||||
assert df.set_index("a").to_csv(na_rep="_") == expected
|
||||
assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected
|
||||
|
||||
csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ")
|
||||
expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"])
|
||||
assert expected == csv
|
||||
|
||||
def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype):
|
||||
# GH 29975
|
||||
# Make sure full na_rep shows up when a dtype is provided
|
||||
expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"])
|
||||
csv = pd.Series(["a", pd.NA, "c"], dtype=nullable_string_dtype).to_csv(
|
||||
na_rep="ZZZZZ"
|
||||
)
|
||||
assert expected == csv
|
||||
|
||||
def test_to_csv_date_format(self):
|
||||
# GH 10209
|
||||
df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")})
|
||||
df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="d")})
|
||||
|
||||
expected_rows = [
|
||||
",A",
|
||||
"0,2013-01-01 00:00:00",
|
||||
"1,2013-01-01 00:00:01",
|
||||
"2,2013-01-01 00:00:02",
|
||||
"3,2013-01-01 00:00:03",
|
||||
"4,2013-01-01 00:00:04",
|
||||
]
|
||||
expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df_sec.to_csv() == expected_default_sec
|
||||
|
||||
expected_rows = [
|
||||
",A",
|
||||
"0,2013-01-01 00:00:00",
|
||||
"1,2013-01-02 00:00:00",
|
||||
"2,2013-01-03 00:00:00",
|
||||
"3,2013-01-04 00:00:00",
|
||||
"4,2013-01-05 00:00:00",
|
||||
]
|
||||
expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df_day.to_csv(date_format="%Y-%m-%d %H:%M:%S") == expected_ymdhms_day
|
||||
|
||||
expected_rows = [
|
||||
",A",
|
||||
"0,2013-01-01",
|
||||
"1,2013-01-01",
|
||||
"2,2013-01-01",
|
||||
"3,2013-01-01",
|
||||
"4,2013-01-01",
|
||||
]
|
||||
expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df_sec.to_csv(date_format="%Y-%m-%d") == expected_ymd_sec
|
||||
|
||||
expected_rows = [
|
||||
",A",
|
||||
"0,2013-01-01",
|
||||
"1,2013-01-02",
|
||||
"2,2013-01-03",
|
||||
"3,2013-01-04",
|
||||
"4,2013-01-05",
|
||||
]
|
||||
expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df_day.to_csv() == expected_default_day
|
||||
assert df_day.to_csv(date_format="%Y-%m-%d") == expected_default_day
|
||||
|
||||
# see gh-7791
|
||||
#
|
||||
# Testing if date_format parameter is taken into account
|
||||
# for multi-indexed DataFrames.
|
||||
df_sec["B"] = 0
|
||||
df_sec["C"] = 1
|
||||
|
||||
expected_rows = ["A,B,C", "2013-01-01,0,1.0"]
|
||||
expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
|
||||
df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"])
|
||||
assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec
|
||||
|
||||
def test_to_csv_different_datetime_formats(self):
|
||||
# GH#21734
|
||||
df = DataFrame(
|
||||
{
|
||||
"date": pd.to_datetime("1970-01-01"),
|
||||
"datetime": pd.date_range("1970-01-01", periods=2, freq="h"),
|
||||
}
|
||||
)
|
||||
expected_rows = [
|
||||
"date,datetime",
|
||||
"1970-01-01,1970-01-01 00:00:00",
|
||||
"1970-01-01,1970-01-01 01:00:00",
|
||||
]
|
||||
expected = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df.to_csv(index=False) == expected
|
||||
|
||||
def test_to_csv_date_format_in_categorical(self):
|
||||
# GH#40754
|
||||
ser = pd.Series(pd.to_datetime(["2021-03-27", pd.NaT], format="%Y-%m-%d"))
|
||||
ser = ser.astype("category")
|
||||
expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27", '""'])
|
||||
assert ser.to_csv(index=False) == expected
|
||||
|
||||
ser = pd.Series(
|
||||
pd.date_range(
|
||||
start="2021-03-27", freq="D", periods=1, tz="Europe/Berlin"
|
||||
).append(pd.DatetimeIndex([pd.NaT]))
|
||||
)
|
||||
ser = ser.astype("category")
|
||||
assert ser.to_csv(index=False, date_format="%Y-%m-%d") == expected
|
||||
|
||||
def test_to_csv_float_ea_float_format(self):
|
||||
# GH#45991
|
||||
df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"})
|
||||
df["a"] = df["a"].astype("Float64")
|
||||
result = df.to_csv(index=False, float_format="%.5f")
|
||||
expected = tm.convert_rows_list_to_csv_str(
|
||||
["a,b", "1.10000,c", "2.02000,c", ",c", "6.00001,c"]
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
def test_to_csv_float_ea_no_float_format(self):
|
||||
# GH#45991
|
||||
df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"})
|
||||
df["a"] = df["a"].astype("Float64")
|
||||
result = df.to_csv(index=False)
|
||||
expected = tm.convert_rows_list_to_csv_str(
|
||||
["a,b", "1.1,c", "2.02,c", ",c", "6.000006,c"]
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
def test_to_csv_multi_index(self):
|
||||
# see gh-6618
|
||||
df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]))
|
||||
|
||||
exp_rows = [",1", ",2", "0,1"]
|
||||
exp = tm.convert_rows_list_to_csv_str(exp_rows)
|
||||
assert df.to_csv() == exp
|
||||
|
||||
exp_rows = ["1", "2", "1"]
|
||||
exp = tm.convert_rows_list_to_csv_str(exp_rows)
|
||||
assert df.to_csv(index=False) == exp
|
||||
|
||||
df = DataFrame(
|
||||
[1],
|
||||
columns=pd.MultiIndex.from_arrays([[1], [2]]),
|
||||
index=pd.MultiIndex.from_arrays([[1], [2]]),
|
||||
)
|
||||
|
||||
exp_rows = [",,1", ",,2", "1,2,1"]
|
||||
exp = tm.convert_rows_list_to_csv_str(exp_rows)
|
||||
assert df.to_csv() == exp
|
||||
|
||||
exp_rows = ["1", "2", "1"]
|
||||
exp = tm.convert_rows_list_to_csv_str(exp_rows)
|
||||
assert df.to_csv(index=False) == exp
|
||||
|
||||
df = DataFrame([1], columns=pd.MultiIndex.from_arrays([["foo"], ["bar"]]))
|
||||
|
||||
exp_rows = [",foo", ",bar", "0,1"]
|
||||
exp = tm.convert_rows_list_to_csv_str(exp_rows)
|
||||
assert df.to_csv() == exp
|
||||
|
||||
exp_rows = ["foo", "bar", "1"]
|
||||
exp = tm.convert_rows_list_to_csv_str(exp_rows)
|
||||
assert df.to_csv(index=False) == exp
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ind,expected",
|
||||
[
|
||||
(
|
||||
pd.MultiIndex(levels=[[1.0]], codes=[[0]], names=["x"]),
|
||||
"x,data\n1.0,1\n",
|
||||
),
|
||||
(
|
||||
pd.MultiIndex(
|
||||
levels=[[1.0], [2.0]], codes=[[0], [0]], names=["x", "y"]
|
||||
),
|
||||
"x,y,data\n1.0,2.0,1\n",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_csv_single_level_multi_index(self, ind, expected, frame_or_series):
|
||||
# see gh-19589
|
||||
obj = frame_or_series(pd.Series([1], ind, name="data"))
|
||||
|
||||
result = obj.to_csv(lineterminator="\n", header=True)
|
||||
assert result == expected
|
||||
|
||||
def test_to_csv_string_array_ascii(self):
|
||||
# GH 10813
|
||||
str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}]
|
||||
df = DataFrame(str_array)
|
||||
expected_ascii = """\
|
||||
,names
|
||||
0,"['foo', 'bar']"
|
||||
1,"['baz', 'qux']"
|
||||
"""
|
||||
with tm.ensure_clean("str_test.csv") as path:
|
||||
df.to_csv(path, encoding="ascii")
|
||||
with open(path, encoding="utf-8") as f:
|
||||
assert f.read() == expected_ascii
|
||||
|
||||
def test_to_csv_string_array_utf8(self):
|
||||
# GH 10813
|
||||
str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}]
|
||||
df = DataFrame(str_array)
|
||||
expected_utf8 = """\
|
||||
,names
|
||||
0,"['foo', 'bar']"
|
||||
1,"['baz', 'qux']"
|
||||
"""
|
||||
with tm.ensure_clean("unicode_test.csv") as path:
|
||||
df.to_csv(path, encoding="utf-8")
|
||||
with open(path, encoding="utf-8") as f:
|
||||
assert f.read() == expected_utf8
|
||||
|
||||
def test_to_csv_string_with_lf(self):
|
||||
# GH 20353
|
||||
data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]}
|
||||
df = DataFrame(data)
|
||||
with tm.ensure_clean("lf_test.csv") as path:
|
||||
# case 1: The default line terminator(=os.linesep)(PR 21406)
|
||||
os_linesep = os.linesep.encode("utf-8")
|
||||
expected_noarg = (
|
||||
b"int,str_lf"
|
||||
+ os_linesep
|
||||
+ b"1,abc"
|
||||
+ os_linesep
|
||||
+ b'2,"d\nef"'
|
||||
+ os_linesep
|
||||
+ b'3,"g\nh\n\ni"'
|
||||
+ os_linesep
|
||||
)
|
||||
df.to_csv(path, index=False)
|
||||
with open(path, "rb") as f:
|
||||
assert f.read() == expected_noarg
|
||||
with tm.ensure_clean("lf_test.csv") as path:
|
||||
# case 2: LF as line terminator
|
||||
expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n'
|
||||
df.to_csv(path, lineterminator="\n", index=False)
|
||||
with open(path, "rb") as f:
|
||||
assert f.read() == expected_lf
|
||||
with tm.ensure_clean("lf_test.csv") as path:
|
||||
# case 3: CRLF as line terminator
|
||||
# 'lineterminator' should not change inner element
|
||||
expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n'
|
||||
df.to_csv(path, lineterminator="\r\n", index=False)
|
||||
with open(path, "rb") as f:
|
||||
assert f.read() == expected_crlf
|
||||
|
||||
def test_to_csv_string_with_crlf(self):
|
||||
# GH 20353
|
||||
data = {"int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]}
|
||||
df = DataFrame(data)
|
||||
with tm.ensure_clean("crlf_test.csv") as path:
|
||||
# case 1: The default line terminator(=os.linesep)(PR 21406)
|
||||
os_linesep = os.linesep.encode("utf-8")
|
||||
expected_noarg = (
|
||||
b"int,str_crlf"
|
||||
+ os_linesep
|
||||
+ b"1,abc"
|
||||
+ os_linesep
|
||||
+ b'2,"d\r\nef"'
|
||||
+ os_linesep
|
||||
+ b'3,"g\r\nh\r\n\r\ni"'
|
||||
+ os_linesep
|
||||
)
|
||||
df.to_csv(path, index=False)
|
||||
with open(path, "rb") as f:
|
||||
assert f.read() == expected_noarg
|
||||
with tm.ensure_clean("crlf_test.csv") as path:
|
||||
# case 2: LF as line terminator
|
||||
expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n'
|
||||
df.to_csv(path, lineterminator="\n", index=False)
|
||||
with open(path, "rb") as f:
|
||||
assert f.read() == expected_lf
|
||||
with tm.ensure_clean("crlf_test.csv") as path:
|
||||
# case 3: CRLF as line terminator
|
||||
# 'lineterminator' should not change inner element
|
||||
expected_crlf = (
|
||||
b"int,str_crlf\r\n"
|
||||
b"1,abc\r\n"
|
||||
b'2,"d\r\nef"\r\n'
|
||||
b'3,"g\r\nh\r\n\r\ni"\r\n'
|
||||
)
|
||||
df.to_csv(path, lineterminator="\r\n", index=False)
|
||||
with open(path, "rb") as f:
|
||||
assert f.read() == expected_crlf
|
||||
|
||||
def test_to_csv_stdout_file(self, capsys):
|
||||
# GH 21561
|
||||
df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["name_1", "name_2"])
|
||||
expected_rows = [",name_1,name_2", "0,foo,bar", "1,baz,qux"]
|
||||
expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
|
||||
df.to_csv(sys.stdout, encoding="ascii")
|
||||
captured = capsys.readouterr()
|
||||
|
||||
assert captured.out == expected_ascii
|
||||
assert not sys.stdout.closed
|
||||
|
||||
@pytest.mark.xfail(
|
||||
compat.is_platform_windows(),
|
||||
reason=(
|
||||
"Especially in Windows, file stream should not be passed"
|
||||
"to csv writer without newline='' option."
|
||||
"(https://docs.python.org/3/library/csv.html#csv.writer)"
|
||||
),
|
||||
)
|
||||
def test_to_csv_write_to_open_file(self):
|
||||
# GH 21696
|
||||
df = DataFrame({"a": ["x", "y", "z"]})
|
||||
expected = """\
|
||||
manual header
|
||||
x
|
||||
y
|
||||
z
|
||||
"""
|
||||
with tm.ensure_clean("test.txt") as path:
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write("manual header\n")
|
||||
df.to_csv(f, header=None, index=None)
|
||||
with open(path, encoding="utf-8") as f:
|
||||
assert f.read() == expected
|
||||
|
||||
def test_to_csv_write_to_open_file_with_newline_py3(self):
|
||||
# see gh-21696
|
||||
# see gh-20353
|
||||
df = DataFrame({"a": ["x", "y", "z"]})
|
||||
expected_rows = ["x", "y", "z"]
|
||||
expected = "manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
with tm.ensure_clean("test.txt") as path:
|
||||
with open(path, "w", newline="", encoding="utf-8") as f:
|
||||
f.write("manual header\n")
|
||||
df.to_csv(f, header=None, index=None)
|
||||
|
||||
with open(path, "rb") as f:
|
||||
assert f.read() == bytes(expected, "utf-8")
|
||||
|
||||
@pytest.mark.parametrize("to_infer", [True, False])
|
||||
@pytest.mark.parametrize("read_infer", [True, False])
|
||||
def test_to_csv_compression(
|
||||
self, compression_only, read_infer, to_infer, compression_to_extension
|
||||
):
|
||||
# see gh-15008
|
||||
compression = compression_only
|
||||
|
||||
# We'll complete file extension subsequently.
|
||||
filename = "test."
|
||||
filename += compression_to_extension[compression]
|
||||
|
||||
df = DataFrame({"A": [1]})
|
||||
|
||||
to_compression = "infer" if to_infer else compression
|
||||
read_compression = "infer" if read_infer else compression
|
||||
|
||||
with tm.ensure_clean(filename) as path:
|
||||
df.to_csv(path, compression=to_compression)
|
||||
result = pd.read_csv(path, index_col=0, compression=read_compression)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
def test_to_csv_compression_dict(self, compression_only):
|
||||
# GH 26023
|
||||
method = compression_only
|
||||
df = DataFrame({"ABC": [1]})
|
||||
filename = "to_csv_compress_as_dict."
|
||||
extension = {
|
||||
"gzip": "gz",
|
||||
"zstd": "zst",
|
||||
}.get(method, method)
|
||||
filename += extension
|
||||
with tm.ensure_clean(filename) as path:
|
||||
df.to_csv(path, compression={"method": method})
|
||||
read_df = pd.read_csv(path, index_col=0)
|
||||
tm.assert_frame_equal(read_df, df)
|
||||
|
||||
def test_to_csv_compression_dict_no_method_raises(self):
|
||||
# GH 26023
|
||||
df = DataFrame({"ABC": [1]})
|
||||
compression = {"some_option": True}
|
||||
msg = "must have key 'method'"
|
||||
|
||||
with tm.ensure_clean("out.zip") as path:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_csv(path, compression=compression)
|
||||
|
||||
@pytest.mark.parametrize("compression", ["zip", "infer"])
|
||||
@pytest.mark.parametrize("archive_name", ["test_to_csv.csv", "test_to_csv.zip"])
|
||||
def test_to_csv_zip_arguments(self, compression, archive_name):
|
||||
# GH 26023
|
||||
df = DataFrame({"ABC": [1]})
|
||||
with tm.ensure_clean("to_csv_archive_name.zip") as path:
|
||||
df.to_csv(
|
||||
path, compression={"method": compression, "archive_name": archive_name}
|
||||
)
|
||||
with ZipFile(path) as zp:
|
||||
assert len(zp.filelist) == 1
|
||||
archived_file = zp.filelist[0].filename
|
||||
assert archived_file == archive_name
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename,expected_arcname",
|
||||
[
|
||||
("archive.csv", "archive.csv"),
|
||||
("archive.tsv", "archive.tsv"),
|
||||
("archive.csv.zip", "archive.csv"),
|
||||
("archive.tsv.zip", "archive.tsv"),
|
||||
("archive.zip", "archive"),
|
||||
],
|
||||
)
|
||||
def test_to_csv_zip_infer_name(self, tmp_path, filename, expected_arcname):
|
||||
# GH 39465
|
||||
df = DataFrame({"ABC": [1]})
|
||||
path = tmp_path / filename
|
||||
df.to_csv(path, compression="zip")
|
||||
with ZipFile(path) as zp:
|
||||
assert len(zp.filelist) == 1
|
||||
archived_file = zp.filelist[0].filename
|
||||
assert archived_file == expected_arcname
|
||||
|
||||
@pytest.mark.parametrize("df_new_type", ["Int64"])
|
||||
def test_to_csv_na_rep_long_string(self, df_new_type):
|
||||
# see gh-25099
|
||||
df = DataFrame({"c": [float("nan")] * 3})
|
||||
df = df.astype(df_new_type)
|
||||
expected_rows = ["c", "mynull", "mynull", "mynull"]
|
||||
expected = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
|
||||
result = df.to_csv(index=False, na_rep="mynull", encoding="ascii")
|
||||
|
||||
assert expected == result
|
||||
|
||||
def test_to_csv_timedelta_precision(self):
|
||||
# GH 6783
|
||||
s = pd.Series([1, 1]).astype("timedelta64[ns]")
|
||||
buf = io.StringIO()
|
||||
s.to_csv(buf)
|
||||
result = buf.getvalue()
|
||||
expected_rows = [
|
||||
",0",
|
||||
"0,0 days 00:00:00.000000001",
|
||||
"1,0 days 00:00:00.000000001",
|
||||
]
|
||||
expected = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert result == expected
|
||||
|
||||
def test_na_rep_truncated(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/31447
|
||||
result = pd.Series(range(8, 12)).to_csv(na_rep="-")
|
||||
expected = tm.convert_rows_list_to_csv_str([",0", "0,8", "1,9", "2,10", "3,11"])
|
||||
assert result == expected
|
||||
|
||||
result = pd.Series([True, False]).to_csv(na_rep="nan")
|
||||
expected = tm.convert_rows_list_to_csv_str([",0", "0,True", "1,False"])
|
||||
assert result == expected
|
||||
|
||||
result = pd.Series([1.1, 2.2]).to_csv(na_rep=".")
|
||||
expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"])
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"])
|
||||
def test_to_csv_errors(self, errors):
|
||||
# GH 22610
|
||||
data = ["\ud800foo"]
|
||||
ser = pd.Series(data, index=Index(data, dtype=object), dtype=object)
|
||||
with tm.ensure_clean("test.csv") as path:
|
||||
ser.to_csv(path, errors=errors)
|
||||
# No use in reading back the data as it is not the same anymore
|
||||
# due to the error handling
|
||||
|
||||
@pytest.mark.parametrize("mode", ["wb", "w"])
|
||||
def test_to_csv_binary_handle(self, mode):
|
||||
"""
|
||||
Binary file objects should work (if 'mode' contains a 'b') or even without
|
||||
it in most cases.
|
||||
|
||||
GH 35058 and GH 19827
|
||||
"""
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, mode="w+b") as handle:
|
||||
df.to_csv(handle, mode=mode)
|
||||
tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))
|
||||
|
||||
@pytest.mark.parametrize("mode", ["wb", "w"])
|
||||
def test_to_csv_encoding_binary_handle(self, mode):
|
||||
"""
|
||||
Binary file objects should honor a specified encoding.
|
||||
|
||||
GH 23854 and GH 13068 with binary handles
|
||||
"""
|
||||
# example from GH 23854
|
||||
content = "a, b, 🐟".encode("utf-8-sig")
|
||||
buffer = io.BytesIO(content)
|
||||
df = pd.read_csv(buffer, encoding="utf-8-sig")
|
||||
|
||||
buffer = io.BytesIO()
|
||||
df.to_csv(buffer, mode=mode, encoding="utf-8-sig", index=False)
|
||||
buffer.seek(0) # tests whether file handle wasn't closed
|
||||
assert buffer.getvalue().startswith(content)
|
||||
|
||||
# example from GH 13068
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w+b") as handle:
|
||||
DataFrame().to_csv(handle, mode=mode, encoding="utf-8-sig")
|
||||
|
||||
handle.seek(0)
|
||||
assert handle.read().startswith(b'\xef\xbb\xbf""')
|
||||
|
||||
|
||||
def test_to_csv_iterative_compression_name(compression):
|
||||
# GH 38714
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_csv(path, compression=compression, chunksize=1)
|
||||
tm.assert_frame_equal(
|
||||
pd.read_csv(path, compression=compression, index_col=0), df
|
||||
)
|
||||
|
||||
|
||||
def test_to_csv_iterative_compression_buffer(compression):
|
||||
# GH 38714
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
with io.BytesIO() as buffer:
|
||||
df.to_csv(buffer, compression=compression, chunksize=1)
|
||||
buffer.seek(0)
|
||||
tm.assert_frame_equal(
|
||||
pd.read_csv(buffer, compression=compression, index_col=0), df
|
||||
)
|
||||
assert not buffer.closed
|
||||
|
||||
|
||||
def test_to_csv_pos_args_deprecation():
|
||||
# GH-54229
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
msg = (
|
||||
r"Starting with pandas version 3.0 all arguments of to_csv except for the "
|
||||
r"argument 'path_or_buf' will be keyword-only."
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
buffer = io.BytesIO()
|
||||
df.to_csv(buffer, ";")
|
||||
@ -0,0 +1,429 @@
|
||||
"""Tests formatting as writer-agnostic ExcelCells
|
||||
|
||||
ExcelFormatter is tested implicitly in pandas/tests/io/excel
|
||||
"""
|
||||
import string
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.errors import CSSWarning
|
||||
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.formats.excel import (
|
||||
CssExcelCell,
|
||||
CSSToExcelConverter,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"css,expected",
|
||||
[
|
||||
# FONT
|
||||
# - name
|
||||
("font-family: foo,bar", {"font": {"name": "foo"}}),
|
||||
('font-family: "foo bar",baz', {"font": {"name": "foo bar"}}),
|
||||
("font-family: foo,\nbar", {"font": {"name": "foo"}}),
|
||||
("font-family: foo, bar, baz", {"font": {"name": "foo"}}),
|
||||
("font-family: bar, foo", {"font": {"name": "bar"}}),
|
||||
("font-family: 'foo bar', baz", {"font": {"name": "foo bar"}}),
|
||||
("font-family: 'foo \\'bar', baz", {"font": {"name": "foo 'bar"}}),
|
||||
('font-family: "foo \\"bar", baz', {"font": {"name": 'foo "bar'}}),
|
||||
('font-family: "foo ,bar", baz', {"font": {"name": "foo ,bar"}}),
|
||||
# - family
|
||||
("font-family: serif", {"font": {"name": "serif", "family": 1}}),
|
||||
("font-family: Serif", {"font": {"name": "serif", "family": 1}}),
|
||||
("font-family: roman, serif", {"font": {"name": "roman", "family": 1}}),
|
||||
("font-family: roman, sans-serif", {"font": {"name": "roman", "family": 2}}),
|
||||
("font-family: roman, sans serif", {"font": {"name": "roman"}}),
|
||||
("font-family: roman, sansserif", {"font": {"name": "roman"}}),
|
||||
("font-family: roman, cursive", {"font": {"name": "roman", "family": 4}}),
|
||||
("font-family: roman, fantasy", {"font": {"name": "roman", "family": 5}}),
|
||||
# - size
|
||||
("font-size: 1em", {"font": {"size": 12}}),
|
||||
("font-size: xx-small", {"font": {"size": 6}}),
|
||||
("font-size: x-small", {"font": {"size": 7.5}}),
|
||||
("font-size: small", {"font": {"size": 9.6}}),
|
||||
("font-size: medium", {"font": {"size": 12}}),
|
||||
("font-size: large", {"font": {"size": 13.5}}),
|
||||
("font-size: x-large", {"font": {"size": 18}}),
|
||||
("font-size: xx-large", {"font": {"size": 24}}),
|
||||
("font-size: 50%", {"font": {"size": 6}}),
|
||||
# - bold
|
||||
("font-weight: 100", {"font": {"bold": False}}),
|
||||
("font-weight: 200", {"font": {"bold": False}}),
|
||||
("font-weight: 300", {"font": {"bold": False}}),
|
||||
("font-weight: 400", {"font": {"bold": False}}),
|
||||
("font-weight: normal", {"font": {"bold": False}}),
|
||||
("font-weight: lighter", {"font": {"bold": False}}),
|
||||
("font-weight: bold", {"font": {"bold": True}}),
|
||||
("font-weight: bolder", {"font": {"bold": True}}),
|
||||
("font-weight: 700", {"font": {"bold": True}}),
|
||||
("font-weight: 800", {"font": {"bold": True}}),
|
||||
("font-weight: 900", {"font": {"bold": True}}),
|
||||
# - italic
|
||||
("font-style: italic", {"font": {"italic": True}}),
|
||||
("font-style: oblique", {"font": {"italic": True}}),
|
||||
# - underline
|
||||
("text-decoration: underline", {"font": {"underline": "single"}}),
|
||||
("text-decoration: overline", {}),
|
||||
("text-decoration: none", {}),
|
||||
# - strike
|
||||
("text-decoration: line-through", {"font": {"strike": True}}),
|
||||
(
|
||||
"text-decoration: underline line-through",
|
||||
{"font": {"strike": True, "underline": "single"}},
|
||||
),
|
||||
(
|
||||
"text-decoration: underline; text-decoration: line-through",
|
||||
{"font": {"strike": True}},
|
||||
),
|
||||
# - color
|
||||
("color: red", {"font": {"color": "FF0000"}}),
|
||||
("color: #ff0000", {"font": {"color": "FF0000"}}),
|
||||
("color: #f0a", {"font": {"color": "FF00AA"}}),
|
||||
# - shadow
|
||||
("text-shadow: none", {"font": {"shadow": False}}),
|
||||
("text-shadow: 0px -0em 0px #CCC", {"font": {"shadow": False}}),
|
||||
("text-shadow: 0px -0em 0px #999", {"font": {"shadow": False}}),
|
||||
("text-shadow: 0px -0em 0px", {"font": {"shadow": False}}),
|
||||
("text-shadow: 2px -0em 0px #CCC", {"font": {"shadow": True}}),
|
||||
("text-shadow: 0px -2em 0px #CCC", {"font": {"shadow": True}}),
|
||||
("text-shadow: 0px -0em 2px #CCC", {"font": {"shadow": True}}),
|
||||
("text-shadow: 0px -0em 2px", {"font": {"shadow": True}}),
|
||||
("text-shadow: 0px -2em", {"font": {"shadow": True}}),
|
||||
# FILL
|
||||
# - color, fillType
|
||||
(
|
||||
"background-color: red",
|
||||
{"fill": {"fgColor": "FF0000", "patternType": "solid"}},
|
||||
),
|
||||
(
|
||||
"background-color: #ff0000",
|
||||
{"fill": {"fgColor": "FF0000", "patternType": "solid"}},
|
||||
),
|
||||
(
|
||||
"background-color: #f0a",
|
||||
{"fill": {"fgColor": "FF00AA", "patternType": "solid"}},
|
||||
),
|
||||
# BORDER
|
||||
# - style
|
||||
(
|
||||
"border-style: solid",
|
||||
{
|
||||
"border": {
|
||||
"top": {"style": "medium"},
|
||||
"bottom": {"style": "medium"},
|
||||
"left": {"style": "medium"},
|
||||
"right": {"style": "medium"},
|
||||
}
|
||||
},
|
||||
),
|
||||
(
|
||||
"border-style: solid; border-width: thin",
|
||||
{
|
||||
"border": {
|
||||
"top": {"style": "thin"},
|
||||
"bottom": {"style": "thin"},
|
||||
"left": {"style": "thin"},
|
||||
"right": {"style": "thin"},
|
||||
}
|
||||
},
|
||||
),
|
||||
(
|
||||
"border-top-style: solid; border-top-width: thin",
|
||||
{"border": {"top": {"style": "thin"}}},
|
||||
),
|
||||
(
|
||||
"border-top-style: solid; border-top-width: 1pt",
|
||||
{"border": {"top": {"style": "thin"}}},
|
||||
),
|
||||
("border-top-style: solid", {"border": {"top": {"style": "medium"}}}),
|
||||
(
|
||||
"border-top-style: solid; border-top-width: medium",
|
||||
{"border": {"top": {"style": "medium"}}},
|
||||
),
|
||||
(
|
||||
"border-top-style: solid; border-top-width: 2pt",
|
||||
{"border": {"top": {"style": "medium"}}},
|
||||
),
|
||||
(
|
||||
"border-top-style: solid; border-top-width: thick",
|
||||
{"border": {"top": {"style": "thick"}}},
|
||||
),
|
||||
(
|
||||
"border-top-style: solid; border-top-width: 4pt",
|
||||
{"border": {"top": {"style": "thick"}}},
|
||||
),
|
||||
(
|
||||
"border-top-style: dotted",
|
||||
{"border": {"top": {"style": "mediumDashDotDot"}}},
|
||||
),
|
||||
(
|
||||
"border-top-style: dotted; border-top-width: thin",
|
||||
{"border": {"top": {"style": "dotted"}}},
|
||||
),
|
||||
("border-top-style: dashed", {"border": {"top": {"style": "mediumDashed"}}}),
|
||||
(
|
||||
"border-top-style: dashed; border-top-width: thin",
|
||||
{"border": {"top": {"style": "dashed"}}},
|
||||
),
|
||||
("border-top-style: double", {"border": {"top": {"style": "double"}}}),
|
||||
# - color
|
||||
(
|
||||
"border-style: solid; border-color: #0000ff",
|
||||
{
|
||||
"border": {
|
||||
"top": {"style": "medium", "color": "0000FF"},
|
||||
"right": {"style": "medium", "color": "0000FF"},
|
||||
"bottom": {"style": "medium", "color": "0000FF"},
|
||||
"left": {"style": "medium", "color": "0000FF"},
|
||||
}
|
||||
},
|
||||
),
|
||||
(
|
||||
"border-top-style: double; border-top-color: blue",
|
||||
{"border": {"top": {"style": "double", "color": "0000FF"}}},
|
||||
),
|
||||
(
|
||||
"border-top-style: solid; border-top-color: #06c",
|
||||
{"border": {"top": {"style": "medium", "color": "0066CC"}}},
|
||||
),
|
||||
(
|
||||
"border-top-color: blue",
|
||||
{"border": {"top": {"color": "0000FF", "style": "none"}}},
|
||||
),
|
||||
# ALIGNMENT
|
||||
# - horizontal
|
||||
("text-align: center", {"alignment": {"horizontal": "center"}}),
|
||||
("text-align: left", {"alignment": {"horizontal": "left"}}),
|
||||
("text-align: right", {"alignment": {"horizontal": "right"}}),
|
||||
("text-align: justify", {"alignment": {"horizontal": "justify"}}),
|
||||
# - vertical
|
||||
("vertical-align: top", {"alignment": {"vertical": "top"}}),
|
||||
("vertical-align: text-top", {"alignment": {"vertical": "top"}}),
|
||||
("vertical-align: middle", {"alignment": {"vertical": "center"}}),
|
||||
("vertical-align: bottom", {"alignment": {"vertical": "bottom"}}),
|
||||
("vertical-align: text-bottom", {"alignment": {"vertical": "bottom"}}),
|
||||
# - wrap_text
|
||||
("white-space: nowrap", {"alignment": {"wrap_text": False}}),
|
||||
("white-space: pre", {"alignment": {"wrap_text": False}}),
|
||||
("white-space: pre-line", {"alignment": {"wrap_text": False}}),
|
||||
("white-space: normal", {"alignment": {"wrap_text": True}}),
|
||||
# NUMBER FORMAT
|
||||
("number-format: 0%", {"number_format": {"format_code": "0%"}}),
|
||||
(
|
||||
"number-format: 0§[Red](0)§-§@;",
|
||||
{"number_format": {"format_code": "0;[red](0);-;@"}}, # GH 46152
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_css_to_excel(css, expected):
|
||||
convert = CSSToExcelConverter()
|
||||
assert expected == convert(css)
|
||||
|
||||
|
||||
def test_css_to_excel_multiple():
|
||||
convert = CSSToExcelConverter()
|
||||
actual = convert(
|
||||
"""
|
||||
font-weight: bold;
|
||||
text-decoration: underline;
|
||||
color: red;
|
||||
border-width: thin;
|
||||
text-align: center;
|
||||
vertical-align: top;
|
||||
unused: something;
|
||||
"""
|
||||
)
|
||||
assert {
|
||||
"font": {"bold": True, "underline": "single", "color": "FF0000"},
|
||||
"border": {
|
||||
"top": {"style": "thin"},
|
||||
"right": {"style": "thin"},
|
||||
"bottom": {"style": "thin"},
|
||||
"left": {"style": "thin"},
|
||||
},
|
||||
"alignment": {"horizontal": "center", "vertical": "top"},
|
||||
} == actual
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"css,inherited,expected",
|
||||
[
|
||||
("font-weight: bold", "", {"font": {"bold": True}}),
|
||||
("", "font-weight: bold", {"font": {"bold": True}}),
|
||||
(
|
||||
"font-weight: bold",
|
||||
"font-style: italic",
|
||||
{"font": {"bold": True, "italic": True}},
|
||||
),
|
||||
("font-style: normal", "font-style: italic", {"font": {"italic": False}}),
|
||||
("font-style: inherit", "", {}),
|
||||
(
|
||||
"font-style: normal; font-style: inherit",
|
||||
"font-style: italic",
|
||||
{"font": {"italic": True}},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_css_to_excel_inherited(css, inherited, expected):
|
||||
convert = CSSToExcelConverter(inherited)
|
||||
assert expected == convert(css)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_color,output_color",
|
||||
(
|
||||
list(CSSToExcelConverter.NAMED_COLORS.items())
|
||||
+ [("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()]
|
||||
+ [("#F0F", "FF00FF"), ("#ABC", "AABBCC")]
|
||||
),
|
||||
)
|
||||
def test_css_to_excel_good_colors(input_color, output_color):
|
||||
# see gh-18392
|
||||
css = (
|
||||
f"border-top-color: {input_color}; "
|
||||
f"border-right-color: {input_color}; "
|
||||
f"border-bottom-color: {input_color}; "
|
||||
f"border-left-color: {input_color}; "
|
||||
f"background-color: {input_color}; "
|
||||
f"color: {input_color}"
|
||||
)
|
||||
|
||||
expected = {}
|
||||
|
||||
expected["fill"] = {"patternType": "solid", "fgColor": output_color}
|
||||
|
||||
expected["font"] = {"color": output_color}
|
||||
|
||||
expected["border"] = {
|
||||
k: {"color": output_color, "style": "none"}
|
||||
for k in ("top", "right", "bottom", "left")
|
||||
}
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
convert = CSSToExcelConverter()
|
||||
assert expected == convert(css)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_color", [None, "not-a-color"])
|
||||
def test_css_to_excel_bad_colors(input_color):
|
||||
# see gh-18392
|
||||
css = (
|
||||
f"border-top-color: {input_color}; "
|
||||
f"border-right-color: {input_color}; "
|
||||
f"border-bottom-color: {input_color}; "
|
||||
f"border-left-color: {input_color}; "
|
||||
f"background-color: {input_color}; "
|
||||
f"color: {input_color}"
|
||||
)
|
||||
|
||||
expected = {}
|
||||
|
||||
if input_color is not None:
|
||||
expected["fill"] = {"patternType": "solid"}
|
||||
|
||||
with tm.assert_produces_warning(CSSWarning):
|
||||
convert = CSSToExcelConverter()
|
||||
assert expected == convert(css)
|
||||
|
||||
|
||||
def tests_css_named_colors_valid():
|
||||
upper_hexs = set(map(str.upper, string.hexdigits))
|
||||
for color in CSSToExcelConverter.NAMED_COLORS.values():
|
||||
assert len(color) == 6 and all(c in upper_hexs for c in color)
|
||||
|
||||
|
||||
def test_css_named_colors_from_mpl_present():
|
||||
mpl_colors = pytest.importorskip("matplotlib.colors")
|
||||
|
||||
pd_colors = CSSToExcelConverter.NAMED_COLORS
|
||||
for name, color in mpl_colors.CSS4_COLORS.items():
|
||||
assert name in pd_colors and pd_colors[name] == color[1:]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"styles,expected",
|
||||
[
|
||||
([("color", "green"), ("color", "red")], "color: red;"),
|
||||
([("font-weight", "bold"), ("font-weight", "normal")], "font-weight: normal;"),
|
||||
([("text-align", "center"), ("TEXT-ALIGN", "right")], "text-align: right;"),
|
||||
],
|
||||
)
|
||||
def test_css_excel_cell_precedence(styles, expected):
|
||||
"""It applies favors latter declarations over former declarations"""
|
||||
# See GH 47371
|
||||
converter = CSSToExcelConverter()
|
||||
converter._call_cached.cache_clear()
|
||||
css_styles = {(0, 0): styles}
|
||||
cell = CssExcelCell(
|
||||
row=0,
|
||||
col=0,
|
||||
val="",
|
||||
style=None,
|
||||
css_styles=css_styles,
|
||||
css_row=0,
|
||||
css_col=0,
|
||||
css_converter=converter,
|
||||
)
|
||||
converter._call_cached.cache_clear()
|
||||
|
||||
assert cell.style == converter(expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"styles,cache_hits,cache_misses",
|
||||
[
|
||||
([[("color", "green"), ("color", "red"), ("color", "green")]], 0, 1),
|
||||
(
|
||||
[
|
||||
[("font-weight", "bold")],
|
||||
[("font-weight", "normal"), ("font-weight", "bold")],
|
||||
],
|
||||
1,
|
||||
1,
|
||||
),
|
||||
([[("text-align", "center")], [("TEXT-ALIGN", "center")]], 1, 1),
|
||||
(
|
||||
[
|
||||
[("font-weight", "bold"), ("text-align", "center")],
|
||||
[("font-weight", "bold"), ("text-align", "left")],
|
||||
],
|
||||
0,
|
||||
2,
|
||||
),
|
||||
(
|
||||
[
|
||||
[("font-weight", "bold"), ("text-align", "center")],
|
||||
[("font-weight", "bold"), ("text-align", "left")],
|
||||
[("font-weight", "bold"), ("text-align", "center")],
|
||||
],
|
||||
1,
|
||||
2,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_css_excel_cell_cache(styles, cache_hits, cache_misses):
|
||||
"""It caches unique cell styles"""
|
||||
# See GH 47371
|
||||
converter = CSSToExcelConverter()
|
||||
converter._call_cached.cache_clear()
|
||||
|
||||
css_styles = {(0, i): _style for i, _style in enumerate(styles)}
|
||||
for css_row, css_col in css_styles:
|
||||
CssExcelCell(
|
||||
row=0,
|
||||
col=0,
|
||||
val="",
|
||||
style=None,
|
||||
css_styles=css_styles,
|
||||
css_row=css_row,
|
||||
css_col=css_col,
|
||||
css_converter=converter,
|
||||
)
|
||||
cache_info = converter._call_cached.cache_info()
|
||||
converter._call_cached.cache_clear()
|
||||
|
||||
assert cache_info.hits == cache_hits
|
||||
assert cache_info.misses == cache_misses
|
||||
1177
lib/python3.11/site-packages/pandas/tests/io/formats/test_to_html.py
Normal file
1177
lib/python3.11/site-packages/pandas/tests/io/formats/test_to_html.py
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,106 @@
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
)
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
pytest.importorskip("tabulate")
|
||||
|
||||
|
||||
def test_simple():
|
||||
buf = StringIO()
|
||||
df = pd.DataFrame([1, 2, 3])
|
||||
df.to_markdown(buf=buf)
|
||||
result = buf.getvalue()
|
||||
assert (
|
||||
result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |"
|
||||
)
|
||||
|
||||
|
||||
def test_empty_frame():
|
||||
buf = StringIO()
|
||||
df = pd.DataFrame({"id": [], "first_name": [], "last_name": []}).set_index("id")
|
||||
df.to_markdown(buf=buf)
|
||||
result = buf.getvalue()
|
||||
assert result == (
|
||||
"| id | first_name | last_name |\n"
|
||||
"|------|--------------|-------------|"
|
||||
)
|
||||
|
||||
|
||||
def test_other_tablefmt():
|
||||
buf = StringIO()
|
||||
df = pd.DataFrame([1, 2, 3])
|
||||
df.to_markdown(buf=buf, tablefmt="jira")
|
||||
result = buf.getvalue()
|
||||
assert result == "|| || 0 ||\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |"
|
||||
|
||||
|
||||
def test_other_headers():
|
||||
buf = StringIO()
|
||||
df = pd.DataFrame([1, 2, 3])
|
||||
df.to_markdown(buf=buf, headers=["foo", "bar"])
|
||||
result = buf.getvalue()
|
||||
assert result == (
|
||||
"| foo | bar |\n|------:|------:|\n| 0 "
|
||||
"| 1 |\n| 1 | 2 |\n| 2 | 3 |"
|
||||
)
|
||||
|
||||
|
||||
def test_series():
|
||||
buf = StringIO()
|
||||
s = pd.Series([1, 2, 3], name="foo")
|
||||
s.to_markdown(buf=buf)
|
||||
result = buf.getvalue()
|
||||
assert result == (
|
||||
"| | foo |\n|---:|------:|\n| 0 | 1 "
|
||||
"|\n| 1 | 2 |\n| 2 | 3 |"
|
||||
)
|
||||
|
||||
|
||||
def test_no_buf():
|
||||
df = pd.DataFrame([1, 2, 3])
|
||||
result = df.to_markdown()
|
||||
assert (
|
||||
result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index", [True, False])
|
||||
def test_index(index):
|
||||
# GH 32667
|
||||
|
||||
df = pd.DataFrame([1, 2, 3])
|
||||
|
||||
result = df.to_markdown(index=index)
|
||||
|
||||
if index:
|
||||
expected = (
|
||||
"| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |"
|
||||
)
|
||||
else:
|
||||
expected = "| 0 |\n|----:|\n| 1 |\n| 2 |\n| 3 |"
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_showindex_disallowed_in_kwargs():
|
||||
# GH 32667; disallowing showindex in kwargs enforced in 2.0
|
||||
df = pd.DataFrame([1, 2, 3])
|
||||
with pytest.raises(ValueError, match="Pass 'index' instead of 'showindex"):
|
||||
df.to_markdown(index=True, showindex=True)
|
||||
|
||||
|
||||
def test_markdown_pos_args_deprecatation():
|
||||
# GH-54229
|
||||
df = pd.DataFrame({"a": [1, 2, 3]})
|
||||
msg = (
|
||||
r"Starting with pandas version 3.0 all arguments of to_markdown except for the "
|
||||
r"argument 'buf' will be keyword-only."
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
buffer = BytesIO()
|
||||
df.to_markdown(buffer, "grid")
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,350 @@
|
||||
"""
|
||||
self-contained to write legacy storage pickle files
|
||||
|
||||
To use this script. Create an environment where you want
|
||||
generate pickles, say its for 0.20.3, with your pandas clone
|
||||
in ~/pandas
|
||||
|
||||
. activate pandas_0.20.3
|
||||
cd ~/pandas/pandas
|
||||
|
||||
$ python -m tests.io.generate_legacy_storage_files \
|
||||
tests/io/data/legacy_pickle/0.20.3/ pickle
|
||||
|
||||
This script generates a storage file for the current arch, system,
|
||||
and python version
|
||||
pandas version: 0.20.3
|
||||
output dir : pandas/pandas/tests/io/data/legacy_pickle/0.20.3/
|
||||
storage format: pickle
|
||||
created pickle file: 0.20.3_x86_64_darwin_3.5.2.pickle
|
||||
|
||||
The idea here is you are using the *current* version of the
|
||||
generate_legacy_storage_files with an *older* version of pandas to
|
||||
generate a pickle file. We will then check this file into a current
|
||||
branch, and test using test_pickle.py. This will load the *older*
|
||||
pickles and test versus the current data that is generated
|
||||
(with main). These are then compared.
|
||||
|
||||
If we have cases where we changed the signature (e.g. we renamed
|
||||
offset -> freq in Timestamp). Then we have to conditionally execute
|
||||
in the generate_legacy_storage_files.py to make it
|
||||
run under the older AND the newer version.
|
||||
|
||||
"""
|
||||
|
||||
from datetime import timedelta
|
||||
import os
|
||||
import pickle
|
||||
import platform as pl
|
||||
import sys
|
||||
|
||||
# Remove script directory from path, otherwise Python will try to
|
||||
# import the JSON test directory as the json module
|
||||
sys.path.pop(0)
|
||||
|
||||
import numpy as np
|
||||
|
||||
import pandas
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
NaT,
|
||||
Period,
|
||||
RangeIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
bdate_range,
|
||||
date_range,
|
||||
interval_range,
|
||||
period_range,
|
||||
timedelta_range,
|
||||
)
|
||||
from pandas.arrays import SparseArray
|
||||
|
||||
from pandas.tseries.offsets import (
|
||||
FY5253,
|
||||
BusinessDay,
|
||||
BusinessHour,
|
||||
CustomBusinessDay,
|
||||
DateOffset,
|
||||
Day,
|
||||
Easter,
|
||||
Hour,
|
||||
LastWeekOfMonth,
|
||||
Minute,
|
||||
MonthBegin,
|
||||
MonthEnd,
|
||||
QuarterBegin,
|
||||
QuarterEnd,
|
||||
SemiMonthBegin,
|
||||
SemiMonthEnd,
|
||||
Week,
|
||||
WeekOfMonth,
|
||||
YearBegin,
|
||||
YearEnd,
|
||||
)
|
||||
|
||||
|
||||
def _create_sp_series():
|
||||
nan = np.nan
|
||||
|
||||
# nan-based
|
||||
arr = np.arange(15, dtype=np.float64)
|
||||
arr[7:12] = nan
|
||||
arr[-1:] = nan
|
||||
|
||||
bseries = Series(SparseArray(arr, kind="block"))
|
||||
bseries.name = "bseries"
|
||||
return bseries
|
||||
|
||||
|
||||
def _create_sp_tsseries():
|
||||
nan = np.nan
|
||||
|
||||
# nan-based
|
||||
arr = np.arange(15, dtype=np.float64)
|
||||
arr[7:12] = nan
|
||||
arr[-1:] = nan
|
||||
|
||||
date_index = bdate_range("1/1/2011", periods=len(arr))
|
||||
bseries = Series(SparseArray(arr, kind="block"), index=date_index)
|
||||
bseries.name = "btsseries"
|
||||
return bseries
|
||||
|
||||
|
||||
def _create_sp_frame():
|
||||
nan = np.nan
|
||||
|
||||
data = {
|
||||
"A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
|
||||
"B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
|
||||
"C": np.arange(10).astype(np.int64),
|
||||
"D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan],
|
||||
}
|
||||
|
||||
dates = bdate_range("1/1/2011", periods=10)
|
||||
return DataFrame(data, index=dates).apply(SparseArray)
|
||||
|
||||
|
||||
def create_pickle_data():
|
||||
"""create the pickle data"""
|
||||
data = {
|
||||
"A": [0.0, 1.0, 2.0, 3.0, np.nan],
|
||||
"B": [0, 1, 0, 1, 0],
|
||||
"C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
|
||||
"D": date_range("1/1/2009", periods=5),
|
||||
"E": [0.0, 1, Timestamp("20100101"), "foo", 2.0],
|
||||
}
|
||||
|
||||
scalars = {"timestamp": Timestamp("20130101"), "period": Period("2012", "M")}
|
||||
|
||||
index = {
|
||||
"int": Index(np.arange(10)),
|
||||
"date": date_range("20130101", periods=10),
|
||||
"period": period_range("2013-01-01", freq="M", periods=10),
|
||||
"float": Index(np.arange(10, dtype=np.float64)),
|
||||
"uint": Index(np.arange(10, dtype=np.uint64)),
|
||||
"timedelta": timedelta_range("00:00:00", freq="30min", periods=10),
|
||||
"string": Index(["foo", "bar", "baz", "qux", "quux"], dtype="string"),
|
||||
}
|
||||
|
||||
index["range"] = RangeIndex(10)
|
||||
|
||||
index["interval"] = interval_range(0, periods=10)
|
||||
|
||||
mi = {
|
||||
"reg2": MultiIndex.from_tuples(
|
||||
tuple(
|
||||
zip(
|
||||
*[
|
||||
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
|
||||
["one", "two", "one", "two", "one", "two", "one", "two"],
|
||||
]
|
||||
)
|
||||
),
|
||||
names=["first", "second"],
|
||||
)
|
||||
}
|
||||
|
||||
series = {
|
||||
"float": Series(data["A"]),
|
||||
"int": Series(data["B"]),
|
||||
"mixed": Series(data["E"]),
|
||||
"ts": Series(
|
||||
np.arange(10).astype(np.int64), index=date_range("20130101", periods=10)
|
||||
),
|
||||
"mi": Series(
|
||||
np.arange(5).astype(np.float64),
|
||||
index=MultiIndex.from_tuples(
|
||||
tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"]
|
||||
),
|
||||
),
|
||||
"dup": Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]),
|
||||
"cat": Series(Categorical(["foo", "bar", "baz"])),
|
||||
"dt": Series(date_range("20130101", periods=5)),
|
||||
"dt_tz": Series(date_range("20130101", periods=5, tz="US/Eastern")),
|
||||
"period": Series([Period("2000Q1")] * 5),
|
||||
"string": Series(["foo", "bar", "baz", "qux", "quux"], dtype="string"),
|
||||
}
|
||||
|
||||
mixed_dup_df = DataFrame(data)
|
||||
mixed_dup_df.columns = list("ABCDA")
|
||||
frame = {
|
||||
"float": DataFrame({"A": series["float"], "B": series["float"] + 1}),
|
||||
"int": DataFrame({"A": series["int"], "B": series["int"] + 1}),
|
||||
"mixed": DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}),
|
||||
"mi": DataFrame(
|
||||
{"A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64)},
|
||||
index=MultiIndex.from_tuples(
|
||||
tuple(
|
||||
zip(
|
||||
*[
|
||||
["bar", "bar", "baz", "baz", "baz"],
|
||||
["one", "two", "one", "two", "three"],
|
||||
]
|
||||
)
|
||||
),
|
||||
names=["first", "second"],
|
||||
),
|
||||
),
|
||||
"dup": DataFrame(
|
||||
np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"]
|
||||
),
|
||||
"cat_onecol": DataFrame({"A": Categorical(["foo", "bar"])}),
|
||||
"cat_and_float": DataFrame(
|
||||
{
|
||||
"A": Categorical(["foo", "bar", "baz"]),
|
||||
"B": np.arange(3).astype(np.int64),
|
||||
}
|
||||
),
|
||||
"mixed_dup": mixed_dup_df,
|
||||
"dt_mixed_tzs": DataFrame(
|
||||
{
|
||||
"A": Timestamp("20130102", tz="US/Eastern"),
|
||||
"B": Timestamp("20130603", tz="CET"),
|
||||
},
|
||||
index=range(5),
|
||||
),
|
||||
"dt_mixed2_tzs": DataFrame(
|
||||
{
|
||||
"A": Timestamp("20130102", tz="US/Eastern"),
|
||||
"B": Timestamp("20130603", tz="CET"),
|
||||
"C": Timestamp("20130603", tz="UTC"),
|
||||
},
|
||||
index=range(5),
|
||||
),
|
||||
"string": DataFrame(
|
||||
{
|
||||
"A": Series(["foo", "bar", "baz", "qux", "quux"], dtype="string"),
|
||||
"B": Series(["one", "two", "one", "two", "three"], dtype="string"),
|
||||
}
|
||||
),
|
||||
}
|
||||
|
||||
cat = {
|
||||
"int8": Categorical(list("abcdefg")),
|
||||
"int16": Categorical(np.arange(1000)),
|
||||
"int32": Categorical(np.arange(10000)),
|
||||
}
|
||||
|
||||
timestamp = {
|
||||
"normal": Timestamp("2011-01-01"),
|
||||
"nat": NaT,
|
||||
"tz": Timestamp("2011-01-01", tz="US/Eastern"),
|
||||
}
|
||||
|
||||
off = {
|
||||
"DateOffset": DateOffset(years=1),
|
||||
"DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824),
|
||||
"BusinessDay": BusinessDay(offset=timedelta(seconds=9)),
|
||||
"BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"),
|
||||
"CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"),
|
||||
"SemiMonthBegin": SemiMonthBegin(day_of_month=9),
|
||||
"SemiMonthEnd": SemiMonthEnd(day_of_month=24),
|
||||
"MonthBegin": MonthBegin(1),
|
||||
"MonthEnd": MonthEnd(1),
|
||||
"QuarterBegin": QuarterBegin(1),
|
||||
"QuarterEnd": QuarterEnd(1),
|
||||
"Day": Day(1),
|
||||
"YearBegin": YearBegin(1),
|
||||
"YearEnd": YearEnd(1),
|
||||
"Week": Week(1),
|
||||
"Week_Tues": Week(2, normalize=False, weekday=1),
|
||||
"WeekOfMonth": WeekOfMonth(week=3, weekday=4),
|
||||
"LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3),
|
||||
"FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
|
||||
"Easter": Easter(),
|
||||
"Hour": Hour(1),
|
||||
"Minute": Minute(1),
|
||||
}
|
||||
|
||||
return {
|
||||
"series": series,
|
||||
"frame": frame,
|
||||
"index": index,
|
||||
"scalars": scalars,
|
||||
"mi": mi,
|
||||
"sp_series": {"float": _create_sp_series(), "ts": _create_sp_tsseries()},
|
||||
"sp_frame": {"float": _create_sp_frame()},
|
||||
"cat": cat,
|
||||
"timestamp": timestamp,
|
||||
"offsets": off,
|
||||
}
|
||||
|
||||
|
||||
def platform_name():
|
||||
return "_".join(
|
||||
[
|
||||
str(pandas.__version__),
|
||||
str(pl.machine()),
|
||||
str(pl.system().lower()),
|
||||
str(pl.python_version()),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def write_legacy_pickles(output_dir):
|
||||
version = pandas.__version__
|
||||
|
||||
print(
|
||||
"This script generates a storage file for the current arch, system, "
|
||||
"and python version"
|
||||
)
|
||||
print(f" pandas version: {version}")
|
||||
print(f" output dir : {output_dir}")
|
||||
print(" storage format: pickle")
|
||||
|
||||
pth = f"{platform_name()}.pickle"
|
||||
|
||||
with open(os.path.join(output_dir, pth), "wb") as fh:
|
||||
pickle.dump(create_pickle_data(), fh, pickle.DEFAULT_PROTOCOL)
|
||||
|
||||
print(f"created pickle file: {pth}")
|
||||
|
||||
|
||||
def write_legacy_file():
|
||||
# force our cwd to be the first searched
|
||||
sys.path.insert(0, "")
|
||||
|
||||
if not 3 <= len(sys.argv) <= 4:
|
||||
sys.exit(
|
||||
"Specify output directory and storage type: generate_legacy_"
|
||||
"storage_files.py <output_dir> <storage_type> "
|
||||
)
|
||||
|
||||
output_dir = str(sys.argv[1])
|
||||
storage_type = str(sys.argv[2])
|
||||
|
||||
if not os.path.exists(output_dir):
|
||||
os.mkdir(output_dir)
|
||||
|
||||
if storage_type == "pickle":
|
||||
write_legacy_pickles(output_dir=output_dir)
|
||||
else:
|
||||
sys.exit("storage_type must be one of {'pickle'}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
write_legacy_file()
|
||||
@ -0,0 +1,9 @@
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(params=["split", "records", "index", "columns", "values"])
|
||||
def orient(request):
|
||||
"""
|
||||
Fixture for orients excluding the table format.
|
||||
"""
|
||||
return request.param
|
||||
@ -0,0 +1,130 @@
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
)
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_compression_roundtrip(compression):
|
||||
df = pd.DataFrame(
|
||||
[[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
|
||||
index=["A", "B"],
|
||||
columns=["X", "Y", "Z"],
|
||||
)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_json(path, compression=compression)
|
||||
tm.assert_frame_equal(df, pd.read_json(path, compression=compression))
|
||||
|
||||
# explicitly ensure file was compressed.
|
||||
with tm.decompress_file(path, compression) as fh:
|
||||
result = fh.read().decode("utf8")
|
||||
data = StringIO(result)
|
||||
tm.assert_frame_equal(df, pd.read_json(data))
|
||||
|
||||
|
||||
def test_read_zipped_json(datapath):
|
||||
uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
|
||||
uncompressed_df = pd.read_json(uncompressed_path)
|
||||
|
||||
compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
|
||||
compressed_df = pd.read_json(compressed_path, compression="zip")
|
||||
|
||||
tm.assert_frame_equal(uncompressed_df, compressed_df)
|
||||
|
||||
|
||||
@td.skip_if_not_us_locale
|
||||
@pytest.mark.single_cpu
|
||||
def test_with_s3_url(compression, s3_public_bucket, s3so):
|
||||
# Bucket created in tests/io/conftest.py
|
||||
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_json(path, compression=compression)
|
||||
with open(path, "rb") as f:
|
||||
s3_public_bucket.put_object(Key="test-1", Body=f)
|
||||
|
||||
roundtripped_df = pd.read_json(
|
||||
f"s3://{s3_public_bucket.name}/test-1",
|
||||
compression=compression,
|
||||
storage_options=s3so,
|
||||
)
|
||||
tm.assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_lines_with_compression(compression):
|
||||
with tm.ensure_clean() as path:
|
||||
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
|
||||
df.to_json(path, orient="records", lines=True, compression=compression)
|
||||
roundtripped_df = pd.read_json(path, lines=True, compression=compression)
|
||||
tm.assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_chunksize_with_compression(compression):
|
||||
with tm.ensure_clean() as path:
|
||||
df = pd.read_json(StringIO('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}'))
|
||||
df.to_json(path, orient="records", lines=True, compression=compression)
|
||||
|
||||
with pd.read_json(
|
||||
path, lines=True, chunksize=1, compression=compression
|
||||
) as res:
|
||||
roundtripped_df = pd.concat(res)
|
||||
tm.assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_write_unsupported_compression_type():
|
||||
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
|
||||
with tm.ensure_clean() as path:
|
||||
msg = "Unrecognized compression type: unsupported"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_json(path, compression="unsupported")
|
||||
|
||||
|
||||
def test_read_unsupported_compression_type():
|
||||
with tm.ensure_clean() as path:
|
||||
msg = "Unrecognized compression type: unsupported"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_json(path, compression="unsupported")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
|
||||
)
|
||||
@pytest.mark.parametrize("to_infer", [True, False])
|
||||
@pytest.mark.parametrize("read_infer", [True, False])
|
||||
def test_to_json_compression(
|
||||
compression_only, read_infer, to_infer, compression_to_extension, infer_string
|
||||
):
|
||||
with pd.option_context("future.infer_string", infer_string):
|
||||
# see gh-15008
|
||||
compression = compression_only
|
||||
|
||||
# We'll complete file extension subsequently.
|
||||
filename = "test."
|
||||
filename += compression_to_extension[compression]
|
||||
|
||||
df = pd.DataFrame({"A": [1]})
|
||||
|
||||
to_compression = "infer" if to_infer else compression
|
||||
read_compression = "infer" if read_infer else compression
|
||||
|
||||
with tm.ensure_clean(filename) as path:
|
||||
df.to_json(path, compression=to_compression)
|
||||
result = pd.read_json(path, compression=read_compression)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_to_json_compression_mode(compression):
|
||||
# GH 39985 (read_json does not support user-provided binary files)
|
||||
expected = pd.DataFrame({"A": [1]})
|
||||
|
||||
with BytesIO() as buffer:
|
||||
expected.to_json(buffer, compression=compression)
|
||||
# df = pd.read_json(buffer, compression=compression)
|
||||
# tm.assert_frame_equal(expected, df)
|
||||
@ -0,0 +1,21 @@
|
||||
"""
|
||||
Tests for the deprecated keyword arguments for `read_json`.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.json import read_json
|
||||
|
||||
|
||||
def test_good_kwargs():
|
||||
df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2])
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
data1 = StringIO(df.to_json(orient="split"))
|
||||
tm.assert_frame_equal(df, read_json(data1, orient="split"))
|
||||
data2 = StringIO(df.to_json(orient="columns"))
|
||||
tm.assert_frame_equal(df, read_json(data2, orient="columns"))
|
||||
data3 = StringIO(df.to_json(orient="index"))
|
||||
tm.assert_frame_equal(df, read_json(data3, orient="index"))
|
||||
@ -0,0 +1,873 @@
|
||||
"""Tests for Table Schema integration."""
|
||||
from collections import OrderedDict
|
||||
from io import StringIO
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
CategoricalDtype,
|
||||
DatetimeTZDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.json._table_schema import (
|
||||
as_json_table_type,
|
||||
build_table_schema,
|
||||
convert_json_field_to_pandas_type,
|
||||
convert_pandas_type_to_json_field,
|
||||
set_default_names,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_schema():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3, 4],
|
||||
"B": ["a", "b", "c", "c"],
|
||||
"C": pd.date_range("2016-01-01", freq="d", periods=4),
|
||||
"D": pd.timedelta_range("1h", periods=4, freq="min"),
|
||||
},
|
||||
index=pd.Index(range(4), name="idx"),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_table():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3, 4],
|
||||
"B": ["a", "b", "c", "c"],
|
||||
"C": pd.date_range("2016-01-01", freq="d", periods=4),
|
||||
"D": pd.timedelta_range("1h", periods=4, freq="min"),
|
||||
"E": pd.Series(pd.Categorical(["a", "b", "c", "c"])),
|
||||
"F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)),
|
||||
"G": [1.0, 2.0, 3, 4.0],
|
||||
"H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"),
|
||||
},
|
||||
index=pd.Index(range(4), name="idx"),
|
||||
)
|
||||
|
||||
|
||||
class TestBuildSchema:
|
||||
def test_build_table_schema(self, df_schema, using_infer_string):
|
||||
result = build_table_schema(df_schema, version=False)
|
||||
expected = {
|
||||
"fields": [
|
||||
{"name": "idx", "type": "integer"},
|
||||
{"name": "A", "type": "integer"},
|
||||
{"name": "B", "type": "string"},
|
||||
{"name": "C", "type": "datetime"},
|
||||
{"name": "D", "type": "duration"},
|
||||
],
|
||||
"primaryKey": ["idx"],
|
||||
}
|
||||
if using_infer_string:
|
||||
expected["fields"][2] = {"name": "B", "type": "string", "extDtype": "str"}
|
||||
assert result == expected
|
||||
result = build_table_schema(df_schema)
|
||||
assert "pandas_version" in result
|
||||
|
||||
def test_series(self):
|
||||
s = pd.Series([1, 2, 3], name="foo")
|
||||
result = build_table_schema(s, version=False)
|
||||
expected = {
|
||||
"fields": [
|
||||
{"name": "index", "type": "integer"},
|
||||
{"name": "foo", "type": "integer"},
|
||||
],
|
||||
"primaryKey": ["index"],
|
||||
}
|
||||
assert result == expected
|
||||
result = build_table_schema(s)
|
||||
assert "pandas_version" in result
|
||||
|
||||
def test_series_unnamed(self):
|
||||
result = build_table_schema(pd.Series([1, 2, 3]), version=False)
|
||||
expected = {
|
||||
"fields": [
|
||||
{"name": "index", "type": "integer"},
|
||||
{"name": "values", "type": "integer"},
|
||||
],
|
||||
"primaryKey": ["index"],
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
def test_multiindex(self, df_schema, using_infer_string):
|
||||
df = df_schema
|
||||
idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)])
|
||||
df.index = idx
|
||||
|
||||
result = build_table_schema(df, version=False)
|
||||
expected = {
|
||||
"fields": [
|
||||
{"name": "level_0", "type": "string"},
|
||||
{"name": "level_1", "type": "integer"},
|
||||
{"name": "A", "type": "integer"},
|
||||
{"name": "B", "type": "string"},
|
||||
{"name": "C", "type": "datetime"},
|
||||
{"name": "D", "type": "duration"},
|
||||
],
|
||||
"primaryKey": ["level_0", "level_1"],
|
||||
}
|
||||
if using_infer_string:
|
||||
expected["fields"][0] = {
|
||||
"name": "level_0",
|
||||
"type": "string",
|
||||
"extDtype": "str",
|
||||
}
|
||||
expected["fields"][3] = {"name": "B", "type": "string", "extDtype": "str"}
|
||||
assert result == expected
|
||||
|
||||
df.index.names = ["idx0", None]
|
||||
expected["fields"][0]["name"] = "idx0"
|
||||
expected["primaryKey"] = ["idx0", "level_1"]
|
||||
result = build_table_schema(df, version=False)
|
||||
assert result == expected
|
||||
|
||||
|
||||
class TestTableSchemaType:
|
||||
@pytest.mark.parametrize("int_type", [int, np.int16, np.int32, np.int64])
|
||||
def test_as_json_table_type_int_data(self, int_type):
|
||||
int_data = [1, 2, 3]
|
||||
assert as_json_table_type(np.array(int_data, dtype=int_type).dtype) == "integer"
|
||||
|
||||
@pytest.mark.parametrize("float_type", [float, np.float16, np.float32, np.float64])
|
||||
def test_as_json_table_type_float_data(self, float_type):
|
||||
float_data = [1.0, 2.0, 3.0]
|
||||
assert (
|
||||
as_json_table_type(np.array(float_data, dtype=float_type).dtype) == "number"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize("bool_type", [bool, np.bool_])
|
||||
def test_as_json_table_type_bool_data(self, bool_type):
|
||||
bool_data = [True, False]
|
||||
assert (
|
||||
as_json_table_type(np.array(bool_data, dtype=bool_type).dtype) == "boolean"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"date_data",
|
||||
[
|
||||
pd.to_datetime(["2016"]),
|
||||
pd.to_datetime(["2016"], utc=True),
|
||||
pd.Series(pd.to_datetime(["2016"])),
|
||||
pd.Series(pd.to_datetime(["2016"], utc=True)),
|
||||
pd.period_range("2016", freq="Y", periods=3),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_date_data(self, date_data):
|
||||
assert as_json_table_type(date_data.dtype) == "datetime"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"str_data",
|
||||
[pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)],
|
||||
)
|
||||
def test_as_json_table_type_string_data(self, str_data):
|
||||
assert as_json_table_type(str_data.dtype) == "string"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"cat_data",
|
||||
[
|
||||
pd.Categorical(["a"]),
|
||||
pd.Categorical([1]),
|
||||
pd.Series(pd.Categorical([1])),
|
||||
pd.CategoricalIndex([1]),
|
||||
pd.Categorical([1]),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_categorical_data(self, cat_data):
|
||||
assert as_json_table_type(cat_data.dtype) == "any"
|
||||
|
||||
# ------
|
||||
# dtypes
|
||||
# ------
|
||||
@pytest.mark.parametrize("int_dtype", [int, np.int16, np.int32, np.int64])
|
||||
def test_as_json_table_type_int_dtypes(self, int_dtype):
|
||||
assert as_json_table_type(int_dtype) == "integer"
|
||||
|
||||
@pytest.mark.parametrize("float_dtype", [float, np.float16, np.float32, np.float64])
|
||||
def test_as_json_table_type_float_dtypes(self, float_dtype):
|
||||
assert as_json_table_type(float_dtype) == "number"
|
||||
|
||||
@pytest.mark.parametrize("bool_dtype", [bool, np.bool_])
|
||||
def test_as_json_table_type_bool_dtypes(self, bool_dtype):
|
||||
assert as_json_table_type(bool_dtype) == "boolean"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"date_dtype",
|
||||
[
|
||||
np.dtype("<M8[ns]"),
|
||||
PeriodDtype("D"),
|
||||
DatetimeTZDtype("ns", "US/Central"),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_date_dtypes(self, date_dtype):
|
||||
# TODO: datedate.date? datetime.time?
|
||||
assert as_json_table_type(date_dtype) == "datetime"
|
||||
|
||||
@pytest.mark.parametrize("td_dtype", [np.dtype("<m8[ns]")])
|
||||
def test_as_json_table_type_timedelta_dtypes(self, td_dtype):
|
||||
assert as_json_table_type(td_dtype) == "duration"
|
||||
|
||||
@pytest.mark.parametrize("str_dtype", [object]) # TODO(GH#14904) flesh out dtypes?
|
||||
def test_as_json_table_type_string_dtypes(self, str_dtype):
|
||||
assert as_json_table_type(str_dtype) == "string"
|
||||
|
||||
def test_as_json_table_type_categorical_dtypes(self):
|
||||
assert as_json_table_type(pd.Categorical(["a"]).dtype) == "any"
|
||||
assert as_json_table_type(CategoricalDtype()) == "any"
|
||||
|
||||
|
||||
class TestTableOrient:
|
||||
def test_build_series(self):
|
||||
s = pd.Series([1, 2], name="a")
|
||||
s.index.name = "id"
|
||||
result = s.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [{"name": "id", "type": "integer"}, {"name": "a", "type": "integer"}]
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["id"]}
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
("schema", schema),
|
||||
(
|
||||
"data",
|
||||
[
|
||||
OrderedDict([("id", 0), ("a", 1)]),
|
||||
OrderedDict([("id", 1), ("a", 2)]),
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_read_json_from_to_json_results(self):
|
||||
# GH32383
|
||||
df = DataFrame(
|
||||
{
|
||||
"_id": {"row_0": 0},
|
||||
"category": {"row_0": "Goods"},
|
||||
"recommender_id": {"row_0": 3},
|
||||
"recommender_name_jp": {"row_0": "浦田"},
|
||||
"recommender_name_en": {"row_0": "Urata"},
|
||||
"name_jp": {"row_0": "博多人形(松尾吉将まつお よしまさ)"},
|
||||
"name_en": {"row_0": "Hakata Dolls Matsuo"},
|
||||
}
|
||||
)
|
||||
|
||||
result1 = pd.read_json(StringIO(df.to_json()))
|
||||
result2 = DataFrame.from_dict(json.loads(df.to_json()))
|
||||
tm.assert_frame_equal(result1, df)
|
||||
tm.assert_frame_equal(result2, df)
|
||||
|
||||
def test_to_json(self, df_table, using_infer_string):
|
||||
df = df_table
|
||||
df.index.name = "idx"
|
||||
result = df.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"name": "idx", "type": "integer"},
|
||||
{"name": "A", "type": "integer"},
|
||||
{"name": "B", "type": "string"},
|
||||
{"name": "C", "type": "datetime"},
|
||||
{"name": "D", "type": "duration"},
|
||||
{
|
||||
"constraints": {"enum": ["a", "b", "c"]},
|
||||
"name": "E",
|
||||
"ordered": False,
|
||||
"type": "any",
|
||||
},
|
||||
{
|
||||
"constraints": {"enum": ["a", "b", "c"]},
|
||||
"name": "F",
|
||||
"ordered": True,
|
||||
"type": "any",
|
||||
},
|
||||
{"name": "G", "type": "number"},
|
||||
{"name": "H", "type": "datetime", "tz": "US/Central"},
|
||||
]
|
||||
|
||||
if using_infer_string:
|
||||
fields[2] = {"name": "B", "type": "string", "extDtype": "str"}
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["idx"]}
|
||||
data = [
|
||||
OrderedDict(
|
||||
[
|
||||
("idx", 0),
|
||||
("A", 1),
|
||||
("B", "a"),
|
||||
("C", "2016-01-01T00:00:00.000"),
|
||||
("D", "P0DT1H0M0S"),
|
||||
("E", "a"),
|
||||
("F", "a"),
|
||||
("G", 1.0),
|
||||
("H", "2016-01-01T06:00:00.000Z"),
|
||||
]
|
||||
),
|
||||
OrderedDict(
|
||||
[
|
||||
("idx", 1),
|
||||
("A", 2),
|
||||
("B", "b"),
|
||||
("C", "2016-01-02T00:00:00.000"),
|
||||
("D", "P0DT1H1M0S"),
|
||||
("E", "b"),
|
||||
("F", "b"),
|
||||
("G", 2.0),
|
||||
("H", "2016-01-02T06:00:00.000Z"),
|
||||
]
|
||||
),
|
||||
OrderedDict(
|
||||
[
|
||||
("idx", 2),
|
||||
("A", 3),
|
||||
("B", "c"),
|
||||
("C", "2016-01-03T00:00:00.000"),
|
||||
("D", "P0DT1H2M0S"),
|
||||
("E", "c"),
|
||||
("F", "c"),
|
||||
("G", 3.0),
|
||||
("H", "2016-01-03T06:00:00.000Z"),
|
||||
]
|
||||
),
|
||||
OrderedDict(
|
||||
[
|
||||
("idx", 3),
|
||||
("A", 4),
|
||||
("B", "c"),
|
||||
("C", "2016-01-04T00:00:00.000"),
|
||||
("D", "P0DT1H3M0S"),
|
||||
("E", "c"),
|
||||
("F", "c"),
|
||||
("G", 4.0),
|
||||
("H", "2016-01-04T06:00:00.000Z"),
|
||||
]
|
||||
),
|
||||
]
|
||||
expected = OrderedDict([("schema", schema), ("data", data)])
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_float_index(self):
|
||||
data = pd.Series(1, index=[1.0, 2.0])
|
||||
result = data.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
(
|
||||
"schema",
|
||||
{
|
||||
"fields": [
|
||||
{"name": "index", "type": "number"},
|
||||
{"name": "values", "type": "integer"},
|
||||
],
|
||||
"primaryKey": ["index"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"data",
|
||||
[
|
||||
OrderedDict([("index", 1.0), ("values", 1)]),
|
||||
OrderedDict([("index", 2.0), ("values", 1)]),
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_period_index(self):
|
||||
idx = pd.period_range("2016", freq="Q-JAN", periods=2)
|
||||
data = pd.Series(1, idx)
|
||||
result = data.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"freq": "QE-JAN", "name": "index", "type": "datetime"},
|
||||
{"name": "values", "type": "integer"},
|
||||
]
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["index"]}
|
||||
data = [
|
||||
OrderedDict([("index", "2015-11-01T00:00:00.000"), ("values", 1)]),
|
||||
OrderedDict([("index", "2016-02-01T00:00:00.000"), ("values", 1)]),
|
||||
]
|
||||
expected = OrderedDict([("schema", schema), ("data", data)])
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_categorical_index(self):
|
||||
data = pd.Series(1, pd.CategoricalIndex(["a", "b"]))
|
||||
result = data.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
(
|
||||
"schema",
|
||||
{
|
||||
"fields": [
|
||||
{
|
||||
"name": "index",
|
||||
"type": "any",
|
||||
"constraints": {"enum": ["a", "b"]},
|
||||
"ordered": False,
|
||||
},
|
||||
{"name": "values", "type": "integer"},
|
||||
],
|
||||
"primaryKey": ["index"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"data",
|
||||
[
|
||||
OrderedDict([("index", "a"), ("values", 1)]),
|
||||
OrderedDict([("index", "b"), ("values", 1)]),
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_date_format_raises(self, df_table):
|
||||
msg = (
|
||||
"Trying to write with `orient='table'` and `date_format='epoch'`. Table "
|
||||
"Schema requires dates to be formatted with `date_format='iso'`"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df_table.to_json(orient="table", date_format="epoch")
|
||||
|
||||
# others work
|
||||
df_table.to_json(orient="table", date_format="iso")
|
||||
df_table.to_json(orient="table")
|
||||
|
||||
def test_convert_pandas_type_to_json_field_int(self, index_or_series):
|
||||
kind = index_or_series
|
||||
data = [1, 2, 3]
|
||||
result = convert_pandas_type_to_json_field(kind(data, name="name"))
|
||||
expected = {"name": "name", "type": "integer"}
|
||||
assert result == expected
|
||||
|
||||
def test_convert_pandas_type_to_json_field_float(self, index_or_series):
|
||||
kind = index_or_series
|
||||
data = [1.0, 2.0, 3.0]
|
||||
result = convert_pandas_type_to_json_field(kind(data, name="name"))
|
||||
expected = {"name": "name", "type": "number"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dt_args,extra_exp", [({}, {}), ({"utc": True}, {"tz": "UTC"})]
|
||||
)
|
||||
@pytest.mark.parametrize("wrapper", [None, pd.Series])
|
||||
def test_convert_pandas_type_to_json_field_datetime(
|
||||
self, dt_args, extra_exp, wrapper
|
||||
):
|
||||
data = [1.0, 2.0, 3.0]
|
||||
data = pd.to_datetime(data, **dt_args)
|
||||
if wrapper is pd.Series:
|
||||
data = pd.Series(data, name="values")
|
||||
result = convert_pandas_type_to_json_field(data)
|
||||
expected = {"name": "values", "type": "datetime"}
|
||||
expected.update(extra_exp)
|
||||
assert result == expected
|
||||
|
||||
def test_convert_pandas_type_to_json_period_range(self):
|
||||
arr = pd.period_range("2016", freq="Y-DEC", periods=4)
|
||||
result = convert_pandas_type_to_json_field(arr)
|
||||
expected = {"name": "values", "type": "datetime", "freq": "YE-DEC"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize("kind", [pd.Categorical, pd.CategoricalIndex])
|
||||
@pytest.mark.parametrize("ordered", [True, False])
|
||||
def test_convert_pandas_type_to_json_field_categorical(self, kind, ordered):
|
||||
data = ["a", "b", "c"]
|
||||
if kind is pd.Categorical:
|
||||
arr = pd.Series(kind(data, ordered=ordered), name="cats")
|
||||
elif kind is pd.CategoricalIndex:
|
||||
arr = kind(data, ordered=ordered, name="cats")
|
||||
|
||||
result = convert_pandas_type_to_json_field(arr)
|
||||
expected = {
|
||||
"name": "cats",
|
||||
"type": "any",
|
||||
"constraints": {"enum": data},
|
||||
"ordered": ordered,
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"inp,exp",
|
||||
[
|
||||
({"type": "integer"}, "int64"),
|
||||
({"type": "number"}, "float64"),
|
||||
({"type": "boolean"}, "bool"),
|
||||
({"type": "duration"}, "timedelta64"),
|
||||
({"type": "datetime"}, "datetime64[ns]"),
|
||||
({"type": "datetime", "tz": "US/Hawaii"}, "datetime64[ns, US/Hawaii]"),
|
||||
({"type": "any"}, "object"),
|
||||
(
|
||||
{
|
||||
"type": "any",
|
||||
"constraints": {"enum": ["a", "b", "c"]},
|
||||
"ordered": False,
|
||||
},
|
||||
CategoricalDtype(categories=["a", "b", "c"], ordered=False),
|
||||
),
|
||||
(
|
||||
{
|
||||
"type": "any",
|
||||
"constraints": {"enum": ["a", "b", "c"]},
|
||||
"ordered": True,
|
||||
},
|
||||
CategoricalDtype(categories=["a", "b", "c"], ordered=True),
|
||||
),
|
||||
({"type": "string"}, None),
|
||||
],
|
||||
)
|
||||
def test_convert_json_field_to_pandas_type(self, inp, exp):
|
||||
field = {"name": "foo"}
|
||||
field.update(inp)
|
||||
assert convert_json_field_to_pandas_type(field) == exp
|
||||
|
||||
@pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"])
|
||||
def test_convert_json_field_to_pandas_type_raises(self, inp):
|
||||
field = {"type": inp}
|
||||
with pytest.raises(
|
||||
ValueError, match=f"Unsupported or invalid field type: {inp}"
|
||||
):
|
||||
convert_json_field_to_pandas_type(field)
|
||||
|
||||
def test_categorical(self):
|
||||
s = pd.Series(pd.Categorical(["a", "b", "a"]))
|
||||
s.index.name = "idx"
|
||||
result = s.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"name": "idx", "type": "integer"},
|
||||
{
|
||||
"constraints": {"enum": ["a", "b"]},
|
||||
"name": "values",
|
||||
"ordered": False,
|
||||
"type": "any",
|
||||
},
|
||||
]
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
("schema", {"fields": fields, "primaryKey": ["idx"]}),
|
||||
(
|
||||
"data",
|
||||
[
|
||||
OrderedDict([("idx", 0), ("values", "a")]),
|
||||
OrderedDict([("idx", 1), ("values", "b")]),
|
||||
OrderedDict([("idx", 2), ("values", "a")]),
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx,nm,prop",
|
||||
[
|
||||
(pd.Index([1]), "index", "name"),
|
||||
(pd.Index([1], name="myname"), "myname", "name"),
|
||||
(
|
||||
pd.MultiIndex.from_product([("a", "b"), ("c", "d")]),
|
||||
["level_0", "level_1"],
|
||||
"names",
|
||||
),
|
||||
(
|
||||
pd.MultiIndex.from_product(
|
||||
[("a", "b"), ("c", "d")], names=["n1", "n2"]
|
||||
),
|
||||
["n1", "n2"],
|
||||
"names",
|
||||
),
|
||||
(
|
||||
pd.MultiIndex.from_product(
|
||||
[("a", "b"), ("c", "d")], names=["n1", None]
|
||||
),
|
||||
["n1", "level_1"],
|
||||
"names",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_set_names_unset(self, idx, nm, prop):
|
||||
data = pd.Series(1, idx)
|
||||
result = set_default_names(data)
|
||||
assert getattr(result.index, prop) == nm
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx",
|
||||
[
|
||||
pd.Index([], name="index"),
|
||||
pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("level_0", "level_1")),
|
||||
pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("foo", "level_1")),
|
||||
],
|
||||
)
|
||||
def test_warns_non_roundtrippable_names(self, idx):
|
||||
# GH 19130
|
||||
df = DataFrame(index=idx)
|
||||
df.index.name = "index"
|
||||
with tm.assert_produces_warning():
|
||||
set_default_names(df)
|
||||
|
||||
def test_timestamp_in_columns(self):
|
||||
df = DataFrame(
|
||||
[[1, 2]], columns=[pd.Timestamp("2016"), pd.Timedelta(10, unit="s")]
|
||||
)
|
||||
result = df.to_json(orient="table")
|
||||
js = json.loads(result)
|
||||
assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000"
|
||||
assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case",
|
||||
[
|
||||
pd.Series([1], index=pd.Index([1], name="a"), name="a"),
|
||||
DataFrame({"A": [1]}, index=pd.Index([1], name="A")),
|
||||
DataFrame(
|
||||
{"A": [1]},
|
||||
index=pd.MultiIndex.from_arrays([["a"], [1]], names=["A", "a"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_overlapping_names(self, case):
|
||||
with pytest.raises(ValueError, match="Overlapping"):
|
||||
case.to_json(orient="table")
|
||||
|
||||
def test_mi_falsey_name(self):
|
||||
# GH 16203
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((4, 4)),
|
||||
index=pd.MultiIndex.from_product([("A", "B"), ("a", "b")]),
|
||||
)
|
||||
result = [x["name"] for x in build_table_schema(df)["fields"]]
|
||||
assert result == ["level_0", "level_1", 0, 1, 2, 3]
|
||||
|
||||
|
||||
class TestTableOrientReader:
|
||||
@pytest.mark.parametrize(
|
||||
"index_nm",
|
||||
[None, "idx", pytest.param("index", marks=pytest.mark.xfail), "level_0"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
{"ints": [1, 2, 3, 4]},
|
||||
{"objects": ["a", "b", "c", "d"]},
|
||||
{"objects": ["1", "2", "3", "4"]},
|
||||
{"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)},
|
||||
{"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))},
|
||||
{
|
||||
"ordered_cats": pd.Series(
|
||||
pd.Categorical(["a", "b", "c", "c"], ordered=True)
|
||||
)
|
||||
},
|
||||
{"floats": [1.0, 2.0, 3.0, 4.0]},
|
||||
{"floats": [1.1, 2.2, 3.3, 4.4]},
|
||||
{"bools": [True, False, False, True]},
|
||||
{
|
||||
"timezones": pd.date_range(
|
||||
"2016-01-01", freq="d", periods=4, tz="US/Central"
|
||||
) # added in # GH 35973
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_read_json_table_orient(self, index_nm, vals, recwarn):
|
||||
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize("index_nm", [None, "idx", "index"])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[{"timedeltas": pd.timedelta_range("1h", periods=4, freq="min")}],
|
||||
)
|
||||
def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
|
||||
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
|
||||
out = df.to_json(orient="table")
|
||||
with pytest.raises(NotImplementedError, match="can not yet read "):
|
||||
pd.read_json(out, orient="table")
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_nm",
|
||||
[None, "idx", pytest.param("index", marks=pytest.mark.xfail), "level_0"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
{"ints": [1, 2, 3, 4]},
|
||||
{"objects": ["a", "b", "c", "d"]},
|
||||
{"objects": ["1", "2", "3", "4"]},
|
||||
{"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)},
|
||||
{"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))},
|
||||
{
|
||||
"ordered_cats": pd.Series(
|
||||
pd.Categorical(["a", "b", "c", "c"], ordered=True)
|
||||
)
|
||||
},
|
||||
{"floats": [1.0, 2.0, 3.0, 4.0]},
|
||||
{"floats": [1.1, 2.2, 3.3, 4.4]},
|
||||
{"bools": [True, False, False, True]},
|
||||
{
|
||||
"timezones": pd.date_range(
|
||||
"2016-01-01", freq="d", periods=4, tz="US/Central"
|
||||
) # added in # GH 35973
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_read_json_table_period_orient(self, index_nm, vals, recwarn):
|
||||
df = DataFrame(
|
||||
vals,
|
||||
index=pd.Index(
|
||||
(pd.Period(f"2022Q{q}") for q in range(1, 5)), name=index_nm
|
||||
),
|
||||
)
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx",
|
||||
[
|
||||
pd.Index(range(4)),
|
||||
pd.date_range(
|
||||
"2020-08-30",
|
||||
freq="d",
|
||||
periods=4,
|
||||
)._with_freq(None),
|
||||
pd.date_range(
|
||||
"2020-08-30", freq="d", periods=4, tz="US/Central"
|
||||
)._with_freq(None),
|
||||
pd.MultiIndex.from_product(
|
||||
[
|
||||
pd.date_range("2020-08-30", freq="d", periods=2, tz="US/Central"),
|
||||
["x", "y"],
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
{"floats": [1.1, 2.2, 3.3, 4.4]},
|
||||
{"dates": pd.date_range("2020-08-30", freq="d", periods=4)},
|
||||
{
|
||||
"timezones": pd.date_range(
|
||||
"2020-08-30", freq="d", periods=4, tz="Europe/London"
|
||||
)
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_read_json_table_timezones_orient(self, idx, vals, recwarn):
|
||||
# GH 35973
|
||||
df = DataFrame(vals, index=idx)
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
def test_comprehensive(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3, 4],
|
||||
"B": ["a", "b", "c", "c"],
|
||||
"C": pd.date_range("2016-01-01", freq="d", periods=4),
|
||||
# 'D': pd.timedelta_range('1h', periods=4, freq='min'),
|
||||
"E": pd.Series(pd.Categorical(["a", "b", "c", "c"])),
|
||||
"F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)),
|
||||
"G": [1.1, 2.2, 3.3, 4.4],
|
||||
"H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"),
|
||||
"I": [True, False, False, True],
|
||||
},
|
||||
index=pd.Index(range(4), name="idx"),
|
||||
)
|
||||
|
||||
out = StringIO(df.to_json(orient="table"))
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_names",
|
||||
[[None, None], ["foo", "bar"], ["foo", None], [None, "foo"], ["index", "foo"]],
|
||||
)
|
||||
def test_multiindex(self, index_names):
|
||||
# GH 18912
|
||||
df = DataFrame(
|
||||
[["Arr", "alpha", [1, 2, 3, 4]], ["Bee", "Beta", [10, 20, 30, 40]]],
|
||||
index=[["A", "B"], ["Null", "Eins"]],
|
||||
columns=["Aussprache", "Griechisch", "Args"],
|
||||
)
|
||||
df.index.names = index_names
|
||||
out = StringIO(df.to_json(orient="table"))
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
def test_empty_frame_roundtrip(self):
|
||||
# GH 21287
|
||||
df = DataFrame(columns=["a", "b", "c"])
|
||||
expected = df.copy()
|
||||
out = StringIO(df.to_json(orient="table"))
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
def test_read_json_orient_table_old_schema_version(self):
|
||||
df_json = """
|
||||
{
|
||||
"schema":{
|
||||
"fields":[
|
||||
{"name":"index","type":"integer"},
|
||||
{"name":"a","type":"string"}
|
||||
],
|
||||
"primaryKey":["index"],
|
||||
"pandas_version":"0.20.0"
|
||||
},
|
||||
"data":[
|
||||
{"index":0,"a":1},
|
||||
{"index":1,"a":2.0},
|
||||
{"index":2,"a":"s"}
|
||||
]
|
||||
}
|
||||
"""
|
||||
expected = DataFrame({"a": [1, 2.0, "s"]})
|
||||
result = pd.read_json(StringIO(df_json), orient="table")
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
@pytest.mark.parametrize("freq", ["M", "2M", "Q", "2Q", "Y", "2Y"])
|
||||
def test_read_json_table_orient_period_depr_freq(self, freq, recwarn):
|
||||
# GH#9586
|
||||
df = DataFrame(
|
||||
{"ints": [1, 2]},
|
||||
index=pd.PeriodIndex(["2020-01", "2021-06"], freq=freq),
|
||||
)
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
@ -0,0 +1,317 @@
|
||||
"""Tests for ExtensionDtype Table Schema integration."""
|
||||
|
||||
from collections import OrderedDict
|
||||
import datetime as dt
|
||||
import decimal
|
||||
from io import StringIO
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
NA,
|
||||
DataFrame,
|
||||
Index,
|
||||
array,
|
||||
read_json,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays.integer import Int64Dtype
|
||||
from pandas.core.arrays.string_ import StringDtype
|
||||
from pandas.core.series import Series
|
||||
from pandas.tests.extension.date import (
|
||||
DateArray,
|
||||
DateDtype,
|
||||
)
|
||||
from pandas.tests.extension.decimal.array import (
|
||||
DecimalArray,
|
||||
DecimalDtype,
|
||||
)
|
||||
|
||||
from pandas.io.json._table_schema import (
|
||||
as_json_table_type,
|
||||
build_table_schema,
|
||||
)
|
||||
|
||||
|
||||
class TestBuildSchema:
|
||||
def test_build_table_schema(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": DateArray([dt.date(2021, 10, 10)]),
|
||||
"B": DecimalArray([decimal.Decimal(10)]),
|
||||
"C": array(["pandas"], dtype="string"),
|
||||
"D": array([10], dtype="Int64"),
|
||||
}
|
||||
)
|
||||
result = build_table_schema(df, version=False)
|
||||
expected = {
|
||||
"fields": [
|
||||
{"name": "index", "type": "integer"},
|
||||
{"name": "A", "type": "any", "extDtype": "DateDtype"},
|
||||
{"name": "B", "type": "number", "extDtype": "decimal"},
|
||||
{"name": "C", "type": "string", "extDtype": "string"},
|
||||
{"name": "D", "type": "integer", "extDtype": "Int64"},
|
||||
],
|
||||
"primaryKey": ["index"],
|
||||
}
|
||||
assert result == expected
|
||||
result = build_table_schema(df)
|
||||
assert "pandas_version" in result
|
||||
|
||||
|
||||
class TestTableSchemaType:
|
||||
@pytest.mark.parametrize(
|
||||
"date_data",
|
||||
[
|
||||
DateArray([dt.date(2021, 10, 10)]),
|
||||
DateArray(dt.date(2021, 10, 10)),
|
||||
Series(DateArray(dt.date(2021, 10, 10))),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_ext_date_array_dtype(self, date_data):
|
||||
assert as_json_table_type(date_data.dtype) == "any"
|
||||
|
||||
def test_as_json_table_type_ext_date_dtype(self):
|
||||
assert as_json_table_type(DateDtype()) == "any"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"decimal_data",
|
||||
[
|
||||
DecimalArray([decimal.Decimal(10)]),
|
||||
Series(DecimalArray([decimal.Decimal(10)])),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_ext_decimal_array_dtype(self, decimal_data):
|
||||
assert as_json_table_type(decimal_data.dtype) == "number"
|
||||
|
||||
def test_as_json_table_type_ext_decimal_dtype(self):
|
||||
assert as_json_table_type(DecimalDtype()) == "number"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"string_data",
|
||||
[
|
||||
array(["pandas"], dtype="string"),
|
||||
Series(array(["pandas"], dtype="string")),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_ext_string_array_dtype(self, string_data):
|
||||
assert as_json_table_type(string_data.dtype) == "string"
|
||||
|
||||
def test_as_json_table_type_ext_string_dtype(self):
|
||||
assert as_json_table_type(StringDtype()) == "string"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"integer_data",
|
||||
[
|
||||
array([10], dtype="Int64"),
|
||||
Series(array([10], dtype="Int64")),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_ext_integer_array_dtype(self, integer_data):
|
||||
assert as_json_table_type(integer_data.dtype) == "integer"
|
||||
|
||||
def test_as_json_table_type_ext_integer_dtype(self):
|
||||
assert as_json_table_type(Int64Dtype()) == "integer"
|
||||
|
||||
|
||||
class TestTableOrient:
|
||||
@pytest.fixture
|
||||
def da(self):
|
||||
return DateArray([dt.date(2021, 10, 10)])
|
||||
|
||||
@pytest.fixture
|
||||
def dc(self):
|
||||
return DecimalArray([decimal.Decimal(10)])
|
||||
|
||||
@pytest.fixture
|
||||
def sa(self):
|
||||
return array(["pandas"], dtype="string")
|
||||
|
||||
@pytest.fixture
|
||||
def ia(self):
|
||||
return array([10], dtype="Int64")
|
||||
|
||||
@pytest.fixture
|
||||
def df(self, da, dc, sa, ia):
|
||||
return DataFrame(
|
||||
{
|
||||
"A": da,
|
||||
"B": dc,
|
||||
"C": sa,
|
||||
"D": ia,
|
||||
}
|
||||
)
|
||||
|
||||
def test_build_date_series(self, da):
|
||||
s = Series(da, name="a")
|
||||
s.index.name = "id"
|
||||
result = s.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"name": "id", "type": "integer"},
|
||||
{"name": "a", "type": "any", "extDtype": "DateDtype"},
|
||||
]
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["id"]}
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
("schema", schema),
|
||||
("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000")])]),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_build_decimal_series(self, dc):
|
||||
s = Series(dc, name="a")
|
||||
s.index.name = "id"
|
||||
result = s.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"name": "id", "type": "integer"},
|
||||
{"name": "a", "type": "number", "extDtype": "decimal"},
|
||||
]
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["id"]}
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
("schema", schema),
|
||||
("data", [OrderedDict([("id", 0), ("a", 10.0)])]),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_build_string_series(self, sa):
|
||||
s = Series(sa, name="a")
|
||||
s.index.name = "id"
|
||||
result = s.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"name": "id", "type": "integer"},
|
||||
{"name": "a", "type": "string", "extDtype": "string"},
|
||||
]
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["id"]}
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
("schema", schema),
|
||||
("data", [OrderedDict([("id", 0), ("a", "pandas")])]),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_build_int64_series(self, ia):
|
||||
s = Series(ia, name="a")
|
||||
s.index.name = "id"
|
||||
result = s.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"name": "id", "type": "integer"},
|
||||
{"name": "a", "type": "integer", "extDtype": "Int64"},
|
||||
]
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["id"]}
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
("schema", schema),
|
||||
("data", [OrderedDict([("id", 0), ("a", 10)])]),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_to_json(self, df):
|
||||
df = df.copy()
|
||||
df.index.name = "idx"
|
||||
result = df.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
OrderedDict({"name": "idx", "type": "integer"}),
|
||||
OrderedDict({"name": "A", "type": "any", "extDtype": "DateDtype"}),
|
||||
OrderedDict({"name": "B", "type": "number", "extDtype": "decimal"}),
|
||||
OrderedDict({"name": "C", "type": "string", "extDtype": "string"}),
|
||||
OrderedDict({"name": "D", "type": "integer", "extDtype": "Int64"}),
|
||||
]
|
||||
|
||||
schema = OrderedDict({"fields": fields, "primaryKey": ["idx"]})
|
||||
data = [
|
||||
OrderedDict(
|
||||
[
|
||||
("idx", 0),
|
||||
("A", "2021-10-10T00:00:00.000"),
|
||||
("B", 10.0),
|
||||
("C", "pandas"),
|
||||
("D", 10),
|
||||
]
|
||||
)
|
||||
]
|
||||
expected = OrderedDict([("schema", schema), ("data", data)])
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_json_ext_dtype_reading_roundtrip(self):
|
||||
# GH#40255
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": Series([2, NA], dtype="Int64"),
|
||||
"b": Series([1.5, NA], dtype="Float64"),
|
||||
"c": Series([True, NA], dtype="boolean"),
|
||||
},
|
||||
index=Index([1, NA], dtype="Int64"),
|
||||
)
|
||||
expected = df.copy()
|
||||
data_json = df.to_json(orient="table", indent=4)
|
||||
result = read_json(StringIO(data_json), orient="table")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_json_ext_dtype_reading(self):
|
||||
# GH#40255
|
||||
data_json = """{
|
||||
"schema":{
|
||||
"fields":[
|
||||
{
|
||||
"name":"a",
|
||||
"type":"integer",
|
||||
"extDtype":"Int64"
|
||||
}
|
||||
],
|
||||
},
|
||||
"data":[
|
||||
{
|
||||
"a":2
|
||||
},
|
||||
{
|
||||
"a":null
|
||||
}
|
||||
]
|
||||
}"""
|
||||
result = read_json(StringIO(data_json), orient="table")
|
||||
expected = DataFrame({"a": Series([2, NA], dtype="Int64")})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,907 @@
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
json_normalize,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.json._normalize import nested_to_record
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def deep_nested():
|
||||
# deeply nested data
|
||||
return [
|
||||
{
|
||||
"country": "USA",
|
||||
"states": [
|
||||
{
|
||||
"name": "California",
|
||||
"cities": [
|
||||
{"name": "San Francisco", "pop": 12345},
|
||||
{"name": "Los Angeles", "pop": 12346},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Ohio",
|
||||
"cities": [
|
||||
{"name": "Columbus", "pop": 1234},
|
||||
{"name": "Cleveland", "pop": 1236},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"country": "Germany",
|
||||
"states": [
|
||||
{"name": "Bayern", "cities": [{"name": "Munich", "pop": 12347}]},
|
||||
{
|
||||
"name": "Nordrhein-Westfalen",
|
||||
"cities": [
|
||||
{"name": "Duesseldorf", "pop": 1238},
|
||||
{"name": "Koeln", "pop": 1239},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def state_data():
|
||||
return [
|
||||
{
|
||||
"counties": [
|
||||
{"name": "Dade", "population": 12345},
|
||||
{"name": "Broward", "population": 40000},
|
||||
{"name": "Palm Beach", "population": 60000},
|
||||
],
|
||||
"info": {"governor": "Rick Scott"},
|
||||
"shortname": "FL",
|
||||
"state": "Florida",
|
||||
},
|
||||
{
|
||||
"counties": [
|
||||
{"name": "Summit", "population": 1234},
|
||||
{"name": "Cuyahoga", "population": 1337},
|
||||
],
|
||||
"info": {"governor": "John Kasich"},
|
||||
"shortname": "OH",
|
||||
"state": "Ohio",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def author_missing_data():
|
||||
return [
|
||||
{"info": None},
|
||||
{
|
||||
"info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
|
||||
"author_name": {"first": "Jane", "last_name": "Doe"},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def missing_metadata():
|
||||
return [
|
||||
{
|
||||
"name": "Alice",
|
||||
"addresses": [
|
||||
{
|
||||
"number": 9562,
|
||||
"street": "Morris St.",
|
||||
"city": "Massillon",
|
||||
"state": "OH",
|
||||
"zip": 44646,
|
||||
}
|
||||
],
|
||||
"previous_residences": {"cities": [{"city_name": "Foo York City"}]},
|
||||
},
|
||||
{
|
||||
"addresses": [
|
||||
{
|
||||
"number": 8449,
|
||||
"street": "Spring St.",
|
||||
"city": "Elizabethton",
|
||||
"state": "TN",
|
||||
"zip": 37643,
|
||||
}
|
||||
],
|
||||
"previous_residences": {"cities": [{"city_name": "Barmingham"}]},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def max_level_test_input_data():
|
||||
"""
|
||||
input data to test json_normalize with max_level param
|
||||
"""
|
||||
return [
|
||||
{
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Lookup": {
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
},
|
||||
"Image": {"a": "b"},
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
class TestJSONNormalize:
|
||||
def test_simple_records(self):
|
||||
recs = [
|
||||
{"a": 1, "b": 2, "c": 3},
|
||||
{"a": 4, "b": 5, "c": 6},
|
||||
{"a": 7, "b": 8, "c": 9},
|
||||
{"a": 10, "b": 11, "c": 12},
|
||||
]
|
||||
|
||||
result = json_normalize(recs)
|
||||
expected = DataFrame(recs)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_simple_normalize(self, state_data):
|
||||
result = json_normalize(state_data[0], "counties")
|
||||
expected = DataFrame(state_data[0]["counties"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(state_data, "counties")
|
||||
|
||||
expected = []
|
||||
for rec in state_data:
|
||||
expected.extend(rec["counties"])
|
||||
expected = DataFrame(expected)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(state_data, "counties", meta="state")
|
||||
expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fields_list_type_normalize(self):
|
||||
parse_metadata_fields_list_type = [
|
||||
{"values": [1, 2, 3], "metadata": {"listdata": [1, 2]}}
|
||||
]
|
||||
result = json_normalize(
|
||||
parse_metadata_fields_list_type,
|
||||
record_path=["values"],
|
||||
meta=[["metadata", "listdata"]],
|
||||
)
|
||||
expected = DataFrame(
|
||||
{0: [1, 2, 3], "metadata.listdata": [[1, 2], [1, 2], [1, 2]]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_empty_array(self):
|
||||
result = json_normalize([])
|
||||
expected = DataFrame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, record_path, exception_type",
|
||||
[
|
||||
([{"a": 0}, {"a": 1}], None, None),
|
||||
({"a": [{"a": 0}, {"a": 1}]}, "a", None),
|
||||
('{"a": [{"a": 0}, {"a": 1}]}', None, NotImplementedError),
|
||||
(None, None, NotImplementedError),
|
||||
],
|
||||
)
|
||||
def test_accepted_input(self, data, record_path, exception_type):
|
||||
if exception_type is not None:
|
||||
with pytest.raises(exception_type, match=""):
|
||||
json_normalize(data, record_path=record_path)
|
||||
else:
|
||||
result = json_normalize(data, record_path=record_path)
|
||||
expected = DataFrame([0, 1], columns=["a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_simple_normalize_with_separator(self, deep_nested):
|
||||
# GH 14883
|
||||
result = json_normalize({"A": {"A": 1, "B": 2}})
|
||||
expected = DataFrame([[1, 2]], columns=["A.A", "A.B"])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize({"A": {"A": 1, "B": 2}}, sep="_")
|
||||
expected = DataFrame([[1, 2]], columns=["A_A", "A_B"])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize({"A": {"A": 1, "B": 2}}, sep="\u03c3")
|
||||
expected = DataFrame([[1, 2]], columns=["A\u03c3A", "A\u03c3B"])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize(
|
||||
deep_nested,
|
||||
["states", "cities"],
|
||||
meta=["country", ["states", "name"]],
|
||||
sep="_",
|
||||
)
|
||||
expected = Index(["name", "pop", "country", "states_name"]).sort_values()
|
||||
assert result.columns.sort_values().equals(expected)
|
||||
|
||||
def test_normalize_with_multichar_separator(self):
|
||||
# GH #43831
|
||||
data = {"a": [1, 2], "b": {"b_1": 2, "b_2": (3, 4)}}
|
||||
result = json_normalize(data, sep="__")
|
||||
expected = DataFrame([[[1, 2], 2, (3, 4)]], columns=["a", "b__b_1", "b__b_2"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_value_array_record_prefix(self):
|
||||
# GH 21536
|
||||
result = json_normalize({"A": [1, 2]}, "A", record_prefix="Prefix.")
|
||||
expected = DataFrame([[1], [2]], columns=["Prefix.0"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_nested_object_record_path(self):
|
||||
# GH 22706
|
||||
data = {
|
||||
"state": "Florida",
|
||||
"info": {
|
||||
"governor": "Rick Scott",
|
||||
"counties": [
|
||||
{"name": "Dade", "population": 12345},
|
||||
{"name": "Broward", "population": 40000},
|
||||
{"name": "Palm Beach", "population": 60000},
|
||||
],
|
||||
},
|
||||
}
|
||||
result = json_normalize(data, record_path=["info", "counties"])
|
||||
expected = DataFrame(
|
||||
[["Dade", 12345], ["Broward", 40000], ["Palm Beach", 60000]],
|
||||
columns=["name", "population"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_more_deeply_nested(self, deep_nested):
|
||||
result = json_normalize(
|
||||
deep_nested, ["states", "cities"], meta=["country", ["states", "name"]]
|
||||
)
|
||||
ex_data = {
|
||||
"country": ["USA"] * 4 + ["Germany"] * 3,
|
||||
"states.name": [
|
||||
"California",
|
||||
"California",
|
||||
"Ohio",
|
||||
"Ohio",
|
||||
"Bayern",
|
||||
"Nordrhein-Westfalen",
|
||||
"Nordrhein-Westfalen",
|
||||
],
|
||||
"name": [
|
||||
"San Francisco",
|
||||
"Los Angeles",
|
||||
"Columbus",
|
||||
"Cleveland",
|
||||
"Munich",
|
||||
"Duesseldorf",
|
||||
"Koeln",
|
||||
],
|
||||
"pop": [12345, 12346, 1234, 1236, 12347, 1238, 1239],
|
||||
}
|
||||
|
||||
expected = DataFrame(ex_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_shallow_nested(self):
|
||||
data = [
|
||||
{
|
||||
"state": "Florida",
|
||||
"shortname": "FL",
|
||||
"info": {"governor": "Rick Scott"},
|
||||
"counties": [
|
||||
{"name": "Dade", "population": 12345},
|
||||
{"name": "Broward", "population": 40000},
|
||||
{"name": "Palm Beach", "population": 60000},
|
||||
],
|
||||
},
|
||||
{
|
||||
"state": "Ohio",
|
||||
"shortname": "OH",
|
||||
"info": {"governor": "John Kasich"},
|
||||
"counties": [
|
||||
{"name": "Summit", "population": 1234},
|
||||
{"name": "Cuyahoga", "population": 1337},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
result = json_normalize(
|
||||
data, "counties", ["state", "shortname", ["info", "governor"]]
|
||||
)
|
||||
ex_data = {
|
||||
"name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
|
||||
"state": ["Florida"] * 3 + ["Ohio"] * 2,
|
||||
"shortname": ["FL", "FL", "FL", "OH", "OH"],
|
||||
"info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
|
||||
"population": [12345, 40000, 60000, 1234, 1337],
|
||||
}
|
||||
expected = DataFrame(ex_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_nested_meta_path_with_nested_record_path(self, state_data):
|
||||
# GH 27220
|
||||
result = json_normalize(
|
||||
data=state_data,
|
||||
record_path=["counties"],
|
||||
meta=["state", "shortname", ["info", "governor"]],
|
||||
errors="ignore",
|
||||
)
|
||||
|
||||
ex_data = {
|
||||
"name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
|
||||
"population": [12345, 40000, 60000, 1234, 1337],
|
||||
"state": ["Florida"] * 3 + ["Ohio"] * 2,
|
||||
"shortname": ["FL"] * 3 + ["OH"] * 2,
|
||||
"info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
|
||||
}
|
||||
|
||||
expected = DataFrame(ex_data)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_meta_name_conflict(self):
|
||||
data = [
|
||||
{
|
||||
"foo": "hello",
|
||||
"bar": "there",
|
||||
"data": [
|
||||
{"foo": "something", "bar": "else"},
|
||||
{"foo": "something2", "bar": "else2"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
msg = r"Conflicting metadata name (foo|bar), need distinguishing prefix"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
json_normalize(data, "data", meta=["foo", "bar"])
|
||||
|
||||
result = json_normalize(data, "data", meta=["foo", "bar"], meta_prefix="meta")
|
||||
|
||||
for val in ["metafoo", "metabar", "foo", "bar"]:
|
||||
assert val in result
|
||||
|
||||
def test_meta_parameter_not_modified(self):
|
||||
# GH 18610
|
||||
data = [
|
||||
{
|
||||
"foo": "hello",
|
||||
"bar": "there",
|
||||
"data": [
|
||||
{"foo": "something", "bar": "else"},
|
||||
{"foo": "something2", "bar": "else2"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
COLUMNS = ["foo", "bar"]
|
||||
result = json_normalize(data, "data", meta=COLUMNS, meta_prefix="meta")
|
||||
|
||||
assert COLUMNS == ["foo", "bar"]
|
||||
for val in ["metafoo", "metabar", "foo", "bar"]:
|
||||
assert val in result
|
||||
|
||||
def test_record_prefix(self, state_data):
|
||||
result = json_normalize(state_data[0], "counties")
|
||||
expected = DataFrame(state_data[0]["counties"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(
|
||||
state_data, "counties", meta="state", record_prefix="county_"
|
||||
)
|
||||
|
||||
expected = []
|
||||
for rec in state_data:
|
||||
expected.extend(rec["counties"])
|
||||
expected = DataFrame(expected)
|
||||
expected = expected.rename(columns=lambda x: "county_" + x)
|
||||
expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_non_ascii_key(self):
|
||||
testjson = (
|
||||
b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
|
||||
b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
|
||||
).decode("utf8")
|
||||
|
||||
testdata = {
|
||||
b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1],
|
||||
"sub.A": [1, 3],
|
||||
"sub.B": [2, 4],
|
||||
}
|
||||
expected = DataFrame(testdata)
|
||||
|
||||
result = json_normalize(json.loads(testjson))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_missing_field(self, author_missing_data):
|
||||
# GH20030:
|
||||
result = json_normalize(author_missing_data)
|
||||
ex_data = [
|
||||
{
|
||||
"info": np.nan,
|
||||
"info.created_at": np.nan,
|
||||
"info.last_updated": np.nan,
|
||||
"author_name.first": np.nan,
|
||||
"author_name.last_name": np.nan,
|
||||
},
|
||||
{
|
||||
"info": None,
|
||||
"info.created_at": "11/08/1993",
|
||||
"info.last_updated": "26/05/2012",
|
||||
"author_name.first": "Jane",
|
||||
"author_name.last_name": "Doe",
|
||||
},
|
||||
]
|
||||
expected = DataFrame(ex_data)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_level,expected",
|
||||
[
|
||||
(
|
||||
0,
|
||||
[
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Image": {"a": "b"},
|
||||
},
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Image": {"a": "b"},
|
||||
},
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField.Id": "ID001",
|
||||
"UserField.Name": "Name001",
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Image": {"a": "b"},
|
||||
},
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField.Id": "ID001",
|
||||
"UserField.Name": "Name001",
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Image": {"a": "b"},
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_max_level_with_records_path(self, max_level, expected):
|
||||
# GH23843: Enhanced JSON normalize
|
||||
test_input = [
|
||||
{
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Lookup": [
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
},
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
},
|
||||
],
|
||||
"Image": {"a": "b"},
|
||||
"tags": [
|
||||
{"foo": "something", "bar": "else"},
|
||||
{"foo": "something2", "bar": "else2"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
result = json_normalize(
|
||||
test_input,
|
||||
record_path=["Lookup"],
|
||||
meta=[["CreatedBy"], ["Image"]],
|
||||
max_level=max_level,
|
||||
)
|
||||
expected_df = DataFrame(data=expected, columns=result.columns.values)
|
||||
tm.assert_equal(expected_df, result)
|
||||
|
||||
def test_nested_flattening_consistent(self):
|
||||
# see gh-21537
|
||||
df1 = json_normalize([{"A": {"B": 1}}])
|
||||
df2 = json_normalize({"dummy": [{"A": {"B": 1}}]}, "dummy")
|
||||
|
||||
# They should be the same.
|
||||
tm.assert_frame_equal(df1, df2)
|
||||
|
||||
def test_nonetype_record_path(self, nulls_fixture):
|
||||
# see gh-30148
|
||||
# should not raise TypeError
|
||||
result = json_normalize(
|
||||
[
|
||||
{"state": "Texas", "info": nulls_fixture},
|
||||
{"state": "Florida", "info": [{"i": 2}]},
|
||||
],
|
||||
record_path=["info"],
|
||||
)
|
||||
expected = DataFrame({"i": 2}, index=[0])
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("value", ["false", "true", "{}", "1", '"text"'])
|
||||
def test_non_list_record_path_errors(self, value):
|
||||
# see gh-30148, GH 26284
|
||||
parsed_value = json.loads(value)
|
||||
test_input = {"state": "Texas", "info": parsed_value}
|
||||
test_path = "info"
|
||||
msg = (
|
||||
f"{test_input} has non list value {parsed_value} for path {test_path}. "
|
||||
"Must be list or null."
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
json_normalize([test_input], record_path=[test_path])
|
||||
|
||||
def test_meta_non_iterable(self):
|
||||
# GH 31507
|
||||
data = """[{"id": 99, "data": [{"one": 1, "two": 2}]}]"""
|
||||
|
||||
result = json_normalize(json.loads(data), record_path=["data"], meta=["id"])
|
||||
expected = DataFrame(
|
||||
{"one": [1], "two": [2], "id": np.array([99], dtype=object)}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_generator(self, state_data):
|
||||
# GH35923 Fix pd.json_normalize to not skip the first element of a
|
||||
# generator input
|
||||
def generator_data():
|
||||
yield from state_data[0]["counties"]
|
||||
|
||||
result = json_normalize(generator_data())
|
||||
expected = DataFrame(state_data[0]["counties"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_top_column_with_leading_underscore(self):
|
||||
# 49861
|
||||
data = {"_id": {"a1": 10, "l2": {"l3": 0}}, "gg": 4}
|
||||
result = json_normalize(data, sep="_")
|
||||
expected = DataFrame([[4, 10, 0]], columns=["gg", "_id_a1", "_id_l2_l3"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestNestedToRecord:
|
||||
def test_flat_stays_flat(self):
|
||||
recs = [{"flat1": 1, "flat2": 2}, {"flat3": 3, "flat2": 4}]
|
||||
result = nested_to_record(recs)
|
||||
expected = recs
|
||||
assert result == expected
|
||||
|
||||
def test_one_level_deep_flattens(self):
|
||||
data = {"flat1": 1, "dict1": {"c": 1, "d": 2}}
|
||||
|
||||
result = nested_to_record(data)
|
||||
expected = {"dict1.c": 1, "dict1.d": 2, "flat1": 1}
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_nested_flattens(self):
|
||||
data = {
|
||||
"flat1": 1,
|
||||
"dict1": {"c": 1, "d": 2},
|
||||
"nested": {"e": {"c": 1, "d": 2}, "d": 2},
|
||||
}
|
||||
|
||||
result = nested_to_record(data)
|
||||
expected = {
|
||||
"dict1.c": 1,
|
||||
"dict1.d": 2,
|
||||
"flat1": 1,
|
||||
"nested.d": 2,
|
||||
"nested.e.c": 1,
|
||||
"nested.e.d": 2,
|
||||
}
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_json_normalize_errors(self, missing_metadata):
|
||||
# GH14583:
|
||||
# If meta keys are not always present a new option to set
|
||||
# errors='ignore' has been implemented
|
||||
|
||||
msg = (
|
||||
"Key 'name' not found. To replace missing values of "
|
||||
"'name' with np.nan, pass in errors='ignore'"
|
||||
)
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
json_normalize(
|
||||
data=missing_metadata,
|
||||
record_path="addresses",
|
||||
meta="name",
|
||||
errors="raise",
|
||||
)
|
||||
|
||||
def test_missing_meta(self, missing_metadata):
|
||||
# GH25468
|
||||
# If metadata is nullable with errors set to ignore, the null values
|
||||
# should be numpy.nan values
|
||||
result = json_normalize(
|
||||
data=missing_metadata, record_path="addresses", meta="name", errors="ignore"
|
||||
)
|
||||
ex_data = [
|
||||
[9562, "Morris St.", "Massillon", "OH", 44646, "Alice"],
|
||||
[8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan],
|
||||
]
|
||||
columns = ["number", "street", "city", "state", "zip", "name"]
|
||||
expected = DataFrame(ex_data, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_missing_nested_meta(self):
|
||||
# GH44312
|
||||
# If errors="ignore" and nested metadata is null, we should return nan
|
||||
data = {"meta": "foo", "nested_meta": None, "value": [{"rec": 1}, {"rec": 2}]}
|
||||
result = json_normalize(
|
||||
data,
|
||||
record_path="value",
|
||||
meta=["meta", ["nested_meta", "leaf"]],
|
||||
errors="ignore",
|
||||
)
|
||||
ex_data = [[1, "foo", np.nan], [2, "foo", np.nan]]
|
||||
columns = ["rec", "meta", "nested_meta.leaf"]
|
||||
expected = DataFrame(ex_data, columns=columns).astype(
|
||||
{"nested_meta.leaf": object}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# If errors="raise" and nested metadata is null, we should raise with the
|
||||
# key of the first missing level
|
||||
with pytest.raises(KeyError, match="'leaf' not found"):
|
||||
json_normalize(
|
||||
data,
|
||||
record_path="value",
|
||||
meta=["meta", ["nested_meta", "leaf"]],
|
||||
errors="raise",
|
||||
)
|
||||
|
||||
def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata):
|
||||
# GH41876
|
||||
# Ensure errors='raise' works as intended even when a record_path of length
|
||||
# greater than one is passed in
|
||||
msg = (
|
||||
"Key 'name' not found. To replace missing values of "
|
||||
"'name' with np.nan, pass in errors='ignore'"
|
||||
)
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
json_normalize(
|
||||
data=missing_metadata,
|
||||
record_path=["previous_residences", "cities"],
|
||||
meta="name",
|
||||
errors="raise",
|
||||
)
|
||||
|
||||
def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata):
|
||||
# GH41876
|
||||
# Ensure errors='ignore' works as intended even when a record_path of length
|
||||
# greater than one is passed in
|
||||
result = json_normalize(
|
||||
data=missing_metadata,
|
||||
record_path=["previous_residences", "cities"],
|
||||
meta="name",
|
||||
errors="ignore",
|
||||
)
|
||||
ex_data = [
|
||||
["Foo York City", "Alice"],
|
||||
["Barmingham", np.nan],
|
||||
]
|
||||
columns = ["city_name", "name"]
|
||||
expected = DataFrame(ex_data, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_donot_drop_nonevalues(self):
|
||||
# GH21356
|
||||
data = [
|
||||
{"info": None, "author_name": {"first": "Smith", "last_name": "Appleseed"}},
|
||||
{
|
||||
"info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
|
||||
"author_name": {"first": "Jane", "last_name": "Doe"},
|
||||
},
|
||||
]
|
||||
result = nested_to_record(data)
|
||||
expected = [
|
||||
{
|
||||
"info": None,
|
||||
"author_name.first": "Smith",
|
||||
"author_name.last_name": "Appleseed",
|
||||
},
|
||||
{
|
||||
"author_name.first": "Jane",
|
||||
"author_name.last_name": "Doe",
|
||||
"info.created_at": "11/08/1993",
|
||||
"info.last_updated": "26/05/2012",
|
||||
},
|
||||
]
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_nonetype_top_level_bottom_level(self):
|
||||
# GH21158: If inner level json has a key with a null value
|
||||
# make sure it does not do a new_d.pop twice and except
|
||||
data = {
|
||||
"id": None,
|
||||
"location": {
|
||||
"country": {
|
||||
"state": {
|
||||
"id": None,
|
||||
"town.info": {
|
||||
"id": None,
|
||||
"region": None,
|
||||
"x": 49.151580810546875,
|
||||
"y": -33.148521423339844,
|
||||
"z": 27.572303771972656,
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
result = nested_to_record(data)
|
||||
expected = {
|
||||
"id": None,
|
||||
"location.country.state.id": None,
|
||||
"location.country.state.town.info.id": None,
|
||||
"location.country.state.town.info.region": None,
|
||||
"location.country.state.town.info.x": 49.151580810546875,
|
||||
"location.country.state.town.info.y": -33.148521423339844,
|
||||
"location.country.state.town.info.z": 27.572303771972656,
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
def test_nonetype_multiple_levels(self):
|
||||
# GH21158: If inner level json has a key with a null value
|
||||
# make sure it does not do a new_d.pop twice and except
|
||||
data = {
|
||||
"id": None,
|
||||
"location": {
|
||||
"id": None,
|
||||
"country": {
|
||||
"id": None,
|
||||
"state": {
|
||||
"id": None,
|
||||
"town.info": {
|
||||
"region": None,
|
||||
"x": 49.151580810546875,
|
||||
"y": -33.148521423339844,
|
||||
"z": 27.572303771972656,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
result = nested_to_record(data)
|
||||
expected = {
|
||||
"id": None,
|
||||
"location.id": None,
|
||||
"location.country.id": None,
|
||||
"location.country.state.id": None,
|
||||
"location.country.state.town.info.region": None,
|
||||
"location.country.state.town.info.x": 49.151580810546875,
|
||||
"location.country.state.town.info.y": -33.148521423339844,
|
||||
"location.country.state.town.info.z": 27.572303771972656,
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_level, expected",
|
||||
[
|
||||
(
|
||||
None,
|
||||
[
|
||||
{
|
||||
"CreatedBy.Name": "User001",
|
||||
"Lookup.TextField": "Some text",
|
||||
"Lookup.UserField.Id": "ID001",
|
||||
"Lookup.UserField.Name": "Name001",
|
||||
"Image.a": "b",
|
||||
}
|
||||
],
|
||||
),
|
||||
(
|
||||
0,
|
||||
[
|
||||
{
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Lookup": {
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
},
|
||||
"Image": {"a": "b"},
|
||||
}
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
{
|
||||
"CreatedBy.Name": "User001",
|
||||
"Lookup.TextField": "Some text",
|
||||
"Lookup.UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
"Image.a": "b",
|
||||
}
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_with_max_level(self, max_level, expected, max_level_test_input_data):
|
||||
# GH23843: Enhanced JSON normalize
|
||||
output = nested_to_record(max_level_test_input_data, max_level=max_level)
|
||||
assert output == expected
|
||||
|
||||
def test_with_large_max_level(self):
|
||||
# GH23843: Enhanced JSON normalize
|
||||
max_level = 100
|
||||
input_data = [
|
||||
{
|
||||
"CreatedBy": {
|
||||
"user": {
|
||||
"name": {"firstname": "Leo", "LastName": "Thomson"},
|
||||
"family_tree": {
|
||||
"father": {
|
||||
"name": "Father001",
|
||||
"father": {
|
||||
"Name": "Father002",
|
||||
"father": {
|
||||
"name": "Father003",
|
||||
"father": {"Name": "Father004"},
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
expected = [
|
||||
{
|
||||
"CreatedBy.user.name.firstname": "Leo",
|
||||
"CreatedBy.user.name.LastName": "Thomson",
|
||||
"CreatedBy.user.family_tree.father.name": "Father001",
|
||||
"CreatedBy.user.family_tree.father.father.Name": "Father002",
|
||||
"CreatedBy.user.family_tree.father.father.father.name": "Father003",
|
||||
"CreatedBy.user.family_tree.father.father.father.father.Name": "Father004", # noqa: E501
|
||||
}
|
||||
]
|
||||
output = nested_to_record(input_data, max_level=max_level)
|
||||
assert output == expected
|
||||
|
||||
def test_series_non_zero_index(self):
|
||||
# GH 19020
|
||||
data = {
|
||||
0: {"id": 1, "name": "Foo", "elements": {"a": 1}},
|
||||
1: {"id": 2, "name": "Bar", "elements": {"b": 2}},
|
||||
2: {"id": 3, "name": "Baz", "elements": {"c": 3}},
|
||||
}
|
||||
s = Series(data)
|
||||
s.index = [1, 2, 3]
|
||||
result = json_normalize(s)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"id": [1, 2, 3],
|
||||
"name": ["Foo", "Bar", "Baz"],
|
||||
"elements.a": [1.0, np.nan, np.nan],
|
||||
"elements.b": [np.nan, 2.0, np.nan],
|
||||
"elements.c": [np.nan, np.nan, 3.0],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
2188
lib/python3.11/site-packages/pandas/tests/io/json/test_pandas.py
Normal file
2188
lib/python3.11/site-packages/pandas/tests/io/json/test_pandas.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,543 @@
|
||||
from collections.abc import Iterator
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
read_json,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.json._json import JsonReader
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lines_json_df():
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
return df.to_json(lines=True, orient="records")
|
||||
|
||||
|
||||
@pytest.fixture(params=["ujson", "pyarrow"])
|
||||
def engine(request):
|
||||
if request.param == "pyarrow":
|
||||
pytest.importorskip("pyarrow.json")
|
||||
return request.param
|
||||
|
||||
|
||||
def test_read_jsonl():
|
||||
# GH9180
|
||||
result = read_json(StringIO('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n'), lines=True)
|
||||
expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_jsonl_engine_pyarrow(datapath, engine):
|
||||
result = read_json(
|
||||
datapath("io", "json", "data", "line_delimited.json"),
|
||||
lines=True,
|
||||
engine=engine,
|
||||
)
|
||||
expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_datetime(request, engine):
|
||||
# GH33787
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = "Pyarrow only supports a file path as an input and line delimited json"
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
|
||||
|
||||
df = DataFrame(
|
||||
[([1, 2], ["2020-03-05", "2020-04-08T09:58:49+00:00"], "hector")],
|
||||
columns=["accounts", "date", "name"],
|
||||
)
|
||||
json_line = df.to_json(lines=True, orient="records")
|
||||
|
||||
if engine == "pyarrow":
|
||||
result = read_json(StringIO(json_line), engine=engine)
|
||||
else:
|
||||
result = read_json(StringIO(json_line), engine=engine)
|
||||
expected = DataFrame(
|
||||
[[1, "2020-03-05", "hector"], [2, "2020-04-08T09:58:49+00:00", "hector"]],
|
||||
columns=["accounts", "date", "name"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_jsonl_unicode_chars():
|
||||
# GH15132: non-ascii unicode characters
|
||||
# \u201d == RIGHT DOUBLE QUOTATION MARK
|
||||
|
||||
# simulate file handle
|
||||
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
|
||||
json = StringIO(json)
|
||||
result = read_json(json, lines=True)
|
||||
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# simulate string
|
||||
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
|
||||
result = read_json(StringIO(json), lines=True)
|
||||
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_jsonl():
|
||||
# GH9180
|
||||
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n'
|
||||
assert result == expected
|
||||
|
||||
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n'
|
||||
assert result == expected
|
||||
tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
|
||||
|
||||
# GH15096: escaped characters in columns and data
|
||||
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n'
|
||||
assert result == expected
|
||||
tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
|
||||
|
||||
|
||||
def test_to_jsonl_count_new_lines():
|
||||
# GH36888
|
||||
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
|
||||
actual_new_lines_count = df.to_json(orient="records", lines=True).count("\n")
|
||||
expected_new_lines_count = 2
|
||||
assert actual_new_lines_count == expected_new_lines_count
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [1, 1.0])
|
||||
def test_readjson_chunks(request, lines_json_df, chunksize, engine):
|
||||
# Basic test that read_json(chunks=True) gives the same result as
|
||||
# read_json(chunks=False)
|
||||
# GH17048: memory usage when lines=True
|
||||
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = (
|
||||
"Pyarrow only supports a file path as an input and line delimited json"
|
||||
"and doesn't support chunksize parameter."
|
||||
)
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
|
||||
|
||||
unchunked = read_json(StringIO(lines_json_df), lines=True)
|
||||
with read_json(
|
||||
StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine
|
||||
) as reader:
|
||||
chunked = pd.concat(reader)
|
||||
|
||||
tm.assert_frame_equal(chunked, unchunked)
|
||||
|
||||
|
||||
def test_readjson_chunksize_requires_lines(lines_json_df, engine):
|
||||
msg = "chunksize can only be passed if lines=True"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with read_json(
|
||||
StringIO(lines_json_df), lines=False, chunksize=2, engine=engine
|
||||
) as _:
|
||||
pass
|
||||
|
||||
|
||||
def test_readjson_chunks_series(request, engine):
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = (
|
||||
"Pyarrow only supports a file path as an input and line delimited json"
|
||||
"and doesn't support chunksize parameter."
|
||||
)
|
||||
request.applymarker(pytest.mark.xfail(reason=reason))
|
||||
|
||||
# Test reading line-format JSON to Series with chunksize param
|
||||
s = pd.Series({"A": 1, "B": 2})
|
||||
|
||||
strio = StringIO(s.to_json(lines=True, orient="records"))
|
||||
unchunked = read_json(strio, lines=True, typ="Series", engine=engine)
|
||||
|
||||
strio = StringIO(s.to_json(lines=True, orient="records"))
|
||||
with read_json(
|
||||
strio, lines=True, typ="Series", chunksize=1, engine=engine
|
||||
) as reader:
|
||||
chunked = pd.concat(reader)
|
||||
|
||||
tm.assert_series_equal(chunked, unchunked)
|
||||
|
||||
|
||||
def test_readjson_each_chunk(request, lines_json_df, engine):
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = (
|
||||
"Pyarrow only supports a file path as an input and line delimited json"
|
||||
"and doesn't support chunksize parameter."
|
||||
)
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
|
||||
|
||||
# Other tests check that the final result of read_json(chunksize=True)
|
||||
# is correct. This checks the intermediate chunks.
|
||||
with read_json(
|
||||
StringIO(lines_json_df), lines=True, chunksize=2, engine=engine
|
||||
) as reader:
|
||||
chunks = list(reader)
|
||||
assert chunks[0].shape == (2, 2)
|
||||
assert chunks[1].shape == (1, 2)
|
||||
|
||||
|
||||
def test_readjson_chunks_from_file(request, engine):
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = (
|
||||
"Pyarrow only supports a file path as an input and line delimited json"
|
||||
"and doesn't support chunksize parameter."
|
||||
)
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
|
||||
|
||||
with tm.ensure_clean("test.json") as path:
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
df.to_json(path, lines=True, orient="records")
|
||||
with read_json(path, lines=True, chunksize=1, engine=engine) as reader:
|
||||
chunked = pd.concat(reader)
|
||||
unchunked = read_json(path, lines=True, engine=engine)
|
||||
tm.assert_frame_equal(unchunked, chunked)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [None, 1])
|
||||
def test_readjson_chunks_closes(chunksize):
|
||||
with tm.ensure_clean("test.json") as path:
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
df.to_json(path, lines=True, orient="records")
|
||||
reader = JsonReader(
|
||||
path,
|
||||
orient=None,
|
||||
typ="frame",
|
||||
dtype=True,
|
||||
convert_axes=True,
|
||||
convert_dates=True,
|
||||
keep_default_dates=True,
|
||||
precise_float=False,
|
||||
date_unit=None,
|
||||
encoding=None,
|
||||
lines=True,
|
||||
chunksize=chunksize,
|
||||
compression=None,
|
||||
nrows=None,
|
||||
)
|
||||
with reader:
|
||||
reader.read()
|
||||
assert (
|
||||
reader.handles.handle.closed
|
||||
), f"didn't close stream with chunksize = {chunksize}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
|
||||
def test_readjson_invalid_chunksize(lines_json_df, chunksize, engine):
|
||||
msg = r"'chunksize' must be an integer >=1"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with read_json(
|
||||
StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine
|
||||
) as _:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [None, 1, 2])
|
||||
def test_readjson_chunks_multiple_empty_lines(chunksize):
|
||||
j = """
|
||||
|
||||
{"A":1,"B":4}
|
||||
|
||||
|
||||
|
||||
{"A":2,"B":5}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
{"A":3,"B":6}
|
||||
"""
|
||||
orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
test = read_json(StringIO(j), lines=True, chunksize=chunksize)
|
||||
if chunksize is not None:
|
||||
with test:
|
||||
test = pd.concat(test)
|
||||
tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}")
|
||||
|
||||
|
||||
def test_readjson_unicode(request, monkeypatch, engine):
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = (
|
||||
"Pyarrow only supports a file path as an input and line delimited json"
|
||||
"and doesn't support chunksize parameter."
|
||||
)
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
|
||||
|
||||
with tm.ensure_clean("test.json") as path:
|
||||
monkeypatch.setattr("locale.getpreferredencoding", lambda do_setlocale: "cp949")
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}')
|
||||
|
||||
result = read_json(path, engine=engine)
|
||||
expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", [1, 2])
|
||||
def test_readjson_nrows(nrows, engine):
|
||||
# GH 33916
|
||||
# Test reading line-format JSON to Series with nrows param
|
||||
jsonl = """{"a": 1, "b": 2}
|
||||
{"a": 3, "b": 4}
|
||||
{"a": 5, "b": 6}
|
||||
{"a": 7, "b": 8}"""
|
||||
result = read_json(StringIO(jsonl), lines=True, nrows=nrows)
|
||||
expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
|
||||
def test_readjson_nrows_chunks(request, nrows, chunksize, engine):
|
||||
# GH 33916
|
||||
# Test reading line-format JSON to Series with nrows and chunksize param
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = (
|
||||
"Pyarrow only supports a file path as an input and line delimited json"
|
||||
"and doesn't support chunksize parameter."
|
||||
)
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
|
||||
|
||||
jsonl = """{"a": 1, "b": 2}
|
||||
{"a": 3, "b": 4}
|
||||
{"a": 5, "b": 6}
|
||||
{"a": 7, "b": 8}"""
|
||||
|
||||
if engine != "pyarrow":
|
||||
with read_json(
|
||||
StringIO(jsonl), lines=True, nrows=nrows, chunksize=chunksize, engine=engine
|
||||
) as reader:
|
||||
chunked = pd.concat(reader)
|
||||
else:
|
||||
with read_json(
|
||||
jsonl, lines=True, nrows=nrows, chunksize=chunksize, engine=engine
|
||||
) as reader:
|
||||
chunked = pd.concat(reader)
|
||||
expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
|
||||
tm.assert_frame_equal(chunked, expected)
|
||||
|
||||
|
||||
def test_readjson_nrows_requires_lines(engine):
|
||||
# GH 33916
|
||||
# Test ValueError raised if nrows is set without setting lines in read_json
|
||||
jsonl = """{"a": 1, "b": 2}
|
||||
{"a": 3, "b": 4}
|
||||
{"a": 5, "b": 6}
|
||||
{"a": 7, "b": 8}"""
|
||||
msg = "nrows can only be passed if lines=True"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_json(jsonl, lines=False, nrows=2, engine=engine)
|
||||
|
||||
|
||||
def test_readjson_lines_chunks_fileurl(request, datapath, engine):
|
||||
# GH 27135
|
||||
# Test reading line-format JSON from file url
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = (
|
||||
"Pyarrow only supports a file path as an input and line delimited json"
|
||||
"and doesn't support chunksize parameter."
|
||||
)
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
|
||||
|
||||
df_list_expected = [
|
||||
DataFrame([[1, 2]], columns=["a", "b"], index=[0]),
|
||||
DataFrame([[3, 4]], columns=["a", "b"], index=[1]),
|
||||
DataFrame([[5, 6]], columns=["a", "b"], index=[2]),
|
||||
]
|
||||
os_path = datapath("io", "json", "data", "line_delimited.json")
|
||||
file_url = Path(os_path).as_uri()
|
||||
with read_json(file_url, lines=True, chunksize=1, engine=engine) as url_reader:
|
||||
for index, chuck in enumerate(url_reader):
|
||||
tm.assert_frame_equal(chuck, df_list_expected[index])
|
||||
|
||||
|
||||
def test_chunksize_is_incremental():
|
||||
# See https://github.com/pandas-dev/pandas/issues/34548
|
||||
jsonl = (
|
||||
"""{"a": 1, "b": 2}
|
||||
{"a": 3, "b": 4}
|
||||
{"a": 5, "b": 6}
|
||||
{"a": 7, "b": 8}\n"""
|
||||
* 1000
|
||||
)
|
||||
|
||||
class MyReader:
|
||||
def __init__(self, contents) -> None:
|
||||
self.read_count = 0
|
||||
self.stringio = StringIO(contents)
|
||||
|
||||
def read(self, *args):
|
||||
self.read_count += 1
|
||||
return self.stringio.read(*args)
|
||||
|
||||
def __iter__(self) -> Iterator:
|
||||
self.read_count += 1
|
||||
return iter(self.stringio)
|
||||
|
||||
reader = MyReader(jsonl)
|
||||
assert len(list(read_json(reader, lines=True, chunksize=100))) > 1
|
||||
assert reader.read_count > 10
|
||||
|
||||
|
||||
@pytest.mark.parametrize("orient_", ["split", "index", "table"])
|
||||
def test_to_json_append_orient(orient_):
|
||||
# GH 35849
|
||||
# Test ValueError when orient is not 'records'
|
||||
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
|
||||
msg = (
|
||||
r"mode='a' \(append\) is only supported when "
|
||||
"lines is True and orient is 'records'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_json(mode="a", orient=orient_)
|
||||
|
||||
|
||||
def test_to_json_append_lines():
|
||||
# GH 35849
|
||||
# Test ValueError when lines is not True
|
||||
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
|
||||
msg = (
|
||||
r"mode='a' \(append\) is only supported when "
|
||||
"lines is True and orient is 'records'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_json(mode="a", lines=False, orient="records")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mode_", ["r", "x"])
|
||||
def test_to_json_append_mode(mode_):
|
||||
# GH 35849
|
||||
# Test ValueError when mode is not supported option
|
||||
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
|
||||
msg = (
|
||||
f"mode={mode_} is not a valid option."
|
||||
"Only 'w' and 'a' are currently supported."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_json(mode=mode_, lines=False, orient="records")
|
||||
|
||||
|
||||
def test_to_json_append_output_consistent_columns():
|
||||
# GH 35849
|
||||
# Testing that resulting output reads in as expected.
|
||||
# Testing same columns, new rows
|
||||
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
|
||||
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
|
||||
|
||||
expected = DataFrame({"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]})
|
||||
with tm.ensure_clean("test.json") as path:
|
||||
# Save dataframes to the same file
|
||||
df1.to_json(path, lines=True, orient="records")
|
||||
df2.to_json(path, mode="a", lines=True, orient="records")
|
||||
|
||||
# Read path file
|
||||
result = read_json(path, lines=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_json_append_output_inconsistent_columns():
|
||||
# GH 35849
|
||||
# Testing that resulting output reads in as expected.
|
||||
# Testing one new column, one old column, new rows
|
||||
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
|
||||
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col1": [1, 2, None, None],
|
||||
"col2": ["a", "b", "e", "f"],
|
||||
"col3": [np.nan, np.nan, "!", "#"],
|
||||
}
|
||||
)
|
||||
with tm.ensure_clean("test.json") as path:
|
||||
# Save dataframes to the same file
|
||||
df1.to_json(path, mode="a", lines=True, orient="records")
|
||||
df3.to_json(path, mode="a", lines=True, orient="records")
|
||||
|
||||
# Read path file
|
||||
result = read_json(path, lines=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_json_append_output_different_columns():
|
||||
# GH 35849
|
||||
# Testing that resulting output reads in as expected.
|
||||
# Testing same, differing and new columns
|
||||
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
|
||||
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
|
||||
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
|
||||
df4 = DataFrame({"col4": [True, False]})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col1": [1, 2, 3, 4, None, None, None, None],
|
||||
"col2": ["a", "b", "c", "d", "e", "f", np.nan, np.nan],
|
||||
"col3": [np.nan, np.nan, np.nan, np.nan, "!", "#", np.nan, np.nan],
|
||||
"col4": [None, None, None, None, None, None, True, False],
|
||||
}
|
||||
).astype({"col4": "float"})
|
||||
with tm.ensure_clean("test.json") as path:
|
||||
# Save dataframes to the same file
|
||||
df1.to_json(path, mode="a", lines=True, orient="records")
|
||||
df2.to_json(path, mode="a", lines=True, orient="records")
|
||||
df3.to_json(path, mode="a", lines=True, orient="records")
|
||||
df4.to_json(path, mode="a", lines=True, orient="records")
|
||||
|
||||
# Read path file
|
||||
result = read_json(path, lines=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_json_append_output_different_columns_reordered():
|
||||
# GH 35849
|
||||
# Testing that resulting output reads in as expected.
|
||||
# Testing specific result column order.
|
||||
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
|
||||
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
|
||||
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
|
||||
df4 = DataFrame({"col4": [True, False]})
|
||||
|
||||
# df4, df3, df2, df1 (in that order)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col4": [True, False, None, None, None, None, None, None],
|
||||
"col2": [np.nan, np.nan, "e", "f", "c", "d", "a", "b"],
|
||||
"col3": [np.nan, np.nan, "!", "#", np.nan, np.nan, np.nan, np.nan],
|
||||
"col1": [None, None, None, None, 3, 4, 1, 2],
|
||||
}
|
||||
).astype({"col4": "float"})
|
||||
with tm.ensure_clean("test.json") as path:
|
||||
# Save dataframes to the same file
|
||||
df4.to_json(path, mode="a", lines=True, orient="records")
|
||||
df3.to_json(path, mode="a", lines=True, orient="records")
|
||||
df2.to_json(path, mode="a", lines=True, orient="records")
|
||||
df1.to_json(path, mode="a", lines=True, orient="records")
|
||||
|
||||
# Read path file
|
||||
result = read_json(path, lines=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
1087
lib/python3.11/site-packages/pandas/tests/io/json/test_ujson.py
Normal file
1087
lib/python3.11/site-packages/pandas/tests/io/json/test_ujson.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,382 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import parsers as libparsers
|
||||
from pandas.errors import DtypeWarning
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", [0, "index"])
|
||||
def test_read_chunksize_with_index(all_parsers, index_col):
|
||||
parser = all_parsers
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
["foo", 2, 3, 4, 5],
|
||||
["bar", 7, 8, 9, 10],
|
||||
["baz", 12, 13, 14, 15],
|
||||
["qux", 12, 13, 14, 15],
|
||||
["foo2", 12, 13, 14, 15],
|
||||
["bar2", 12, 13, 14, 15],
|
||||
],
|
||||
columns=["index", "A", "B", "C", "D"],
|
||||
)
|
||||
expected = expected.set_index("index")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
|
||||
list(reader)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
|
||||
chunks = list(reader)
|
||||
tm.assert_frame_equal(chunks[0], expected[:2])
|
||||
tm.assert_frame_equal(chunks[1], expected[2:4])
|
||||
tm.assert_frame_equal(chunks[2], expected[4:])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
|
||||
def test_read_chunksize_bad(all_parsers, chunksize):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
msg = r"'chunksize' must be an integer >=1"
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [2, 8])
|
||||
def test_read_chunksize_and_nrows(all_parsers, chunksize):
|
||||
# see gh-15755
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0, "nrows": 5}
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
return
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
|
||||
tm.assert_frame_equal(concat(reader), expected)
|
||||
|
||||
|
||||
def test_read_chunksize_and_nrows_changing_size(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0, "nrows": 5}
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
return
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
|
||||
tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
|
||||
tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
|
||||
|
||||
with pytest.raises(StopIteration, match=""):
|
||||
reader.get_chunk(size=3)
|
||||
|
||||
|
||||
def test_get_chunk_passed_chunksize(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
1,2,3"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), chunksize=2) as reader:
|
||||
reader.get_chunk()
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), chunksize=2) as reader:
|
||||
result = reader.get_chunk()
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
|
||||
def test_read_chunksize_compat(all_parsers, kwargs):
|
||||
# see gh-12185
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
|
||||
concat(reader)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
|
||||
via_reader = concat(reader)
|
||||
tm.assert_frame_equal(via_reader, result)
|
||||
|
||||
|
||||
def test_read_chunksize_jagged_names(all_parsers):
|
||||
# see gh-23509
|
||||
parser = all_parsers
|
||||
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
|
||||
|
||||
expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(
|
||||
StringIO(data), names=range(10), chunksize=4
|
||||
) as reader:
|
||||
concat(reader)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
|
||||
result = concat(reader)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_chunk_begins_with_newline_whitespace(all_parsers):
|
||||
# see gh-10022
|
||||
parser = all_parsers
|
||||
data = "\n hello\nworld\n"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = DataFrame([" hello", "world"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
|
||||
# mainly an issue with the C parser
|
||||
heuristic = 2**3
|
||||
parser = all_parsers
|
||||
integers = [str(i) for i in range(heuristic - 1)]
|
||||
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
|
||||
|
||||
# Coercions should work without warnings.
|
||||
with monkeypatch.context() as m:
|
||||
m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
assert type(result.a[0]) is np.float64
|
||||
assert result.a.dtype == float
|
||||
|
||||
|
||||
def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
|
||||
warning_type = None
|
||||
parser = all_parsers
|
||||
size = 10000
|
||||
|
||||
# see gh-3866: if chunks are different types and can't
|
||||
# be coerced using numerical types, then issue warning.
|
||||
if parser.engine == "c" and parser.low_memory:
|
||||
warning_type = DtypeWarning
|
||||
# Use larger size to hit warning path
|
||||
size = 499999
|
||||
|
||||
integers = [str(i) for i in range(size)]
|
||||
data = "a\n" + "\n".join(integers + ["a", "b"] + integers)
|
||||
|
||||
buf = StringIO(data)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
df = parser.read_csv(
|
||||
buf,
|
||||
)
|
||||
else:
|
||||
df = parser.read_csv_check_warnings(
|
||||
warning_type,
|
||||
r"Columns \(0\) have mixed types. "
|
||||
"Specify dtype option on import or set low_memory=False.",
|
||||
buf,
|
||||
)
|
||||
if parser.engine == "c" and parser.low_memory:
|
||||
assert df.a.dtype == object
|
||||
elif using_infer_string:
|
||||
assert df.a.dtype == "str"
|
||||
else:
|
||||
assert df.a.dtype == object
|
||||
|
||||
|
||||
@pytest.mark.parametrize("iterator", [True, False])
|
||||
def test_empty_with_nrows_chunksize(all_parsers, iterator):
|
||||
# see gh-9535
|
||||
parser = all_parsers
|
||||
expected = DataFrame(columns=["foo", "bar"])
|
||||
|
||||
nrows = 10
|
||||
data = StringIO("foo,bar\n")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"The '(nrows|chunksize)' option is not supported with the 'pyarrow' engine"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
if iterator:
|
||||
with parser.read_csv(data, chunksize=nrows) as reader:
|
||||
next(iter(reader))
|
||||
else:
|
||||
parser.read_csv(data, nrows=nrows)
|
||||
return
|
||||
|
||||
if iterator:
|
||||
with parser.read_csv(data, chunksize=nrows) as reader:
|
||||
result = next(iter(reader))
|
||||
else:
|
||||
result = parser.read_csv(data, nrows=nrows)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_memory_growth_chunksize(all_parsers):
|
||||
# see gh-24805
|
||||
#
|
||||
# Let's just make sure that we don't crash
|
||||
# as we iteratively process all chunks.
|
||||
parser = all_parsers
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
for i in range(1000):
|
||||
f.write(str(i) + "\n")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(path, chunksize=20) as result:
|
||||
for _ in result:
|
||||
pass
|
||||
return
|
||||
|
||||
with parser.read_csv(path, chunksize=20) as result:
|
||||
for _ in result:
|
||||
pass
|
||||
|
||||
|
||||
def test_chunksize_with_usecols_second_block_shorter(all_parsers):
|
||||
# GH#21211
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4
|
||||
5,6,7,8
|
||||
9,10,11
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
names=["a", "b"],
|
||||
chunksize=2,
|
||||
usecols=[0, 1],
|
||||
header=None,
|
||||
)
|
||||
return
|
||||
|
||||
result_chunks = parser.read_csv(
|
||||
StringIO(data),
|
||||
names=["a", "b"],
|
||||
chunksize=2,
|
||||
usecols=[0, 1],
|
||||
header=None,
|
||||
)
|
||||
|
||||
expected_frames = [
|
||||
DataFrame({"a": [1, 5], "b": [2, 6]}),
|
||||
DataFrame({"a": [9], "b": [10]}, index=[2]),
|
||||
]
|
||||
|
||||
for i, result in enumerate(result_chunks):
|
||||
tm.assert_frame_equal(result, expected_frames[i])
|
||||
|
||||
|
||||
def test_chunksize_second_block_shorter(all_parsers):
|
||||
# GH#21211
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
1,2,3,4
|
||||
5,6,7,8
|
||||
9,10,11
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), chunksize=2)
|
||||
return
|
||||
|
||||
result_chunks = parser.read_csv(StringIO(data), chunksize=2)
|
||||
|
||||
expected_frames = [
|
||||
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
|
||||
DataFrame({"a": [9], "b": [10], "c": [11], "d": [np.nan]}, index=[2]),
|
||||
]
|
||||
|
||||
for i, result in enumerate(result_chunks):
|
||||
tm.assert_frame_equal(result, expected_frames[i])
|
||||
@ -0,0 +1,983 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from datetime import datetime
|
||||
from inspect import signature
|
||||
from io import StringIO
|
||||
import os
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._config import using_string_dtype
|
||||
|
||||
from pandas.compat import HAS_PYARROW
|
||||
from pandas.errors import (
|
||||
EmptyDataError,
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Timestamp,
|
||||
compat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.parsers import TextFileReader
|
||||
from pandas.io.parsers.c_parser_wrapper import CParserWrapper
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_override_set_noconvert_columns():
|
||||
# see gh-17351
|
||||
#
|
||||
# Usecols needs to be sorted in _set_noconvert_columns based
|
||||
# on the test_usecols_with_parse_dates test from test_usecols.py
|
||||
class MyTextFileReader(TextFileReader):
|
||||
def __init__(self) -> None:
|
||||
self._currow = 0
|
||||
self.squeeze = False
|
||||
|
||||
class MyCParserWrapper(CParserWrapper):
|
||||
def _set_noconvert_columns(self):
|
||||
if self.usecols_dtype == "integer":
|
||||
# self.usecols is a set, which is documented as unordered
|
||||
# but in practice, a CPython set of integers is sorted.
|
||||
# In other implementations this assumption does not hold.
|
||||
# The following code simulates a different order, which
|
||||
# before GH 17351 would cause the wrong columns to be
|
||||
# converted via the parse_dates parameter
|
||||
self.usecols = list(self.usecols)
|
||||
self.usecols.reverse()
|
||||
return CParserWrapper._set_noconvert_columns(self)
|
||||
|
||||
data = """a,b,c,d,e
|
||||
0,1,2014-01-01,09:00,4
|
||||
0,1,2014-01-02,10:00,4"""
|
||||
|
||||
parse_dates = [[1, 2]]
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
|
||||
parser = MyTextFileReader()
|
||||
parser.options = {
|
||||
"usecols": [0, 2, 3],
|
||||
"parse_dates": parse_dates,
|
||||
"delimiter": ",",
|
||||
}
|
||||
parser.engine = "c"
|
||||
parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
|
||||
|
||||
result = parser.read()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_local(all_parsers, csv1):
|
||||
prefix = "file:///" if compat.is_platform_windows() else "file://"
|
||||
parser = all_parsers
|
||||
|
||||
fname = prefix + str(os.path.abspath(csv1))
|
||||
result = parser.read_csv(fname, index_col=0, parse_dates=True)
|
||||
# TODO: make unit check more specific
|
||||
if parser.engine == "pyarrow":
|
||||
result.index = result.index.as_unit("ns")
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.980269, 3.685731, -0.364216805298, -1.159738],
|
||||
[1.047916, -0.041232, -0.16181208307, 0.212549],
|
||||
[0.498581, 0.731168, -0.537677223318, 1.346270],
|
||||
[1.120202, 1.567621, 0.00364077397681, 0.675253],
|
||||
[-0.487094, 0.571455, -1.6116394093, 0.103469],
|
||||
[0.836649, 0.246462, 0.588542635376, 1.062782],
|
||||
[-0.157161, 1.340307, 1.1957779562, -1.097007],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(
|
||||
[
|
||||
datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 4),
|
||||
datetime(2000, 1, 5),
|
||||
datetime(2000, 1, 6),
|
||||
datetime(2000, 1, 7),
|
||||
datetime(2000, 1, 10),
|
||||
datetime(2000, 1, 11),
|
||||
],
|
||||
name="index",
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_1000_sep(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A|B|C
|
||||
1|2,334|5
|
||||
10|13|10.
|
||||
"""
|
||||
expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep="|", thousands=",")
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep="|", thousands=",")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_unnamed_columns(all_parsers):
|
||||
data = """A,B,C,,
|
||||
1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
|
||||
dtype=np.int64,
|
||||
columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"],
|
||||
)
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_csv_mixed_type(all_parsers):
|
||||
data = """A,B,C
|
||||
a,1,2
|
||||
b,3,4
|
||||
c,4,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]})
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_low_memory_no_rows_with_index(all_parsers):
|
||||
# see gh-21141
|
||||
parser = all_parsers
|
||||
|
||||
if not parser.low_memory:
|
||||
pytest.skip("This is a low-memory specific test")
|
||||
|
||||
data = """A,B,C
|
||||
1,1,1,2
|
||||
2,2,3,4
|
||||
3,3,4,5
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
|
||||
expected = DataFrame(columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_dataframe(all_parsers, csv1):
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(csv1, index_col=0, parse_dates=True)
|
||||
# TODO: make unit check more specific
|
||||
if parser.engine == "pyarrow":
|
||||
result.index = result.index.as_unit("ns")
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.980269, 3.685731, -0.364216805298, -1.159738],
|
||||
[1.047916, -0.041232, -0.16181208307, 0.212549],
|
||||
[0.498581, 0.731168, -0.537677223318, 1.346270],
|
||||
[1.120202, 1.567621, 0.00364077397681, 0.675253],
|
||||
[-0.487094, 0.571455, -1.6116394093, 0.103469],
|
||||
[0.836649, 0.246462, 0.588542635376, 1.062782],
|
||||
[-0.157161, 1.340307, 1.1957779562, -1.097007],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(
|
||||
[
|
||||
datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 4),
|
||||
datetime(2000, 1, 5),
|
||||
datetime(2000, 1, 6),
|
||||
datetime(2000, 1, 7),
|
||||
datetime(2000, 1, 10),
|
||||
datetime(2000, 1, 11),
|
||||
],
|
||||
name="index",
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", [3, 3.0])
|
||||
def test_read_nrows(all_parsers, nrows):
|
||||
# see gh-10476
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]],
|
||||
columns=["index", "A", "B", "C", "D"],
|
||||
)
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), nrows=nrows)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), nrows=nrows)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", [1.2, "foo", -1])
|
||||
def test_read_nrows_bad(all_parsers, nrows):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
msg = r"'nrows' must be an integer >=0"
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), nrows=nrows)
|
||||
|
||||
|
||||
def test_nrows_skipfooter_errors(all_parsers):
|
||||
msg = "'skipfooter' not supported with 'nrows'"
|
||||
data = "a\n1\n2\n3\n4\n5\n6"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=1, nrows=5)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_missing_trailing_delimiters(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C,D
|
||||
1,2,3,4
|
||||
1,3,3,
|
||||
1,4,5"""
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]],
|
||||
columns=["A", "B", "C", "D"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skip_initial_space(all_parsers):
|
||||
data = (
|
||||
'"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
|
||||
"1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, "
|
||||
"314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, "
|
||||
"70.06056, 344.98370, 1, 1, -0.689265, -0.692787, "
|
||||
"0.212036, 14.7674, 41.605, -9999.0, -9999.0, "
|
||||
"-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128"
|
||||
)
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
names=list(range(33)),
|
||||
header=None,
|
||||
na_values=["-9999.0"],
|
||||
skipinitialspace=True,
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
names=list(range(33)),
|
||||
header=None,
|
||||
na_values=["-9999.0"],
|
||||
skipinitialspace=True,
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[
|
||||
"09-Apr-2012",
|
||||
"01:10:18.300",
|
||||
2456026.548822908,
|
||||
12849,
|
||||
1.00361,
|
||||
1.12551,
|
||||
330.65659,
|
||||
355626618.16711,
|
||||
73.48821,
|
||||
314.11625,
|
||||
1917.09447,
|
||||
179.71425,
|
||||
80.0,
|
||||
240.0,
|
||||
-350,
|
||||
70.06056,
|
||||
344.9837,
|
||||
1,
|
||||
1,
|
||||
-0.689265,
|
||||
-0.692787,
|
||||
0.212036,
|
||||
14.7674,
|
||||
41.605,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
0,
|
||||
12,
|
||||
128,
|
||||
]
|
||||
]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_trailing_delimiters(all_parsers):
|
||||
# see gh-2442
|
||||
data = """A,B,C
|
||||
1,2,3,
|
||||
4,5,6,
|
||||
7,8,9,"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=False)
|
||||
|
||||
expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_escapechar(all_parsers):
|
||||
# https://stackoverflow.com/questions/13824840/feature-request-for-
|
||||
# pandas-read-csv
|
||||
data = '''SEARCH_TERM,ACTUAL_URL
|
||||
"bra tv board","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
|
||||
"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
|
||||
"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"'''
|
||||
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8"
|
||||
)
|
||||
|
||||
assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series'
|
||||
|
||||
tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"]))
|
||||
|
||||
|
||||
def test_ignore_leading_whitespace(all_parsers):
|
||||
# see gh-3374, gh-6607
|
||||
parser = all_parsers
|
||||
data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
|
||||
expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]])
|
||||
def test_uneven_lines_with_usecols(all_parsers, usecols):
|
||||
# see gh-12203
|
||||
parser = all_parsers
|
||||
data = r"""a,b,c
|
||||
0,1,2
|
||||
3,4,5,6,7
|
||||
8,9,10"""
|
||||
|
||||
if usecols is None:
|
||||
# Make sure that an error is still raised
|
||||
# when the "usecols" parameter is not provided.
|
||||
msg = r"Expected \d+ fields in line \d+, saw \d+"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
else:
|
||||
expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
# First, check to see that the response of parser when faced with no
|
||||
# provided columns raises the correct error, with or without usecols.
|
||||
("", {}, None),
|
||||
("", {"usecols": ["X"]}, None),
|
||||
(
|
||||
",,",
|
||||
{"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
|
||||
DataFrame(columns=["X"], index=[0], dtype=np.float64),
|
||||
),
|
||||
(
|
||||
"",
|
||||
{"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
|
||||
DataFrame(columns=["X"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
|
||||
# see gh-12493
|
||||
parser = all_parsers
|
||||
|
||||
if expected is None:
|
||||
msg = "No columns to parse from file"
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,expected",
|
||||
[
|
||||
# gh-8661, gh-8679: this should ignore six lines, including
|
||||
# lines with trailing whitespace and blank lines.
|
||||
(
|
||||
{
|
||||
"header": None,
|
||||
"delim_whitespace": True,
|
||||
"skiprows": [0, 1, 2, 3, 5, 6],
|
||||
"skip_blank_lines": True,
|
||||
},
|
||||
DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]),
|
||||
),
|
||||
# gh-8983: test skipping set of rows after a row with trailing spaces.
|
||||
(
|
||||
{
|
||||
"delim_whitespace": True,
|
||||
"skiprows": [1, 2, 3, 5, 6],
|
||||
"skip_blank_lines": True,
|
||||
},
|
||||
DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_trailing_spaces(all_parsers, kwargs, expected):
|
||||
data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501
|
||||
parser = all_parsers
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_raise_on_sep_with_delim_whitespace(all_parsers):
|
||||
# see gh-6607
|
||||
data = "a b c\n1 2 3"
|
||||
parser = all_parsers
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with pytest.raises(ValueError, match="you can only specify one"):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
|
||||
|
||||
|
||||
def test_read_filepath_or_buffer(all_parsers):
|
||||
# see gh-43366
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match="Expected file path name or file-like"):
|
||||
parser.read_csv(filepath_or_buffer=b"input")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("delim_whitespace", [True, False])
|
||||
def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
|
||||
# see gh-9710
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
MyColumn
|
||||
a
|
||||
b
|
||||
a
|
||||
b\n"""
|
||||
|
||||
expected = DataFrame({"MyColumn": list("abab")})
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
skipinitialspace=True,
|
||||
delim_whitespace=delim_whitespace,
|
||||
)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sep,skip_blank_lines,exp_data",
|
||||
[
|
||||
(",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
|
||||
(r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
|
||||
(
|
||||
",",
|
||||
False,
|
||||
[
|
||||
[1.0, 2.0, 4.0],
|
||||
[np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan],
|
||||
[5.0, np.nan, 10.0],
|
||||
[np.nan, np.nan, np.nan],
|
||||
[-70.0, 0.4, 1.0],
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data, request):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
|
||||
|
||||
5.,NaN,10.0
|
||||
|
||||
-70,.4,1
|
||||
"""
|
||||
|
||||
if sep == r"\s+":
|
||||
data = data.replace(",", " ")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines)
|
||||
expected = DataFrame(exp_data, columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_whitespace_lines(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """
|
||||
|
||||
\t \t\t
|
||||
\t
|
||||
A,B,C
|
||||
\t 1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"])
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected",
|
||||
[
|
||||
(
|
||||
""" A B C D
|
||||
a 1 2 3 4
|
||||
b 1 2 3 4
|
||||
c 1 2 3 4
|
||||
""",
|
||||
DataFrame(
|
||||
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=["a", "b", "c"],
|
||||
),
|
||||
),
|
||||
(
|
||||
" a b c\n1 2 3 \n4 5 6\n 7 8 9",
|
||||
DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_whitespace_regex_separator(all_parsers, data, expected):
|
||||
# see gh-6607
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_sub_character(all_parsers, csv_dir_path):
|
||||
# see gh-16893
|
||||
filename = os.path.join(csv_dir_path, "sub_char.csv")
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
|
||||
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(filename)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"])
|
||||
def test_filename_with_special_chars(all_parsers, filename):
|
||||
# see gh-15086.
|
||||
parser = all_parsers
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
|
||||
with tm.ensure_clean(filename) as path:
|
||||
df.to_csv(path, index=False)
|
||||
|
||||
result = parser.read_csv(path)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_read_table_same_signature_as_read_csv(all_parsers):
|
||||
# GH-34976
|
||||
parser = all_parsers
|
||||
|
||||
table_sign = signature(parser.read_table)
|
||||
csv_sign = signature(parser.read_csv)
|
||||
|
||||
assert table_sign.parameters.keys() == csv_sign.parameters.keys()
|
||||
assert table_sign.return_annotation == csv_sign.return_annotation
|
||||
|
||||
for key, csv_param in csv_sign.parameters.items():
|
||||
table_param = table_sign.parameters[key]
|
||||
if key == "sep":
|
||||
assert csv_param.default == ","
|
||||
assert table_param.default == "\t"
|
||||
assert table_param.annotation == csv_param.annotation
|
||||
assert table_param.kind == csv_param.kind
|
||||
continue
|
||||
|
||||
assert table_param == csv_param
|
||||
|
||||
|
||||
def test_read_table_equivalency_to_read_csv(all_parsers):
|
||||
# see gh-21948
|
||||
# As of 0.25.0, read_table is undeprecated
|
||||
parser = all_parsers
|
||||
data = "a\tb\n1\t2\n3\t4"
|
||||
expected = parser.read_csv(StringIO(data), sep="\t")
|
||||
result = parser.read_table(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("read_func", ["read_csv", "read_table"])
|
||||
def test_read_csv_and_table_sys_setprofile(all_parsers, read_func):
|
||||
# GH#41069
|
||||
parser = all_parsers
|
||||
data = "a b\n0 1"
|
||||
|
||||
sys.setprofile(lambda *a, **k: None)
|
||||
result = getattr(parser, read_func)(StringIO(data))
|
||||
sys.setprofile(None)
|
||||
|
||||
expected = DataFrame({"a b": ["0 1"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_first_row_bom(all_parsers):
|
||||
# see gh-26545
|
||||
parser = all_parsers
|
||||
data = '''\ufeff"Head1"\t"Head2"\t"Head3"'''
|
||||
|
||||
result = parser.read_csv(StringIO(data), delimiter="\t")
|
||||
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_first_row_bom_unquoted(all_parsers):
|
||||
# see gh-36343
|
||||
parser = all_parsers
|
||||
data = """\ufeffHead1\tHead2\tHead3"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), delimiter="\t")
|
||||
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", range(1, 6))
|
||||
def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
|
||||
# GH 28071
|
||||
ref = DataFrame(
|
||||
[[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
|
||||
columns=list("ab"),
|
||||
)
|
||||
csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False
|
||||
)
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
|
||||
tm.assert_frame_equal(df, ref[:nrows])
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_no_header_two_extra_columns(all_parsers):
|
||||
# GH 26218
|
||||
column_names = ["one", "two", "three"]
|
||||
ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
|
||||
stream = StringIO("foo,bar,baz,bam,blah")
|
||||
parser = all_parsers
|
||||
df = parser.read_csv_check_warnings(
|
||||
ParserWarning,
|
||||
"Length of header or names does not match length of data. "
|
||||
"This leads to a loss of data with index_col=False.",
|
||||
stream,
|
||||
header=None,
|
||||
names=column_names,
|
||||
index_col=False,
|
||||
)
|
||||
tm.assert_frame_equal(df, ref)
|
||||
|
||||
|
||||
def test_read_csv_names_not_accepting_sets(all_parsers):
|
||||
# GH 34946
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6\n"""
|
||||
parser = all_parsers
|
||||
with pytest.raises(ValueError, match="Names should be an ordered collection."):
|
||||
parser.read_csv(StringIO(data), names=set("QAZ"))
|
||||
|
||||
|
||||
def test_read_table_delim_whitespace_default_sep(all_parsers):
|
||||
# GH: 35958
|
||||
f = StringIO("a b c\n1 -2 -3\n4 5 6")
|
||||
parser = all_parsers
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_table(f, delim_whitespace=True)
|
||||
return
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_table(f, delim_whitespace=True)
|
||||
expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("delimiter", [",", "\t"])
|
||||
def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
|
||||
# GH: 35958
|
||||
f = StringIO("a b c\n1 -2 -3\n4 5 6")
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"Specified a delimiter with both sep and "
|
||||
"delim_whitespace=True; you can only specify one."
|
||||
)
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(f, delim_whitespace=True, sep=delimiter)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
|
||||
|
||||
|
||||
def test_read_csv_delimiter_and_sep_no_default(all_parsers):
|
||||
# GH#39823
|
||||
f = StringIO("a,b\n1,2")
|
||||
parser = all_parsers
|
||||
msg = "Specified a sep and a delimiter; you can only specify one."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(f, sep=" ", delimiter=".")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [{"delimiter": "\n"}, {"sep": "\n"}])
|
||||
def test_read_csv_line_break_as_separator(kwargs, all_parsers):
|
||||
# GH#43528
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,2,3
|
||||
"""
|
||||
msg = (
|
||||
r"Specified \\n as separator or delimiter. This forces the python engine "
|
||||
r"which does not accept a line terminator. Hence it is not allowed to use "
|
||||
r"the line terminator as separator."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("delimiter", [",", "\t"])
|
||||
def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
|
||||
# GH: 35958
|
||||
f = StringIO("a b c\n1 -2 -3\n4 5 6")
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"Specified a delimiter with both sep and "
|
||||
"delim_whitespace=True; you can only specify one."
|
||||
)
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_table(f, delim_whitespace=True, sep=delimiter)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_dict_keys_as_names(all_parsers):
|
||||
# GH: 36928
|
||||
data = "1,2"
|
||||
|
||||
keys = {"a": int, "b": int}.keys()
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=keys)
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
|
||||
@xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0
|
||||
def test_encoding_surrogatepass(all_parsers):
|
||||
# GH39017
|
||||
parser = all_parsers
|
||||
content = b"\xed\xbd\xbf"
|
||||
decoded = content.decode("utf-8", errors="surrogatepass")
|
||||
expected = DataFrame({decoded: [decoded]}, index=[decoded * 2])
|
||||
expected.index.name = decoded * 2
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
Path(path).write_bytes(
|
||||
content * 2 + b"," + content + b"\n" + content * 2 + b"," + content
|
||||
)
|
||||
df = parser.read_csv(path, encoding_errors="surrogatepass", index_col=0)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte"):
|
||||
parser.read_csv(path)
|
||||
|
||||
|
||||
def test_malformed_second_line(all_parsers):
|
||||
# see GH14782
|
||||
parser = all_parsers
|
||||
data = "\na\nb\n"
|
||||
result = parser.read_csv(StringIO(data), skip_blank_lines=False, header=1)
|
||||
expected = DataFrame({"a": ["b"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_short_single_line(all_parsers):
|
||||
# GH 47566
|
||||
parser = all_parsers
|
||||
columns = ["a", "b", "c"]
|
||||
data = "1,2"
|
||||
result = parser.read_csv(StringIO(data), header=None, names=columns)
|
||||
expected = DataFrame({"a": [1], "b": [2], "c": [np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Length mismatch: Expected axis has 2 elements
|
||||
def test_short_multi_line(all_parsers):
|
||||
# GH 47566
|
||||
parser = all_parsers
|
||||
columns = ["a", "b", "c"]
|
||||
data = "1,2\n1,2"
|
||||
result = parser.read_csv(StringIO(data), header=None, names=columns)
|
||||
expected = DataFrame({"a": [1, 1], "b": [2, 2], "c": [np.nan, np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_seek(all_parsers):
|
||||
# GH48646
|
||||
parser = all_parsers
|
||||
prefix = "### DATA\n"
|
||||
content = "nkey,value\ntables,rectangular\n"
|
||||
with tm.ensure_clean() as path:
|
||||
Path(path).write_text(prefix + content, encoding="utf-8")
|
||||
with open(path, encoding="utf-8") as file:
|
||||
file.readline()
|
||||
actual = parser.read_csv(file)
|
||||
expected = parser.read_csv(StringIO(content))
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
@ -0,0 +1,91 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.parsers import TextParser
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
@xfail_pyarrow
|
||||
def test_read_data_list(all_parsers):
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"
|
||||
|
||||
data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]]
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
with TextParser(data_list, chunksize=2, **kwargs) as parser:
|
||||
result = parser.read()
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_reader_list(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
|
||||
lines = list(csv.reader(StringIO(data)))
|
||||
with TextParser(lines, chunksize=2, **kwargs) as reader:
|
||||
chunks = list(reader)
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
tm.assert_frame_equal(chunks[0], expected[:2])
|
||||
tm.assert_frame_equal(chunks[1], expected[2:4])
|
||||
tm.assert_frame_equal(chunks[2], expected[4:])
|
||||
|
||||
|
||||
def test_reader_list_skiprows(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
|
||||
lines = list(csv.reader(StringIO(data)))
|
||||
with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader:
|
||||
chunks = list(reader)
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
tm.assert_frame_equal(chunks[0], expected[1:3])
|
||||
|
||||
|
||||
def test_read_csv_parse_simple_list(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """foo
|
||||
bar baz
|
||||
qux foo
|
||||
foo
|
||||
bar"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,72 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,thousands,decimal",
|
||||
[
|
||||
(
|
||||
"""A|B|C
|
||||
1|2,334.01|5
|
||||
10|13|10.
|
||||
""",
|
||||
",",
|
||||
".",
|
||||
),
|
||||
(
|
||||
"""A|B|C
|
||||
1|2.334,01|5
|
||||
10|13|10,
|
||||
""",
|
||||
".",
|
||||
",",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), sep="|", thousands=thousands, decimal=decimal
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), sep="|", thousands=thousands, decimal=decimal
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_euro_decimal_format(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """Id;Number1;Number2;Text1;Text2;Number3
|
||||
1;1521,1541;187101,9543;ABC;poi;4,738797819
|
||||
2;121,12;14897,76;DEF;uyt;0,377320872
|
||||
3;878,158;108013,434;GHI;rez;2,735694704"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=";", decimal=",")
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819],
|
||||
[2, 121.12, 14897.76, "DEF", "uyt", 0.377320872],
|
||||
[3, 878.158, 108013.434, "GHI", "rez", 2.735694704],
|
||||
],
|
||||
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,478 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
)
|
||||
import os
|
||||
import platform
|
||||
from urllib.error import URLError
|
||||
import uuid
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import (
|
||||
EmptyDataError,
|
||||
ParserError,
|
||||
)
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.single_cpu
|
||||
def test_url(all_parsers, csv_dir_path, httpserver):
|
||||
parser = all_parsers
|
||||
kwargs = {"sep": "\t"}
|
||||
|
||||
local_path = os.path.join(csv_dir_path, "salaries.csv")
|
||||
with open(local_path, encoding="utf-8") as f:
|
||||
httpserver.serve_content(content=f.read())
|
||||
|
||||
url_result = parser.read_csv(httpserver.url, **kwargs)
|
||||
|
||||
local_result = parser.read_csv(local_path, **kwargs)
|
||||
tm.assert_frame_equal(url_result, local_result)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_local_file(all_parsers, csv_dir_path):
|
||||
parser = all_parsers
|
||||
kwargs = {"sep": "\t"}
|
||||
|
||||
local_path = os.path.join(csv_dir_path, "salaries.csv")
|
||||
local_result = parser.read_csv(local_path, **kwargs)
|
||||
url = "file://localhost/" + local_path
|
||||
|
||||
try:
|
||||
url_result = parser.read_csv(url, **kwargs)
|
||||
tm.assert_frame_equal(url_result, local_result)
|
||||
except URLError:
|
||||
# Fails on some systems.
|
||||
pytest.skip("Failing on: " + " ".join(platform.uname()))
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
def test_path_path_lib(all_parsers):
|
||||
parser = all_parsers
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
def test_path_local_path(all_parsers):
|
||||
parser = all_parsers
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
result = tm.round_trip_localpath(
|
||||
df.to_csv, lambda p: parser.read_csv(p, index_col=0)
|
||||
)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
|
||||
def test_nonexistent_path(all_parsers):
|
||||
# gh-2428: pls no segfault
|
||||
# gh-14086: raise more helpful FileNotFoundError
|
||||
# GH#29233 "File foo" instead of "File b'foo'"
|
||||
parser = all_parsers
|
||||
path = f"{uuid.uuid4()}.csv"
|
||||
|
||||
msg = r"\[Errno 2\]"
|
||||
with pytest.raises(FileNotFoundError, match=msg) as e:
|
||||
parser.read_csv(path)
|
||||
assert path == e.value.filename
|
||||
|
||||
|
||||
@td.skip_if_windows # os.chmod does not work in windows
|
||||
def test_no_permission(all_parsers):
|
||||
# GH 23784
|
||||
parser = all_parsers
|
||||
|
||||
msg = r"\[Errno 13\]"
|
||||
with tm.ensure_clean() as path:
|
||||
os.chmod(path, 0) # make file unreadable
|
||||
|
||||
# verify that this process cannot open the file (not running as sudo)
|
||||
try:
|
||||
with open(path, encoding="utf-8"):
|
||||
pass
|
||||
pytest.skip("Running as sudo.")
|
||||
except PermissionError:
|
||||
pass
|
||||
|
||||
with pytest.raises(PermissionError, match=msg) as e:
|
||||
parser.read_csv(path)
|
||||
assert path == e.value.filename
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected,msg",
|
||||
[
|
||||
# gh-10728: WHITESPACE_LINE
|
||||
(
|
||||
"a,b,c\n4,5,6\n ",
|
||||
{},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# gh-10548: EAT_LINE_COMMENT
|
||||
(
|
||||
"a,b,c\n4,5,6\n#comment",
|
||||
{"comment": "#"},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_CRNL_NOP
|
||||
(
|
||||
"a,b,c\n4,5,6\n\r",
|
||||
{},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_COMMENT
|
||||
(
|
||||
"a,b,c\n4,5,6#comment",
|
||||
{"comment": "#"},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# SKIP_LINE
|
||||
(
|
||||
"a,b,c\n4,5,6\nskipme",
|
||||
{"skiprows": [2]},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_LINE_COMMENT
|
||||
(
|
||||
"a,b,c\n4,5,6\n#comment",
|
||||
{"comment": "#", "skip_blank_lines": False},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# IN_FIELD
|
||||
(
|
||||
"a,b,c\n4,5,6\n ",
|
||||
{"skip_blank_lines": False},
|
||||
DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_CRNL
|
||||
(
|
||||
"a,b,c\n4,5,6\n\r",
|
||||
{"skip_blank_lines": False},
|
||||
DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# ESCAPED_CHAR
|
||||
(
|
||||
"a,b,c\n4,5,6\n\\",
|
||||
{"escapechar": "\\"},
|
||||
None,
|
||||
"(EOF following escape character)|(unexpected end of data)",
|
||||
),
|
||||
# ESCAPE_IN_QUOTED_FIELD
|
||||
(
|
||||
'a,b,c\n4,5,6\n"\\',
|
||||
{"escapechar": "\\"},
|
||||
None,
|
||||
"(EOF inside string starting at row 2)|(unexpected end of data)",
|
||||
),
|
||||
# IN_QUOTED_FIELD
|
||||
(
|
||||
'a,b,c\n4,5,6\n"',
|
||||
{"escapechar": "\\"},
|
||||
None,
|
||||
"(EOF inside string starting at row 2)|(unexpected end of data)",
|
||||
),
|
||||
],
|
||||
ids=[
|
||||
"whitespace-line",
|
||||
"eat-line-comment",
|
||||
"eat-crnl-nop",
|
||||
"eat-comment",
|
||||
"skip-line",
|
||||
"eat-line-comment",
|
||||
"in-field",
|
||||
"eat-crnl",
|
||||
"escaped-char",
|
||||
"escape-in-quoted-field",
|
||||
"in-quoted-field",
|
||||
],
|
||||
)
|
||||
def test_eof_states(all_parsers, data, kwargs, expected, msg, request):
|
||||
# see gh-10728, gh-10548
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and "comment" in kwargs:
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
return
|
||||
|
||||
if parser.engine == "pyarrow" and "\r" not in data:
|
||||
# pandas.errors.ParserError: CSV parse error: Expected 3 columns, got 1:
|
||||
# ValueError: skiprows argument must be an integer when using engine='pyarrow'
|
||||
# AssertionError: Regex pattern did not match.
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
if expected is None:
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_temporary_file(all_parsers):
|
||||
# see gh-13398
|
||||
parser = all_parsers
|
||||
data = "0 0"
|
||||
|
||||
with tm.ensure_clean(mode="w+", return_filelike=True) as new_file:
|
||||
new_file.write(data)
|
||||
new_file.flush()
|
||||
new_file.seek(0)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(new_file, sep=r"\s+", header=None)
|
||||
return
|
||||
|
||||
result = parser.read_csv(new_file, sep=r"\s+", header=None)
|
||||
|
||||
expected = DataFrame([[0, 0]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_internal_eof_byte(all_parsers):
|
||||
# see gh-5500
|
||||
parser = all_parsers
|
||||
data = "a,b\n1\x1a,2"
|
||||
|
||||
expected = DataFrame([["1\x1a", 2]], columns=["a", "b"])
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_internal_eof_byte_to_file(all_parsers):
|
||||
# see gh-16559
|
||||
parser = all_parsers
|
||||
data = b'c1,c2\r\n"test \x1a test", test\r\n'
|
||||
expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"])
|
||||
path = f"__{uuid.uuid4()}__.csv"
|
||||
|
||||
with tm.ensure_clean(path) as path:
|
||||
with open(path, "wb") as f:
|
||||
f.write(data)
|
||||
|
||||
result = parser.read_csv(path)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_file_handle_string_io(all_parsers):
|
||||
# gh-14418
|
||||
#
|
||||
# Don't close user provided file handles.
|
||||
parser = all_parsers
|
||||
data = "a,b\n1,2"
|
||||
|
||||
fh = StringIO(data)
|
||||
parser.read_csv(fh)
|
||||
assert not fh.closed
|
||||
|
||||
|
||||
def test_file_handles_with_open(all_parsers, csv1):
|
||||
# gh-14418
|
||||
#
|
||||
# Don't close user provided file handles.
|
||||
parser = all_parsers
|
||||
|
||||
for mode in ["r", "rb"]:
|
||||
with open(csv1, mode, encoding="utf-8" if mode == "r" else None) as f:
|
||||
parser.read_csv(f)
|
||||
assert not f.closed
|
||||
|
||||
|
||||
def test_invalid_file_buffer_class(all_parsers):
|
||||
# see gh-15337
|
||||
class InvalidBuffer:
|
||||
pass
|
||||
|
||||
parser = all_parsers
|
||||
msg = "Invalid file path or buffer object type"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(InvalidBuffer())
|
||||
|
||||
|
||||
def test_invalid_file_buffer_mock(all_parsers):
|
||||
# see gh-15337
|
||||
parser = all_parsers
|
||||
msg = "Invalid file path or buffer object type"
|
||||
|
||||
class Foo:
|
||||
pass
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(Foo())
|
||||
|
||||
|
||||
def test_valid_file_buffer_seems_invalid(all_parsers):
|
||||
# gh-16135: we want to ensure that "tell" and "seek"
|
||||
# aren't actually being used when we call `read_csv`
|
||||
#
|
||||
# Thus, while the object may look "invalid" (these
|
||||
# methods are attributes of the `StringIO` class),
|
||||
# it is still a valid file-object for our purposes.
|
||||
class NoSeekTellBuffer(StringIO):
|
||||
def tell(self):
|
||||
raise AttributeError("No tell method")
|
||||
|
||||
def seek(self, pos, whence=0):
|
||||
raise AttributeError("No seek method")
|
||||
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(NoSeekTellBuffer(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("io_class", [StringIO, BytesIO])
|
||||
@pytest.mark.parametrize("encoding", [None, "utf-8"])
|
||||
def test_read_csv_file_handle(all_parsers, io_class, encoding):
|
||||
"""
|
||||
Test whether read_csv does not close user-provided file handles.
|
||||
|
||||
GH 36980
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
|
||||
content = "a,b\n1,2"
|
||||
handle = io_class(content.encode("utf-8") if io_class == BytesIO else content)
|
||||
|
||||
tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected)
|
||||
assert not handle.closed
|
||||
|
||||
|
||||
def test_memory_map_compression(all_parsers, compression):
|
||||
"""
|
||||
Support memory map for compressed files.
|
||||
|
||||
GH 37621
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
expected.to_csv(path, index=False, compression=compression)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(path, memory_map=True, compression=compression)
|
||||
return
|
||||
|
||||
result = parser.read_csv(path, memory_map=True, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(
|
||||
result,
|
||||
expected,
|
||||
)
|
||||
|
||||
|
||||
def test_context_manager(all_parsers, datapath):
|
||||
# make sure that opened files are closed
|
||||
parser = all_parsers
|
||||
|
||||
path = datapath("io", "data", "csv", "iris.csv")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(path, chunksize=1)
|
||||
return
|
||||
|
||||
reader = parser.read_csv(path, chunksize=1)
|
||||
assert not reader.handles.handle.closed
|
||||
try:
|
||||
with reader:
|
||||
next(reader)
|
||||
assert False
|
||||
except AssertionError:
|
||||
assert reader.handles.handle.closed
|
||||
|
||||
|
||||
def test_context_manageri_user_provided(all_parsers, datapath):
|
||||
# make sure that user-provided handles are not closed
|
||||
parser = all_parsers
|
||||
|
||||
with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path:
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(path, chunksize=1)
|
||||
return
|
||||
|
||||
reader = parser.read_csv(path, chunksize=1)
|
||||
assert not reader.handles.handle.closed
|
||||
try:
|
||||
with reader:
|
||||
next(reader)
|
||||
assert False
|
||||
except AssertionError:
|
||||
assert not reader.handles.handle.closed
|
||||
|
||||
|
||||
@skip_pyarrow # ParserError: Empty CSV file
|
||||
def test_file_descriptor_leak(all_parsers, using_copy_on_write):
|
||||
# GH 31488
|
||||
parser = all_parsers
|
||||
with tm.ensure_clean() as path:
|
||||
with pytest.raises(EmptyDataError, match="No columns to parse from file"):
|
||||
parser.read_csv(path)
|
||||
|
||||
|
||||
def test_memory_map(all_parsers, csv_dir_path):
|
||||
mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]}
|
||||
)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(mmap_file, memory_map=True)
|
||||
return
|
||||
|
||||
result = parser.read_csv(mmap_file, memory_map=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,79 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_linux
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block
|
||||
def test_float_parser(all_parsers):
|
||||
# see gh-9565
|
||||
parser = all_parsers
|
||||
data = "45e-1,4.5,45.,inf,-inf"
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
|
||||
expected = DataFrame([[float(s) for s in data.split(",")]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_scientific_no_exponent(all_parsers_all_precisions):
|
||||
# see gh-12215
|
||||
df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
|
||||
data = df.to_csv(index=False)
|
||||
parser, precision = all_parsers_all_precisions
|
||||
|
||||
df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
|
||||
tm.assert_frame_equal(df_roundtrip, df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"neg_exp",
|
||||
[
|
||||
-617,
|
||||
-100000,
|
||||
pytest.param(-99999999999999999, marks=pytest.mark.skip_ubsan),
|
||||
],
|
||||
)
|
||||
def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
|
||||
# GH#38753
|
||||
parser, precision = all_parsers_all_precisions
|
||||
|
||||
data = f"data\n10E{neg_exp}"
|
||||
result = parser.read_csv(StringIO(data), float_precision=precision)
|
||||
expected = DataFrame({"data": [0.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.skip_ubsan
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
|
||||
def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
|
||||
# GH#38753
|
||||
parser, precision = all_parsers_all_precisions
|
||||
data = f"data\n10E{exp}"
|
||||
result = parser.read_csv(StringIO(data), float_precision=precision)
|
||||
if precision == "round_trip":
|
||||
if exp == 999999999999999999 and is_platform_linux():
|
||||
mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
|
||||
request.applymarker(mark)
|
||||
|
||||
value = np.inf if exp > 0 else 0.0
|
||||
expected = DataFrame({"data": [value]})
|
||||
else:
|
||||
expected = DataFrame({"data": [f"10E{exp}"]})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,304 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
"""foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
""",
|
||||
{"index_col": 0, "names": ["index", "A", "B", "C", "D"]},
|
||||
DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"),
|
||||
columns=["A", "B", "C", "D"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"""foo,one,2,3,4,5
|
||||
foo,two,7,8,9,10
|
||||
foo,three,12,13,14,15
|
||||
bar,one,12,13,14,15
|
||||
bar,two,12,13,14,15
|
||||
""",
|
||||
{"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]},
|
||||
DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
index=MultiIndex.from_tuples(
|
||||
[
|
||||
("foo", "one"),
|
||||
("foo", "two"),
|
||||
("foo", "three"),
|
||||
("bar", "one"),
|
||||
("bar", "two"),
|
||||
],
|
||||
names=["index1", "index2"],
|
||||
),
|
||||
columns=["A", "B", "C", "D"],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_pass_names_with_index(all_parsers, data, kwargs, expected):
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
|
||||
def test_multi_index_no_level_names(
|
||||
request, all_parsers, index_col, using_infer_string
|
||||
):
|
||||
data = """index1,index2,A,B,C,D
|
||||
foo,one,2,3,4,5
|
||||
foo,two,7,8,9,10
|
||||
foo,three,12,13,14,15
|
||||
bar,one,12,13,14,15
|
||||
bar,two,12,13,14,15
|
||||
"""
|
||||
headless_data = "\n".join(data.split("\n")[1:])
|
||||
|
||||
names = ["A", "B", "C", "D"]
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(headless_data), index_col=index_col, header=None, names=names
|
||||
)
|
||||
expected = parser.read_csv(StringIO(data), index_col=index_col)
|
||||
|
||||
# No index names in headless data.
|
||||
expected.index.names = [None] * 2
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_multi_index_no_level_names_implicit(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C,D
|
||||
foo,one,2,3,4,5
|
||||
foo,two,7,8,9,10
|
||||
foo,three,12,13,14,15
|
||||
bar,one,12,13,14,15
|
||||
bar,two,12,13,14,15
|
||||
"""
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=MultiIndex.from_tuples(
|
||||
[
|
||||
("foo", "one"),
|
||||
("foo", "two"),
|
||||
("foo", "three"),
|
||||
("bar", "one"),
|
||||
("bar", "two"),
|
||||
]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected,header",
|
||||
[
|
||||
("a,b", DataFrame(columns=["a", "b"]), [0]),
|
||||
(
|
||||
"a,b\nc,d",
|
||||
DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])),
|
||||
[0, 1],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("round_trip", [True, False])
|
||||
def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
|
||||
# see gh-14545
|
||||
parser = all_parsers
|
||||
data = expected.to_csv(index=False) if round_trip else data
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.columns are different
|
||||
def test_no_unnamed_index(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """ id c0 c1 c2
|
||||
0 1 0 a b
|
||||
1 2 0 c d
|
||||
2 2 2 e f
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), sep=" ")
|
||||
expected = DataFrame(
|
||||
[[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]],
|
||||
columns=["Unnamed: 0", "id", "c0", "c1", "c2"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_duplicate_index_explicit(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo,12,13,14,15
|
||||
bar,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=0)
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_read_duplicate_index_implicit(all_parsers):
|
||||
data = """A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo,12,13,14,15
|
||||
bar,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_read_csv_no_index_name(all_parsers, csv_dir_path):
|
||||
parser = all_parsers
|
||||
csv2 = os.path.join(csv_dir_path, "test2.csv")
|
||||
result = parser.read_csv(csv2, index_col=0, parse_dates=True)
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.980269, 3.685731, -0.364216805298, -1.159738, "foo"],
|
||||
[1.047916, -0.041232, -0.16181208307, 0.212549, "bar"],
|
||||
[0.498581, 0.731168, -0.537677223318, 1.346270, "baz"],
|
||||
[1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"],
|
||||
[-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"],
|
||||
],
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
index=Index(
|
||||
[
|
||||
datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 4),
|
||||
datetime(2000, 1, 5),
|
||||
datetime(2000, 1, 6),
|
||||
datetime(2000, 1, 7),
|
||||
]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_empty_with_index(all_parsers):
|
||||
# see gh-10184
|
||||
data = "x,y"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=0)
|
||||
|
||||
expected = DataFrame(columns=["y"], index=Index([], name="x"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# CSV parse error: Empty CSV file or block: cannot infer number of columns
|
||||
@skip_pyarrow
|
||||
def test_empty_with_multi_index(all_parsers):
|
||||
# see gh-10467
|
||||
data = "x,y,z"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=["x", "y"])
|
||||
|
||||
expected = DataFrame(
|
||||
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# CSV parse error: Empty CSV file or block: cannot infer number of columns
|
||||
@skip_pyarrow
|
||||
def test_empty_with_reversed_multi_index(all_parsers):
|
||||
data = "x,y,z"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=[1, 0])
|
||||
|
||||
expected = DataFrame(
|
||||
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,78 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
@pytest.mark.parametrize("na_filter", [True, False])
|
||||
def test_inf_parsing(all_parsers, na_filter):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
,A
|
||||
a,inf
|
||||
b,-inf
|
||||
c,+Inf
|
||||
d,-Inf
|
||||
e,INF
|
||||
f,-INF
|
||||
g,+INf
|
||||
h,-INf
|
||||
i,inF
|
||||
j,-inF"""
|
||||
expected = DataFrame(
|
||||
{"A": [float("inf"), float("-inf")] * 5},
|
||||
index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
@pytest.mark.parametrize("na_filter", [True, False])
|
||||
def test_infinity_parsing(all_parsers, na_filter):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
,A
|
||||
a,Infinity
|
||||
b,-Infinity
|
||||
c,+Infinity
|
||||
"""
|
||||
expected = DataFrame(
|
||||
{"A": [float("infinity"), float("-infinity"), float("+infinity")]},
|
||||
index=["a", "b", "c"],
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_with_use_inf_as_na(all_parsers):
|
||||
# https://github.com/pandas-dev/pandas/issues/35493
|
||||
parser = all_parsers
|
||||
data = "1.0\nNaN\n3.0"
|
||||
msg = "use_inf_as_na option is deprecated"
|
||||
warn = FutureWarning
|
||||
if parser.engine == "pyarrow":
|
||||
warn = (FutureWarning, DeprecationWarning)
|
||||
|
||||
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
|
||||
with option_context("use_inf_as_na", True):
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = DataFrame([1.0, np.nan, 3.0])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,231 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_int_conversion(all_parsers):
|
||||
data = """A,B
|
||||
1.0,1
|
||||
2.0,2
|
||||
3.0,3
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
"A,B\nTrue,1\nFalse,2\nTrue,3",
|
||||
{},
|
||||
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
|
||||
),
|
||||
(
|
||||
"A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3",
|
||||
{"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]},
|
||||
DataFrame(
|
||||
[[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]],
|
||||
columns=["A", "B"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"A,B\nTRUE,1\nFALSE,2\nTRUE,3",
|
||||
{},
|
||||
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
|
||||
),
|
||||
(
|
||||
"A,B\nfoo,bar\nbar,foo",
|
||||
{"true_values": ["foo"], "false_values": ["bar"]},
|
||||
DataFrame([[True, False], [False, True]], columns=["A", "B"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_parse_bool(all_parsers, data, kwargs, expected):
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_integers_above_fp_precision(all_parsers):
|
||||
data = """Numbers
|
||||
17007000002000191
|
||||
17007000002000191
|
||||
17007000002000191
|
||||
17007000002000191
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000194"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Numbers": [
|
||||
17007000002000191,
|
||||
17007000002000191,
|
||||
17007000002000191,
|
||||
17007000002000191,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000194,
|
||||
]
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sep", [" ", r"\s+"])
|
||||
def test_integer_overflow_bug(all_parsers, sep):
|
||||
# see gh-2601
|
||||
data = "65248E10 11\n55555E55 22\n"
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow" and sep != " ":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=None, sep=sep)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, sep=sep)
|
||||
expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_int64_min_issues(all_parsers):
|
||||
# see gh-2599
|
||||
parser = all_parsers
|
||||
data = "A,B\n0,0\n0,"
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame({"A": [0, 0], "B": [0, np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
|
||||
def test_int64_overflow(all_parsers, conv, request):
|
||||
data = """ID
|
||||
00013007854817840016671868
|
||||
00013007854817840016749251
|
||||
00013007854817840016754630
|
||||
00013007854817840016781876
|
||||
00013007854817840017028824
|
||||
00013007854817840017963235
|
||||
00013007854817840018860166"""
|
||||
parser = all_parsers
|
||||
|
||||
if conv is None:
|
||||
# 13007854817840016671868 > UINT64_MAX, so this
|
||||
# will overflow and return object as the dtype.
|
||||
if parser.engine == "pyarrow":
|
||||
mark = pytest.mark.xfail(reason="parses to float64")
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[
|
||||
"00013007854817840016671868",
|
||||
"00013007854817840016749251",
|
||||
"00013007854817840016754630",
|
||||
"00013007854817840016781876",
|
||||
"00013007854817840017028824",
|
||||
"00013007854817840017963235",
|
||||
"00013007854817840018860166",
|
||||
],
|
||||
columns=["ID"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
# 13007854817840016671868 > UINT64_MAX, so attempts
|
||||
# to cast to either int64 or uint64 will result in
|
||||
# an OverflowError being raised.
|
||||
msg = "|".join(
|
||||
[
|
||||
"Python int too large to convert to C long",
|
||||
"long too big to convert",
|
||||
"int too big to convert",
|
||||
]
|
||||
)
|
||||
err = OverflowError
|
||||
if parser.engine == "pyarrow":
|
||||
err = ValueError
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(StringIO(data), converters={"ID": conv})
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
@pytest.mark.parametrize(
|
||||
"val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
|
||||
)
|
||||
def test_int64_uint64_range(all_parsers, val):
|
||||
# These numbers fall right inside the int64-uint64
|
||||
# range, so they should be parsed as string.
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(str(val)), header=None)
|
||||
|
||||
expected = DataFrame([val])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
@pytest.mark.parametrize(
|
||||
"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
|
||||
)
|
||||
def test_outside_int64_uint64_range(all_parsers, val):
|
||||
# These numbers fall just outside the int64-uint64
|
||||
# range, so they should be parsed as string.
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(str(val)), header=None)
|
||||
|
||||
expected = DataFrame([str(val)])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # gets float64 dtype instead of object
|
||||
@pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]])
|
||||
def test_numeric_range_too_wide(all_parsers, exp_data):
|
||||
# No numerical dtype can hold both negative and uint64
|
||||
# values, so they should be cast as string.
|
||||
parser = all_parsers
|
||||
data = "\n".join(exp_data)
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_integer_precision(all_parsers):
|
||||
# Gh 7072
|
||||
s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765
|
||||
5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(s), header=None)[4]
|
||||
expected = Series([4321583677327450765, 4321113141090630389], name=4)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@ -0,0 +1,134 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
def test_iterator(all_parsers):
|
||||
# see gh-6607
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), iterator=True, **kwargs)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader:
|
||||
first_chunk = reader.read(3)
|
||||
tm.assert_frame_equal(first_chunk, expected[:3])
|
||||
|
||||
last_chunk = reader.read(5)
|
||||
tm.assert_frame_equal(last_chunk, expected[3:])
|
||||
|
||||
|
||||
def test_iterator2(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), iterator=True)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), iterator=True) as reader:
|
||||
result = list(reader)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(result[0], expected)
|
||||
|
||||
|
||||
def test_iterator_stop_on_chunksize(all_parsers):
|
||||
# gh-3967: stopping iteration when chunksize is specified
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), chunksize=1)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), chunksize=1) as reader:
|
||||
result = list(reader)
|
||||
|
||||
assert len(result) == 3
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(concat(result), expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}]
|
||||
)
|
||||
def test_iterator_skipfooter_errors(all_parsers, kwargs):
|
||||
msg = "'skipfooter' not supported for iteration"
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"The '(chunksize|iterator)' option is not supported with the "
|
||||
"'pyarrow' engine"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _:
|
||||
pass
|
||||
|
||||
|
||||
def test_iteration_open_handle(all_parsers):
|
||||
parser = all_parsers
|
||||
kwargs = {"header": None}
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG")
|
||||
|
||||
with open(path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
if "CCC" in line:
|
||||
break
|
||||
|
||||
result = parser.read_csv(f, **kwargs)
|
||||
expected = DataFrame({0: ["DDD", "EEE", "FFF", "GGG"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,320 @@
|
||||
"""
|
||||
Tests that work on the Python, C and PyArrow engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
import codecs
|
||||
import csv
|
||||
from io import StringIO
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY311
|
||||
from pandas.errors import (
|
||||
EmptyDataError,
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_empty_decimal_marker(all_parsers):
|
||||
data = """A|B|C
|
||||
1|2,334|5
|
||||
10|13|10.
|
||||
"""
|
||||
# Parsers support only length-1 decimals
|
||||
msg = "Only length-1 decimal markers supported"
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"only single character unicode strings can be "
|
||||
"converted to Py_UCS4, got length 0"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), decimal="")
|
||||
|
||||
|
||||
def test_bad_stream_exception(all_parsers, csv_dir_path):
|
||||
# see gh-13652
|
||||
#
|
||||
# This test validates that both the Python engine and C engine will
|
||||
# raise UnicodeDecodeError instead of C engine raising ParserError
|
||||
# and swallowing the exception that caused read to fail.
|
||||
path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv")
|
||||
codec = codecs.lookup("utf-8")
|
||||
utf8 = codecs.lookup("utf-8")
|
||||
parser = all_parsers
|
||||
msg = "'utf-8' codec can't decode byte"
|
||||
|
||||
# Stream must be binary UTF8.
|
||||
with open(path, "rb") as handle, codecs.StreamRecoder(
|
||||
handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter
|
||||
) as stream:
|
||||
with pytest.raises(UnicodeDecodeError, match=msg):
|
||||
parser.read_csv(stream)
|
||||
|
||||
|
||||
def test_malformed(all_parsers):
|
||||
# see gh-6607
|
||||
parser = all_parsers
|
||||
data = """ignore
|
||||
A,B,C
|
||||
1,2,3 # comment
|
||||
1,2,3,4,5
|
||||
2,3,4
|
||||
"""
|
||||
msg = "Expected 3 fields in line 4, saw 5"
|
||||
err = ParserError
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
err = ValueError
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(StringIO(data), header=1, comment="#")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", [5, 3, None])
|
||||
def test_malformed_chunks(all_parsers, nrows):
|
||||
data = """ignore
|
||||
A,B,C
|
||||
skip
|
||||
1,2,3
|
||||
3,5,10 # comment
|
||||
1,2,3,4,5
|
||||
2,3,4
|
||||
"""
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
header=1,
|
||||
comment="#",
|
||||
iterator=True,
|
||||
chunksize=1,
|
||||
skiprows=[2],
|
||||
)
|
||||
return
|
||||
|
||||
msg = "Expected 3 fields in line 6, saw 5"
|
||||
with parser.read_csv(
|
||||
StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2]
|
||||
) as reader:
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
reader.read(nrows)
|
||||
|
||||
|
||||
@xfail_pyarrow # does not raise
|
||||
def test_catch_too_many_names(all_parsers):
|
||||
# see gh-5156
|
||||
data = """\
|
||||
1,2,3
|
||||
4,,6
|
||||
7,8,9
|
||||
10,11,12\n"""
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"Too many columns specified: expected 4 and found 3"
|
||||
if parser.engine == "c"
|
||||
else "Number of passed names did not match "
|
||||
"number of header fields in the file"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
|
||||
def test_raise_on_no_columns(all_parsers, nrows):
|
||||
parser = all_parsers
|
||||
data = "\n" * nrows
|
||||
|
||||
msg = "No columns to parse from file"
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
|
||||
|
||||
def test_unexpected_keyword_parameter_exception(all_parsers):
|
||||
# GH-34976
|
||||
parser = all_parsers
|
||||
|
||||
msg = "{}\\(\\) got an unexpected keyword argument 'foo'"
|
||||
with pytest.raises(TypeError, match=msg.format("read_csv")):
|
||||
parser.read_csv("foo.csv", foo=1)
|
||||
with pytest.raises(TypeError, match=msg.format("read_table")):
|
||||
parser.read_table("foo.tsv", foo=1)
|
||||
|
||||
|
||||
def test_suppress_error_output(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
expected = DataFrame({"a": [1, 4]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), on_bad_lines="skip")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_error_bad_lines(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
|
||||
msg = "Expected 1 fields in line 3, saw 3"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
# "CSV parse error: Expected 1 columns, got 3: 1,2,3"
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), on_bad_lines="error")
|
||||
|
||||
|
||||
def test_warn_bad_lines(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
expected = DataFrame({"a": [1, 4]})
|
||||
match_msg = "Skipping line"
|
||||
|
||||
expected_warning = ParserWarning
|
||||
if parser.engine == "pyarrow":
|
||||
match_msg = "Expected 1 columns, but found 3: 1,2,3"
|
||||
expected_warning = (ParserWarning, DeprecationWarning)
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
expected_warning, match=match_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_wrong_num_columns(all_parsers):
|
||||
# Too few columns.
|
||||
data = """A,B,C,D,E,F
|
||||
1,2,3,4,5,6
|
||||
6,7,8,9,10,11,12
|
||||
11,12,13,14,15,16
|
||||
"""
|
||||
parser = all_parsers
|
||||
msg = "Expected 6 fields in line 3, saw 7"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
# Expected 6 columns, got 7: 6,7,8,9,10,11,12
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
|
||||
|
||||
def test_null_byte_char(request, all_parsers):
|
||||
# see gh-2741
|
||||
data = "\x00,foo"
|
||||
names = ["a", "b"]
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "c" or (parser.engine == "python" and PY311):
|
||||
if parser.engine == "python" and PY311:
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason="In Python 3.11, this is read as an empty character not null"
|
||||
)
|
||||
)
|
||||
expected = DataFrame([[np.nan, "foo"]], columns=names)
|
||||
out = parser.read_csv(StringIO(data), names=names)
|
||||
tm.assert_frame_equal(out, expected)
|
||||
else:
|
||||
if parser.engine == "pyarrow":
|
||||
# CSV parse error: Empty CSV file or block: "
|
||||
# cannot infer number of columns"
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
else:
|
||||
msg = "NULL byte detected"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=names)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("always::ResourceWarning")
|
||||
def test_open_file(request, all_parsers):
|
||||
# GH 39024
|
||||
parser = all_parsers
|
||||
|
||||
msg = "Could not determine delimiter"
|
||||
err = csv.Error
|
||||
if parser.engine == "c":
|
||||
msg = "the 'c' engine does not support sep=None with delim_whitespace=False"
|
||||
err = ValueError
|
||||
elif parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"the 'pyarrow' engine does not support sep=None with delim_whitespace=False"
|
||||
)
|
||||
err = ValueError
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
file = Path(path)
|
||||
file.write_bytes(b"\xe4\na\n1")
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
# should not trigger a ResourceWarning
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(file, sep=None, encoding_errors="replace")
|
||||
|
||||
|
||||
def test_invalid_on_bad_line(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
with pytest.raises(ValueError, match="Argument abc is invalid for on_bad_lines"):
|
||||
parser.read_csv(StringIO(data), on_bad_lines="abc")
|
||||
|
||||
|
||||
def test_bad_header_uniform_error(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n"
|
||||
msg = "Expected 2 fields in line 2, saw 4"
|
||||
if parser.engine == "c":
|
||||
msg = (
|
||||
"Could not construct index. Requested to use 1 "
|
||||
"number of columns, but 3 left to parse."
|
||||
)
|
||||
elif parser.engine == "pyarrow":
|
||||
# "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4"
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error")
|
||||
|
||||
|
||||
def test_on_bad_lines_warn_correct_formatting(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = """1,2
|
||||
a,b
|
||||
a,b,c
|
||||
a,b,d
|
||||
a,b
|
||||
"""
|
||||
expected = DataFrame({"1": "a", "2": ["b"] * 2})
|
||||
match_msg = "Skipping line"
|
||||
|
||||
expected_warning = ParserWarning
|
||||
if parser.engine == "pyarrow":
|
||||
match_msg = "Expected 2 columns, but found 3: a,b,c"
|
||||
expected_warning = (ParserWarning, DeprecationWarning)
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
expected_warning, match=match_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,81 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas._testing as tm
|
||||
|
||||
depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated"
|
||||
|
||||
|
||||
def test_verbose_read(all_parsers, capsys):
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
one,1,2,3
|
||||
one,1,2,3
|
||||
,1,2,3
|
||||
one,1,2,3
|
||||
,1,2,3
|
||||
,1,2,3
|
||||
one,1,2,3
|
||||
two,1,2,3"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'verbose' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True)
|
||||
return
|
||||
|
||||
# Engines are verbose in different ways.
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True)
|
||||
captured = capsys.readouterr()
|
||||
|
||||
if parser.engine == "c":
|
||||
assert "Tokenization took:" in captured.out
|
||||
assert "Parser memory cleanup took:" in captured.out
|
||||
else: # Python engine
|
||||
assert captured.out == "Filled 3 NA values in column a\n"
|
||||
|
||||
|
||||
def test_verbose_read2(all_parsers, capsys):
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
one,1,2,3
|
||||
two,1,2,3
|
||||
three,1,2,3
|
||||
four,1,2,3
|
||||
five,1,2,3
|
||||
,1,2,3
|
||||
seven,1,2,3
|
||||
eight,1,2,3"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'verbose' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True, index_col=0)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True, index_col=0)
|
||||
captured = capsys.readouterr()
|
||||
|
||||
# Engines are verbose in different ways.
|
||||
if parser.engine == "c":
|
||||
assert "Tokenization took:" in captured.out
|
||||
assert "Parser memory cleanup took:" in captured.out
|
||||
else: # Python engine
|
||||
assert captured.out == "Filled 1 NA values in column a\n"
|
||||
337
lib/python3.11/site-packages/pandas/tests/io/parser/conftest.py
Normal file
337
lib/python3.11/site-packages/pandas/tests/io/parser/conftest.py
Normal file
@ -0,0 +1,337 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import HAS_PYARROW
|
||||
from pandas.compat._optional import VERSIONS
|
||||
|
||||
from pandas import (
|
||||
read_csv,
|
||||
read_table,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class BaseParser:
|
||||
engine: str | None = None
|
||||
low_memory = True
|
||||
float_precision_choices: list[str | None] = []
|
||||
|
||||
def update_kwargs(self, kwargs):
|
||||
kwargs = kwargs.copy()
|
||||
kwargs.update({"engine": self.engine, "low_memory": self.low_memory})
|
||||
|
||||
return kwargs
|
||||
|
||||
def read_csv(self, *args, **kwargs):
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
return read_csv(*args, **kwargs)
|
||||
|
||||
def read_csv_check_warnings(
|
||||
self,
|
||||
warn_type: type[Warning],
|
||||
warn_msg: str,
|
||||
*args,
|
||||
raise_on_extra_warnings=True,
|
||||
check_stacklevel: bool = True,
|
||||
**kwargs,
|
||||
):
|
||||
# We need to check the stacklevel here instead of in the tests
|
||||
# since this is where read_csv is called and where the warning
|
||||
# should point to.
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
with tm.assert_produces_warning(
|
||||
warn_type,
|
||||
match=warn_msg,
|
||||
raise_on_extra_warnings=raise_on_extra_warnings,
|
||||
check_stacklevel=check_stacklevel,
|
||||
):
|
||||
return read_csv(*args, **kwargs)
|
||||
|
||||
def read_table(self, *args, **kwargs):
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
return read_table(*args, **kwargs)
|
||||
|
||||
def read_table_check_warnings(
|
||||
self,
|
||||
warn_type: type[Warning],
|
||||
warn_msg: str,
|
||||
*args,
|
||||
raise_on_extra_warnings=True,
|
||||
**kwargs,
|
||||
):
|
||||
# We need to check the stacklevel here instead of in the tests
|
||||
# since this is where read_table is called and where the warning
|
||||
# should point to.
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
with tm.assert_produces_warning(
|
||||
warn_type, match=warn_msg, raise_on_extra_warnings=raise_on_extra_warnings
|
||||
):
|
||||
return read_table(*args, **kwargs)
|
||||
|
||||
|
||||
class CParser(BaseParser):
|
||||
engine = "c"
|
||||
float_precision_choices = [None, "high", "round_trip"]
|
||||
|
||||
|
||||
class CParserHighMemory(CParser):
|
||||
low_memory = False
|
||||
|
||||
|
||||
class CParserLowMemory(CParser):
|
||||
low_memory = True
|
||||
|
||||
|
||||
class PythonParser(BaseParser):
|
||||
engine = "python"
|
||||
float_precision_choices = [None]
|
||||
|
||||
|
||||
class PyArrowParser(BaseParser):
|
||||
engine = "pyarrow"
|
||||
float_precision_choices = [None]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def csv_dir_path(datapath):
|
||||
"""
|
||||
The directory path to the data files needed for parser tests.
|
||||
"""
|
||||
return datapath("io", "parser", "data")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def csv1(datapath):
|
||||
"""
|
||||
The path to the data file "test1.csv" needed for parser tests.
|
||||
"""
|
||||
return os.path.join(datapath("io", "data", "csv"), "test1.csv")
|
||||
|
||||
|
||||
_cParserHighMemory = CParserHighMemory
|
||||
_cParserLowMemory = CParserLowMemory
|
||||
_pythonParser = PythonParser
|
||||
_pyarrowParser = PyArrowParser
|
||||
|
||||
_py_parsers_only = [_pythonParser]
|
||||
_c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
|
||||
_pyarrow_parsers_only = [
|
||||
pytest.param(
|
||||
_pyarrowParser,
|
||||
marks=[
|
||||
pytest.mark.single_cpu,
|
||||
pytest.mark.skipif(not HAS_PYARROW, reason="pyarrow is not installed"),
|
||||
],
|
||||
)
|
||||
]
|
||||
|
||||
_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
|
||||
|
||||
_py_parser_ids = ["python"]
|
||||
_c_parser_ids = ["c_high", "c_low"]
|
||||
_pyarrow_parsers_ids = ["pyarrow"]
|
||||
|
||||
_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parsers_ids]
|
||||
|
||||
|
||||
@pytest.fixture(params=_all_parsers, ids=_all_parser_ids)
|
||||
def all_parsers(request):
|
||||
"""
|
||||
Fixture all of the CSV parsers.
|
||||
"""
|
||||
parser = request.param()
|
||||
if parser.engine == "pyarrow":
|
||||
pytest.importorskip("pyarrow", VERSIONS["pyarrow"])
|
||||
# Try finding a way to disable threads all together
|
||||
# for more stable CI runs
|
||||
import pyarrow
|
||||
|
||||
pyarrow.set_cpu_count(1)
|
||||
return parser
|
||||
|
||||
|
||||
@pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids)
|
||||
def c_parser_only(request):
|
||||
"""
|
||||
Fixture all of the CSV parsers using the C engine.
|
||||
"""
|
||||
return request.param()
|
||||
|
||||
|
||||
@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids)
|
||||
def python_parser_only(request):
|
||||
"""
|
||||
Fixture all of the CSV parsers using the Python engine.
|
||||
"""
|
||||
return request.param()
|
||||
|
||||
|
||||
@pytest.fixture(params=_pyarrow_parsers_only, ids=_pyarrow_parsers_ids)
|
||||
def pyarrow_parser_only(request):
|
||||
"""
|
||||
Fixture all of the CSV parsers using the Pyarrow engine.
|
||||
"""
|
||||
return request.param()
|
||||
|
||||
|
||||
def _get_all_parser_float_precision_combinations():
|
||||
"""
|
||||
Return all allowable parser and float precision
|
||||
combinations and corresponding ids.
|
||||
"""
|
||||
params = []
|
||||
ids = []
|
||||
for parser, parser_id in zip(_all_parsers, _all_parser_ids):
|
||||
if hasattr(parser, "values"):
|
||||
# Wrapped in pytest.param, get the actual parser back
|
||||
parser = parser.values[0]
|
||||
for precision in parser.float_precision_choices:
|
||||
# Re-wrap in pytest.param for pyarrow
|
||||
mark = (
|
||||
[
|
||||
pytest.mark.single_cpu,
|
||||
pytest.mark.skipif(
|
||||
not HAS_PYARROW, reason="pyarrow is not installed"
|
||||
),
|
||||
]
|
||||
if parser.engine == "pyarrow"
|
||||
else ()
|
||||
)
|
||||
param = pytest.param((parser(), precision), marks=mark)
|
||||
params.append(param)
|
||||
ids.append(f"{parser_id}-{precision}")
|
||||
|
||||
return {"params": params, "ids": ids}
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=_get_all_parser_float_precision_combinations()["params"],
|
||||
ids=_get_all_parser_float_precision_combinations()["ids"],
|
||||
)
|
||||
def all_parsers_all_precisions(request):
|
||||
"""
|
||||
Fixture for all allowable combinations of parser
|
||||
and float precision
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
_utf_values = [8, 16, 32]
|
||||
|
||||
_encoding_seps = ["", "-", "_"]
|
||||
_encoding_prefixes = ["utf", "UTF"]
|
||||
|
||||
_encoding_fmts = [
|
||||
f"{prefix}{sep}{{0}}" for sep in _encoding_seps for prefix in _encoding_prefixes
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(params=_utf_values)
|
||||
def utf_value(request):
|
||||
"""
|
||||
Fixture for all possible integer values for a UTF encoding.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=_encoding_fmts)
|
||||
def encoding_fmt(request):
|
||||
"""
|
||||
Fixture for all possible string formats of a UTF encoding.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
("-1,0", -1.0),
|
||||
("-1,2e0", -1.2),
|
||||
("-1e0", -1.0),
|
||||
("+1e0", 1.0),
|
||||
("+1e+0", 1.0),
|
||||
("+1e-1", 0.1),
|
||||
("+,1e1", 1.0),
|
||||
("+1,e0", 1.0),
|
||||
("-,1e1", -1.0),
|
||||
("-1,e0", -1.0),
|
||||
("0,1", 0.1),
|
||||
("1,", 1.0),
|
||||
(",1", 0.1),
|
||||
("-,1", -0.1),
|
||||
("1_,", 1.0),
|
||||
("1_234,56", 1234.56),
|
||||
("1_234,56e0", 1234.56),
|
||||
# negative cases; must not parse as float
|
||||
("_", "_"),
|
||||
("-_", "-_"),
|
||||
("-_1", "-_1"),
|
||||
("-_1e0", "-_1e0"),
|
||||
("_1", "_1"),
|
||||
("_1,", "_1,"),
|
||||
("_1,_", "_1,_"),
|
||||
("_1e0", "_1e0"),
|
||||
("1,2e_1", "1,2e_1"),
|
||||
("1,2e1_0", "1,2e1_0"),
|
||||
("1,_2", "1,_2"),
|
||||
(",1__2", ",1__2"),
|
||||
(",1e", ",1e"),
|
||||
("-,1e", "-,1e"),
|
||||
("1_000,000_000", "1_000,000_000"),
|
||||
("1,e1_2", "1,e1_2"),
|
||||
("e11,2", "e11,2"),
|
||||
("1e11,2", "1e11,2"),
|
||||
("1,2,2", "1,2,2"),
|
||||
("1,2_1", "1,2_1"),
|
||||
("1,2e-10e1", "1,2e-10e1"),
|
||||
("--1,2", "--1,2"),
|
||||
("1a_2,1", "1a_2,1"),
|
||||
("1,2E-1", 0.12),
|
||||
("1,2E1", 12.0),
|
||||
]
|
||||
)
|
||||
def numeric_decimal(request):
|
||||
"""
|
||||
Fixture for all numeric formats which should get recognized. The first entry
|
||||
represents the value to read while the second represents the expected result.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pyarrow_xfail(request):
|
||||
"""
|
||||
Fixture that xfails a test if the engine is pyarrow.
|
||||
|
||||
Use if failure is do to unsupported keywords or inconsistent results.
|
||||
"""
|
||||
if "all_parsers" in request.fixturenames:
|
||||
parser = request.getfixturevalue("all_parsers")
|
||||
elif "all_parsers_all_precisions" in request.fixturenames:
|
||||
# Return value is tuple of (engine, precision)
|
||||
parser = request.getfixturevalue("all_parsers_all_precisions")[0]
|
||||
else:
|
||||
return
|
||||
if parser.engine == "pyarrow":
|
||||
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
|
||||
request.applymarker(mark)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pyarrow_skip(request):
|
||||
"""
|
||||
Fixture that skips a test if the engine is pyarrow.
|
||||
|
||||
Use if failure is do a parsing failure from pyarrow.csv.read_csv
|
||||
"""
|
||||
if "all_parsers" in request.fixturenames:
|
||||
parser = request.getfixturevalue("all_parsers")
|
||||
elif "all_parsers_all_precisions" in request.fixturenames:
|
||||
# Return value is tuple of (engine, precision)
|
||||
parser = request.getfixturevalue("all_parsers_all_precisions")[0]
|
||||
else:
|
||||
return
|
||||
if parser.engine == "pyarrow":
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
@ -0,0 +1,334 @@
|
||||
"""
|
||||
Tests dtype specification during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import parsers as libparsers
|
||||
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
"category",
|
||||
CategoricalDtype(),
|
||||
{"a": "category", "b": "category", "c": CategoricalDtype()},
|
||||
],
|
||||
)
|
||||
def test_categorical_dtype(all_parsers, dtype):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,a,3.4
|
||||
1,a,3.4
|
||||
2,b,4.5"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["a", "a", "b"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"]),
|
||||
}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
|
||||
def test_categorical_dtype_single(all_parsers, dtype, request):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,a,3.4
|
||||
1,a,3.4
|
||||
2,b,4.5"""
|
||||
expected = DataFrame(
|
||||
{"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
mark = pytest.mark.xfail(
|
||||
strict=False,
|
||||
reason="Flaky test sometimes gives object dtype instead of Categorical",
|
||||
)
|
||||
request.applymarker(mark)
|
||||
|
||||
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
def test_categorical_dtype_unsorted(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,b,3.4
|
||||
1,b,3.4
|
||||
2,a,4.5"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["b", "b", "a"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"]),
|
||||
}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
def test_categorical_dtype_missing(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,b,3.4
|
||||
1,nan,3.4
|
||||
2,a,4.5"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["b", np.nan, "a"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"]),
|
||||
}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
@pytest.mark.slow
|
||||
def test_categorical_dtype_high_cardinality_numeric(all_parsers, monkeypatch):
|
||||
# see gh-18186
|
||||
# was an issue with C parser, due to DEFAULT_BUFFER_HEURISTIC
|
||||
parser = all_parsers
|
||||
heuristic = 2**5
|
||||
data = np.sort([str(i) for i in range(heuristic + 1)])
|
||||
expected = DataFrame({"a": Categorical(data, ordered=True)})
|
||||
with monkeypatch.context() as m:
|
||||
m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
|
||||
actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
|
||||
actual["a"] = actual["a"].cat.reorder_categories(
|
||||
np.sort(actual.a.cat.categories), ordered=True
|
||||
)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
|
||||
# see gh-10153
|
||||
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
|
||||
parser = all_parsers
|
||||
encoding = "utf-16"
|
||||
sep = "\t"
|
||||
|
||||
expected = parser.read_csv(pth, sep=sep, encoding=encoding)
|
||||
expected = expected.apply(Categorical)
|
||||
|
||||
actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_chunksize_infer_categories(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
expecteds = [
|
||||
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
|
||||
DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
|
||||
]
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2)
|
||||
return
|
||||
|
||||
with parser.read_csv(
|
||||
StringIO(data), dtype={"b": "category"}, chunksize=2
|
||||
) as actuals:
|
||||
for actual, expected in zip(actuals, expecteds):
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
cats = ["a", "b", "c"]
|
||||
expecteds = [
|
||||
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
|
||||
DataFrame(
|
||||
{"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)},
|
||||
index=[2, 3],
|
||||
),
|
||||
]
|
||||
dtype = CategoricalDtype(cats)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals:
|
||||
for actual, expected in zip(actuals, expecteds):
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
|
||||
# see gh-10153
|
||||
pth = os.path.join(csv_dir_path, "unicode_series.csv")
|
||||
parser = all_parsers
|
||||
encoding = "latin-1"
|
||||
|
||||
expected = parser.read_csv(pth, header=None, encoding=encoding)
|
||||
expected[1] = Categorical(expected[1])
|
||||
|
||||
actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"})
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ordered", [False, True])
|
||||
@pytest.mark.parametrize(
|
||||
"categories",
|
||||
[["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
|
||||
)
|
||||
def test_categorical_category_dtype(all_parsers, categories, ordered):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 2],
|
||||
"b": Categorical(
|
||||
["a", "b", "b", "c"], categories=categories, ordered=ordered
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_category_dtype_unsorted(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
dtype = CategoricalDtype(["c", "b", "a"])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 2],
|
||||
"b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
|
||||
}
|
||||
)
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype={"b": dtype})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_numeric(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([1, 2, 3])}
|
||||
|
||||
data = "b\n1\n1\n2\n3"
|
||||
expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_datetime(all_parsers):
|
||||
parser = all_parsers
|
||||
dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None)
|
||||
dtype = {"b": CategoricalDtype(dti)}
|
||||
|
||||
data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
|
||||
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_timestamp(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([Timestamp("2014")])}
|
||||
|
||||
data = "b\n2014-01-01\n2014-01-01"
|
||||
expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_timedelta(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype(pd.to_timedelta(["1h", "2h", "3h"]))}
|
||||
|
||||
data = "b\n1h\n2h\n3h"
|
||||
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
"b\nTrue\nFalse\nNA\nFalse",
|
||||
"b\ntrue\nfalse\nNA\nfalse",
|
||||
"b\nTRUE\nFALSE\nNA\nFALSE",
|
||||
"b\nTrue\nFalse\nNA\nFALSE",
|
||||
],
|
||||
)
|
||||
def test_categorical_dtype_coerces_boolean(all_parsers, data):
|
||||
# see gh-20498
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([False, True])}
|
||||
expected = DataFrame({"b": Categorical([True, False, None, False])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_unexpected_categories(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
|
||||
|
||||
data = "b\nd\na\nc\nd" # Unexpected c
|
||||
expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,644 @@
|
||||
"""
|
||||
Tests dtype specification during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from collections import defaultdict
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import IntegerArray
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [str, object])
|
||||
@pytest.mark.parametrize("check_orig", [True, False])
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
|
||||
# see gh-3795, gh-6607
|
||||
parser = all_parsers
|
||||
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((5, 2)).round(4),
|
||||
columns=list("AB"),
|
||||
index=["1A", "1B", "1C", "1D", "1E"],
|
||||
)
|
||||
|
||||
with tm.ensure_clean("__passing_str_as_dtype__.csv") as path:
|
||||
df.to_csv(path)
|
||||
|
||||
result = parser.read_csv(path, dtype=dtype, index_col=0)
|
||||
|
||||
if check_orig:
|
||||
expected = df.copy()
|
||||
result = result.astype(float)
|
||||
elif using_infer_string and dtype is str:
|
||||
expected = df.astype(str)
|
||||
else:
|
||||
expected = df.astype(str).astype(object)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtype_per_column(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
one,two
|
||||
1,2.5
|
||||
2,3.5
|
||||
3,4.5
|
||||
4,5.5"""
|
||||
expected = DataFrame(
|
||||
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
|
||||
)
|
||||
expected["one"] = expected["one"].astype(np.float64)
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_invalid_dtype_per_column(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
one,two
|
||||
1,2.5
|
||||
2,3.5
|
||||
3,4.5
|
||||
4,5.5"""
|
||||
|
||||
with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"):
|
||||
parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
|
||||
|
||||
|
||||
def test_raise_on_passed_int_dtype_with_nas(all_parsers):
|
||||
# see gh-2631
|
||||
parser = all_parsers
|
||||
data = """YEAR, DOY, a
|
||||
2001,106380451,10
|
||||
2001,,11
|
||||
2001,106380451,67"""
|
||||
|
||||
if parser.engine == "c":
|
||||
msg = "Integer column has NA values"
|
||||
elif parser.engine == "pyarrow":
|
||||
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
|
||||
else:
|
||||
msg = "Unable to convert column DOY"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True)
|
||||
|
||||
|
||||
def test_dtype_with_converters(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1.1,2.2
|
||||
1.2,2.3"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)}
|
||||
)
|
||||
return
|
||||
|
||||
# Dtype spec ignored if converted specified.
|
||||
result = parser.read_csv_check_warnings(
|
||||
ParserWarning,
|
||||
"Both a converter and dtype were specified for column a "
|
||||
"- only the converter will be used.",
|
||||
StringIO(data),
|
||||
dtype={"a": "i8"},
|
||||
converters={"a": lambda x: str(x)},
|
||||
)
|
||||
expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"])
|
||||
)
|
||||
def test_numeric_dtype(all_parsers, dtype):
|
||||
data = "0\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame([0, 1], dtype=dtype)
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_boolean_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "\n".join(
|
||||
[
|
||||
"a",
|
||||
"True",
|
||||
"TRUE",
|
||||
"true",
|
||||
"1",
|
||||
"1.0",
|
||||
"False",
|
||||
"FALSE",
|
||||
"false",
|
||||
"0",
|
||||
"0.0",
|
||||
"NaN",
|
||||
"nan",
|
||||
"NA",
|
||||
"null",
|
||||
"NULL",
|
||||
]
|
||||
)
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype="boolean")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.array(
|
||||
[
|
||||
True,
|
||||
True,
|
||||
True,
|
||||
True,
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
],
|
||||
dtype="boolean",
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_delimiter_with_usecols_and_parse_dates(all_parsers):
|
||||
# GH#35873
|
||||
result = all_parsers.read_csv(
|
||||
StringIO('"dump","-9,1","-9,1",20101010'),
|
||||
engine="python",
|
||||
names=["col", "col1", "col2", "col3"],
|
||||
usecols=["col1", "col2", "col3"],
|
||||
parse_dates=["col3"],
|
||||
decimal=",",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("thousands", ["_", None])
|
||||
def test_decimal_and_exponential(
|
||||
request, python_parser_only, numeric_decimal, thousands
|
||||
):
|
||||
# GH#31920
|
||||
decimal_number_check(request, python_parser_only, numeric_decimal, thousands, None)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("thousands", ["_", None])
|
||||
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
|
||||
def test_1000_sep_decimal_float_precision(
|
||||
request, c_parser_only, numeric_decimal, float_precision, thousands
|
||||
):
|
||||
# test decimal and thousand sep handling in across 'float_precision'
|
||||
# parsers
|
||||
decimal_number_check(
|
||||
request, c_parser_only, numeric_decimal, thousands, float_precision
|
||||
)
|
||||
text, value = numeric_decimal
|
||||
text = " " + text + " "
|
||||
if isinstance(value, str): # the negative cases (parse as text)
|
||||
value = " " + value + " "
|
||||
decimal_number_check(
|
||||
request, c_parser_only, (text, value), thousands, float_precision
|
||||
)
|
||||
|
||||
|
||||
def decimal_number_check(request, parser, numeric_decimal, thousands, float_precision):
|
||||
# GH#31920
|
||||
value = numeric_decimal[0]
|
||||
if thousands is None and value in ("1_,", "1_234,56", "1_234,56e0"):
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(reason=f"thousands={thousands} and sep is in {value}")
|
||||
)
|
||||
df = parser.read_csv(
|
||||
StringIO(value),
|
||||
float_precision=float_precision,
|
||||
sep="|",
|
||||
thousands=thousands,
|
||||
decimal=",",
|
||||
header=None,
|
||||
)
|
||||
val = df.iloc[0, 0]
|
||||
assert val == numeric_decimal[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
|
||||
def test_skip_whitespace(c_parser_only, float_precision):
|
||||
DATA = """id\tnum\t
|
||||
1\t1.2 \t
|
||||
1\t 2.1\t
|
||||
2\t 1\t
|
||||
2\t 1.2 \t
|
||||
"""
|
||||
df = c_parser_only.read_csv(
|
||||
StringIO(DATA),
|
||||
float_precision=float_precision,
|
||||
sep="\t",
|
||||
header=0,
|
||||
dtype={1: np.float64},
|
||||
)
|
||||
tm.assert_series_equal(df.iloc[:, 1], pd.Series([1.2, 2.1, 1.0, 1.2], name="num"))
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_true_values_cast_to_bool(all_parsers):
|
||||
# GH#34655
|
||||
text = """a,b
|
||||
yes,xxx
|
||||
no,yyy
|
||||
1,zzz
|
||||
0,aaa
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO(text),
|
||||
true_values=["yes"],
|
||||
false_values=["no"],
|
||||
dtype={"a": "boolean"},
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"a": [True, False, True, False], "b": ["xxx", "yyy", "zzz", "aaa"]}
|
||||
)
|
||||
expected["a"] = expected["a"].astype("boolean")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
|
||||
def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
|
||||
# GH#35211
|
||||
parser = all_parsers
|
||||
data = """a,a\n1,1"""
|
||||
dtype_dict = {"a": str, **dtypes}
|
||||
# GH#42462
|
||||
dtype_dict_copy = dtype_dict.copy()
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype_dict)
|
||||
expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
|
||||
assert dtype_dict == dtype_dict_copy, "dtype dict changed"
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
|
||||
# GH#42022
|
||||
parser = all_parsers
|
||||
data = """a,a\n1,1"""
|
||||
result = parser.read_csv(StringIO(data), dtype=str)
|
||||
expected = DataFrame({"a": ["1"], "a.1": ["1"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtype_multi_index(all_parsers):
|
||||
# GH 42446
|
||||
parser = all_parsers
|
||||
data = "A,B,B\nX,Y,Z\n1,2,3"
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=list(range(2)),
|
||||
dtype={
|
||||
("A", "X"): np.int32,
|
||||
("B", "Y"): np.int32,
|
||||
("B", "Z"): np.float32,
|
||||
},
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
("A", "X"): np.int32([1]),
|
||||
("B", "Y"): np.int32([2]),
|
||||
("B", "Z"): np.float32([3]),
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
|
||||
# GH 25472
|
||||
parser = all_parsers
|
||||
dtype = any_int_ea_dtype
|
||||
|
||||
data = """a,b,c
|
||||
,3,5
|
||||
1,,6
|
||||
2,4,"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.array([pd.NA, 1, 2], dtype=dtype),
|
||||
"b": pd.array([3, pd.NA, 4], dtype=dtype),
|
||||
"c": pd.array([5, 6, pd.NA], dtype=dtype),
|
||||
}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
@pytest.mark.parametrize("default", ["float", "float64"])
|
||||
def test_dtypes_defaultdict(all_parsers, default):
|
||||
# GH#41574
|
||||
data = """a,b
|
||||
1,2
|
||||
"""
|
||||
dtype = defaultdict(lambda: default, a="int64")
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
expected = DataFrame({"a": [1], "b": 2.0})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
|
||||
# GH#41574
|
||||
data = """a,b,a,b,b.1
|
||||
1,2,3,4,5
|
||||
"""
|
||||
dtype = defaultdict(lambda: "float64", a="int64")
|
||||
dtype["b.1"] = "int64"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
expected = DataFrame({"a": [1], "b": [2.0], "a.1": [3], "b.2": [4.0], "b.1": [5]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtypes_defaultdict_invalid(all_parsers):
|
||||
# GH#41574
|
||||
data = """a,b
|
||||
1,2
|
||||
"""
|
||||
dtype = defaultdict(lambda: "invalid_dtype", a="int64")
|
||||
parser = all_parsers
|
||||
with pytest.raises(TypeError, match="not understood"):
|
||||
parser.read_csv(StringIO(data), dtype=dtype)
|
||||
|
||||
|
||||
def test_dtype_backend(all_parsers):
|
||||
# GH#36712
|
||||
|
||||
parser = all_parsers
|
||||
|
||||
data = """a,b,c,d,e,f,g,h,i,j
|
||||
1,2.5,True,a,,,,,12-31-2019,
|
||||
3,4.5,False,b,6,7.5,True,a,12-31-2019,
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data), dtype_backend="numpy_nullable", parse_dates=["i"]
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 3], dtype="Int64"),
|
||||
"b": pd.Series([2.5, 4.5], dtype="Float64"),
|
||||
"c": pd.Series([True, False], dtype="boolean"),
|
||||
"d": pd.Series(["a", "b"], dtype="string"),
|
||||
"e": pd.Series([pd.NA, 6], dtype="Int64"),
|
||||
"f": pd.Series([pd.NA, 7.5], dtype="Float64"),
|
||||
"g": pd.Series([pd.NA, True], dtype="boolean"),
|
||||
"h": pd.Series([pd.NA, "a"], dtype="string"),
|
||||
"i": pd.Series([Timestamp("2019-12-31")] * 2),
|
||||
"j": pd.Series([pd.NA, pd.NA], dtype="Int64"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_backend_and_dtype(all_parsers):
|
||||
# GH#36712
|
||||
|
||||
parser = all_parsers
|
||||
|
||||
data = """a,b
|
||||
1,2.5
|
||||
,
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data), dtype_backend="numpy_nullable", dtype="float64"
|
||||
)
|
||||
expected = DataFrame({"a": [1.0, np.nan], "b": [2.5, np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_backend_string(all_parsers, string_storage):
|
||||
# GH#36712
|
||||
with pd.option_context("mode.string_storage", string_storage):
|
||||
parser = all_parsers
|
||||
|
||||
data = """a,b
|
||||
a,x
|
||||
b,
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable")
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)),
|
||||
"b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)),
|
||||
},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_backend_ea_dtype_specified(all_parsers):
|
||||
# GH#491496
|
||||
data = """a,b
|
||||
1,2
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO(data), dtype="Int64", dtype_backend="numpy_nullable"
|
||||
)
|
||||
expected = DataFrame({"a": [1], "b": 2}, dtype="Int64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_backend_pyarrow(all_parsers, request):
|
||||
# GH#36712
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
parser = all_parsers
|
||||
|
||||
data = """a,b,c,d,e,f,g,h,i,j
|
||||
1,2.5,True,a,,,,,12-31-2019,
|
||||
3,4.5,False,b,6,7.5,True,a,12-31-2019,
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), dtype_backend="pyarrow", parse_dates=["i"])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 3], dtype="int64[pyarrow]"),
|
||||
"b": pd.Series([2.5, 4.5], dtype="float64[pyarrow]"),
|
||||
"c": pd.Series([True, False], dtype="bool[pyarrow]"),
|
||||
"d": pd.Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())),
|
||||
"e": pd.Series([pd.NA, 6], dtype="int64[pyarrow]"),
|
||||
"f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"),
|
||||
"g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"),
|
||||
"h": pd.Series(
|
||||
[pd.NA, "a"],
|
||||
dtype=pd.ArrowDtype(pa.string()),
|
||||
),
|
||||
"i": pd.Series([Timestamp("2019-12-31")] * 2),
|
||||
"j": pd.Series([pd.NA, pd.NA], dtype="null[pyarrow]"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# pyarrow engine failing:
|
||||
# https://github.com/pandas-dev/pandas/issues/56136
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_ea_int_avoid_overflow(all_parsers):
|
||||
# GH#32134
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,1
|
||||
,1
|
||||
1582218195625938945,1
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), dtype={"a": "Int64"})
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": IntegerArray(
|
||||
np.array([1, 1, 1582218195625938945]), np.array([False, True, False])
|
||||
),
|
||||
"b": 1,
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_string_inference(all_parsers):
|
||||
# GH#54430
|
||||
dtype = pd.StringDtype(na_value=np.nan)
|
||||
|
||||
data = """a,b
|
||||
x,1
|
||||
y,2
|
||||
,3"""
|
||||
parser = all_parsers
|
||||
with pd.option_context("future.infer_string", True):
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": pd.Series(["x", "y", None], dtype=dtype), "b": [1, 2, 3]},
|
||||
columns=pd.Index(["a", "b"], dtype=dtype),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_])
|
||||
def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
|
||||
# GH#56047
|
||||
data = """a,b
|
||||
x,a
|
||||
y,a
|
||||
z,a"""
|
||||
parser = all_parsers
|
||||
with pd.option_context("future.infer_string", True):
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
|
||||
expected_dtype = pd.StringDtype(na_value=np.nan) if dtype is str else object
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.Series(["x", "y", "z"], dtype=expected_dtype),
|
||||
"b": pd.Series(["a", "a", "a"], dtype=expected_dtype),
|
||||
},
|
||||
columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with pd.option_context("future.infer_string", True):
|
||||
result = parser.read_csv(StringIO(data), dtype={"a": dtype})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.Series(["x", "y", "z"], dtype=expected_dtype),
|
||||
"b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)),
|
||||
},
|
||||
columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow
|
||||
def test_accurate_parsing_of_large_integers(all_parsers):
|
||||
# GH#52505
|
||||
data = """SYMBOL,MOMENT,ID,ID_DEAL
|
||||
AAPL,20230301181139587,1925036343869802844,
|
||||
AAPL,20230301181139587,2023552585717889863,2023552585717263358
|
||||
NVDA,20230301181139587,2023552585717889863,2023552585717263359
|
||||
AMC,20230301181139587,2023552585717889863,2023552585717263360
|
||||
AMZN,20230301181139587,2023552585717889759,2023552585717263360
|
||||
MSFT,20230301181139587,2023552585717889863,2023552585717263361
|
||||
NVDA,20230301181139587,2023552585717889827,2023552585717263361"""
|
||||
orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
|
||||
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1
|
||||
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1
|
||||
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2
|
||||
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263361, "ID_DEAL"]) == 2
|
||||
|
||||
|
||||
def test_dtypes_with_usecols(all_parsers):
|
||||
# GH#54868
|
||||
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,2,3
|
||||
4,5,6"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["a", "c"], dtype={"a": object})
|
||||
if parser.engine == "pyarrow":
|
||||
values = [1, 4]
|
||||
else:
|
||||
values = ["1", "4"]
|
||||
expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_col_with_dtype_no_rangeindex(all_parsers):
|
||||
data = StringIO("345.5,519.5,0\n519.5,726.5,1")
|
||||
result = all_parsers.read_csv(
|
||||
data,
|
||||
header=None,
|
||||
names=["start", "stop", "bin_id"],
|
||||
dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32},
|
||||
index_col="bin_id",
|
||||
).index
|
||||
expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id")
|
||||
tm.assert_index_equal(result, expected)
|
||||
@ -0,0 +1,181 @@
|
||||
"""
|
||||
Tests dtype specification during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_dtype_all_columns_empty(all_parsers):
|
||||
# see gh-12048
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO("A,B"), dtype=str)
|
||||
|
||||
expected = DataFrame({"A": [], "B": []}, dtype=str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two"
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": "u1"})
|
||||
|
||||
expected = DataFrame(
|
||||
{"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_index_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two"
|
||||
result = parser.read_csv(
|
||||
StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"}
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_multi_index_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two,three"
|
||||
result = parser.read_csv(
|
||||
StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"}
|
||||
)
|
||||
|
||||
exp_idx = MultiIndex.from_arrays(
|
||||
[np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)],
|
||||
names=["one", "two"],
|
||||
)
|
||||
expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"})
|
||||
|
||||
expected = DataFrame(
|
||||
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
|
||||
|
||||
expected = DataFrame(
|
||||
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
|
||||
# see gh-9424
|
||||
parser = all_parsers
|
||||
expected = concat(
|
||||
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
|
||||
axis=1,
|
||||
)
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers):
|
||||
# see gh-9424
|
||||
parser = all_parsers
|
||||
expected = concat(
|
||||
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
|
||||
axis=1,
|
||||
)
|
||||
expected.index = expected.index.astype(object)
|
||||
|
||||
with pytest.raises(ValueError, match="Duplicate names"):
|
||||
data = ""
|
||||
parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"})
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype,expected",
|
||||
[
|
||||
(np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)),
|
||||
(
|
||||
"category",
|
||||
DataFrame({"a": Categorical([]), "b": Categorical([])}),
|
||||
),
|
||||
(
|
||||
{"a": "category", "b": "category"},
|
||||
DataFrame({"a": Categorical([]), "b": Categorical([])}),
|
||||
),
|
||||
("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")),
|
||||
(
|
||||
"timedelta64[ns]",
|
||||
DataFrame(
|
||||
{
|
||||
"a": Series([], dtype="timedelta64[ns]"),
|
||||
"b": Series([], dtype="timedelta64[ns]"),
|
||||
},
|
||||
),
|
||||
),
|
||||
(
|
||||
{"a": np.int64, "b": np.int32},
|
||||
DataFrame(
|
||||
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
|
||||
),
|
||||
),
|
||||
(
|
||||
{0: np.int64, 1: np.int32},
|
||||
DataFrame(
|
||||
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
|
||||
),
|
||||
),
|
||||
(
|
||||
{"a": np.int64, 1: np.int32},
|
||||
DataFrame(
|
||||
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_dtype(all_parsers, dtype, expected):
|
||||
# see gh-14712
|
||||
parser = all_parsers
|
||||
data = "a,b"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=0, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,647 @@
|
||||
"""
|
||||
Tests that apply specifically to the CParser. Unless specifically stated
|
||||
as a CParser-specific issue, the goal is to eventually move as many of
|
||||
these tests out of this module as soon as the Python parser can accept
|
||||
further arguments when parsing.
|
||||
"""
|
||||
from decimal import Decimal
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
TextIOWrapper,
|
||||
)
|
||||
import mmap
|
||||
import os
|
||||
import tarfile
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat.numpy import np_version_gte1p24
|
||||
from pandas.errors import (
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"malformed",
|
||||
["1\r1\r1\r 1\r 1\r", "1\r1\r1\r 1\r 1\r11\r", "1\r1\r1\r 1\r 1\r11\r1\r"],
|
||||
ids=["words pointer", "stream pointer", "lines pointer"],
|
||||
)
|
||||
def test_buffer_overflow(c_parser_only, malformed):
|
||||
# see gh-9205: test certain malformed input files that cause
|
||||
# buffer overflows in tokenizer.c
|
||||
msg = "Buffer overflow caught - possible malformed input file."
|
||||
parser = c_parser_only
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(malformed))
|
||||
|
||||
|
||||
def test_delim_whitespace_custom_terminator(c_parser_only):
|
||||
# See gh-12912
|
||||
data = "a b c~1 2 3~4 5 6~7 8 9"
|
||||
parser = c_parser_only
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
def test_dtype_and_names_error(c_parser_only):
|
||||
# see gh-8833: passing both dtype and names
|
||||
# resulting in an error reporting issue
|
||||
parser = c_parser_only
|
||||
data = """
|
||||
1.0 1
|
||||
2.0 2
|
||||
3.0 3
|
||||
"""
|
||||
# base cases
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None)
|
||||
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"])
|
||||
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# fallback casting
|
||||
result = parser.read_csv(
|
||||
StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32}
|
||||
)
|
||||
expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"])
|
||||
expected["a"] = expected["a"].astype(np.int32)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
data = """
|
||||
1.0 1
|
||||
nan 2
|
||||
3.0 3
|
||||
"""
|
||||
# fallback casting, but not castable
|
||||
warning = RuntimeWarning if np_version_gte1p24 else None
|
||||
with pytest.raises(ValueError, match="cannot safely convert"):
|
||||
with tm.assert_produces_warning(warning, check_stacklevel=False):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
sep=r"\s+",
|
||||
header=None,
|
||||
names=["a", "b"],
|
||||
dtype={"a": np.int32},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"match,kwargs",
|
||||
[
|
||||
# For each of these cases, all of the dtypes are valid, just unsupported.
|
||||
(
|
||||
(
|
||||
"the dtype datetime64 is not supported for parsing, "
|
||||
"pass this column using parse_dates instead"
|
||||
),
|
||||
{"dtype": {"A": "datetime64", "B": "float64"}},
|
||||
),
|
||||
(
|
||||
(
|
||||
"the dtype datetime64 is not supported for parsing, "
|
||||
"pass this column using parse_dates instead"
|
||||
),
|
||||
{"dtype": {"A": "datetime64", "B": "float64"}, "parse_dates": ["B"]},
|
||||
),
|
||||
(
|
||||
"the dtype timedelta64 is not supported for parsing",
|
||||
{"dtype": {"A": "timedelta64", "B": "float64"}},
|
||||
),
|
||||
(
|
||||
f"the dtype {tm.ENDIAN}U8 is not supported for parsing",
|
||||
{"dtype": {"A": "U8"}},
|
||||
),
|
||||
],
|
||||
ids=["dt64-0", "dt64-1", "td64", f"{tm.ENDIAN}U8"],
|
||||
)
|
||||
def test_unsupported_dtype(c_parser_only, match, kwargs):
|
||||
parser = c_parser_only
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((5, 2)),
|
||||
columns=list("AB"),
|
||||
index=["1A", "1B", "1C", "1D", "1E"],
|
||||
)
|
||||
|
||||
with tm.ensure_clean("__unsupported_dtype__.csv") as path:
|
||||
df.to_csv(path)
|
||||
|
||||
with pytest.raises(TypeError, match=match):
|
||||
parser.read_csv(path, index_col=0, **kwargs)
|
||||
|
||||
|
||||
@td.skip_if_32bit
|
||||
@pytest.mark.slow
|
||||
# test numbers between 1 and 2
|
||||
@pytest.mark.parametrize("num", np.linspace(1.0, 2.0, num=21))
|
||||
def test_precise_conversion(c_parser_only, num):
|
||||
parser = c_parser_only
|
||||
|
||||
normal_errors = []
|
||||
precise_errors = []
|
||||
|
||||
def error(val: float, actual_val: Decimal) -> Decimal:
|
||||
return abs(Decimal(f"{val:.100}") - actual_val)
|
||||
|
||||
# 25 decimal digits of precision
|
||||
text = f"a\n{num:.25}"
|
||||
|
||||
normal_val = float(
|
||||
parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
|
||||
)
|
||||
precise_val = float(parser.read_csv(StringIO(text), float_precision="high")["a"][0])
|
||||
roundtrip_val = float(
|
||||
parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
|
||||
)
|
||||
actual_val = Decimal(text[2:])
|
||||
|
||||
normal_errors.append(error(normal_val, actual_val))
|
||||
precise_errors.append(error(precise_val, actual_val))
|
||||
|
||||
# round-trip should match float()
|
||||
assert roundtrip_val == float(text[2:])
|
||||
|
||||
assert sum(precise_errors) <= sum(normal_errors)
|
||||
assert max(precise_errors) <= max(normal_errors)
|
||||
|
||||
|
||||
def test_usecols_dtypes(c_parser_only, using_infer_string):
|
||||
parser = c_parser_only
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
usecols=(0, 1, 2),
|
||||
names=("a", "b", "c"),
|
||||
header=None,
|
||||
converters={"a": str},
|
||||
dtype={"b": int, "c": float},
|
||||
)
|
||||
result2 = parser.read_csv(
|
||||
StringIO(data),
|
||||
usecols=(0, 2),
|
||||
names=("a", "b", "c"),
|
||||
header=None,
|
||||
converters={"a": str},
|
||||
dtype={"b": int, "c": float},
|
||||
)
|
||||
|
||||
if using_infer_string:
|
||||
assert (result.dtypes == ["string", int, float]).all()
|
||||
assert (result2.dtypes == ["string", float]).all()
|
||||
else:
|
||||
assert (result.dtypes == [object, int, float]).all()
|
||||
assert (result2.dtypes == [object, float]).all()
|
||||
|
||||
|
||||
def test_disable_bool_parsing(c_parser_only):
|
||||
# see gh-2090
|
||||
|
||||
parser = c_parser_only
|
||||
data = """A,B,C
|
||||
Yes,No,Yes
|
||||
No,Yes,Yes
|
||||
Yes,,Yes
|
||||
No,No,No"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=object)
|
||||
assert (result.dtypes == object).all()
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=object, na_filter=False)
|
||||
assert result["B"][2] == ""
|
||||
|
||||
|
||||
def test_custom_lineterminator(c_parser_only):
|
||||
parser = c_parser_only
|
||||
data = "a,b,c~1,2,3~4,5,6"
|
||||
|
||||
result = parser.read_csv(StringIO(data), lineterminator="~")
|
||||
expected = parser.read_csv(StringIO(data.replace("~", "\n")))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_ragged_csv(c_parser_only):
|
||||
parser = c_parser_only
|
||||
data = """1,2,3
|
||||
1,2,3,4
|
||||
1,2,3,4,5
|
||||
1,2
|
||||
1,2,3,4"""
|
||||
|
||||
nice_data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data), header=None, names=["a", "b", "c", "d", "e"]
|
||||
)
|
||||
|
||||
expected = parser.read_csv(
|
||||
StringIO(nice_data), header=None, names=["a", "b", "c", "d", "e"]
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# too many columns, cause segfault if not careful
|
||||
data = "1,2\n3,4,5"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, names=range(50))
|
||||
expected = parser.read_csv(StringIO(data), header=None, names=range(3)).reindex(
|
||||
columns=range(50)
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_tokenize_CR_with_quoting(c_parser_only):
|
||||
# see gh-3453
|
||||
parser = c_parser_only
|
||||
data = ' a,b,c\r"a,b","e,d","f,f"'
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = parser.read_csv(StringIO(data.replace("\r", "\n")), header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = parser.read_csv(StringIO(data.replace("\r", "\n")))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("count", [3 * 2**n for n in range(6)])
|
||||
def test_grow_boundary_at_cap(c_parser_only, count):
|
||||
# See gh-12494
|
||||
#
|
||||
# Cause of error was that the C parser
|
||||
# was not increasing the buffer size when
|
||||
# the desired space would fill the buffer
|
||||
# to capacity, which would later cause a
|
||||
# buffer overflow error when checking the
|
||||
# EOF terminator of the CSV stream.
|
||||
# 3 * 2^n commas was observed to break the parser
|
||||
parser = c_parser_only
|
||||
|
||||
with StringIO("," * count) as s:
|
||||
expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)])
|
||||
df = parser.read_csv(s)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("encoding", [None, "utf-8"])
|
||||
def test_parse_trim_buffers(c_parser_only, encoding):
|
||||
# This test is part of a bugfix for gh-13703. It attempts to
|
||||
# to stress the system memory allocator, to cause it to move the
|
||||
# stream buffer and either let the OS reclaim the region, or let
|
||||
# other memory requests of parser otherwise modify the contents
|
||||
# of memory space, where it was formally located.
|
||||
# This test is designed to cause a `segfault` with unpatched
|
||||
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
|
||||
# times it fails due to memory corruption, which causes the
|
||||
# loaded DataFrame to differ from the expected one.
|
||||
|
||||
# Also force 'utf-8' encoding, so that `_string_convert` would take
|
||||
# a different execution branch.
|
||||
|
||||
parser = c_parser_only
|
||||
|
||||
# Generate a large mixed-type CSV file on-the-fly (one record is
|
||||
# approx 1.5KiB).
|
||||
record_ = (
|
||||
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z"""
|
||||
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,"""
|
||||
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9"""
|
||||
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,"""
|
||||
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9."""
|
||||
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999."""
|
||||
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ"""
|
||||
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ"""
|
||||
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z"""
|
||||
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,"""
|
||||
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,"""
|
||||
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,"""
|
||||
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999"""
|
||||
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9."""
|
||||
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,"""
|
||||
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z"""
|
||||
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ"""
|
||||
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99"""
|
||||
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-"""
|
||||
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9"""
|
||||
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,"""
|
||||
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9."""
|
||||
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ"""
|
||||
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ"""
|
||||
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ"""
|
||||
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ"""
|
||||
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99"""
|
||||
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9"""
|
||||
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
|
||||
)
|
||||
|
||||
# Set the number of lines so that a call to `parser_trim_buffers`
|
||||
# is triggered: after a couple of full chunks are consumed a
|
||||
# relatively small 'residual' chunk would cause reallocation
|
||||
# within the parser.
|
||||
chunksize, n_lines = 128, 2 * 128 + 15
|
||||
csv_data = "\n".join([record_] * n_lines) + "\n"
|
||||
|
||||
# We will use StringIO to load the CSV from this text buffer.
|
||||
# pd.read_csv() will iterate over the file in chunks and will
|
||||
# finally read a residual chunk of really small size.
|
||||
|
||||
# Generate the expected output: manually create the dataframe
|
||||
# by splitting by comma and repeating the `n_lines` times.
|
||||
row = tuple(val_ if val_ else np.nan for val_ in record_.split(","))
|
||||
expected = DataFrame(
|
||||
[row for _ in range(n_lines)], dtype=object, columns=None, index=None
|
||||
)
|
||||
|
||||
# Iterate over the CSV file in chunks of `chunksize` lines
|
||||
with parser.read_csv(
|
||||
StringIO(csv_data),
|
||||
header=None,
|
||||
dtype=object,
|
||||
chunksize=chunksize,
|
||||
encoding=encoding,
|
||||
) as chunks_:
|
||||
result = concat(chunks_, axis=0, ignore_index=True)
|
||||
|
||||
# Check for data corruption if there was no segfault
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_internal_null_byte(c_parser_only):
|
||||
# see gh-14012
|
||||
#
|
||||
# The null byte ('\x00') should not be used as a
|
||||
# true line terminator, escape character, or comment
|
||||
# character, only as a placeholder to indicate that
|
||||
# none was specified.
|
||||
#
|
||||
# This test should be moved to test_common.py ONLY when
|
||||
# Python's csv class supports parsing '\x00'.
|
||||
parser = c_parser_only
|
||||
|
||||
names = ["a", "b", "c"]
|
||||
data = "1,2,3\n4,\x00,6\n7,8,9"
|
||||
expected = DataFrame([[1, 2.0, 3], [4, np.nan, 6], [7, 8, 9]], columns=names)
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_nrows_large(c_parser_only):
|
||||
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
|
||||
parser = c_parser_only
|
||||
header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n"
|
||||
data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n"
|
||||
header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n"
|
||||
data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n"
|
||||
test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2
|
||||
|
||||
df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010)
|
||||
|
||||
assert df.size == 1010 * 10
|
||||
|
||||
|
||||
def test_float_precision_round_trip_with_text(c_parser_only):
|
||||
# see gh-15140
|
||||
parser = c_parser_only
|
||||
df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip")
|
||||
tm.assert_frame_equal(df, DataFrame({0: ["a"]}))
|
||||
|
||||
|
||||
def test_large_difference_in_columns(c_parser_only):
|
||||
# see gh-14125
|
||||
parser = c_parser_only
|
||||
|
||||
count = 10000
|
||||
large_row = ("X," * count)[:-1] + "\n"
|
||||
normal_row = "XXXXXX XXXXXX,111111111111111\n"
|
||||
test_input = (large_row + normal_row * 6)[:-1]
|
||||
|
||||
result = parser.read_csv(StringIO(test_input), header=None, usecols=[0])
|
||||
rows = test_input.split("\n")
|
||||
|
||||
expected = DataFrame([row.split(",")[0] for row in rows])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_data_after_quote(c_parser_only):
|
||||
# see gh-15910
|
||||
parser = c_parser_only
|
||||
|
||||
data = 'a\n1\n"b"a'
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame({"a": ["1", "ba"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_whitespace_delimited(c_parser_only):
|
||||
parser = c_parser_only
|
||||
test_input = """\
|
||||
1 2
|
||||
2 2 3
|
||||
3 2 3 # 3 fields
|
||||
4 2 3# 3 fields
|
||||
5 2 # 2 fields
|
||||
6 2# 2 fields
|
||||
7 # 1 field, NaN
|
||||
8# 1 field, NaN
|
||||
9 2 3 # skipped line
|
||||
# comment"""
|
||||
with tm.assert_produces_warning(
|
||||
ParserWarning, match="Skipping line", check_stacklevel=False
|
||||
):
|
||||
df = parser.read_csv(
|
||||
StringIO(test_input),
|
||||
comment="#",
|
||||
header=None,
|
||||
delimiter="\\s+",
|
||||
skiprows=0,
|
||||
on_bad_lines="warn",
|
||||
)
|
||||
expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
def test_file_like_no_next(c_parser_only):
|
||||
# gh-16530: the file-like need not have a "next" or "__next__"
|
||||
# attribute despite having an "__iter__" attribute.
|
||||
#
|
||||
# NOTE: This is only true for the C engine, not Python engine.
|
||||
class NoNextBuffer(StringIO):
|
||||
def __next__(self):
|
||||
raise AttributeError("No next method")
|
||||
|
||||
next = __next__
|
||||
|
||||
parser = c_parser_only
|
||||
data = "a\n1"
|
||||
|
||||
expected = DataFrame({"a": [1]})
|
||||
result = parser.read_csv(NoNextBuffer(data))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_buffer_rd_bytes_bad_unicode(c_parser_only):
|
||||
# see gh-22748
|
||||
t = BytesIO(b"\xB0")
|
||||
t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
|
||||
msg = "'utf-8' codec can't encode character"
|
||||
with pytest.raises(UnicodeError, match=msg):
|
||||
c_parser_only.read_csv(t, encoding="UTF-8")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
|
||||
def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix):
|
||||
# see gh-16530
|
||||
#
|
||||
# Unfortunately, Python's CSV library can't handle
|
||||
# tarfile objects (expects string, not bytes when
|
||||
# iterating through a file-like).
|
||||
parser = c_parser_only
|
||||
tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix)
|
||||
|
||||
with tarfile.open(tar_path, "r") as tar:
|
||||
data_file = tar.extractfile("tar_data.csv")
|
||||
|
||||
out = parser.read_csv(data_file)
|
||||
expected = DataFrame({"a": [1]})
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
|
||||
def test_chunk_whitespace_on_boundary(c_parser_only):
|
||||
# see gh-9735: this issue is C parser-specific (bug when
|
||||
# parsing whitespace and characters at chunk boundary)
|
||||
#
|
||||
# This test case has a field too large for the Python parser / CSV library.
|
||||
parser = c_parser_only
|
||||
|
||||
chunk1 = "a" * (1024 * 256 - 2) + "\na"
|
||||
chunk2 = "\n a"
|
||||
result = parser.read_csv(StringIO(chunk1 + chunk2), header=None)
|
||||
|
||||
expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_file_handles_mmap(c_parser_only, csv1):
|
||||
# gh-14418
|
||||
#
|
||||
# Don't close user provided file handles.
|
||||
parser = c_parser_only
|
||||
|
||||
with open(csv1, encoding="utf-8") as f:
|
||||
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m:
|
||||
parser.read_csv(m)
|
||||
assert not m.closed
|
||||
|
||||
|
||||
def test_file_binary_mode(c_parser_only):
|
||||
# see gh-23779
|
||||
parser = c_parser_only
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write("1,2,3\n4,5,6")
|
||||
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_unix_style_breaks(c_parser_only):
|
||||
# GH 11020
|
||||
parser = c_parser_only
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w", newline="\n", encoding="utf-8") as f:
|
||||
f.write("blah\n\ncol_1,col_2,col_3\n\n")
|
||||
result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
|
||||
expected = DataFrame(columns=["col_1", "col_2", "col_3"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
|
||||
@pytest.mark.parametrize(
|
||||
"data,thousands,decimal",
|
||||
[
|
||||
(
|
||||
"""A|B|C
|
||||
1|2,334.01|5
|
||||
10|13|10.
|
||||
""",
|
||||
",",
|
||||
".",
|
||||
),
|
||||
(
|
||||
"""A|B|C
|
||||
1|2.334,01|5
|
||||
10|13|10,
|
||||
""",
|
||||
".",
|
||||
",",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_1000_sep_with_decimal(
|
||||
c_parser_only, data, thousands, decimal, float_precision
|
||||
):
|
||||
parser = c_parser_only
|
||||
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
sep="|",
|
||||
thousands=thousands,
|
||||
decimal=decimal,
|
||||
float_precision=float_precision,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_float_precision_options(c_parser_only):
|
||||
# GH 17154, 36228
|
||||
parser = c_parser_only
|
||||
s = "foo\n243.164\n"
|
||||
df = parser.read_csv(StringIO(s))
|
||||
df2 = parser.read_csv(StringIO(s), float_precision="high")
|
||||
|
||||
tm.assert_frame_equal(df, df2)
|
||||
|
||||
df3 = parser.read_csv(StringIO(s), float_precision="legacy")
|
||||
|
||||
assert not df.iloc[0, 0] == df3.iloc[0, 0]
|
||||
|
||||
msg = "Unrecognized float_precision option: junk"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(s), float_precision="junk")
|
||||
@ -0,0 +1,227 @@
|
||||
"""
|
||||
Tests that comments are properly handled during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values", [None, ["NaN"]])
|
||||
def test_comment(all_parsers, na_values):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
1,2.,4.#hello world
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", na_values=na_values)
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), comment="#", na_values=na_values)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"read_kwargs", [{}, {"lineterminator": "*"}, {"delim_whitespace": True}]
|
||||
)
|
||||
def test_line_comment(all_parsers, read_kwargs, request):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
A,B,C
|
||||
1,2.,4.#hello world
|
||||
#ignore this line
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
warn = None
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
if read_kwargs.get("delim_whitespace"):
|
||||
data = data.replace(",", " ")
|
||||
warn = FutureWarning
|
||||
elif read_kwargs.get("lineterminator"):
|
||||
data = data.replace("\n", read_kwargs.get("lineterminator"))
|
||||
|
||||
read_kwargs["comment"] = "#"
|
||||
if parser.engine == "pyarrow":
|
||||
if "lineterminator" in read_kwargs:
|
||||
msg = (
|
||||
"The 'lineterminator' option is not supported with the 'pyarrow' engine"
|
||||
)
|
||||
else:
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
warn, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), **read_kwargs)
|
||||
return
|
||||
elif parser.engine == "python" and read_kwargs.get("lineterminator"):
|
||||
msg = r"Custom line terminators not supported in python parser \(yet\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
warn, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), **read_kwargs)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False):
|
||||
result = parser.read_csv(StringIO(data), **read_kwargs)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_skiprows(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
random line
|
||||
# second empty line
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# This should ignore the first four lines (including comments).
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", skiprows=4)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), comment="#", skiprows=4)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_header(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
# second empty line
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# Header should begin at the second non-comment line.
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", header=1)
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), comment="#", header=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_skiprows_header(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
# second empty line
|
||||
# third empty line
|
||||
X,Y,Z
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# Skiprows should skip the first 4 lines (including comments),
|
||||
# while header should start from the second non-commented line,
|
||||
# starting with line 5.
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"])
|
||||
def test_custom_comment_char(all_parsers, comment_char):
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data.replace("#", comment_char)), comment=comment_char
|
||||
)
|
||||
return
|
||||
result = parser.read_csv(
|
||||
StringIO(data.replace("#", comment_char)), comment=comment_char
|
||||
)
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", ["infer", None])
|
||||
def test_comment_first_line(all_parsers, header):
|
||||
# see gh-4623
|
||||
parser = all_parsers
|
||||
data = "# notes\na,b,c\n# more notes\n1,2,3"
|
||||
|
||||
if header is None:
|
||||
expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]})
|
||||
else:
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", header=header)
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), comment="#", header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_char_in_default_value(all_parsers, request):
|
||||
# GH#34002
|
||||
if all_parsers.engine == "c":
|
||||
reason = "see gh-34002: works on the python engine but not the c engine"
|
||||
# NA value containing comment char is interpreted as comment
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=AssertionError))
|
||||
parser = all_parsers
|
||||
|
||||
data = (
|
||||
"# this is a comment\n"
|
||||
"col1,col2,col3,col4\n"
|
||||
"1,2,3,4#inline comment\n"
|
||||
"4,5#,6,10\n"
|
||||
"7,8,#N/A,11\n"
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", na_values="#N/A")
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), comment="#", na_values="#N/A")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col1": [1, 4, 7],
|
||||
"col2": [2, 5, 8],
|
||||
"col3": [3.0, np.nan, np.nan],
|
||||
"col4": [4.0, np.nan, 11.0],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,211 @@
|
||||
"""
|
||||
Tests compressed data parsing functionality for all
|
||||
of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import tarfile
|
||||
import zipfile
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def buffer(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def parser_and_data(all_parsers, csv1):
|
||||
parser = all_parsers
|
||||
|
||||
with open(csv1, "rb") as f:
|
||||
data = f.read()
|
||||
expected = parser.read_csv(csv1)
|
||||
|
||||
return parser, data, expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("compression", ["zip", "infer", "zip2"])
|
||||
def test_zip(parser_and_data, compression):
|
||||
parser, data, expected = parser_and_data
|
||||
|
||||
with tm.ensure_clean("test_file.zip") as path:
|
||||
with zipfile.ZipFile(path, mode="w") as tmp:
|
||||
tmp.writestr("test_file", data)
|
||||
|
||||
if compression == "zip2":
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, compression="zip")
|
||||
else:
|
||||
result = parser.read_csv(path, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("compression", ["zip", "infer"])
|
||||
def test_zip_error_multiple_files(parser_and_data, compression):
|
||||
parser, data, expected = parser_and_data
|
||||
|
||||
with tm.ensure_clean("combined_zip.zip") as path:
|
||||
inner_file_names = ["test_file", "second_file"]
|
||||
|
||||
with zipfile.ZipFile(path, mode="w") as tmp:
|
||||
for file_name in inner_file_names:
|
||||
tmp.writestr(file_name, data)
|
||||
|
||||
with pytest.raises(ValueError, match="Multiple files"):
|
||||
parser.read_csv(path, compression=compression)
|
||||
|
||||
|
||||
def test_zip_error_no_files(parser_and_data):
|
||||
parser, _, _ = parser_and_data
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with zipfile.ZipFile(path, mode="w"):
|
||||
pass
|
||||
|
||||
with pytest.raises(ValueError, match="Zero files"):
|
||||
parser.read_csv(path, compression="zip")
|
||||
|
||||
|
||||
def test_zip_error_invalid_zip(parser_and_data):
|
||||
parser, _, _ = parser_and_data
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "rb") as f:
|
||||
with pytest.raises(zipfile.BadZipFile, match="File is not a zip file"):
|
||||
parser.read_csv(f, compression="zip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [None, "test.{ext}"])
|
||||
def test_compression(
|
||||
request,
|
||||
parser_and_data,
|
||||
compression_only,
|
||||
buffer,
|
||||
filename,
|
||||
compression_to_extension,
|
||||
):
|
||||
parser, data, expected = parser_and_data
|
||||
compress_type = compression_only
|
||||
|
||||
ext = compression_to_extension[compress_type]
|
||||
filename = filename if filename is None else filename.format(ext=ext)
|
||||
|
||||
if filename and buffer:
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason="Cannot deduce compression from buffer of compressed data."
|
||||
)
|
||||
)
|
||||
|
||||
with tm.ensure_clean(filename=filename) as path:
|
||||
tm.write_to_compressed(compress_type, path, data)
|
||||
compression = "infer" if filename else compress_type
|
||||
|
||||
if buffer:
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, compression=compression)
|
||||
else:
|
||||
result = parser.read_csv(path, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ext", [None, "gz", "bz2"])
|
||||
def test_infer_compression(all_parsers, csv1, buffer, ext):
|
||||
# see gh-9770
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0, "parse_dates": True}
|
||||
|
||||
expected = parser.read_csv(csv1, **kwargs)
|
||||
kwargs["compression"] = "infer"
|
||||
|
||||
if buffer:
|
||||
with open(csv1, encoding="utf-8") as f:
|
||||
result = parser.read_csv(f, **kwargs)
|
||||
else:
|
||||
ext = "." + ext if ext else ""
|
||||
result = parser.read_csv(csv1 + ext, **kwargs)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt):
|
||||
# see gh-18071, gh-24130
|
||||
parser = all_parsers
|
||||
encoding = encoding_fmt.format(utf_value)
|
||||
path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip")
|
||||
|
||||
result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Country": ["Venezuela", "Venezuela"],
|
||||
"Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."],
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
|
||||
def test_invalid_compression(all_parsers, invalid_compression):
|
||||
parser = all_parsers
|
||||
compress_kwargs = {"compression": invalid_compression}
|
||||
|
||||
msg = f"Unrecognized compression type: {invalid_compression}"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv("test_file.zip", **compress_kwargs)
|
||||
|
||||
|
||||
def test_compression_tar_archive(all_parsers, csv_dir_path):
|
||||
parser = all_parsers
|
||||
path = os.path.join(csv_dir_path, "tar_csv.tar.gz")
|
||||
df = parser.read_csv(path)
|
||||
assert list(df.columns) == ["a"]
|
||||
|
||||
|
||||
def test_ignore_compression_extension(all_parsers):
|
||||
parser = all_parsers
|
||||
df = DataFrame({"a": [0, 1]})
|
||||
with tm.ensure_clean("test.csv") as path_csv:
|
||||
with tm.ensure_clean("test.csv.zip") as path_zip:
|
||||
# make sure to create un-compressed file with zip extension
|
||||
df.to_csv(path_csv, index=False)
|
||||
Path(path_zip).write_text(
|
||||
Path(path_csv).read_text(encoding="utf-8"), encoding="utf-8"
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df)
|
||||
|
||||
|
||||
def test_writes_tar_gz(all_parsers):
|
||||
parser = all_parsers
|
||||
data = DataFrame(
|
||||
{
|
||||
"Country": ["Venezuela", "Venezuela"],
|
||||
"Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."],
|
||||
}
|
||||
)
|
||||
with tm.ensure_clean("test.tar.gz") as tar_path:
|
||||
data.to_csv(tar_path, index=False)
|
||||
|
||||
# test that read_csv infers .tar.gz to gzip:
|
||||
tm.assert_frame_equal(parser.read_csv(tar_path), data)
|
||||
|
||||
# test that file is indeed gzipped:
|
||||
with tarfile.open(tar_path, "r:gz") as tar:
|
||||
result = parser.read_csv(
|
||||
tar.extractfile(tar.getnames()[0]), compression="infer"
|
||||
)
|
||||
tm.assert_frame_equal(result, data)
|
||||
@ -0,0 +1,36 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import DtypeWarning
|
||||
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import ArrowExtensionArray
|
||||
|
||||
from pandas.io.parsers.c_parser_wrapper import _concatenate_chunks
|
||||
|
||||
|
||||
def test_concatenate_chunks_pyarrow():
|
||||
# GH#51876
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
chunks = [
|
||||
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
|
||||
{0: ArrowExtensionArray(pa.array([1, 2]))},
|
||||
]
|
||||
result = _concatenate_chunks(chunks)
|
||||
expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0]))
|
||||
tm.assert_extension_array_equal(result[0], expected)
|
||||
|
||||
|
||||
def test_concatenate_chunks_pyarrow_strings():
|
||||
# GH#51876
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
chunks = [
|
||||
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
|
||||
{0: ArrowExtensionArray(pa.array(["a", "b"]))},
|
||||
]
|
||||
with tm.assert_produces_warning(DtypeWarning, match="have mixed types"):
|
||||
result = _concatenate_chunks(chunks)
|
||||
expected = np.concatenate(
|
||||
[np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])]
|
||||
)
|
||||
tm.assert_numpy_array_equal(result[0], expected)
|
||||
@ -0,0 +1,263 @@
|
||||
"""
|
||||
Tests column conversion functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
from dateutil.parser import parse
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_converters_type_must_be_dict(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
"""
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), converters=0)
|
||||
return
|
||||
with pytest.raises(TypeError, match="Type converters.+"):
|
||||
parser.read_csv(StringIO(data), converters=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("column", [3, "D"])
|
||||
@pytest.mark.parametrize(
|
||||
"converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer.
|
||||
)
|
||||
def test_converters(all_parsers, column, converter):
|
||||
parser = all_parsers
|
||||
data = """A,B,C,D
|
||||
a,1,2,01/01/2009
|
||||
b,3,4,01/02/2009
|
||||
c,4,5,01/03/2009
|
||||
"""
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), converters={column: converter})
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), converters={column: converter})
|
||||
|
||||
expected = parser.read_csv(StringIO(data))
|
||||
expected["D"] = expected["D"].map(converter)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_no_implicit_conv(all_parsers):
|
||||
# see gh-2184
|
||||
parser = all_parsers
|
||||
data = """000102,1.2,A\n001245,2,B"""
|
||||
|
||||
converters = {0: lambda x: x.strip()}
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=None, converters=converters)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, converters=converters)
|
||||
|
||||
# Column 0 should not be casted to numeric and should remain as object.
|
||||
expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_euro_decimal_format(all_parsers):
|
||||
# see gh-583
|
||||
converters = {}
|
||||
parser = all_parsers
|
||||
|
||||
data = """Id;Number1;Number2;Text1;Text2;Number3
|
||||
1;1521,1541;187101,9543;ABC;poi;4,7387
|
||||
2;121,12;14897,76;DEF;uyt;0,3773
|
||||
3;878,158;108013,434;GHI;rez;2,7356"""
|
||||
converters["Number1"] = converters["Number2"] = converters[
|
||||
"Number3"
|
||||
] = lambda x: float(x.replace(",", "."))
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep=";", converters=converters)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=";", converters=converters)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387],
|
||||
[2, 121.12, 14897.76, "DEF", "uyt", 0.3773],
|
||||
[3, 878.158, 108013.434, "GHI", "rez", 2.7356],
|
||||
],
|
||||
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_corner_with_nans(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """id,score,days
|
||||
1,2,12
|
||||
2,2-5,
|
||||
3,,14+
|
||||
4,6-12,2"""
|
||||
|
||||
# Example converters.
|
||||
def convert_days(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
is_plus = x.endswith("+")
|
||||
|
||||
if is_plus:
|
||||
x = int(x[:-1]) + 1
|
||||
else:
|
||||
x = int(x)
|
||||
|
||||
return x
|
||||
|
||||
def convert_days_sentinel(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
is_plus = x.endswith("+")
|
||||
|
||||
if is_plus:
|
||||
x = int(x[:-1]) + 1
|
||||
else:
|
||||
x = int(x)
|
||||
|
||||
return x
|
||||
|
||||
def convert_score(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
if x.find("-") > 0:
|
||||
val_min, val_max = map(int, x.split("-"))
|
||||
val = 0.5 * (val_min + val_max)
|
||||
else:
|
||||
val = float(x)
|
||||
|
||||
return val
|
||||
|
||||
results = []
|
||||
|
||||
for day_converter in [convert_days, convert_days_sentinel]:
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
converters={"score": convert_score, "days": day_converter},
|
||||
na_values=["", None],
|
||||
)
|
||||
continue
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
converters={"score": convert_score, "days": day_converter},
|
||||
na_values=["", None],
|
||||
)
|
||||
assert pd.isna(result["days"][1])
|
||||
results.append(result)
|
||||
|
||||
if parser.engine != "pyarrow":
|
||||
tm.assert_frame_equal(results[0], results[1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("conv_f", [lambda x: x, str])
|
||||
def test_converter_index_col_bug(all_parsers, conv_f):
|
||||
# see gh-1835 , GH#40589
|
||||
parser = all_parsers
|
||||
data = "A;B\n1;2\n3;4"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
|
||||
)
|
||||
return
|
||||
|
||||
rs = parser.read_csv(
|
||||
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
|
||||
)
|
||||
|
||||
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A"))
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
|
||||
def test_converter_identity_object(all_parsers):
|
||||
# GH#40589
|
||||
parser = all_parsers
|
||||
data = "A,B\n1,2\n3,4"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), converters={"A": lambda x: x})
|
||||
return
|
||||
|
||||
rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x})
|
||||
|
||||
xp = DataFrame({"A": ["1", "3"], "B": [2, 4]})
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
|
||||
def test_converter_multi_index(all_parsers):
|
||||
# GH 42446
|
||||
parser = all_parsers
|
||||
data = "A,B,B\nX,Y,Z\n1,2,3"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
header=list(range(2)),
|
||||
converters={
|
||||
("A", "X"): np.int32,
|
||||
("B", "Y"): np.int32,
|
||||
("B", "Z"): np.float32,
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=list(range(2)),
|
||||
converters={
|
||||
("A", "X"): np.int32,
|
||||
("B", "Y"): np.int32,
|
||||
("B", "Z"): np.float32,
|
||||
},
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
("A", "X"): np.int32([1]),
|
||||
("B", "Y"): np.int32([2]),
|
||||
("B", "Z"): np.float32([3]),
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,195 @@
|
||||
"""
|
||||
Tests that dialects are properly handled during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def custom_dialect():
|
||||
dialect_name = "weird"
|
||||
dialect_kwargs = {
|
||||
"doublequote": False,
|
||||
"escapechar": "~",
|
||||
"delimiter": ":",
|
||||
"skipinitialspace": False,
|
||||
"quotechar": "`",
|
||||
"quoting": 3,
|
||||
}
|
||||
return dialect_name, dialect_kwargs
|
||||
|
||||
|
||||
def test_dialect(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
label1,label2,label3
|
||||
index1,"a,c,e
|
||||
index2,b,d,f
|
||||
"""
|
||||
|
||||
dia = csv.excel()
|
||||
dia.quoting = csv.QUOTE_NONE
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dialect=dia)
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(data), dialect=dia)
|
||||
|
||||
data = """\
|
||||
label1,label2,label3
|
||||
index1,a,c,e
|
||||
index2,b,d,f
|
||||
"""
|
||||
exp = parser.read_csv(StringIO(data))
|
||||
exp.replace("a", '"a', inplace=True)
|
||||
tm.assert_frame_equal(df, exp)
|
||||
|
||||
|
||||
def test_dialect_str(all_parsers):
|
||||
dialect_name = "mydialect"
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
fruit:vegetable
|
||||
apple:broccoli
|
||||
pear:tomato
|
||||
"""
|
||||
exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]})
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, delimiter=":"):
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dialect=dialect_name)
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(data), dialect=dialect_name)
|
||||
tm.assert_frame_equal(df, exp)
|
||||
|
||||
|
||||
def test_invalid_dialect(all_parsers):
|
||||
class InvalidDialect:
|
||||
pass
|
||||
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
msg = "Invalid dialect"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dialect=InvalidDialect)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg",
|
||||
[None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"],
|
||||
)
|
||||
@pytest.mark.parametrize("value", ["dialect", "default", "other"])
|
||||
def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, value):
|
||||
# see gh-23761.
|
||||
dialect_name, dialect_kwargs = custom_dialect
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
data = "a:b\n1:2"
|
||||
|
||||
warning_klass = None
|
||||
kwds = {}
|
||||
|
||||
# arg=None tests when we pass in the dialect without any other arguments.
|
||||
if arg is not None:
|
||||
if value == "dialect": # No conflict --> no warning.
|
||||
kwds[arg] = dialect_kwargs[arg]
|
||||
elif value == "default": # Default --> no warning.
|
||||
from pandas.io.parsers.base_parser import parser_defaults
|
||||
|
||||
kwds[arg] = parser_defaults[arg]
|
||||
else: # Non-default + conflict with dialect --> warning.
|
||||
warning_klass = ParserWarning
|
||||
kwds[arg] = "blah"
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv_check_warnings(
|
||||
# No warning bc we raise
|
||||
None,
|
||||
"Conflicting values for",
|
||||
StringIO(data),
|
||||
dialect=dialect_name,
|
||||
**kwds,
|
||||
)
|
||||
return
|
||||
result = parser.read_csv_check_warnings(
|
||||
warning_klass,
|
||||
"Conflicting values for",
|
||||
StringIO(data),
|
||||
dialect=dialect_name,
|
||||
**kwds,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,warning_klass",
|
||||
[
|
||||
({"sep": ","}, None), # sep is default --> sep_override=True
|
||||
({"sep": "."}, ParserWarning), # sep isn't default --> sep_override=False
|
||||
({"delimiter": ":"}, None), # No conflict
|
||||
({"delimiter": None}, None), # Default arguments --> sep_override=True
|
||||
({"delimiter": ","}, ParserWarning), # Conflict
|
||||
({"delimiter": "."}, ParserWarning), # Conflict
|
||||
],
|
||||
ids=[
|
||||
"sep-override-true",
|
||||
"sep-override-false",
|
||||
"delimiter-no-conflict",
|
||||
"delimiter-default-arg",
|
||||
"delimiter-conflict",
|
||||
"delimiter-conflict2",
|
||||
],
|
||||
)
|
||||
def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning_klass):
|
||||
# see gh-23761.
|
||||
dialect_name, dialect_kwargs = custom_dialect
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
data = "a:b\n1:2"
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv_check_warnings(
|
||||
# no warning bc we raise
|
||||
None,
|
||||
"Conflicting values for 'delimiter'",
|
||||
StringIO(data),
|
||||
dialect=dialect_name,
|
||||
**kwargs,
|
||||
)
|
||||
return
|
||||
result = parser.read_csv_check_warnings(
|
||||
warning_klass,
|
||||
"Conflicting values for 'delimiter'",
|
||||
StringIO(data),
|
||||
dialect=dialect_name,
|
||||
**kwargs,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,337 @@
|
||||
"""
|
||||
Tests encoding functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import (
|
||||
BytesIO,
|
||||
TextIOWrapper,
|
||||
)
|
||||
import os
|
||||
import tempfile
|
||||
import uuid
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
read_csv,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_bytes_io_input(all_parsers):
|
||||
encoding = "cp1255"
|
||||
parser = all_parsers
|
||||
|
||||
data = BytesIO("שלום:1234\n562:123".encode(encoding))
|
||||
result = parser.read_csv(data, sep=":", encoding=encoding)
|
||||
|
||||
expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_read_csv_unicode(all_parsers):
|
||||
parser = all_parsers
|
||||
data = BytesIO("\u0141aski, Jan;1".encode())
|
||||
|
||||
result = parser.read_csv(data, sep=";", encoding="utf-8", header=None)
|
||||
expected = DataFrame([["\u0141aski, Jan", 1]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
@pytest.mark.parametrize("sep", [",", "\t"])
|
||||
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
|
||||
def test_utf16_bom_skiprows(all_parsers, sep, encoding):
|
||||
# see gh-2298
|
||||
parser = all_parsers
|
||||
data = """skip this
|
||||
skip this too
|
||||
A,B,C
|
||||
1,2,3
|
||||
4,5,6""".replace(
|
||||
",", sep
|
||||
)
|
||||
path = f"__{uuid.uuid4()}__.csv"
|
||||
kwargs = {"sep": sep, "skiprows": 2}
|
||||
utf8 = "utf-8"
|
||||
|
||||
with tm.ensure_clean(path) as path:
|
||||
bytes_data = data.encode(encoding)
|
||||
|
||||
with open(path, "wb") as f:
|
||||
f.write(bytes_data)
|
||||
|
||||
with TextIOWrapper(BytesIO(data.encode(utf8)), encoding=utf8) as bytes_buffer:
|
||||
result = parser.read_csv(path, encoding=encoding, **kwargs)
|
||||
expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_utf16_example(all_parsers, csv_dir_path):
|
||||
path = os.path.join(csv_dir_path, "utf16_ex.txt")
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(path, encoding="utf-16", sep="\t")
|
||||
assert len(result) == 50
|
||||
|
||||
|
||||
def test_unicode_encoding(all_parsers, csv_dir_path):
|
||||
path = os.path.join(csv_dir_path, "unicode_series.csv")
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(path, header=None, encoding="latin-1")
|
||||
result = result.set_index(0)
|
||||
got = result[1][1632]
|
||||
|
||||
expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)"
|
||||
assert got == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
# Basic test
|
||||
("a\n1", {}, DataFrame({"a": [1]})),
|
||||
# "Regular" quoting
|
||||
('"a"\n1', {"quotechar": '"'}, DataFrame({"a": [1]})),
|
||||
# Test in a data row instead of header
|
||||
("b\n1", {"names": ["a"]}, DataFrame({"a": ["b", "1"]})),
|
||||
# Test in empty data row with skipping
|
||||
("\n1", {"names": ["a"], "skip_blank_lines": True}, DataFrame({"a": [1]})),
|
||||
# Test in empty data row without skipping
|
||||
(
|
||||
"\n1",
|
||||
{"names": ["a"], "skip_blank_lines": False},
|
||||
DataFrame({"a": [np.nan, 1]}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_utf8_bom(all_parsers, data, kwargs, expected, request):
|
||||
# see gh-4793
|
||||
parser = all_parsers
|
||||
bom = "\ufeff"
|
||||
utf8 = "utf-8"
|
||||
|
||||
def _encode_data_with_bom(_data):
|
||||
bom_data = (bom + _data).encode(utf8)
|
||||
return BytesIO(bom_data)
|
||||
|
||||
if (
|
||||
parser.engine == "pyarrow"
|
||||
and data == "\n1"
|
||||
and kwargs.get("skip_blank_lines", True)
|
||||
):
|
||||
# CSV parse error: Empty CSV file or block: cannot infer number of columns
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
|
||||
# see gh-13549
|
||||
expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
|
||||
parser = all_parsers
|
||||
|
||||
encoding = encoding_fmt.format(utf_value)
|
||||
data = "mb_num,multibyte\n4.8,test".encode(encoding)
|
||||
|
||||
result = parser.read_csv(BytesIO(data), encoding=encoding)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file_path,encoding",
|
||||
[
|
||||
(("io", "data", "csv", "test1.csv"), "utf-8"),
|
||||
(("io", "parser", "data", "unicode_series.csv"), "latin-1"),
|
||||
(("io", "parser", "data", "sauron.SHIFT_JIS.csv"), "shiftjis"),
|
||||
],
|
||||
)
|
||||
def test_binary_mode_file_buffers(all_parsers, file_path, encoding, datapath):
|
||||
# gh-23779: Python csv engine shouldn't error on files opened in binary.
|
||||
# gh-31575: Python csv engine shouldn't error on files opened in raw binary.
|
||||
parser = all_parsers
|
||||
|
||||
fpath = datapath(*file_path)
|
||||
expected = parser.read_csv(fpath, encoding=encoding)
|
||||
|
||||
with open(fpath, encoding=encoding) as fa:
|
||||
result = parser.read_csv(fa)
|
||||
assert not fa.closed
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
with open(fpath, mode="rb") as fb:
|
||||
result = parser.read_csv(fb, encoding=encoding)
|
||||
assert not fb.closed
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
with open(fpath, mode="rb", buffering=0) as fb:
|
||||
result = parser.read_csv(fb, encoding=encoding)
|
||||
assert not fb.closed
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pass_encoding", [True, False])
|
||||
def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding):
|
||||
# see gh-24130
|
||||
parser = all_parsers
|
||||
encoding = encoding_fmt.format(utf_value)
|
||||
|
||||
if parser.engine == "pyarrow" and pass_encoding is True and utf_value in [16, 32]:
|
||||
# FIXME: this is bad!
|
||||
pytest.skip("These cases freeze")
|
||||
|
||||
expected = DataFrame({"foo": ["bar"]})
|
||||
|
||||
with tm.ensure_clean(mode="w+", encoding=encoding, return_filelike=True) as f:
|
||||
f.write("foo\nbar")
|
||||
f.seek(0)
|
||||
|
||||
result = parser.read_csv(f, encoding=encoding if pass_encoding else None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_encoding_named_temp_file(all_parsers):
|
||||
# see gh-31819
|
||||
parser = all_parsers
|
||||
encoding = "shift-jis"
|
||||
|
||||
title = "てすと"
|
||||
data = "こむ"
|
||||
|
||||
expected = DataFrame({title: [data]})
|
||||
|
||||
with tempfile.NamedTemporaryFile() as f:
|
||||
f.write(f"{title}\n{data}".encode(encoding))
|
||||
|
||||
f.seek(0)
|
||||
|
||||
result = parser.read_csv(f, encoding=encoding)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert not f.closed
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"encoding", ["utf-8", "utf-16", "utf-16-be", "utf-16-le", "utf-32"]
|
||||
)
|
||||
def test_parse_encoded_special_characters(encoding):
|
||||
# GH16218 Verify parsing of data with encoded special characters
|
||||
# Data contains a Unicode 'FULLWIDTH COLON' (U+FF1A) at position (0,"a")
|
||||
data = "a\tb\n:foo\t0\nbar\t1\nbaz\t2" # noqa: RUF001
|
||||
encoded_data = BytesIO(data.encode(encoding))
|
||||
result = read_csv(encoded_data, delimiter="\t", encoding=encoding)
|
||||
|
||||
expected = DataFrame(
|
||||
data=[[":foo", 0], ["bar", 1], ["baz", 2]], # noqa: RUF001
|
||||
columns=["a", "b"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
|
||||
def test_encoding_memory_map(all_parsers, encoding):
|
||||
# GH40986
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
{
|
||||
"name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"],
|
||||
"mask": ["red", "purple", "orange", "blue"],
|
||||
"weapon": ["sai", "bo staff", "nunchunk", "katana"],
|
||||
}
|
||||
)
|
||||
with tm.ensure_clean() as file:
|
||||
expected.to_csv(file, index=False, encoding=encoding)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(file, encoding=encoding, memory_map=True)
|
||||
return
|
||||
|
||||
df = parser.read_csv(file, encoding=encoding, memory_map=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
def test_chunk_splits_multibyte_char(all_parsers):
|
||||
"""
|
||||
Chunk splits a multibyte character with memory_map=True
|
||||
|
||||
GH 43540
|
||||
"""
|
||||
parser = all_parsers
|
||||
# DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx
|
||||
df = DataFrame(data=["a" * 127] * 2048)
|
||||
|
||||
# Put two-bytes utf-8 encoded character "ą" at the end of chunk
|
||||
# utf-8 encoding of "ą" is b'\xc4\x85'
|
||||
df.iloc[2047] = "a" * 127 + "ą"
|
||||
with tm.ensure_clean("bug-gh43540.csv") as fname:
|
||||
df.to_csv(fname, index=False, header=False, encoding="utf-8")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(fname, header=None, memory_map=True)
|
||||
return
|
||||
|
||||
dfr = parser.read_csv(fname, header=None, memory_map=True)
|
||||
tm.assert_frame_equal(dfr, df)
|
||||
|
||||
|
||||
def test_readcsv_memmap_utf8(all_parsers):
|
||||
"""
|
||||
GH 43787
|
||||
|
||||
Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8
|
||||
"""
|
||||
lines = []
|
||||
line_length = 128
|
||||
start_char = " "
|
||||
end_char = "\U00010080"
|
||||
# This for loop creates a list of 128-char strings
|
||||
# consisting of consecutive Unicode chars
|
||||
for lnum in range(ord(start_char), ord(end_char), line_length):
|
||||
line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n"
|
||||
try:
|
||||
line.encode("utf-8")
|
||||
except UnicodeEncodeError:
|
||||
continue
|
||||
lines.append(line)
|
||||
parser = all_parsers
|
||||
df = DataFrame(lines)
|
||||
with tm.ensure_clean("utf8test.csv") as fname:
|
||||
df.to_csv(fname, index=False, header=False, encoding="utf-8")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
|
||||
return
|
||||
|
||||
dfr = parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
|
||||
tm.assert_frame_equal(df, dfr)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
@pytest.mark.parametrize("mode", ["w+b", "w+t"])
|
||||
def test_not_readable(all_parsers, mode):
|
||||
# GH43439
|
||||
parser = all_parsers
|
||||
content = b"abcd"
|
||||
if "t" in mode:
|
||||
content = "abcd"
|
||||
with tempfile.SpooledTemporaryFile(mode=mode, encoding="utf-8") as handle:
|
||||
handle.write(content)
|
||||
handle.seek(0)
|
||||
df = parser.read_csv(handle)
|
||||
expected = DataFrame([], columns=["abcd"])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
@ -0,0 +1,733 @@
|
||||
"""
|
||||
Tests that the file header is properly handled or inferred
|
||||
during parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from collections import namedtuple
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_read_with_bad_header(all_parsers):
|
||||
parser = all_parsers
|
||||
msg = r"but only \d+ lines in file"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
s = StringIO(",,")
|
||||
parser.read_csv(s, header=[10])
|
||||
|
||||
|
||||
def test_negative_header(all_parsers):
|
||||
# see gh-27779
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="Passing negative integer to header is invalid. "
|
||||
"For no header, use header=None instead",
|
||||
):
|
||||
parser.read_csv(StringIO(data), header=-1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [([-1, 2, 4]), ([-5, 0])])
|
||||
def test_negative_multi_index_header(all_parsers, header):
|
||||
# see gh-27779
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
with pytest.raises(
|
||||
ValueError, match="cannot specify multi-index header with negative integers"
|
||||
):
|
||||
parser.read_csv(StringIO(data), header=header)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [True, False])
|
||||
def test_bool_header_arg(all_parsers, header):
|
||||
# see gh-6114
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
MyColumn
|
||||
a
|
||||
b
|
||||
a
|
||||
b"""
|
||||
msg = "Passing a bool to header is invalid"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=header)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame are different
|
||||
def test_header_with_index_col(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
names = ["A", "B", "C"]
|
||||
result = parser.read_csv(StringIO(data), names=names)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_not_first_line(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """got,to,ignore,this,line
|
||||
got,to,ignore,this,line
|
||||
index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
"""
|
||||
data2 = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=2, index_col=0)
|
||||
expected = parser.read_csv(StringIO(data2), header=0, index_col=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_multi_index(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = """\
|
||||
C0,,C_l0_g0,C_l0_g1,C_l0_g2
|
||||
|
||||
C1,,C_l1_g0,C_l1_g1,C_l1_g2
|
||||
C2,,C_l2_g0,C_l2_g1,C_l2_g2
|
||||
C3,,C_l3_g0,C_l3_g1,C_l3_g2
|
||||
R0,R1,,,
|
||||
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
|
||||
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
|
||||
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
|
||||
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
|
||||
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1])
|
||||
data_gen_f = lambda r, c: f"R{r}C{c}"
|
||||
|
||||
data = [[data_gen_f(r, c) for c in range(3)] for r in range(5)]
|
||||
index = MultiIndex.from_arrays(
|
||||
[[f"R_l0_g{i}" for i in range(5)], [f"R_l1_g{i}" for i in range(5)]],
|
||||
names=["R0", "R1"],
|
||||
)
|
||||
columns = MultiIndex.from_arrays(
|
||||
[
|
||||
[f"C_l0_g{i}" for i in range(3)],
|
||||
[f"C_l1_g{i}" for i in range(3)],
|
||||
[f"C_l2_g{i}" for i in range(3)],
|
||||
[f"C_l3_g{i}" for i in range(3)],
|
||||
],
|
||||
names=["C0", "C1", "C2", "C3"],
|
||||
)
|
||||
expected = DataFrame(data, columns=columns, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,msg",
|
||||
[
|
||||
(
|
||||
{"index_col": ["foo", "bar"]},
|
||||
(
|
||||
"index_col must only contain "
|
||||
"row numbers when specifying "
|
||||
"a multi-index header"
|
||||
),
|
||||
),
|
||||
(
|
||||
{"index_col": [0, 1], "names": ["foo", "bar"]},
|
||||
("cannot specify names when specifying a multi-index header"),
|
||||
),
|
||||
(
|
||||
{"index_col": [0, 1], "usecols": ["foo", "bar"]},
|
||||
("cannot specify usecols when specifying a multi-index header"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_invalid(all_parsers, kwargs, msg):
|
||||
data = """\
|
||||
C0,,C_l0_g0,C_l0_g1,C_l0_g2
|
||||
|
||||
C1,,C_l1_g0,C_l1_g1,C_l1_g2
|
||||
C2,,C_l2_g0,C_l2_g1,C_l2_g2
|
||||
C3,,C_l3_g0,C_l3_g1,C_l3_g2
|
||||
R0,R1,,,
|
||||
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
|
||||
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
|
||||
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
|
||||
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
|
||||
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
|
||||
"""
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs)
|
||||
|
||||
|
||||
_TestTuple = namedtuple("_TestTuple", ["first", "second"])
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"header": [0, 1]},
|
||||
{
|
||||
"skiprows": 3,
|
||||
"names": [
|
||||
("a", "q"),
|
||||
("a", "r"),
|
||||
("a", "s"),
|
||||
("b", "t"),
|
||||
("c", "u"),
|
||||
("c", "v"),
|
||||
],
|
||||
},
|
||||
{
|
||||
"skiprows": 3,
|
||||
"names": [
|
||||
_TestTuple("a", "q"),
|
||||
_TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"),
|
||||
_TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"),
|
||||
_TestTuple("c", "v"),
|
||||
],
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_common_format1(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
|
||||
),
|
||||
)
|
||||
data = """,a,a,a,b,c,c
|
||||
,q,r,s,t,u,v
|
||||
,,,,,,
|
||||
one,1,2,3,4,5,6
|
||||
two,7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"header": [0, 1]},
|
||||
{
|
||||
"skiprows": 2,
|
||||
"names": [
|
||||
("a", "q"),
|
||||
("a", "r"),
|
||||
("a", "s"),
|
||||
("b", "t"),
|
||||
("c", "u"),
|
||||
("c", "v"),
|
||||
],
|
||||
},
|
||||
{
|
||||
"skiprows": 2,
|
||||
"names": [
|
||||
_TestTuple("a", "q"),
|
||||
_TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"),
|
||||
_TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"),
|
||||
_TestTuple("c", "v"),
|
||||
],
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_common_format2(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
|
||||
),
|
||||
)
|
||||
data = """,a,a,a,b,c,c
|
||||
,q,r,s,t,u,v
|
||||
one,1,2,3,4,5,6
|
||||
two,7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"header": [0, 1]},
|
||||
{
|
||||
"skiprows": 2,
|
||||
"names": [
|
||||
("a", "q"),
|
||||
("a", "r"),
|
||||
("a", "s"),
|
||||
("b", "t"),
|
||||
("c", "u"),
|
||||
("c", "v"),
|
||||
],
|
||||
},
|
||||
{
|
||||
"skiprows": 2,
|
||||
"names": [
|
||||
_TestTuple("a", "q"),
|
||||
_TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"),
|
||||
_TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"),
|
||||
_TestTuple("c", "v"),
|
||||
],
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_common_format3(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
|
||||
),
|
||||
)
|
||||
expected = expected.reset_index(drop=True)
|
||||
data = """a,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_multi_index_common_format_malformed1(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
|
||||
index=Index([1, 7]),
|
||||
columns=MultiIndex(
|
||||
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
|
||||
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
|
||||
names=["a", "q"],
|
||||
),
|
||||
)
|
||||
data = """a,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_multi_index_common_format_malformed2(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
|
||||
index=Index([1, 7]),
|
||||
columns=MultiIndex(
|
||||
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
|
||||
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
|
||||
names=[None, "q"],
|
||||
),
|
||||
)
|
||||
|
||||
data = """,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_multi_index_common_format_malformed3(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"),
|
||||
index=MultiIndex(levels=[[1, 7], [2, 8]], codes=[[0, 1], [0, 1]]),
|
||||
columns=MultiIndex(
|
||||
levels=[["a", "b", "c"], ["s", "t", "u", "v"]],
|
||||
codes=[[0, 1, 2, 2], [0, 1, 2, 3]],
|
||||
names=[None, "q"],
|
||||
),
|
||||
)
|
||||
data = """,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_multi_index_blank_line(all_parsers):
|
||||
# GH 40442
|
||||
parser = all_parsers
|
||||
data = [[None, None], [1, 2], [3, 4]]
|
||||
columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")])
|
||||
expected = DataFrame(data, columns=columns)
|
||||
data = "a,b\nA,B\n,\n1,2\n3,4"
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1])
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)]
|
||||
)
|
||||
def test_header_names_backward_compat(all_parsers, data, header, request):
|
||||
# see gh-2539
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and header is not None:
|
||||
mark = pytest.mark.xfail(reason="DataFrame.columns are different")
|
||||
request.applymarker(mark)
|
||||
|
||||
expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block: cannot infer
|
||||
@pytest.mark.parametrize("kwargs", [{}, {"index_col": False}])
|
||||
def test_read_only_header_no_rows(all_parsers, kwargs):
|
||||
# See gh-7773
|
||||
parser = all_parsers
|
||||
expected = DataFrame(columns=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO("a,b,c"), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,names",
|
||||
[
|
||||
({}, [0, 1, 2, 3, 4]),
|
||||
(
|
||||
{"names": ["foo", "bar", "baz", "quux", "panda"]},
|
||||
["foo", "bar", "baz", "quux", "panda"],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_no_header(all_parsers, kwargs, names):
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), header=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [["a", "b"], "string_header"])
|
||||
def test_non_int_header(all_parsers, header):
|
||||
# see gh-16338
|
||||
msg = "header must be integer or list of integers"
|
||||
data = """1,2\n3,4"""
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=header)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_singleton_header(all_parsers):
|
||||
# see gh-7757
|
||||
data = """a,b,c\n0,1,2\n1,2,3"""
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
|
||||
result = parser.read_csv(StringIO(data), header=[0])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected",
|
||||
[
|
||||
(
|
||||
"A,A,A,B\none,one,one,two\n0,40,34,0.1",
|
||||
DataFrame(
|
||||
[[0, 40, 34, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("A", "one"), ("A", "one.1"), ("A", "one.2"), ("B", "two")]
|
||||
),
|
||||
),
|
||||
),
|
||||
(
|
||||
"A,A,A,B\none,one,one.1,two\n0,40,34,0.1",
|
||||
DataFrame(
|
||||
[[0, 40, 34, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("A", "one"), ("A", "one.1"), ("A", "one.1.1"), ("B", "two")]
|
||||
),
|
||||
),
|
||||
),
|
||||
(
|
||||
"A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1",
|
||||
DataFrame(
|
||||
[[0, 40, 34, 0.1, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[
|
||||
("A", "one"),
|
||||
("A", "one.1"),
|
||||
("A", "one.1.1"),
|
||||
("B", "two"),
|
||||
("B", "two.1"),
|
||||
]
|
||||
),
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_mangles_multi_index(all_parsers, data, expected):
|
||||
# see gh-18062
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is requireds
|
||||
@pytest.mark.parametrize("index_col", [None, [0]])
|
||||
@pytest.mark.parametrize(
|
||||
"columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])]
|
||||
)
|
||||
def test_multi_index_unnamed(all_parsers, index_col, columns):
|
||||
# see gh-23687
|
||||
#
|
||||
# When specifying a multi-index header, make sure that
|
||||
# we don't error just because one of the rows in our header
|
||||
# has ALL column names containing the string "Unnamed". The
|
||||
# correct condition to check is whether the row contains
|
||||
# ALL columns that did not have names (and instead were given
|
||||
# placeholder ones).
|
||||
parser = all_parsers
|
||||
header = [0, 1]
|
||||
|
||||
if index_col is None:
|
||||
data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
|
||||
else:
|
||||
data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=header, index_col=index_col)
|
||||
exp_columns = []
|
||||
|
||||
if columns is None:
|
||||
columns = ["", "", ""]
|
||||
|
||||
for i, col in enumerate(columns):
|
||||
if not col: # Unnamed.
|
||||
col = f"Unnamed: {i if index_col is None else i + 1}_level_0"
|
||||
|
||||
exp_columns.append(col)
|
||||
|
||||
columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
|
||||
expected = DataFrame([[2, 3], [4, 5]], columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 2 columns, got 3
|
||||
def test_names_longer_than_header_but_equal_with_data_rows(all_parsers):
|
||||
# GH#38453
|
||||
parser = all_parsers
|
||||
data = """a, b
|
||||
1,2,3
|
||||
5,6,4
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), header=0, names=["A", "B", "C"])
|
||||
expected = DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_read_csv_multiindex_columns(all_parsers):
|
||||
# GH#6051
|
||||
parser = all_parsers
|
||||
|
||||
s1 = "Male, Male, Male, Female, Female\nR, R, L, R, R\n.86, .67, .88, .78, .81"
|
||||
s2 = (
|
||||
"Male, Male, Male, Female, Female\n"
|
||||
"R, R, L, R, R\n"
|
||||
".86, .67, .88, .78, .81\n"
|
||||
".86, .67, .88, .78, .82"
|
||||
)
|
||||
|
||||
mi = MultiIndex.from_tuples(
|
||||
[
|
||||
("Male", "R"),
|
||||
(" Male", " R"),
|
||||
(" Male", " L"),
|
||||
(" Female", " R"),
|
||||
(" Female", " R.1"),
|
||||
]
|
||||
)
|
||||
expected = DataFrame(
|
||||
[[0.86, 0.67, 0.88, 0.78, 0.81], [0.86, 0.67, 0.88, 0.78, 0.82]], columns=mi
|
||||
)
|
||||
|
||||
df1 = parser.read_csv(StringIO(s1), header=[0, 1])
|
||||
tm.assert_frame_equal(df1, expected.iloc[:1])
|
||||
df2 = parser.read_csv(StringIO(s2), header=[0, 1])
|
||||
tm.assert_frame_equal(df2, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_read_csv_multi_header_length_check(all_parsers):
|
||||
# GH#43102
|
||||
parser = all_parsers
|
||||
|
||||
case = """row11,row12,row13
|
||||
row21,row22, row23
|
||||
row31,row32
|
||||
"""
|
||||
|
||||
with pytest.raises(
|
||||
ParserError, match="Header rows must have an equal number of columns."
|
||||
):
|
||||
parser.read_csv(StringIO(case), header=[0, 2])
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 3 columns, got 2
|
||||
def test_header_none_and_implicit_index(all_parsers):
|
||||
# GH#22144
|
||||
parser = all_parsers
|
||||
data = "x,1,5\ny,2\nz,3\n"
|
||||
result = parser.read_csv(StringIO(data), names=["a", "b"], header=None)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3], "b": [5, np.nan, np.nan]}, index=["x", "y", "z"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # regex mismatch "CSV parse error: Expected 2 columns, got "
|
||||
def test_header_none_and_implicit_index_in_second_row(all_parsers):
|
||||
# GH#22144
|
||||
parser = all_parsers
|
||||
data = "x,1\ny,2,5\nz,3\n"
|
||||
with pytest.raises(ParserError, match="Expected 2 fields in line 2, saw 3"):
|
||||
parser.read_csv(StringIO(data), names=["a", "b"], header=None)
|
||||
|
||||
|
||||
def test_header_none_and_on_bad_lines_skip(all_parsers):
|
||||
# GH#22144
|
||||
parser = all_parsers
|
||||
data = "x,1\ny,2,5\nz,3\n"
|
||||
result = parser.read_csv(
|
||||
StringIO(data), names=["a", "b"], header=None, on_bad_lines="skip"
|
||||
)
|
||||
expected = DataFrame({"a": ["x", "z"], "b": [1, 3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is requireds
|
||||
def test_header_missing_rows(all_parsers):
|
||||
# GH#47400
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,2
|
||||
"""
|
||||
msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=[0, 1, 2])
|
||||
|
||||
|
||||
# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine
|
||||
@xfail_pyarrow
|
||||
def test_header_multiple_whitespaces(all_parsers):
|
||||
# GH#54931
|
||||
parser = all_parsers
|
||||
data = """aa bb(1,1) cc(1,1)
|
||||
0 2 3.5"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
expected = DataFrame({"aa": [0], "bb(1,1)": 2, "cc(1,1)": 3.5})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine
|
||||
@xfail_pyarrow
|
||||
def test_header_delim_whitespace(all_parsers):
|
||||
# GH#54918
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,2
|
||||
3,4
|
||||
"""
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(StringIO(data), delim_whitespace=True)
|
||||
expected = DataFrame({"a,b": ["1,2", "3,4"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_no_header_pyarrow(pyarrow_parser_only):
|
||||
parser = pyarrow_parser_only
|
||||
data = """
|
||||
a,i,x
|
||||
b,j,y
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
usecols=[0, 1],
|
||||
dtype="string[pyarrow]",
|
||||
dtype_backend="pyarrow",
|
||||
engine="pyarrow",
|
||||
)
|
||||
expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,376 @@
|
||||
"""
|
||||
Tests that the specified index column (a.k.a "index_col")
|
||||
is properly handled or inferred during parsing for all of
|
||||
the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("with_header", [True, False])
|
||||
def test_index_col_named(all_parsers, with_header):
|
||||
parser = all_parsers
|
||||
no_header = """\
|
||||
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
|
||||
header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
|
||||
|
||||
if with_header:
|
||||
data = header + no_header
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col="ID")
|
||||
expected = parser.read_csv(StringIO(data), header=0).set_index("ID")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
data = no_header
|
||||
msg = "Index ID invalid"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), index_col="ID")
|
||||
|
||||
|
||||
def test_index_col_named2(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
1,2,3,4,hello
|
||||
5,6,7,8,world
|
||||
9,10,11,12,foo
|
||||
"""
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [1, 5, 9], "b": [2, 6, 10], "c": [3, 7, 11], "d": [4, 8, 12]},
|
||||
index=Index(["hello", "world", "foo"], name="message"),
|
||||
)
|
||||
names = ["a", "b", "c", "d", "message"]
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, index_col=["message"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_col_is_true(all_parsers):
|
||||
# see gh-9798
|
||||
data = "a,b\n1,2"
|
||||
parser = all_parsers
|
||||
|
||||
msg = "The value of index_col couldn't be 'True'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), index_col=True)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 3 columns, got 4
|
||||
def test_infer_index_col(all_parsers):
|
||||
data = """A,B,C
|
||||
foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
@pytest.mark.parametrize(
|
||||
"index_col,kwargs",
|
||||
[
|
||||
(None, {"columns": ["x", "y", "z"]}),
|
||||
(False, {"columns": ["x", "y", "z"]}),
|
||||
(0, {"columns": ["y", "z"], "index": Index([], name="x")}),
|
||||
(1, {"columns": ["x", "z"], "index": Index([], name="y")}),
|
||||
("x", {"columns": ["y", "z"], "index": Index([], name="x")}),
|
||||
("y", {"columns": ["x", "z"], "index": Index([], name="y")}),
|
||||
(
|
||||
[0, 1],
|
||||
{
|
||||
"columns": ["z"],
|
||||
"index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
|
||||
},
|
||||
),
|
||||
(
|
||||
["x", "y"],
|
||||
{
|
||||
"columns": ["z"],
|
||||
"index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
|
||||
},
|
||||
),
|
||||
(
|
||||
[1, 0],
|
||||
{
|
||||
"columns": ["z"],
|
||||
"index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]),
|
||||
},
|
||||
),
|
||||
(
|
||||
["y", "x"],
|
||||
{
|
||||
"columns": ["z"],
|
||||
"index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]),
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_index_col_empty_data(all_parsers, index_col, kwargs):
|
||||
data = "x,y,z"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=index_col)
|
||||
|
||||
expected = DataFrame(**kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_index_col_false(all_parsers):
|
||||
# see gh-10413
|
||||
data = "x,y"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=False)
|
||||
|
||||
expected = DataFrame(columns=["x", "y"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_names",
|
||||
[
|
||||
["", ""],
|
||||
["foo", ""],
|
||||
["", "bar"],
|
||||
["foo", "bar"],
|
||||
["NotReallyUnnamed", "Unnamed: 0"],
|
||||
],
|
||||
)
|
||||
def test_multi_index_naming(all_parsers, index_names, request):
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and "" in index_names:
|
||||
mark = pytest.mark.xfail(reason="One case raises, others are wrong")
|
||||
request.applymarker(mark)
|
||||
|
||||
# We don't want empty index names being replaced with "Unnamed: 0"
|
||||
data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
|
||||
result = parser.read_csv(StringIO(data), index_col=[0, 1])
|
||||
|
||||
expected = DataFrame(
|
||||
{"col": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], ["c", "d"]])
|
||||
)
|
||||
expected.index.names = [name if name else None for name in index_names]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_multi_index_naming_not_all_at_beginning(all_parsers):
|
||||
parser = all_parsers
|
||||
data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
|
||||
result = parser.read_csv(StringIO(data), index_col=[0, 2])
|
||||
|
||||
expected = DataFrame(
|
||||
{"Unnamed: 2": ["c", "d", "c", "d"]},
|
||||
index=MultiIndex(
|
||||
levels=[["a", "b"], [1, 2, 3, 4]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_no_multi_index_level_names_empty(all_parsers):
|
||||
# GH 10984
|
||||
parser = all_parsers
|
||||
midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)])
|
||||
expected = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((3, 3)),
|
||||
index=midx,
|
||||
columns=["x", "y", "z"],
|
||||
)
|
||||
with tm.ensure_clean() as path:
|
||||
expected.to_csv(path)
|
||||
result = parser.read_csv(path, index_col=[0, 1, 2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_with_index_col(all_parsers):
|
||||
# GH 33476
|
||||
parser = all_parsers
|
||||
data = """
|
||||
I11,A,A
|
||||
I12,B,B
|
||||
I2,1,3
|
||||
"""
|
||||
midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"])
|
||||
idx = Index(["I2"])
|
||||
expected = DataFrame([[1, 3]], index=idx, columns=midx)
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
col_idx = Index(["A", "A.1"])
|
||||
idx = Index(["I12", "I2"], name="I11")
|
||||
expected = DataFrame([["B", "B"], ["1", "3"]], index=idx, columns=col_idx)
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col="I11", header=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_index_col_large_csv(all_parsers, monkeypatch):
|
||||
# https://github.com/pandas-dev/pandas/issues/37094
|
||||
parser = all_parsers
|
||||
|
||||
ARR_LEN = 100
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": range(ARR_LEN + 1),
|
||||
"b": np.random.default_rng(2).standard_normal(ARR_LEN + 1),
|
||||
}
|
||||
)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_csv(path, index=False)
|
||||
with monkeypatch.context() as m:
|
||||
m.setattr("pandas.core.algorithms._MINIMUM_COMP_ARR_LEN", ARR_LEN)
|
||||
result = parser.read_csv(path, index_col=[0])
|
||||
|
||||
tm.assert_frame_equal(result, df.set_index("a"))
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_index_col_multiindex_columns_no_data(all_parsers):
|
||||
# GH#38292
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1], index_col=0
|
||||
)
|
||||
expected = DataFrame(
|
||||
[],
|
||||
index=Index([]),
|
||||
columns=MultiIndex.from_arrays(
|
||||
[["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_index_col_header_no_data(all_parsers):
|
||||
# GH#38292
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO("a0,a1,a2\n"), header=[0], index_col=0)
|
||||
expected = DataFrame(
|
||||
[],
|
||||
columns=["a1", "a2"],
|
||||
index=Index([], name="a0"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_multiindex_columns_no_data(all_parsers):
|
||||
# GH#38292
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1])
|
||||
expected = DataFrame(
|
||||
[], columns=MultiIndex.from_arrays([["a0", "a1", "a2"], ["b0", "b1", "b2"]])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_multiindex_columns_index_col_with_data(all_parsers):
|
||||
# GH#38292
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO("a0,a1,a2\nb0,b1,b2\ndata,data,data"), header=[0, 1], index_col=0
|
||||
)
|
||||
expected = DataFrame(
|
||||
[["data", "data"]],
|
||||
columns=MultiIndex.from_arrays(
|
||||
[["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"]
|
||||
),
|
||||
index=Index(["data"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_infer_types_boolean_sum(all_parsers):
|
||||
# GH#44079
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO("0,1"),
|
||||
names=["a", "b"],
|
||||
index_col=["a"],
|
||||
dtype={"a": "UInt8"},
|
||||
)
|
||||
expected = DataFrame(
|
||||
data={
|
||||
"a": [
|
||||
0,
|
||||
],
|
||||
"b": [1],
|
||||
}
|
||||
).set_index("a")
|
||||
# Not checking index type now, because the C parser will return a
|
||||
# index column of dtype 'object', and the Python parser will return a
|
||||
# index column of dtype 'int64'.
|
||||
tm.assert_frame_equal(result, expected, check_index_type=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
|
||||
def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
|
||||
# GH#9435
|
||||
data = "a,b\n01,2"
|
||||
parser = all_parsers
|
||||
if dtype == object and parser.engine == "pyarrow":
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
|
||||
expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_multiindex_columns_not_leading_index_col(all_parsers):
|
||||
# GH#38549
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
e,f,g,h
|
||||
x,y,1,2
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=[0, 1],
|
||||
index_col=1,
|
||||
)
|
||||
cols = MultiIndex.from_tuples(
|
||||
[("a", "e"), ("c", "g"), ("d", "h")], names=["b", "f"]
|
||||
)
|
||||
expected = DataFrame([["x", 1, 2]], columns=cols, index=["y"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,182 @@
|
||||
"""
|
||||
Tests that duplicate columns are handled appropriately when parsed by the
|
||||
CSV engine. In general, the expected result is that they are either thoroughly
|
||||
de-duplicated (if mangling requested) or ignored otherwise.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_basic(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "a,a,b,b,b\n1,2,3,4,5"
|
||||
result = parser.read_csv(StringIO(data), sep=",")
|
||||
|
||||
expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_basic_names(all_parsers):
|
||||
# See gh-7160
|
||||
parser = all_parsers
|
||||
|
||||
data = "a,b,a\n0,1,2\n3,4,5"
|
||||
expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"])
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_basic_names_raise(all_parsers):
|
||||
# See gh-7160
|
||||
parser = all_parsers
|
||||
|
||||
data = "0,1,2\n3,4,5"
|
||||
with pytest.raises(ValueError, match="Duplicate names"):
|
||||
parser.read_csv(StringIO(data), names=["a", "b", "a"])
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected",
|
||||
[
|
||||
("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.2", "a.1"])),
|
||||
(
|
||||
"a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
|
||||
DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6]],
|
||||
columns=["a", "a.2", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
|
||||
DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6, 7]],
|
||||
columns=["a", "a.4", "a.3", "a.1", "a.2", "a.5", "a.6"],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_thorough_mangle_columns(all_parsers, data, expected):
|
||||
# see gh-17060
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,names,expected",
|
||||
[
|
||||
(
|
||||
"a,b,b\n1,2,3",
|
||||
["a.1", "a.1", "a.1.1"],
|
||||
DataFrame(
|
||||
[["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"]
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,b,c,d,e,f\n1,2,3,4,5,6",
|
||||
["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
|
||||
DataFrame(
|
||||
[["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]],
|
||||
columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,b,c,d,e,f,g\n1,2,3,4,5,6,7",
|
||||
["a", "a", "a.3", "a.1", "a.2", "a", "a"],
|
||||
DataFrame(
|
||||
[
|
||||
["a", "b", "c", "d", "e", "f", "g"],
|
||||
["1", "2", "3", "4", "5", "6", "7"],
|
||||
],
|
||||
columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_thorough_mangle_names(all_parsers, data, names, expected):
|
||||
# see gh-17095
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match="Duplicate names"):
|
||||
parser.read_csv(StringIO(data), names=names)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.columns are different
|
||||
def test_mangled_unnamed_placeholders(all_parsers):
|
||||
# xref gh-13017
|
||||
orig_key = "0"
|
||||
parser = all_parsers
|
||||
|
||||
orig_value = [1, 2, 3]
|
||||
df = DataFrame({orig_key: orig_value})
|
||||
|
||||
# This test recursively updates `df`.
|
||||
for i in range(3):
|
||||
expected = DataFrame(columns=Index([], dtype="str"))
|
||||
|
||||
for j in range(i + 1):
|
||||
col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
|
||||
expected.insert(loc=0, column=col_name, value=[0, 1, 2])
|
||||
|
||||
expected[orig_key] = orig_value
|
||||
df = parser.read_csv(StringIO(df.to_csv()))
|
||||
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_mangle_dupe_cols_already_exists(all_parsers):
|
||||
# GH#14704
|
||||
parser = all_parsers
|
||||
|
||||
data = "a,a,a.1,a,a.3,a.1,a.1.1\n1,2,3,4,5,6,7"
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6, 7]],
|
||||
columns=["a", "a.2", "a.1", "a.4", "a.3", "a.1.2", "a.1.1"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers):
|
||||
# GH#14704
|
||||
parser = all_parsers
|
||||
|
||||
data = ",Unnamed: 0,,Unnamed: 2\n1,2,3,4"
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4]],
|
||||
columns=["Unnamed: 0.1", "Unnamed: 0", "Unnamed: 2.1", "Unnamed: 2"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecol, engine", [([0, 1, 1], "python"), ([0, 1, 1], "c")])
|
||||
def test_mangle_cols_names(all_parsers, usecol, engine):
|
||||
# GH 11823
|
||||
parser = all_parsers
|
||||
data = "1,2,3"
|
||||
names = ["A", "A", "B"]
|
||||
with pytest.raises(ValueError, match="Duplicate names"):
|
||||
parser.read_csv(StringIO(data), names=names, usecols=usecol, engine=engine)
|
||||
@ -0,0 +1,157 @@
|
||||
"""
|
||||
Tests multithreading behaviour for reading and
|
||||
parsing files for each parser defined in parsers.py
|
||||
"""
|
||||
from contextlib import ExitStack
|
||||
from io import BytesIO
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
from pandas.util.version import Version
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
# We'll probably always skip these for pyarrow
|
||||
# Maybe we'll add our own tests for pyarrow too
|
||||
pytestmark = [
|
||||
pytest.mark.single_cpu,
|
||||
pytest.mark.slow,
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
|
||||
def test_multi_thread_string_io_read_csv(all_parsers, request):
|
||||
# see gh-11786
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
if Version(pa.__version__) < Version("16.0"):
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(reason="# ValueError: Found non-unique column index")
|
||||
)
|
||||
max_row_range = 100
|
||||
num_files = 10
|
||||
|
||||
bytes_to_df = (
|
||||
"\n".join([f"{i:d},{i:d},{i:d}" for i in range(max_row_range)]).encode()
|
||||
for _ in range(num_files)
|
||||
)
|
||||
|
||||
# Read all files in many threads.
|
||||
with ExitStack() as stack:
|
||||
files = [stack.enter_context(BytesIO(b)) for b in bytes_to_df]
|
||||
|
||||
pool = stack.enter_context(ThreadPool(8))
|
||||
|
||||
results = pool.map(parser.read_csv, files)
|
||||
first_result = results[0]
|
||||
|
||||
for result in results:
|
||||
tm.assert_frame_equal(first_result, result)
|
||||
|
||||
|
||||
def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks):
|
||||
"""
|
||||
Generate a DataFrame via multi-thread.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
parser : BaseParser
|
||||
The parser object to use for reading the data.
|
||||
path : str
|
||||
The location of the CSV file to read.
|
||||
num_rows : int
|
||||
The number of rows to read per task.
|
||||
num_tasks : int
|
||||
The number of tasks to use for reading this DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
"""
|
||||
|
||||
def reader(arg):
|
||||
"""
|
||||
Create a reader for part of the CSV.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arg : tuple
|
||||
A tuple of the following:
|
||||
|
||||
* start : int
|
||||
The starting row to start for parsing CSV
|
||||
* nrows : int
|
||||
The number of rows to read.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
"""
|
||||
start, nrows = arg
|
||||
|
||||
if not start:
|
||||
return parser.read_csv(
|
||||
path, index_col=0, header=0, nrows=nrows, parse_dates=["date"]
|
||||
)
|
||||
|
||||
return parser.read_csv(
|
||||
path,
|
||||
index_col=0,
|
||||
header=None,
|
||||
skiprows=int(start) + 1,
|
||||
nrows=nrows,
|
||||
parse_dates=[9],
|
||||
)
|
||||
|
||||
tasks = [
|
||||
(num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks)
|
||||
]
|
||||
|
||||
with ThreadPool(processes=num_tasks) as pool:
|
||||
results = pool.map(reader, tasks)
|
||||
|
||||
header = results[0].columns
|
||||
|
||||
for r in results[1:]:
|
||||
r.columns = header
|
||||
|
||||
final_dataframe = pd.concat(results)
|
||||
return final_dataframe
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: The 'nrows' option is not supported
|
||||
def test_multi_thread_path_multipart_read_csv(all_parsers):
|
||||
# see gh-11786
|
||||
num_tasks = 4
|
||||
num_rows = 48
|
||||
|
||||
parser = all_parsers
|
||||
file_name = "__thread_pool_reader__.csv"
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).random(num_rows),
|
||||
"b": np.random.default_rng(2).random(num_rows),
|
||||
"c": np.random.default_rng(2).random(num_rows),
|
||||
"d": np.random.default_rng(2).random(num_rows),
|
||||
"e": np.random.default_rng(2).random(num_rows),
|
||||
"foo": ["foo"] * num_rows,
|
||||
"bar": ["bar"] * num_rows,
|
||||
"baz": ["baz"] * num_rows,
|
||||
"date": pd.date_range("20000101 09:00:00", periods=num_rows, freq="s"),
|
||||
"int": np.arange(num_rows, dtype="int64"),
|
||||
}
|
||||
)
|
||||
|
||||
with tm.ensure_clean(file_name) as path:
|
||||
df.to_csv(path)
|
||||
|
||||
final_dataframe = _generate_multi_thread_dataframe(
|
||||
parser, path, num_rows, num_tasks
|
||||
)
|
||||
tm.assert_frame_equal(df, final_dataframe)
|
||||
@ -0,0 +1,780 @@
|
||||
"""
|
||||
Tests that NA values are properly handled during
|
||||
parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.parsers import STR_NA_VALUES
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_string_nas(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
a,b,c
|
||||
d,,f
|
||||
,g,h
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[["a", "b", "c"], ["d", np.nan, "f"], [np.nan, "g", "h"]],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
expected.loc[2, "A"] = None
|
||||
expected.loc[1, "B"] = None
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_detect_string_na(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B
|
||||
foo,bar
|
||||
NA,baz
|
||||
NaN,nan
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[["foo", "bar"], [np.nan, "baz"], [np.nan, np.nan]], columns=["A", "B"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
expected.loc[[1, 2], "A"] = None
|
||||
expected.loc[2, "B"] = None
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_values",
|
||||
[
|
||||
["-999.0", "-999"],
|
||||
[-999, -999.0],
|
||||
[-999.0, -999],
|
||||
["-999.0"],
|
||||
["-999"],
|
||||
[-999.0],
|
||||
[-999],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
"""A,B
|
||||
-999,1.2
|
||||
2,-999
|
||||
3,4.5
|
||||
""",
|
||||
"""A,B
|
||||
-999,1.200
|
||||
2,-999.000
|
||||
3,4.500
|
||||
""",
|
||||
],
|
||||
)
|
||||
def test_non_string_na_values(all_parsers, data, na_values, request):
|
||||
# see gh-3611: with an odd float format, we can't match
|
||||
# the string "999.0" exactly but still need float matching
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"])
|
||||
|
||||
if parser.engine == "pyarrow" and not all(isinstance(x, str) for x in na_values):
|
||||
msg = "The 'pyarrow' engine requires all na_values to be strings"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), na_values=na_values)
|
||||
return
|
||||
elif parser.engine == "pyarrow" and "-999.000" in data:
|
||||
# bc the pyarrow engine does not include the float-ified version
|
||||
# of "-999" -> -999, it does not match the entry with the trailing
|
||||
# zeros, so "-999.000" is not treated as null.
|
||||
mark = pytest.mark.xfail(
|
||||
reason="pyarrow engined does not recognize equivalent floats"
|
||||
)
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_default_na_values(all_parsers):
|
||||
_NA_VALUES = {
|
||||
"-1.#IND",
|
||||
"1.#QNAN",
|
||||
"1.#IND",
|
||||
"-1.#QNAN",
|
||||
"#N/A",
|
||||
"N/A",
|
||||
"n/a",
|
||||
"NA",
|
||||
"<NA>",
|
||||
"#NA",
|
||||
"NULL",
|
||||
"null",
|
||||
"NaN",
|
||||
"nan",
|
||||
"-NaN",
|
||||
"-nan",
|
||||
"#N/A N/A",
|
||||
"",
|
||||
"None",
|
||||
}
|
||||
assert _NA_VALUES == STR_NA_VALUES
|
||||
|
||||
parser = all_parsers
|
||||
nv = len(_NA_VALUES)
|
||||
|
||||
def f(i, v):
|
||||
if i == 0:
|
||||
buf = ""
|
||||
elif i > 0:
|
||||
buf = "".join([","] * i)
|
||||
|
||||
buf = f"{buf}{v}"
|
||||
|
||||
if i < nv - 1:
|
||||
joined = "".join([","] * (nv - i - 1))
|
||||
buf = f"{buf}{joined}"
|
||||
|
||||
return buf
|
||||
|
||||
data = StringIO("\n".join([f(i, v) for i, v in enumerate(_NA_VALUES)]))
|
||||
expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
|
||||
|
||||
result = parser.read_csv(data, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values", ["baz", ["baz"]])
|
||||
def test_custom_na_values(all_parsers, na_values):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
ignore,this,row
|
||||
1,NA,3
|
||||
-1.#IND,5,baz
|
||||
7,8,NaN
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "skiprows argument must be an integer when using engine='pyarrow'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_bool_na_values(all_parsers):
|
||||
data = """A,B,C
|
||||
True,False,True
|
||||
NA,True,False
|
||||
False,NA,True"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": np.array([True, np.nan, False], dtype=object),
|
||||
"B": np.array([False, True, np.nan], dtype=object),
|
||||
"C": [True, False, True],
|
||||
}
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
expected.loc[1, "A"] = None
|
||||
expected.loc[2, "B"] = None
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_na_value_dict(all_parsers):
|
||||
data = """A,B,C
|
||||
foo,bar,NA
|
||||
bar,foo,foo
|
||||
foo,bar,NA
|
||||
bar,foo,foo"""
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]})
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]})
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [np.nan, "bar", np.nan, "bar"],
|
||||
"B": [np.nan, "foo", np.nan, "foo"],
|
||||
"C": [np.nan, "foo", np.nan, "foo"],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_col,expected",
|
||||
[
|
||||
(
|
||||
[0],
|
||||
DataFrame({"b": [np.nan], "c": [1], "d": [5]}, index=Index([0], name="a")),
|
||||
),
|
||||
(
|
||||
[0, 2],
|
||||
DataFrame(
|
||||
{"b": [np.nan], "d": [5]},
|
||||
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
|
||||
),
|
||||
),
|
||||
(
|
||||
["a", "c"],
|
||||
DataFrame(
|
||||
{"b": [np.nan], "d": [5]},
|
||||
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_na_value_dict_multi_index(all_parsers, index_col, expected):
|
||||
data = """\
|
||||
a,b,c,d
|
||||
0,NA,1,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), na_values=set(), index_col=index_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,expected",
|
||||
[
|
||||
(
|
||||
{},
|
||||
DataFrame(
|
||||
{
|
||||
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
{"na_values": {"A": [], "C": []}, "keep_default_na": False},
|
||||
DataFrame(
|
||||
{
|
||||
"A": ["a", "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", "nan", "five", "", "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
{"na_values": ["a"], "keep_default_na": False},
|
||||
DataFrame(
|
||||
{
|
||||
"A": [np.nan, "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", "nan", "five", "", "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
{"na_values": {"A": [], "C": []}},
|
||||
DataFrame(
|
||||
{
|
||||
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_na_values_keep_default(
|
||||
all_parsers, kwargs, expected, request, using_infer_string
|
||||
):
|
||||
data = """\
|
||||
A,B,C
|
||||
a,1,one
|
||||
b,2,two
|
||||
,3,three
|
||||
d,4,nan
|
||||
e,5,five
|
||||
nan,6,
|
||||
g,7,seven
|
||||
"""
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
if "na_values" in kwargs and isinstance(kwargs["na_values"], dict):
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
return
|
||||
if not using_infer_string or "na_values" in kwargs:
|
||||
mark = pytest.mark.xfail()
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_na_values_no_keep_default(all_parsers):
|
||||
# see gh-4318: passing na_values=None and
|
||||
# keep_default_na=False yields 'None" as a na_value
|
||||
data = """\
|
||||
A,B,C
|
||||
a,1,None
|
||||
b,2,two
|
||||
,3,None
|
||||
d,4,nan
|
||||
e,5,five
|
||||
nan,6,
|
||||
g,7,seven
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), keep_default_na=False)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": ["a", "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["None", "two", "None", "nan", "five", "", "seven"],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_keep_default_na_dict_na_values(all_parsers):
|
||||
# see gh-19227
|
||||
data = "a,b\n,2"
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), na_values={"b": ["2"]}, keep_default_na=False
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), na_values={"b": ["2"]}, keep_default_na=False
|
||||
)
|
||||
expected = DataFrame({"a": [""], "b": [np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
|
||||
# see gh-19227
|
||||
#
|
||||
# Scalar values shouldn't cause the parsing to crash or fail.
|
||||
data = "a,b\n1,2"
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False)
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False)
|
||||
expected = DataFrame({"a": [1], "b": [np.nan]})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("col_zero_na_values", [113125, "113125"])
|
||||
def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values):
|
||||
# see gh-19227
|
||||
data = """\
|
||||
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
|
||||
729639,"qwer","",asdfkj,466.681,,252.373
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: [np.nan, 729639.0],
|
||||
1: [np.nan, "qwer"],
|
||||
2: ["/blaha", np.nan],
|
||||
3: ["kjsdkj", "asdfkj"],
|
||||
4: [412.166, 466.681],
|
||||
5: ["225.874", ""],
|
||||
6: [np.nan, 252.373],
|
||||
}
|
||||
)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
keep_default_na=False,
|
||||
na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values},
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
keep_default_na=False,
|
||||
na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_filter,row_data",
|
||||
[
|
||||
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
|
||||
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
|
||||
],
|
||||
)
|
||||
def test_na_values_na_filter_override(
|
||||
request, all_parsers, na_filter, row_data, using_infer_string
|
||||
):
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
# mismatched dtypes in both cases, FutureWarning in the True case
|
||||
if not (using_infer_string and na_filter):
|
||||
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
|
||||
request.applymarker(mark)
|
||||
data = """\
|
||||
A,B
|
||||
1,A
|
||||
nan,B
|
||||
3,C
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)
|
||||
|
||||
expected = DataFrame(row_data, columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 8 columns, got 5:
|
||||
def test_na_trailing_columns(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
|
||||
2012-03-14,USD,AAPL,BUY,1000
|
||||
2012-05-12,USD,SBUX,SELL,500"""
|
||||
|
||||
# Trailing columns should be all NaN.
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[
|
||||
["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan],
|
||||
["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan],
|
||||
],
|
||||
columns=[
|
||||
"Date",
|
||||
"Currency",
|
||||
"Symbol",
|
||||
"Type",
|
||||
"Units",
|
||||
"UnitPrice",
|
||||
"Cost",
|
||||
"Tax",
|
||||
],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_values,row_data",
|
||||
[
|
||||
(1, [[np.nan, 2.0], [2.0, np.nan]]),
|
||||
({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]),
|
||||
],
|
||||
)
|
||||
def test_na_values_scalar(all_parsers, na_values, row_data):
|
||||
# see gh-12224
|
||||
parser = all_parsers
|
||||
names = ["a", "b"]
|
||||
data = "1,2\n2,1"
|
||||
|
||||
if parser.engine == "pyarrow" and isinstance(na_values, dict):
|
||||
if isinstance(na_values, dict):
|
||||
err = ValueError
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
else:
|
||||
err = TypeError
|
||||
msg = "The 'pyarrow' engine requires all na_values to be strings"
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
return
|
||||
elif parser.engine == "pyarrow":
|
||||
msg = "The 'pyarrow' engine requires all na_values to be strings"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
expected = DataFrame(row_data, columns=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_na_values_dict_aliasing(all_parsers):
|
||||
parser = all_parsers
|
||||
na_values = {"a": 2, "b": 1}
|
||||
na_values_copy = na_values.copy()
|
||||
|
||||
names = ["a", "b"]
|
||||
data = "1,2\n2,1"
|
||||
|
||||
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_dict_equal(na_values, na_values_copy)
|
||||
|
||||
|
||||
def test_na_values_dict_col_index(all_parsers):
|
||||
# see gh-14203
|
||||
data = "a\nfoo\n1"
|
||||
parser = all_parsers
|
||||
na_values = {0: "foo"}
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), na_values=na_values)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values)
|
||||
expected = DataFrame({"a": [np.nan, 1]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
str(2**63) + "\n" + str(2**63 + 1),
|
||||
{"na_values": [2**63]},
|
||||
DataFrame([str(2**63), str(2**63 + 1)]),
|
||||
),
|
||||
(str(2**63) + ",1" + "\n,2", {}, DataFrame([[str(2**63), 1], ["", 2]])),
|
||||
(str(2**63) + "\n1", {"na_values": [2**63]}, DataFrame([np.nan, 1])),
|
||||
],
|
||||
)
|
||||
def test_na_values_uint64(all_parsers, data, kwargs, expected, request):
|
||||
# see gh-14983
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and "na_values" in kwargs:
|
||||
msg = "The 'pyarrow' engine requires all na_values to be strings"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=None, **kwargs)
|
||||
return
|
||||
elif parser.engine == "pyarrow":
|
||||
mark = pytest.mark.xfail(reason="Returns float64 instead of object")
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_na_values_no_default_with_index(all_parsers):
|
||||
# see gh-15835
|
||||
data = "a,1\nb,2"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"1": [2]}, index=Index(["b"], name="a"))
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, keep_default_na=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])]
|
||||
)
|
||||
def test_no_na_filter_on_index(all_parsers, na_filter, index_data, request):
|
||||
# see gh-5239
|
||||
#
|
||||
# Don't parse NA-values in index unless na_filter=True
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,,3\n4,5,6"
|
||||
|
||||
if parser.engine == "pyarrow" and na_filter is False:
|
||||
mark = pytest.mark.xfail(reason="mismatched index result")
|
||||
request.applymarker(mark)
|
||||
|
||||
expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b"))
|
||||
result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_inf_na_values_with_int_index(all_parsers):
|
||||
# see gh-17128
|
||||
parser = all_parsers
|
||||
data = "idx,col1,col2\n1,3,4\n2,inf,-inf"
|
||||
|
||||
# Don't fail with OverflowError with inf's and integer index column.
|
||||
out = parser.read_csv(StringIO(data), index_col=[0], na_values=["inf", "-inf"])
|
||||
expected = DataFrame(
|
||||
{"col1": [3, np.nan], "col2": [4, np.nan]}, index=Index([1, 2], name="idx")
|
||||
)
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # mismatched shape
|
||||
@pytest.mark.parametrize("na_filter", [True, False])
|
||||
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
|
||||
# see gh-20377
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,,3\n4,5,6"
|
||||
|
||||
# na_filter=True --> missing value becomes NaN.
|
||||
# na_filter=False --> missing value remains empty string.
|
||||
empty = np.nan if na_filter else ""
|
||||
expected = DataFrame({"a": ["1", "4"], "b": [empty, "5"], "c": ["3", "6"]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # mismatched exception message
|
||||
@pytest.mark.parametrize(
|
||||
"data, na_values",
|
||||
[
|
||||
("false,1\n,1\ntrue", None),
|
||||
("false,1\nnull,1\ntrue", None),
|
||||
("false,1\nnan,1\ntrue", None),
|
||||
("false,1\nfoo,1\ntrue", "foo"),
|
||||
("false,1\nfoo,1\ntrue", ["foo"]),
|
||||
("false,1\nfoo,1\ntrue", {"a": "foo"}),
|
||||
],
|
||||
)
|
||||
def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
|
||||
parser = all_parsers
|
||||
msg = "|".join(
|
||||
[
|
||||
"Bool column has NA values in column [0a]",
|
||||
"cannot safely convert passed user dtype of "
|
||||
"bool for object dtyped data in column 0",
|
||||
]
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
names=["a", "b"],
|
||||
dtype={"a": "bool"},
|
||||
na_values=na_values,
|
||||
)
|
||||
|
||||
|
||||
# TODO: this test isn't about the na_values keyword, it is about the empty entries
|
||||
# being returned with NaN entries, whereas the pyarrow engine returns "nan"
|
||||
@xfail_pyarrow # mismatched shapes
|
||||
def test_str_nan_dropped(all_parsers):
|
||||
# see gh-21131
|
||||
parser = all_parsers
|
||||
|
||||
data = """File: small.csv,,
|
||||
10010010233,0123,654
|
||||
foo,,bar
|
||||
01001000155,4530,898"""
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
names=["col1", "col2", "col3"],
|
||||
dtype={"col1": str, "col2": str, "col3": str},
|
||||
).dropna()
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col1": ["10010010233", "01001000155"],
|
||||
"col2": ["0123", "4530"],
|
||||
"col3": ["654", "898"],
|
||||
},
|
||||
index=[1, 3],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nan_multi_index(all_parsers):
|
||||
# GH 42446
|
||||
parser = all_parsers
|
||||
data = "A,B,B\nX,Y,Z\n1,2,inf"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"}
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"}
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
("A", "X"): [1],
|
||||
("B", "Y"): [2],
|
||||
("B", "Z"): [np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # Failed: DID NOT RAISE <class 'ValueError'>; it casts the NaN to False
|
||||
def test_bool_and_nan_to_bool(all_parsers):
|
||||
# GH#42808
|
||||
parser = all_parsers
|
||||
data = """0
|
||||
NaN
|
||||
True
|
||||
False
|
||||
"""
|
||||
with pytest.raises(ValueError, match="NA values"):
|
||||
parser.read_csv(StringIO(data), dtype="bool")
|
||||
|
||||
|
||||
def test_bool_and_nan_to_int(all_parsers):
|
||||
# GH#42808
|
||||
parser = all_parsers
|
||||
data = """0
|
||||
NaN
|
||||
True
|
||||
False
|
||||
"""
|
||||
with pytest.raises(ValueError, match="convert|NoneType"):
|
||||
parser.read_csv(StringIO(data), dtype="int")
|
||||
|
||||
|
||||
def test_bool_and_nan_to_float(all_parsers):
|
||||
# GH#42808
|
||||
parser = all_parsers
|
||||
data = """0
|
||||
NaN
|
||||
True
|
||||
False
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), dtype="float")
|
||||
expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,327 @@
|
||||
"""
|
||||
Tests parsers ability to read and parse non-local files
|
||||
and hence require a network connection to be read.
|
||||
"""
|
||||
from io import BytesIO
|
||||
import logging
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.feather_format import read_feather
|
||||
from pandas.io.parsers import read_csv
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.single_cpu
|
||||
@pytest.mark.parametrize("mode", ["explicit", "infer"])
|
||||
@pytest.mark.parametrize("engine", ["python", "c"])
|
||||
def test_compressed_urls(
|
||||
httpserver,
|
||||
datapath,
|
||||
salaries_table,
|
||||
mode,
|
||||
engine,
|
||||
compression_only,
|
||||
compression_to_extension,
|
||||
):
|
||||
# test reading compressed urls with various engines and
|
||||
# extension inference
|
||||
if compression_only == "tar":
|
||||
pytest.skip("TODO: Add tar salaraies.csv to pandas/io/parsers/data")
|
||||
|
||||
extension = compression_to_extension[compression_only]
|
||||
with open(datapath("io", "parser", "data", "salaries.csv" + extension), "rb") as f:
|
||||
httpserver.serve_content(content=f.read())
|
||||
|
||||
url = httpserver.url + "/salaries.csv" + extension
|
||||
|
||||
if mode != "explicit":
|
||||
compression_only = mode
|
||||
|
||||
url_table = read_csv(url, sep="\t", compression=compression_only, engine=engine)
|
||||
tm.assert_frame_equal(url_table, salaries_table)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.single_cpu
|
||||
def test_url_encoding_csv(httpserver, datapath):
|
||||
"""
|
||||
read_csv should honor the requested encoding for URLs.
|
||||
|
||||
GH 10424
|
||||
"""
|
||||
with open(datapath("io", "parser", "data", "unicode_series.csv"), "rb") as f:
|
||||
httpserver.serve_content(content=f.read())
|
||||
df = read_csv(httpserver.url, encoding="latin-1", header=None)
|
||||
assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tips_df(datapath):
|
||||
"""DataFrame with the tips dataset."""
|
||||
return read_csv(datapath("io", "data", "csv", "tips.csv"))
|
||||
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
@pytest.mark.usefixtures("s3_resource")
|
||||
@td.skip_if_not_us_locale()
|
||||
class TestS3:
|
||||
def test_parse_public_s3_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
|
||||
# more of an integration test due to the not-public contents portion
|
||||
# can probably mock this though.
|
||||
pytest.importorskip("s3fs")
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
compression=comp,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_private_s3_bucket(self, s3_private_bucket_with_data, tips_df, s3so):
|
||||
# Read public file from bucket with not-public contents
|
||||
pytest.importorskip("s3fs")
|
||||
df = read_csv(
|
||||
f"s3://{s3_private_bucket_with_data.name}/tips.csv", storage_options=s3so
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_public_s3n_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
|
||||
# Read from AWS s3 as "s3n" URL
|
||||
df = read_csv(
|
||||
f"s3n://{s3_public_bucket_with_data.name}/tips.csv",
|
||||
nrows=10,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3a_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
|
||||
# Read from AWS s3 as "s3a" URL
|
||||
df = read_csv(
|
||||
f"s3a://{s3_public_bucket_with_data.name}/tips.csv",
|
||||
nrows=10,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3_bucket_nrows(
|
||||
self, s3_public_bucket_with_data, tips_df, s3so
|
||||
):
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
nrows=10,
|
||||
compression=comp,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3_bucket_chunked(
|
||||
self, s3_public_bucket_with_data, tips_df, s3so
|
||||
):
|
||||
# Read with a chunksize
|
||||
chunksize = 5
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
with read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
chunksize=chunksize,
|
||||
compression=comp,
|
||||
storage_options=s3so,
|
||||
) as df_reader:
|
||||
assert df_reader.chunksize == chunksize
|
||||
for i_chunk in [0, 1, 2]:
|
||||
# Read a couple of chunks and make sure we see them
|
||||
# properly.
|
||||
df = df_reader.get_chunk()
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
true_df = tips_df.iloc[
|
||||
chunksize * i_chunk : chunksize * (i_chunk + 1)
|
||||
]
|
||||
tm.assert_frame_equal(true_df, df)
|
||||
|
||||
def test_parse_public_s3_bucket_chunked_python(
|
||||
self, s3_public_bucket_with_data, tips_df, s3so
|
||||
):
|
||||
# Read with a chunksize using the Python parser
|
||||
chunksize = 5
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
with read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
chunksize=chunksize,
|
||||
compression=comp,
|
||||
engine="python",
|
||||
storage_options=s3so,
|
||||
) as df_reader:
|
||||
assert df_reader.chunksize == chunksize
|
||||
for i_chunk in [0, 1, 2]:
|
||||
# Read a couple of chunks and make sure we see them properly.
|
||||
df = df_reader.get_chunk()
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
true_df = tips_df.iloc[
|
||||
chunksize * i_chunk : chunksize * (i_chunk + 1)
|
||||
]
|
||||
tm.assert_frame_equal(true_df, df)
|
||||
|
||||
def test_parse_public_s3_bucket_python(
|
||||
self, s3_public_bucket_with_data, tips_df, s3so
|
||||
):
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
engine="python",
|
||||
compression=comp,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_infer_s3_compression(self, s3_public_bucket_with_data, tips_df, s3so):
|
||||
for ext in ["", ".gz", ".bz2"]:
|
||||
df = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
engine="python",
|
||||
compression="infer",
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_public_s3_bucket_nrows_python(
|
||||
self, s3_public_bucket_with_data, tips_df, s3so
|
||||
):
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
engine="python",
|
||||
nrows=10,
|
||||
compression=comp,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_read_s3_fails(self, s3so):
|
||||
msg = "The specified bucket does not exist"
|
||||
with pytest.raises(OSError, match=msg):
|
||||
read_csv("s3://nyqpug/asdf.csv", storage_options=s3so)
|
||||
|
||||
def test_read_s3_fails_private(self, s3_private_bucket, s3so):
|
||||
msg = "The specified bucket does not exist"
|
||||
# Receive a permission error when trying to read a private bucket.
|
||||
# It's irrelevant here that this isn't actually a table.
|
||||
with pytest.raises(OSError, match=msg):
|
||||
read_csv(f"s3://{s3_private_bucket.name}/file.csv")
|
||||
|
||||
@pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False)
|
||||
def test_write_s3_csv_fails(self, tips_df, s3so):
|
||||
# GH 32486
|
||||
# Attempting to write to an invalid S3 path should raise
|
||||
import botocore
|
||||
|
||||
# GH 34087
|
||||
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
|
||||
# Catch a ClientError since AWS Service Errors are defined dynamically
|
||||
error = (FileNotFoundError, botocore.exceptions.ClientError)
|
||||
|
||||
with pytest.raises(error, match="The specified bucket does not exist"):
|
||||
tips_df.to_csv(
|
||||
"s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so
|
||||
)
|
||||
|
||||
@pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False)
|
||||
def test_write_s3_parquet_fails(self, tips_df, s3so):
|
||||
# GH 27679
|
||||
# Attempting to write to an invalid S3 path should raise
|
||||
pytest.importorskip("pyarrow")
|
||||
import botocore
|
||||
|
||||
# GH 34087
|
||||
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
|
||||
# Catch a ClientError since AWS Service Errors are defined dynamically
|
||||
error = (FileNotFoundError, botocore.exceptions.ClientError)
|
||||
|
||||
with pytest.raises(error, match="The specified bucket does not exist"):
|
||||
tips_df.to_parquet(
|
||||
"s3://an_s3_bucket_data_doesnt_exit/not_real.parquet",
|
||||
storage_options=s3so,
|
||||
)
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
def test_read_csv_handles_boto_s3_object(
|
||||
self, s3_public_bucket_with_data, tips_file
|
||||
):
|
||||
# see gh-16135
|
||||
|
||||
s3_object = s3_public_bucket_with_data.Object("tips.csv")
|
||||
|
||||
with BytesIO(s3_object.get()["Body"].read()) as buffer:
|
||||
result = read_csv(buffer, encoding="utf8")
|
||||
assert isinstance(result, DataFrame)
|
||||
assert not result.empty
|
||||
|
||||
expected = read_csv(tips_file)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
def test_read_csv_chunked_download(self, s3_public_bucket, caplog, s3so):
|
||||
# 8 MB, S3FS uses 5MB chunks
|
||||
df = DataFrame(np.zeros((100000, 4)), columns=list("abcd"))
|
||||
with BytesIO(df.to_csv().encode("utf-8")) as buf:
|
||||
s3_public_bucket.put_object(Key="large-file.csv", Body=buf)
|
||||
uri = f"{s3_public_bucket.name}/large-file.csv"
|
||||
match_re = re.compile(rf"^Fetch: {uri}, 0-(?P<stop>\d+)$")
|
||||
with caplog.at_level(logging.DEBUG, logger="s3fs"):
|
||||
read_csv(
|
||||
f"s3://{uri}",
|
||||
nrows=5,
|
||||
storage_options=s3so,
|
||||
)
|
||||
for log in caplog.messages:
|
||||
if match := re.match(match_re, log):
|
||||
# Less than 8 MB
|
||||
assert int(match.group("stop")) < 8000000
|
||||
|
||||
def test_read_s3_with_hash_in_key(self, s3_public_bucket_with_data, tips_df, s3so):
|
||||
# GH 25945
|
||||
result = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips#1.csv", storage_options=s3so
|
||||
)
|
||||
tm.assert_frame_equal(tips_df, result)
|
||||
|
||||
def test_read_feather_s3_file_path(
|
||||
self, s3_public_bucket_with_data, feather_file, s3so
|
||||
):
|
||||
# GH 29055
|
||||
pytest.importorskip("pyarrow")
|
||||
expected = read_feather(feather_file)
|
||||
res = read_feather(
|
||||
f"s3://{s3_public_bucket_with_data.name}/simple_dataset.feather",
|
||||
storage_options=s3so,
|
||||
)
|
||||
tm.assert_frame_equal(expected, res)
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,566 @@
|
||||
"""
|
||||
Tests that apply specifically to the Python parser. Unless specifically
|
||||
stated as a Python-specific issue, the goal is to eventually move as many of
|
||||
these tests out of this module as soon as the C parser can accept further
|
||||
arguments when parsing.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
TextIOWrapper,
|
||||
)
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import (
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator
|
||||
|
||||
|
||||
def test_default_separator(python_parser_only):
|
||||
# see gh-17333
|
||||
#
|
||||
# csv.Sniffer in Python treats "o" as separator.
|
||||
data = "aob\n1o2\n3o4"
|
||||
parser = python_parser_only
|
||||
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True])
|
||||
def test_invalid_skipfooter_non_int(python_parser_only, skipfooter):
|
||||
# see gh-15925 (comment)
|
||||
data = "a\n1\n2"
|
||||
parser = python_parser_only
|
||||
msg = "skipfooter must be an integer"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=skipfooter)
|
||||
|
||||
|
||||
def test_invalid_skipfooter_negative(python_parser_only):
|
||||
# see gh-15925 (comment)
|
||||
data = "a\n1\n2"
|
||||
parser = python_parser_only
|
||||
msg = "skipfooter cannot be negative"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=-1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [{"sep": None}, {"delimiter": "|"}])
|
||||
def test_sniff_delimiter(python_parser_only, kwargs):
|
||||
data = """index|A|B|C
|
||||
foo|1|2|3
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"],
|
||||
index=Index(["foo", "bar", "baz"], name="index"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_sniff_delimiter_comment(python_parser_only):
|
||||
data = """# comment line
|
||||
index|A|B|C
|
||||
# comment line
|
||||
foo|1|2|3 # ignore | this
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO(data), index_col=0, sep=None, comment="#")
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"],
|
||||
index=Index(["foo", "bar", "baz"], name="index"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("encoding", [None, "utf-8"])
|
||||
def test_sniff_delimiter_encoding(python_parser_only, encoding):
|
||||
parser = python_parser_only
|
||||
data = """ignore this
|
||||
ignore this too
|
||||
index|A|B|C
|
||||
foo|1|2|3
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
|
||||
if encoding is not None:
|
||||
data = data.encode(encoding)
|
||||
data = BytesIO(data)
|
||||
data = TextIOWrapper(data, encoding=encoding)
|
||||
else:
|
||||
data = StringIO(data)
|
||||
|
||||
result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding)
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"],
|
||||
index=Index(["foo", "bar", "baz"], name="index"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_single_line(python_parser_only):
|
||||
# see gh-6607: sniff separator
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None)
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [{"skipfooter": 2}, {"nrows": 3}])
|
||||
def test_skipfooter(python_parser_only, kwargs):
|
||||
# see gh-6607
|
||||
data = """A,B,C
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
want to skip this
|
||||
also also skip this
|
||||
"""
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")]
|
||||
)
|
||||
def test_decompression_regex_sep(python_parser_only, csv1, compression, klass):
|
||||
# see gh-6607
|
||||
parser = python_parser_only
|
||||
|
||||
with open(csv1, "rb") as f:
|
||||
data = f.read()
|
||||
|
||||
data = data.replace(b",", b"::")
|
||||
expected = parser.read_csv(csv1)
|
||||
|
||||
module = pytest.importorskip(compression)
|
||||
klass = getattr(module, klass)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with klass(path, mode="wb") as tmp:
|
||||
tmp.write(data)
|
||||
|
||||
result = parser.read_csv(path, sep="::", compression=compression)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_buglet_4x_multi_index(python_parser_only):
|
||||
# see gh-6607
|
||||
data = """ A B C D E
|
||||
one two three four
|
||||
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
|
||||
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
|
||||
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
|
||||
parser = python_parser_only
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640],
|
||||
[0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
|
||||
[-0.6662, -0.5243, -0.3580, 0.89145, 2.5838],
|
||||
],
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
index=MultiIndex.from_tuples(
|
||||
[("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)],
|
||||
names=["one", "two", "three", "four"],
|
||||
),
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_buglet_4x_multi_index2(python_parser_only):
|
||||
# see gh-6893
|
||||
data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9"
|
||||
parser = python_parser_only
|
||||
|
||||
expected = DataFrame.from_records(
|
||||
[(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
|
||||
columns=list("abcABC"),
|
||||
index=list("abc"),
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("add_footer", [True, False])
|
||||
def test_skipfooter_with_decimal(python_parser_only, add_footer):
|
||||
# see gh-6971
|
||||
data = "1#2\n3#4"
|
||||
parser = python_parser_only
|
||||
expected = DataFrame({"a": [1.2, 3.4]})
|
||||
|
||||
if add_footer:
|
||||
# The stray footer line should not mess with the
|
||||
# casting of the first two lines if we skip it.
|
||||
kwargs = {"skipfooter": 1}
|
||||
data += "\nFooter"
|
||||
else:
|
||||
kwargs = {}
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"]
|
||||
)
|
||||
def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
|
||||
# see gh-3404
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
parser = python_parser_only
|
||||
|
||||
data = "1" + sep + "2"
|
||||
encoded_data = data.encode(encoding)
|
||||
|
||||
result = parser.read_csv(
|
||||
BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
|
||||
def test_multi_char_sep_quotes(python_parser_only, quoting):
|
||||
# see gh-13374
|
||||
kwargs = {"sep": ",,"}
|
||||
parser = python_parser_only
|
||||
|
||||
data = 'a,,b\n1,,a\n2,,"2,,b"'
|
||||
|
||||
if quoting == csv.QUOTE_NONE:
|
||||
msg = "Expected 2 fields in line 3, saw 3"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
|
||||
else:
|
||||
msg = "ignored when a multi-char delimiter is used"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
|
||||
|
||||
|
||||
def test_none_delimiter(python_parser_only):
|
||||
# see gh-13374 and gh-17465
|
||||
parser = python_parser_only
|
||||
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
|
||||
expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]})
|
||||
|
||||
# We expect the third line in the data to be
|
||||
# skipped because it is malformed, but we do
|
||||
# not expect any errors to occur.
|
||||
with tm.assert_produces_warning(
|
||||
ParserWarning, match="Skipping line 3", check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data), header=0, sep=None, on_bad_lines="warn"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
|
||||
@pytest.mark.parametrize("skipfooter", [0, 1])
|
||||
def test_skipfooter_bad_row(python_parser_only, data, skipfooter):
|
||||
# see gh-13879 and gh-15910
|
||||
parser = python_parser_only
|
||||
if skipfooter:
|
||||
msg = "parsing errors in the skipped footer rows"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=skipfooter)
|
||||
else:
|
||||
msg = "unexpected end of data|expected after"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=skipfooter)
|
||||
|
||||
|
||||
def test_malformed_skipfooter(python_parser_only):
|
||||
parser = python_parser_only
|
||||
data = """ignore
|
||||
A,B,C
|
||||
1,2,3 # comment
|
||||
1,2,3,4,5
|
||||
2,3,4
|
||||
footer
|
||||
"""
|
||||
msg = "Expected 3 fields in line 4, saw 5"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)
|
||||
|
||||
|
||||
def test_python_engine_file_no_next(python_parser_only):
|
||||
parser = python_parser_only
|
||||
|
||||
class NoNextBuffer:
|
||||
def __init__(self, csv_data) -> None:
|
||||
self.data = csv_data
|
||||
|
||||
def __iter__(self) -> Iterator:
|
||||
return self.data.__iter__()
|
||||
|
||||
def read(self):
|
||||
return self.data
|
||||
|
||||
def readline(self):
|
||||
return self.data
|
||||
|
||||
parser.read_csv(NoNextBuffer("a\n1"))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bad_line_func", [lambda x: ["2", "3"], lambda x: x[:2]])
|
||||
def test_on_bad_lines_callable(python_parser_only, bad_line_func):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2
|
||||
2,3,4,5,6
|
||||
3,4
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
|
||||
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_on_bad_lines_callable_write_to_external_list(python_parser_only):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2
|
||||
2,3,4,5,6
|
||||
3,4
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
lst = []
|
||||
|
||||
def bad_line_func(bad_line: list[str]) -> list[str]:
|
||||
lst.append(bad_line)
|
||||
return ["2", "3"]
|
||||
|
||||
result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
|
||||
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert lst == [["2", "3", "4", "5", "6"]]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bad_line_func", [lambda x: ["foo", "bar"], lambda x: x[:2]])
|
||||
@pytest.mark.parametrize("sep", [",", "111"])
|
||||
def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, sep):
|
||||
# GH 5686
|
||||
# iterator=True has a separate code path than iterator=False
|
||||
parser = python_parser_only
|
||||
data = f"""
|
||||
0{sep}1
|
||||
hi{sep}there
|
||||
foo{sep}bar{sep}baz
|
||||
good{sep}bye
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
result_iter = parser.read_csv(
|
||||
bad_sio, on_bad_lines=bad_line_func, chunksize=1, iterator=True, sep=sep
|
||||
)
|
||||
expecteds = [
|
||||
{"0": "hi", "1": "there"},
|
||||
{"0": "foo", "1": "bar"},
|
||||
{"0": "good", "1": "bye"},
|
||||
]
|
||||
for i, (result, expected) in enumerate(zip(result_iter, expecteds)):
|
||||
expected = DataFrame(expected, index=range(i, i + 1))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2
|
||||
2,3,4,5,6
|
||||
3,4
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
msg = "This function is buggy."
|
||||
|
||||
def bad_line_func(bad_line):
|
||||
raise ValueError(msg)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
|
||||
|
||||
|
||||
def test_on_bad_lines_callable_not_expected_length(python_parser_only):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2
|
||||
2,3,4,5,6
|
||||
3,4
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
|
||||
result = parser.read_csv_check_warnings(
|
||||
ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x
|
||||
)
|
||||
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_on_bad_lines_callable_returns_none(python_parser_only):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2
|
||||
2,3,4,5,6
|
||||
3,4
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
|
||||
result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None)
|
||||
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_on_bad_lines_index_col_inferred(python_parser_only):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2,3
|
||||
4,5,6
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
|
||||
result = parser.read_csv(bad_sio, on_bad_lines=lambda x: ["99", "99"])
|
||||
expected = DataFrame({"a": [2, 5], "b": [3, 6]}, index=[1, 4])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_col_false_and_header_none(python_parser_only):
|
||||
# GH#46955
|
||||
parser = python_parser_only
|
||||
data = """
|
||||
0.5,0.03
|
||||
0.1,0.2,0.3,2
|
||||
"""
|
||||
result = parser.read_csv_check_warnings(
|
||||
ParserWarning,
|
||||
"Length of header",
|
||||
StringIO(data),
|
||||
sep=",",
|
||||
header=None,
|
||||
index_col=False,
|
||||
)
|
||||
expected = DataFrame({0: [0.5, 0.1], 1: [0.03, 0.2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parser_only):
|
||||
# GH#46569
|
||||
parser = python_parser_only
|
||||
data = StringIO("a\na,b\nc,d,e\nf,g,h")
|
||||
result = parser.read_csv_check_warnings(
|
||||
ParserWarning, "Length of header", data, engine="python", index_col=False
|
||||
)
|
||||
expected = DataFrame({"a": ["a", "c", "f"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}]
|
||||
)
|
||||
def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, dtype):
|
||||
# GH#50270
|
||||
parser = python_parser_only
|
||||
data = """\
|
||||
a;b;c
|
||||
0000.7995;16.000;0
|
||||
3.03.001.00514;0;4.000
|
||||
4923.600.041;23.000;131"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
sep=";",
|
||||
dtype=dtype,
|
||||
thousands=".",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": ["0000.7995", "3.03.001.00514", "4923.600.041"],
|
||||
"b": [16000, 0, 23000],
|
||||
"c": [0, 4000, 131],
|
||||
}
|
||||
)
|
||||
if dtype["a"] == object:
|
||||
expected["a"] = expected["a"].astype(object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype,expected",
|
||||
[
|
||||
(
|
||||
{"a": str, "b": np.float64, "c": np.int64},
|
||||
DataFrame(
|
||||
{
|
||||
"b": [16000.1, 0, 23000],
|
||||
"c": [0, 4001, 131],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
str,
|
||||
DataFrame(
|
||||
{
|
||||
"b": ["16,000.1", "0", "23,000"],
|
||||
"c": ["0", "4,001", "131"],
|
||||
}
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, expected):
|
||||
# GH#50270
|
||||
parser = python_parser_only
|
||||
data = """a;b;c
|
||||
0000,7995;16,000.1;0
|
||||
3,03,001,00514;0;4,001
|
||||
4923,600,041;23,000;131
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
sep=";",
|
||||
dtype=dtype,
|
||||
thousands=",",
|
||||
)
|
||||
expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,183 @@
|
||||
"""
|
||||
Tests that quoting specifications are properly handled
|
||||
during parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY311
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,msg",
|
||||
[
|
||||
({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'),
|
||||
(
|
||||
{"quotechar": None, "quoting": csv.QUOTE_MINIMAL},
|
||||
"quotechar must be set if quoting enabled",
|
||||
),
|
||||
({"quotechar": 2}, '"quotechar" must be string( or None)?, not int'),
|
||||
],
|
||||
)
|
||||
@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block
|
||||
def test_bad_quote_char(all_parsers, kwargs, msg):
|
||||
data = "1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"quoting,msg",
|
||||
[
|
||||
("foo", '"quoting" must be an integer|Argument'),
|
||||
(10, 'bad "quoting" value'), # quoting must be in the range [0, 3]
|
||||
],
|
||||
)
|
||||
@xfail_pyarrow # ValueError: The 'quoting' option is not supported
|
||||
def test_bad_quoting(all_parsers, quoting, msg):
|
||||
data = "1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), quoting=quoting)
|
||||
|
||||
|
||||
def test_quote_char_basic(all_parsers):
|
||||
parser = all_parsers
|
||||
data = 'a,b,c\n1,2,"cat"'
|
||||
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar='"')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
|
||||
def test_quote_char_various(all_parsers, quote_char):
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
|
||||
|
||||
data = 'a,b,c\n1,2,"cat"'
|
||||
new_data = data.replace('"', quote_char)
|
||||
|
||||
result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: The 'quoting' option is not supported
|
||||
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
|
||||
@pytest.mark.parametrize("quote_char", ["", None])
|
||||
def test_null_quote_char(all_parsers, quoting, quote_char):
|
||||
kwargs = {"quotechar": quote_char, "quoting": quoting}
|
||||
data = "a,b,c\n1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
if quoting != csv.QUOTE_NONE:
|
||||
# Sanity checking.
|
||||
msg = (
|
||||
'"quotechar" must be a 1-character string'
|
||||
if PY311 and all_parsers.engine == "python" and quote_char == ""
|
||||
else "quotechar must be set if quoting enabled"
|
||||
)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
elif not (PY311 and all_parsers.engine == "python"):
|
||||
# Python 3.11+ doesn't support null/blank quote chars in their csv parsers
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,exp_data",
|
||||
[
|
||||
({}, [[1, 2, "foo"]]), # Test default.
|
||||
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
|
||||
({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]),
|
||||
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
|
||||
({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]),
|
||||
# QUOTE_NONE tells the reader to do no special handling
|
||||
# of quote characters and leave them alone.
|
||||
({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]),
|
||||
# QUOTE_NONNUMERIC tells the reader to cast
|
||||
# all non-quoted fields to float
|
||||
({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]),
|
||||
],
|
||||
)
|
||||
@xfail_pyarrow # ValueError: The 'quoting' option is not supported
|
||||
def test_quoting_various(all_parsers, kwargs, exp_data):
|
||||
data = '1,2,"foo"'
|
||||
parser = all_parsers
|
||||
columns = ["a", "b", "c"]
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=columns, **kwargs)
|
||||
expected = DataFrame(exp_data, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])]
|
||||
)
|
||||
def test_double_quote(all_parsers, doublequote, exp_data, request):
|
||||
parser = all_parsers
|
||||
data = 'a,b\n3,"4 "" 5"'
|
||||
|
||||
if parser.engine == "pyarrow" and not doublequote:
|
||||
mark = pytest.mark.xfail(reason="Mismatched result")
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote)
|
||||
expected = DataFrame(exp_data, columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quotechar", ['"', "\u0001"])
|
||||
def test_quotechar_unicode(all_parsers, quotechar):
|
||||
# see gh-14477
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar=quotechar)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("balanced", [True, False])
|
||||
def test_unbalanced_quoting(all_parsers, balanced, request):
|
||||
# see gh-22789.
|
||||
parser = all_parsers
|
||||
data = 'a,b,c\n1,2,"3'
|
||||
|
||||
if parser.engine == "pyarrow" and not balanced:
|
||||
mark = pytest.mark.xfail(reason="Mismatched result")
|
||||
request.applymarker(mark)
|
||||
|
||||
if balanced:
|
||||
# Re-balance the quoting and read in without errors.
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
result = parser.read_csv(StringIO(data + '"'))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
msg = (
|
||||
"EOF inside string starting at row 1"
|
||||
if parser.engine == "c"
|
||||
else "unexpected end of data"
|
||||
)
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
1034
lib/python3.11/site-packages/pandas/tests/io/parser/test_read_fwf.py
Normal file
1034
lib/python3.11/site-packages/pandas/tests/io/parser/test_read_fwf.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,334 @@
|
||||
"""
|
||||
Tests that skipped rows are properly handled during
|
||||
parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import EmptyDataError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
@pytest.mark.parametrize("skiprows", [list(range(6)), 6])
|
||||
def test_skip_rows_bug(all_parsers, skiprows):
|
||||
# see gh-505
|
||||
parser = all_parsers
|
||||
text = """#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
1/1/2000,1.,2.,3.
|
||||
1/2/2000,4,5,6
|
||||
1/3/2000,7,8,9
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True
|
||||
)
|
||||
index = Index(
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_deep_skip_rows(all_parsers):
|
||||
# see gh-4382
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n" + "\n".join(
|
||||
[",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)]
|
||||
)
|
||||
condensed_data = "a,b,c\n" + "\n".join(
|
||||
[",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]
|
||||
)
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=[6, 8])
|
||||
condensed_result = parser.read_csv(StringIO(condensed_data))
|
||||
tm.assert_frame_equal(result, condensed_result)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame are different
|
||||
def test_skip_rows_blank(all_parsers):
|
||||
# see gh-9832
|
||||
parser = all_parsers
|
||||
text = """#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
|
||||
1/1/2000,1.,2.,3.
|
||||
1/2/2000,4,5,6
|
||||
1/3/2000,7,8,9
|
||||
"""
|
||||
data = parser.read_csv(
|
||||
StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True
|
||||
)
|
||||
index = Index(
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
|
||||
)
|
||||
tm.assert_frame_equal(data, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line 11
|
||||
line 12",2
|
||||
2,"line 21
|
||||
line 22",2
|
||||
3,"line 31",1""",
|
||||
{"skiprows": [1]},
|
||||
DataFrame(
|
||||
[[2, "line 21\nline 22", 2], [3, "line 31", 1]],
|
||||
columns=["id", "text", "num_lines"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
|
||||
{"quotechar": "~", "skiprows": [2]},
|
||||
DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]),
|
||||
),
|
||||
(
|
||||
(
|
||||
"Text,url\n~example\n "
|
||||
"sentence\n one~,url1\n~"
|
||||
"example\n sentence\n two~,url2\n~"
|
||||
"example\n sentence\n three~,url3"
|
||||
),
|
||||
{"quotechar": "~", "skiprows": [1, 3]},
|
||||
DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_row_with_quote(all_parsers):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
data = """id,text,num_lines
|
||||
1,"line '11' line 12",2
|
||||
2,"line '21' line 22",2
|
||||
3,"line '31' line 32",1"""
|
||||
|
||||
exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]]
|
||||
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=[1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,exp_data",
|
||||
[
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line \n'11' line 12",2
|
||||
2,"line \n'21' line 22",2
|
||||
3,"line \n'31' line 32",1""",
|
||||
[[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
|
||||
),
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line '11\n' line 12",2
|
||||
2,"line '21\n' line 22",2
|
||||
3,"line '31\n' line 32",1""",
|
||||
[[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
|
||||
),
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line '11\n' \r\tline 12",2
|
||||
2,"line '21\n' \r\tline 22",2
|
||||
3,"line '31\n' \r\tline 32",1""",
|
||||
[[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
|
||||
),
|
||||
],
|
||||
)
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), skiprows=[1])
|
||||
|
||||
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: The 'delim_whitespace' option is not supported
|
||||
@pytest.mark.parametrize(
|
||||
"lineterminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR"
|
||||
)
|
||||
def test_skiprows_lineterminator(all_parsers, lineterminator, request):
|
||||
# see gh-9079
|
||||
parser = all_parsers
|
||||
data = "\n".join(
|
||||
[
|
||||
"SMOSMANIA ThetaProbe-ML2X ",
|
||||
"2007/01/01 01:00 0.2140 U M ",
|
||||
"2007/01/01 02:00 0.2141 M O ",
|
||||
"2007/01/01 04:00 0.2142 D M ",
|
||||
]
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
["2007/01/01", "01:00", 0.2140, "U", "M"],
|
||||
["2007/01/01", "02:00", 0.2141, "M", "O"],
|
||||
["2007/01/01", "04:00", 0.2142, "D", "M"],
|
||||
],
|
||||
columns=["date", "time", "var", "flag", "oflag"],
|
||||
)
|
||||
|
||||
if parser.engine == "python" and lineterminator == "\r":
|
||||
mark = pytest.mark.xfail(reason="'CR' not respect with the Python parser yet")
|
||||
request.applymarker(mark)
|
||||
|
||||
data = data.replace("\n", lineterminator)
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
skiprows=1,
|
||||
delim_whitespace=True,
|
||||
names=["date", "time", "var", "flag", "oflag"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame are different
|
||||
def test_skiprows_infield_quote(all_parsers):
|
||||
# see gh-14459
|
||||
parser = all_parsers
|
||||
data = 'a"\nb"\na\n1'
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,expected",
|
||||
[
|
||||
({}, DataFrame({"1": [3, 5]})),
|
||||
({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})),
|
||||
],
|
||||
)
|
||||
def test_skip_rows_callable(all_parsers, kwargs, expected):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_rows_callable_not_in(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "0,a\n1,b\n2,c\n3,d\n4,e"
|
||||
expected = DataFrame([[1, "b"], [3, "d"]])
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), header=None, skiprows=lambda x: x not in [1, 3]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_rows_skip_all(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
msg = "No columns to parse from file"
|
||||
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
parser.read_csv(StringIO(data), skiprows=lambda x: True)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_rows_bad_callable(all_parsers):
|
||||
msg = "by zero"
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
|
||||
with pytest.raises(ZeroDivisionError, match=msg):
|
||||
parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_rows_and_n_rows(all_parsers):
|
||||
# GH#44021
|
||||
data = """a,b
|
||||
1,a
|
||||
2,b
|
||||
3,c
|
||||
4,d
|
||||
5,e
|
||||
6,f
|
||||
7,g
|
||||
8,h
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
|
||||
expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow
|
||||
def test_skip_rows_with_chunks(all_parsers):
|
||||
# GH 55677
|
||||
data = """col_a
|
||||
10
|
||||
20
|
||||
30
|
||||
40
|
||||
50
|
||||
60
|
||||
70
|
||||
80
|
||||
90
|
||||
100
|
||||
"""
|
||||
parser = all_parsers
|
||||
reader = parser.read_csv(
|
||||
StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4
|
||||
)
|
||||
df1 = next(reader)
|
||||
df2 = next(reader)
|
||||
|
||||
tm.assert_frame_equal(df1, DataFrame({"col_a": [20, 30, 60, 70]}))
|
||||
tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6]))
|
||||
@ -0,0 +1,342 @@
|
||||
"""
|
||||
Tests the TextReader class in parsers.pyx, which
|
||||
is integral to the C engine in parsers.py
|
||||
"""
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas._libs.parsers as parser
|
||||
from pandas._libs.parsers import TextReader
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.parsers import (
|
||||
TextFileReader,
|
||||
read_csv,
|
||||
)
|
||||
from pandas.io.parsers.c_parser_wrapper import ensure_dtype_objs
|
||||
|
||||
|
||||
class TestTextReader:
|
||||
@pytest.fixture
|
||||
def csv_path(self, datapath):
|
||||
return datapath("io", "data", "csv", "test1.csv")
|
||||
|
||||
def test_file_handle(self, csv_path):
|
||||
with open(csv_path, "rb") as f:
|
||||
reader = TextReader(f)
|
||||
reader.read()
|
||||
|
||||
def test_file_handle_mmap(self, csv_path):
|
||||
# this was never using memory_map=True
|
||||
with open(csv_path, "rb") as f:
|
||||
reader = TextReader(f, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_StringIO(self, csv_path):
|
||||
with open(csv_path, "rb") as f:
|
||||
text = f.read()
|
||||
src = BytesIO(text)
|
||||
reader = TextReader(src, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_string_factorize(self):
|
||||
# should this be optional?
|
||||
data = "a\nb\na\nb\na"
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
assert len(set(map(id, result[0]))) == 2
|
||||
|
||||
def test_skipinitialspace(self):
|
||||
data = "a, b\na, b\na, b\na, b"
|
||||
|
||||
reader = TextReader(StringIO(data), skipinitialspace=True, header=None)
|
||||
result = reader.read()
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
result[0], np.array(["a", "a", "a", "a"], dtype=np.object_)
|
||||
)
|
||||
tm.assert_numpy_array_equal(
|
||||
result[1], np.array(["b", "b", "b", "b"], dtype=np.object_)
|
||||
)
|
||||
|
||||
def test_parse_booleans(self):
|
||||
data = "True\nFalse\nTrue\nTrue"
|
||||
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
|
||||
assert result[0].dtype == np.bool_
|
||||
|
||||
def test_delimit_whitespace(self):
|
||||
data = 'a b\na\t\t "b"\n"a"\t \t b'
|
||||
|
||||
reader = TextReader(StringIO(data), delim_whitespace=True, header=None)
|
||||
result = reader.read()
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
result[0], np.array(["a", "a", "a"], dtype=np.object_)
|
||||
)
|
||||
tm.assert_numpy_array_equal(
|
||||
result[1], np.array(["b", "b", "b"], dtype=np.object_)
|
||||
)
|
||||
|
||||
def test_embedded_newline(self):
|
||||
data = 'a\n"hello\nthere"\nthis'
|
||||
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_)
|
||||
tm.assert_numpy_array_equal(result[0], expected)
|
||||
|
||||
def test_euro_decimal(self):
|
||||
data = "12345,67\n345,678"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array([12345.67, 345.678])
|
||||
tm.assert_almost_equal(result[0], expected)
|
||||
|
||||
def test_integer_thousands(self):
|
||||
data = "123,456\n12,500"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array([123456, 12500], dtype=np.int64)
|
||||
tm.assert_almost_equal(result[0], expected)
|
||||
|
||||
def test_integer_thousands_alt(self):
|
||||
data = "123.456\n12.500"
|
||||
|
||||
reader = TextFileReader(
|
||||
StringIO(data), delimiter=":", thousands=".", header=None
|
||||
)
|
||||
result = reader.read()
|
||||
|
||||
expected = DataFrame([123456, 12500])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_skip_bad_lines(self):
|
||||
# too many lines, see #2430 for why
|
||||
data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=":", header=None)
|
||||
msg = r"Error tokenizing data\. C error: Expected 3 fields in line 4, saw 4"
|
||||
with pytest.raises(parser.ParserError, match=msg):
|
||||
reader.read()
|
||||
|
||||
reader = TextReader(
|
||||
StringIO(data), delimiter=":", header=None, on_bad_lines=2 # Skip
|
||||
)
|
||||
result = reader.read()
|
||||
expected = {
|
||||
0: np.array(["a", "d", "g", "l"], dtype=object),
|
||||
1: np.array(["b", "e", "h", "m"], dtype=object),
|
||||
2: np.array(["c", "f", "i", "n"], dtype=object),
|
||||
}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
with tm.assert_produces_warning(ParserWarning, match="Skipping line"):
|
||||
reader = TextReader(
|
||||
StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn
|
||||
)
|
||||
reader.read()
|
||||
|
||||
def test_header_not_enough_lines(self):
|
||||
data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=",", header=2)
|
||||
header = reader.header
|
||||
expected = [["a", "b", "c"]]
|
||||
assert header == expected
|
||||
|
||||
recs = reader.read()
|
||||
expected = {
|
||||
0: np.array([1, 4], dtype=np.int64),
|
||||
1: np.array([2, 5], dtype=np.int64),
|
||||
2: np.array([3, 6], dtype=np.int64),
|
||||
}
|
||||
assert_array_dicts_equal(recs, expected)
|
||||
|
||||
def test_escapechar(self):
|
||||
data = '\\"hello world"\n\\"hello world"\n\\"hello world"'
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\")
|
||||
result = reader.read()
|
||||
expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
def test_eof_has_eol(self):
|
||||
# handling of new line at EOF
|
||||
pass
|
||||
|
||||
def test_na_substitution(self):
|
||||
pass
|
||||
|
||||
def test_numpy_string_dtype(self):
|
||||
data = """\
|
||||
a,1
|
||||
aa,2
|
||||
aaa,3
|
||||
aaaa,4
|
||||
aaaaa,5"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
if "dtype" in kwds:
|
||||
kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
|
||||
return TextReader(StringIO(data), delimiter=",", header=None, **kwds)
|
||||
|
||||
reader = _make_reader(dtype="S5,i4")
|
||||
result = reader.read()
|
||||
|
||||
assert result[0].dtype == "S5"
|
||||
|
||||
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaaa"], dtype="S5")
|
||||
assert (result[0] == ex_values).all()
|
||||
assert result[1].dtype == "i4"
|
||||
|
||||
reader = _make_reader(dtype="S4")
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "S4"
|
||||
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaa"], dtype="S4")
|
||||
assert (result[0] == ex_values).all()
|
||||
assert result[1].dtype == "S4"
|
||||
|
||||
def test_pass_dtype(self):
|
||||
data = """\
|
||||
one,two
|
||||
1,a
|
||||
2,b
|
||||
3,c
|
||||
4,d"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
if "dtype" in kwds:
|
||||
kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
|
||||
return TextReader(StringIO(data), delimiter=",", **kwds)
|
||||
|
||||
reader = _make_reader(dtype={"one": "u1", 1: "S1"})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "u1"
|
||||
assert result[1].dtype == "S1"
|
||||
|
||||
reader = _make_reader(dtype={"one": np.uint8, 1: object})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "u1"
|
||||
assert result[1].dtype == "O"
|
||||
|
||||
reader = _make_reader(dtype={"one": np.dtype("u1"), 1: np.dtype("O")})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "u1"
|
||||
assert result[1].dtype == "O"
|
||||
|
||||
def test_usecols(self):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
return TextReader(StringIO(data), delimiter=",", **kwds)
|
||||
|
||||
reader = _make_reader(usecols=(1, 2))
|
||||
result = reader.read()
|
||||
|
||||
exp = _make_reader().read()
|
||||
assert len(result) == 2
|
||||
assert (result[1] == exp[1]).all()
|
||||
assert (result[2] == exp[2]).all()
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, kwargs",
|
||||
[
|
||||
("a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12", {"delimiter": ","}),
|
||||
(
|
||||
"a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12",
|
||||
{"delim_whitespace": True},
|
||||
),
|
||||
("a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12", {"delimiter": ","}),
|
||||
(
|
||||
(
|
||||
"A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r"
|
||||
"AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r"
|
||||
",BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0"
|
||||
),
|
||||
{"delimiter": ","},
|
||||
),
|
||||
("A B C\r 2 3\r4 5 6", {"delim_whitespace": True}),
|
||||
("A B C\r2 3\r4 5 6", {"delim_whitespace": True}),
|
||||
],
|
||||
)
|
||||
def test_cr_delimited(self, text, kwargs):
|
||||
nice_text = text.replace("\r", "\r\n")
|
||||
result = TextReader(StringIO(text), **kwargs).read()
|
||||
expected = TextReader(StringIO(nice_text), **kwargs).read()
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
def test_empty_field_eof(self):
|
||||
data = "a,b,c\n1,2,3\n4,,"
|
||||
|
||||
result = TextReader(StringIO(data), delimiter=",").read()
|
||||
|
||||
expected = {
|
||||
0: np.array([1, 4], dtype=np.int64),
|
||||
1: np.array(["2", ""], dtype=object),
|
||||
2: np.array(["3", ""], dtype=object),
|
||||
}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("repeat", range(10))
|
||||
def test_empty_field_eof_mem_access_bug(self, repeat):
|
||||
# GH5664
|
||||
a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"])
|
||||
b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1])
|
||||
c = DataFrame(
|
||||
[
|
||||
[1, 2, 3, 4],
|
||||
[6, np.nan, np.nan, np.nan],
|
||||
[8, 9, 10, 11],
|
||||
[13, 14, np.nan, np.nan],
|
||||
],
|
||||
columns=list("abcd"),
|
||||
index=[0, 5, 7, 12],
|
||||
)
|
||||
|
||||
df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c")
|
||||
tm.assert_frame_equal(df, a)
|
||||
|
||||
df = read_csv(
|
||||
StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c"
|
||||
)
|
||||
tm.assert_frame_equal(df, b)
|
||||
|
||||
df = read_csv(
|
||||
StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"),
|
||||
names=list("abcd"),
|
||||
engine="c",
|
||||
)
|
||||
tm.assert_frame_equal(df, c)
|
||||
|
||||
def test_empty_csv_input(self):
|
||||
# GH14867
|
||||
with read_csv(
|
||||
StringIO(), chunksize=20, header=None, names=["a", "b", "c"]
|
||||
) as df:
|
||||
assert isinstance(df, TextFileReader)
|
||||
|
||||
|
||||
def assert_array_dicts_equal(left, right):
|
||||
for k, v in left.items():
|
||||
tm.assert_numpy_array_equal(np.asarray(v), np.asarray(right[k]))
|
||||
@ -0,0 +1,226 @@
|
||||
"""
|
||||
Tests that features that are currently unsupported in
|
||||
either the Python or C parser are actually enforced
|
||||
and are clearly communicated to the user.
|
||||
|
||||
Ultimately, the goal is to remove test cases from this
|
||||
test suite as new feature support is added to the parsers.
|
||||
"""
|
||||
from io import StringIO
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserError
|
||||
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.parsers import read_csv
|
||||
import pandas.io.parsers.readers as parsers
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
|
||||
def python_engine(request):
|
||||
return request.param
|
||||
|
||||
|
||||
class TestUnsupportedFeatures:
|
||||
def test_mangle_dupe_cols_false(self):
|
||||
# see gh-12935
|
||||
data = "a b c\n1 2 3"
|
||||
|
||||
for engine in ("c", "python"):
|
||||
with pytest.raises(TypeError, match="unexpected keyword"):
|
||||
read_csv(StringIO(data), engine=engine, mangle_dupe_cols=True)
|
||||
|
||||
def test_c_engine(self):
|
||||
# see gh-6607
|
||||
data = "a b c\n1 2 3"
|
||||
msg = "does not support"
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
# specify C engine with unsupported options (raise)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
|
||||
read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine="c", sep=r"\s")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine="c", sep="\t", quotechar=chr(128))
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine="c", skipfooter=1)
|
||||
|
||||
# specify C-unsupported options without python-unsupported options
|
||||
with tm.assert_produces_warning((parsers.ParserWarning, FutureWarning)):
|
||||
read_csv(StringIO(data), sep=None, delim_whitespace=False)
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), sep=r"\s")
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), sep="\t", quotechar=chr(128))
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), skipfooter=1)
|
||||
|
||||
text = """ A B C D E
|
||||
one two three four
|
||||
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
|
||||
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
|
||||
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
|
||||
msg = "Error tokenizing data"
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
read_csv(StringIO(text), sep="\\s+")
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
read_csv(StringIO(text), engine="c", sep="\\s+")
|
||||
|
||||
msg = "Only length-1 thousands markers supported"
|
||||
data = """A|B|C
|
||||
1|2,334|5
|
||||
10|13|10.
|
||||
"""
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), thousands=",,")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), thousands="")
|
||||
|
||||
msg = "Only length-1 line terminators supported"
|
||||
data = "a,b,c~~1,2,3~~4,5,6"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), lineterminator="~~")
|
||||
|
||||
def test_python_engine(self, python_engine):
|
||||
from pandas.io.parsers.readers import _python_unsupported as py_unsupported
|
||||
|
||||
data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
|
||||
for default in py_unsupported:
|
||||
msg = (
|
||||
f"The {repr(default)} option is not "
|
||||
f"supported with the {repr(python_engine)} engine"
|
||||
)
|
||||
|
||||
kwargs = {default: object()}
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine=python_engine, **kwargs)
|
||||
|
||||
def test_python_engine_file_no_iter(self, python_engine):
|
||||
# see gh-16530
|
||||
class NoNextBuffer:
|
||||
def __init__(self, csv_data) -> None:
|
||||
self.data = csv_data
|
||||
|
||||
def __next__(self):
|
||||
return self.data.__next__()
|
||||
|
||||
def read(self):
|
||||
return self.data
|
||||
|
||||
def readline(self):
|
||||
return self.data
|
||||
|
||||
data = "a\n1"
|
||||
msg = "'NoNextBuffer' object is not iterable|argument 1 must be an iterator"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
read_csv(NoNextBuffer(data), engine=python_engine)
|
||||
|
||||
def test_pyarrow_engine(self):
|
||||
from pandas.io.parsers.readers import _pyarrow_unsupported as pa_unsupported
|
||||
|
||||
data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
|
||||
for default in pa_unsupported:
|
||||
msg = (
|
||||
f"The {repr(default)} option is not "
|
||||
f"supported with the 'pyarrow' engine"
|
||||
)
|
||||
kwargs = {default: object()}
|
||||
default_needs_bool = {"warn_bad_lines", "error_bad_lines"}
|
||||
if default == "dialect":
|
||||
kwargs[default] = "excel" # test a random dialect
|
||||
elif default in default_needs_bool:
|
||||
kwargs[default] = True
|
||||
elif default == "on_bad_lines":
|
||||
kwargs[default] = "warn"
|
||||
|
||||
warn = None
|
||||
depr_msg = None
|
||||
if "delim_whitespace" in kwargs:
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
warn = FutureWarning
|
||||
if "verbose" in kwargs:
|
||||
depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated"
|
||||
warn = FutureWarning
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(warn, match=depr_msg):
|
||||
read_csv(StringIO(data), engine="pyarrow", **kwargs)
|
||||
|
||||
def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers):
|
||||
# GH 5686
|
||||
# GH 54643
|
||||
sio = StringIO("a,b\n1,2")
|
||||
bad_lines_func = lambda x: x
|
||||
parser = all_parsers
|
||||
if all_parsers.engine not in ["python", "pyarrow"]:
|
||||
msg = (
|
||||
"on_bad_line can only be a callable "
|
||||
"function if engine='python' or 'pyarrow'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(sio, on_bad_lines=bad_lines_func)
|
||||
else:
|
||||
parser.read_csv(sio, on_bad_lines=bad_lines_func)
|
||||
|
||||
|
||||
def test_close_file_handle_on_invalid_usecols(all_parsers):
|
||||
# GH 45384
|
||||
parser = all_parsers
|
||||
|
||||
error = ValueError
|
||||
if parser.engine == "pyarrow":
|
||||
# Raises pyarrow.lib.ArrowKeyError
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
with tm.ensure_clean("test.csv") as fname:
|
||||
Path(fname).write_text("col1,col2\na,b\n1,2", encoding="utf-8")
|
||||
with tm.assert_produces_warning(False):
|
||||
with pytest.raises(error, match="col3"):
|
||||
parser.read_csv(fname, usecols=["col1", "col2", "col3"])
|
||||
# unlink fails on windows if file handles still point to it
|
||||
os.unlink(fname)
|
||||
|
||||
|
||||
def test_invalid_file_inputs(request, all_parsers):
|
||||
# GH#45957
|
||||
parser = all_parsers
|
||||
if parser.engine == "python":
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(reason=f"{parser.engine} engine supports lists.")
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid"):
|
||||
parser.read_csv([])
|
||||
|
||||
|
||||
def test_invalid_dtype_backend(all_parsers):
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
|
||||
"'pyarrow' are allowed."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv("test", dtype_backend="numpy")
|
||||
@ -0,0 +1,102 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.parsers import (
|
||||
_maybe_upcast,
|
||||
na_values,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import NA
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import (
|
||||
ArrowStringArray,
|
||||
BooleanArray,
|
||||
FloatingArray,
|
||||
IntegerArray,
|
||||
StringArray,
|
||||
)
|
||||
|
||||
|
||||
def test_maybe_upcast(any_real_numpy_dtype):
|
||||
# GH#36712
|
||||
|
||||
dtype = np.dtype(any_real_numpy_dtype)
|
||||
na_value = na_values[dtype]
|
||||
arr = np.array([1, 2, na_value], dtype=dtype)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
expected_mask = np.array([False, False, True])
|
||||
if issubclass(dtype.type, np.integer):
|
||||
expected = IntegerArray(arr, mask=expected_mask)
|
||||
else:
|
||||
expected = FloatingArray(arr, mask=expected_mask)
|
||||
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_maybe_upcast_no_na(any_real_numpy_dtype):
|
||||
# GH#36712
|
||||
arr = np.array([1, 2, 3], dtype=any_real_numpy_dtype)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
expected_mask = np.array([False, False, False])
|
||||
if issubclass(np.dtype(any_real_numpy_dtype).type, np.integer):
|
||||
expected = IntegerArray(arr, mask=expected_mask)
|
||||
else:
|
||||
expected = FloatingArray(arr, mask=expected_mask)
|
||||
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_maybe_upcaste_bool():
|
||||
# GH#36712
|
||||
dtype = np.bool_
|
||||
na_value = na_values[dtype]
|
||||
arr = np.array([True, False, na_value], dtype="uint8").view(dtype)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
expected_mask = np.array([False, False, True])
|
||||
expected = BooleanArray(arr, mask=expected_mask)
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_maybe_upcaste_bool_no_nan():
|
||||
# GH#36712
|
||||
dtype = np.bool_
|
||||
arr = np.array([True, False, False], dtype="uint8").view(dtype)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
expected_mask = np.array([False, False, False])
|
||||
expected = BooleanArray(arr, mask=expected_mask)
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_maybe_upcaste_all_nan():
|
||||
# GH#36712
|
||||
dtype = np.int64
|
||||
na_value = na_values[dtype]
|
||||
arr = np.array([na_value, na_value], dtype=dtype)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
expected_mask = np.array([True, True])
|
||||
expected = IntegerArray(arr, mask=expected_mask)
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("val", [na_values[np.object_], "c"])
|
||||
def test_maybe_upcast_object(val, string_storage):
|
||||
# GH#36712
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
|
||||
with pd.option_context("mode.string_storage", string_storage):
|
||||
arr = np.array(["a", "b", val], dtype=np.object_)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
if string_storage == "python":
|
||||
exp_val = "c" if val == "c" else NA
|
||||
expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_))
|
||||
else:
|
||||
exp_val = "c" if val == "c" else None
|
||||
expected = ArrowStringArray(pa.array(["a", "b", exp_val]))
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
@ -0,0 +1,194 @@
|
||||
"""
|
||||
Tests the usecols functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
_msg_pyarrow_requires_names = (
|
||||
"The pyarrow engine does not allow 'usecols' to be integer column "
|
||||
"positions. Pass a list of string column names instead."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
|
||||
def test_usecols_with_parse_dates(all_parsers, usecols):
|
||||
# see gh-9755
|
||||
data = """a,b,c,d,e
|
||||
0,1,2014-01-01,09:00,4
|
||||
0,1,2014-01-02,10:00,4"""
|
||||
parser = all_parsers
|
||||
parse_dates = [[1, 2]]
|
||||
|
||||
depr_msg = (
|
||||
"Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
|
||||
)
|
||||
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
if parser.engine == "pyarrow":
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(
|
||||
StringIO(data), usecols=usecols, parse_dates=parse_dates
|
||||
)
|
||||
return
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data), usecols=usecols, parse_dates=parse_dates
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # pyarrow.lib.ArrowKeyError: Column 'fdate' in include_columns
|
||||
def test_usecols_with_parse_dates2(all_parsers):
|
||||
# see gh-13604
|
||||
parser = all_parsers
|
||||
data = """2008-02-07 09:40,1032.43
|
||||
2008-02-07 09:50,1042.54
|
||||
2008-02-07 10:00,1051.65"""
|
||||
|
||||
names = ["date", "values"]
|
||||
usecols = names[:]
|
||||
parse_dates = [0]
|
||||
|
||||
index = Index(
|
||||
[
|
||||
Timestamp("2008-02-07 09:40"),
|
||||
Timestamp("2008-02-07 09:50"),
|
||||
Timestamp("2008-02-07 10:00"),
|
||||
],
|
||||
name="date",
|
||||
)
|
||||
cols = {"values": [1032.43, 1042.54, 1051.65]}
|
||||
expected = DataFrame(cols, index=index)
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
parse_dates=parse_dates,
|
||||
index_col=0,
|
||||
usecols=usecols,
|
||||
header=None,
|
||||
names=names,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_parse_dates3(all_parsers):
|
||||
# see gh-14792
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d,e,f,g,h,i,j
|
||||
2016/09/21,1,1,2,3,4,5,6,7,8"""
|
||||
|
||||
usecols = list("abcdefghij")
|
||||
parse_dates = [0]
|
||||
|
||||
cols = {
|
||||
"a": Timestamp("2016-09-21").as_unit("ns"),
|
||||
"b": [1],
|
||||
"c": [1],
|
||||
"d": [2],
|
||||
"e": [3],
|
||||
"f": [4],
|
||||
"g": [5],
|
||||
"h": [6],
|
||||
"i": [7],
|
||||
"j": [8],
|
||||
}
|
||||
expected = DataFrame(cols, columns=usecols)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_parse_dates4(all_parsers):
|
||||
data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
|
||||
usecols = list("abcdefghij")
|
||||
parse_dates = [[0, 1]]
|
||||
parser = all_parsers
|
||||
|
||||
cols = {
|
||||
"a_b": "2016/09/21 1",
|
||||
"c": [1],
|
||||
"d": [2],
|
||||
"e": [3],
|
||||
"f": [4],
|
||||
"g": [5],
|
||||
"h": [6],
|
||||
"i": [7],
|
||||
"j": [8],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
|
||||
|
||||
depr_msg = (
|
||||
"Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(
|
||||
(FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
usecols=usecols,
|
||||
parse_dates=parse_dates,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
|
||||
@pytest.mark.parametrize(
|
||||
"names",
|
||||
[
|
||||
list("abcde"), # Names span all columns in original data.
|
||||
list("acd"), # Names span only the selected columns.
|
||||
],
|
||||
)
|
||||
def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names, request):
|
||||
# see gh-9755
|
||||
s = """0,1,2014-01-01,09:00,4
|
||||
0,1,2014-01-02,10:00,4"""
|
||||
parse_dates = [[1, 2]]
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and not (len(names) == 3 and usecols[0] == 0):
|
||||
mark = pytest.mark.xfail(
|
||||
reason="Length mismatch in some cases, UserWarning in other"
|
||||
)
|
||||
request.applymarker(mark)
|
||||
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
|
||||
depr_msg = (
|
||||
"Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(
|
||||
(FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,96 @@
|
||||
"""
|
||||
Tests the usecols functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
def test_usecols_with_unicode_strings(all_parsers):
|
||||
# see gh-13219
|
||||
data = """AAA,BBB,CCC,DDD
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"AAA": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002,
|
||||
},
|
||||
"BBB": {0: 8, 1: 2, 2: 7},
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_single_byte_unicode_strings(all_parsers):
|
||||
# see gh-13219
|
||||
data = """A,B,C,D
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"A": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002,
|
||||
},
|
||||
"B": {0: 8, 1: 2, 2: 7},
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]])
|
||||
def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
|
||||
data = """AAA,BBB,CCC,DDD
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
_msg_validate_usecols_arg = (
|
||||
"'usecols' must either be list-like "
|
||||
"of all strings, all unicode, all "
|
||||
"integers or a callable."
|
||||
)
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]])
|
||||
def test_usecols_with_multi_byte_characters(all_parsers, usecols):
|
||||
data = """あああ,いい,ううう,ええええ
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"あああ": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002,
|
||||
},
|
||||
"いい": {0: 8, 1: 2, 2: 7},
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,563 @@
|
||||
"""
|
||||
Tests the usecols functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
array,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
_msg_validate_usecols_arg = (
|
||||
"'usecols' must either be list-like "
|
||||
"of all strings, all unicode, all "
|
||||
"integers or a callable."
|
||||
)
|
||||
_msg_validate_usecols_names = (
|
||||
"Usecols do not match columns, columns expected but not found: {0}"
|
||||
)
|
||||
_msg_pyarrow_requires_names = (
|
||||
"The pyarrow engine does not allow 'usecols' to be integer column "
|
||||
"positions. Pass a list of string column names instead."
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
def test_raise_on_mixed_dtype_usecols(all_parsers):
|
||||
# See gh-12678
|
||||
data = """a,b,c
|
||||
1000,2000,3000
|
||||
4000,5000,6000
|
||||
"""
|
||||
usecols = [0, "b", 2]
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
|
||||
def test_usecols(all_parsers, usecols, request):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_names(all_parsers):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
names = ["foo", "bar"]
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
|
||||
)
|
||||
def test_usecols_relative_to_names(all_parsers, names, usecols):
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow" and not isinstance(usecols[0], int):
|
||||
# ArrowKeyError: Column 'fb' in include_columns does not exist
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_relative_to_names2(all_parsers):
|
||||
# see gh-5766
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), names=["a", "b"], header=None, usecols=[0, 1]
|
||||
)
|
||||
|
||||
expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# regex mismatch: "Length mismatch: Expected axis has 1 elements"
|
||||
@xfail_pyarrow
|
||||
def test_usecols_name_length_conflict(all_parsers):
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
msg = "Number of passed names did not match number of header fields in the file"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1])
|
||||
|
||||
|
||||
def test_usecols_single_string(all_parsers):
|
||||
# see gh-20558
|
||||
parser = all_parsers
|
||||
data = """foo, bar, baz
|
||||
1000, 2000, 3000
|
||||
4000, 5000, 6000"""
|
||||
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols="foo")
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error in one case, AttributeError in another
|
||||
@pytest.mark.parametrize(
|
||||
"data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
|
||||
)
|
||||
def test_usecols_index_col_false(all_parsers, data):
|
||||
# see gh-9082
|
||||
parser = all_parsers
|
||||
usecols = ["a", "c", "d"]
|
||||
expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", ["b", 0])
|
||||
@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
|
||||
def test_usecols_index_col_conflict(all_parsers, usecols, index_col, request):
|
||||
# see gh-4201: test that index_col as integer reflects usecols
|
||||
parser = all_parsers
|
||||
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
|
||||
|
||||
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
|
||||
return
|
||||
|
||||
expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_index_col_conflict2(all_parsers):
|
||||
# see gh-4201: test that index_col as integer reflects usecols
|
||||
parser = all_parsers
|
||||
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
|
||||
|
||||
expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
|
||||
expected = expected.set_index(["b", "c"])
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 3 columns, got 4
|
||||
def test_usecols_implicit_index_col(all_parsers):
|
||||
# see gh-2654
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["a", "b"])
|
||||
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_index_col_middle(all_parsers):
|
||||
# GH#9098
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
1,2,3,4
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="c")
|
||||
expected = DataFrame({"b": [2], "d": [4]}, index=Index([3], name="c"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_index_col_end(all_parsers):
|
||||
# GH#9098
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
1,2,3,4
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="d")
|
||||
expected = DataFrame({"b": [2], "c": [3]}, index=Index([4], name="d"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_regex_sep(all_parsers):
|
||||
# see gh-2733
|
||||
parser = all_parsers
|
||||
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
|
||||
|
||||
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_whitespace(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(
|
||||
StringIO(data), delim_whitespace=True, usecols=("a", "b")
|
||||
)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data), delim_whitespace=True, usecols=("a", "b")
|
||||
)
|
||||
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"usecols,expected",
|
||||
[
|
||||
# Column selection by index.
|
||||
([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])),
|
||||
# Column selection by name.
|
||||
(
|
||||
["0", "1"],
|
||||
DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_usecols_with_integer_like_header(all_parsers, usecols, expected, request):
|
||||
parser = all_parsers
|
||||
data = """2,0,1
|
||||
1000,2000,3000
|
||||
4000,5000,6000"""
|
||||
|
||||
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # mismatched shape
|
||||
def test_empty_usecols(all_parsers):
|
||||
data = "a,b,c\n1,2,3\n4,5,6"
|
||||
expected = DataFrame(columns=Index([]))
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=set())
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_np_array_usecols(all_parsers):
|
||||
# see gh-12546
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,2,3"
|
||||
usecols = np.array(["a", "b"])
|
||||
|
||||
expected = DataFrame([[1, 2]], columns=usecols)
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"usecols,expected",
|
||||
[
|
||||
(
|
||||
lambda x: x.upper() in ["AAA", "BBB", "DDD"],
|
||||
DataFrame(
|
||||
{
|
||||
"AaA": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002,
|
||||
},
|
||||
"bBb": {0: 8, 1: 2, 2: 7},
|
||||
"ddd": {0: "a", 1: "b", 2: "a"},
|
||||
}
|
||||
),
|
||||
),
|
||||
(lambda x: False, DataFrame(columns=Index([]))),
|
||||
],
|
||||
)
|
||||
def test_callable_usecols(all_parsers, usecols, expected):
|
||||
# see gh-14154
|
||||
data = """AaA,bBb,CCC,ddd
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# ArrowKeyError: Column 'fa' in include_columns does not exist in CSV file
|
||||
@skip_pyarrow
|
||||
@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
|
||||
def test_incomplete_first_row(all_parsers, usecols):
|
||||
# see gh-6710
|
||||
data = "1,2\n1,2,3"
|
||||
parser = all_parsers
|
||||
names = ["a", "b", "c"]
|
||||
expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 3 columns, got 4
|
||||
@pytest.mark.parametrize(
|
||||
"data,usecols,kwargs,expected",
|
||||
[
|
||||
# see gh-8985
|
||||
(
|
||||
"19,29,39\n" * 2 + "10,20,30,40",
|
||||
[0, 1, 2],
|
||||
{"header": None},
|
||||
DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]),
|
||||
),
|
||||
# see gh-9549
|
||||
(
|
||||
("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"),
|
||||
["A", "B", "C"],
|
||||
{},
|
||||
DataFrame(
|
||||
{
|
||||
"A": [1, 3, 1, 1, 1, 5],
|
||||
"B": [2, 4, 2, 2, 2, 6],
|
||||
"C": [3, 5, 4, 3, 3, 7],
|
||||
}
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
|
||||
# see gh-8985
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"usecols,kwargs,expected,msg",
|
||||
[
|
||||
(
|
||||
["a", "b", "c", "d"],
|
||||
{},
|
||||
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
|
||||
None,
|
||||
),
|
||||
(
|
||||
["a", "b", "c", "f"],
|
||||
{},
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]"),
|
||||
),
|
||||
(["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")),
|
||||
(
|
||||
["a", "b", "f", "g"],
|
||||
{},
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"),
|
||||
),
|
||||
# see gh-14671
|
||||
(
|
||||
None,
|
||||
{"header": 0, "names": ["A", "B", "C", "D"]},
|
||||
DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}),
|
||||
None,
|
||||
),
|
||||
(
|
||||
["A", "B", "C", "f"],
|
||||
{"header": 0, "names": ["A", "B", "C", "D"]},
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]"),
|
||||
),
|
||||
(
|
||||
["A", "B", "f"],
|
||||
{"names": ["A", "B", "C", "D"]},
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_raises_on_usecols_names_mismatch(
|
||||
all_parsers, usecols, kwargs, expected, msg, request
|
||||
):
|
||||
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
||||
kwargs.update(usecols=usecols)
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and not (
|
||||
usecols is not None and expected is not None
|
||||
):
|
||||
# everything but the first case
|
||||
# ArrowKeyError: Column 'f' in include_columns does not exist in CSV file
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
if expected is None:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
|
||||
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request):
|
||||
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
||||
names = ["A", "B", "C", "D"]
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
if isinstance(usecols[0], int):
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
|
||||
return
|
||||
# "pyarrow.lib.ArrowKeyError: Column 'A' in include_columns does not exist"
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
|
||||
expected = DataFrame({"A": [1, 5], "C": [3, 7]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("names", [None, ["a", "b"]])
|
||||
def test_usecols_indices_out_of_bounds(all_parsers, names):
|
||||
# GH#25623 & GH 41130; enforced in 2.0
|
||||
parser = all_parsers
|
||||
data = """
|
||||
a,b
|
||||
1,2
|
||||
"""
|
||||
|
||||
err = ParserError
|
||||
msg = "Defining usecols with out-of-bounds"
|
||||
if parser.engine == "pyarrow":
|
||||
err = ValueError
|
||||
msg = _msg_pyarrow_requires_names
|
||||
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0)
|
||||
|
||||
|
||||
def test_usecols_additional_columns(all_parsers):
|
||||
# GH#46997
|
||||
parser = all_parsers
|
||||
usecols = lambda header: header.strip() in ["a", "b", "c"]
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
|
||||
return
|
||||
result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
|
||||
expected = DataFrame({"a": ["x"], "b": "y"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_additional_columns_integer_columns(all_parsers):
|
||||
# GH#46997
|
||||
parser = all_parsers
|
||||
usecols = lambda header: header.strip() in ["0", "1"]
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
|
||||
return
|
||||
result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
|
||||
expected = DataFrame({"0": ["x"], "1": "y"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """
|
||||
col1,col2,col3
|
||||
a,1,x
|
||||
b,2,y
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
usecols=["col1", "col2"],
|
||||
dtype={"col1": "string", "col2": "uint8", "col3": "string"},
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"col1": array(["a", "b"]), "col2": np.array([1, 2], dtype="uint8")}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,50 @@
|
||||
from collections.abc import Generator
|
||||
from contextlib import contextmanager
|
||||
import pathlib
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.io.pytables import HDFStore
|
||||
|
||||
tables = pytest.importorskip("tables")
|
||||
# set these parameters so we don't have file sharing
|
||||
tables.parameters.MAX_NUMEXPR_THREADS = 1
|
||||
tables.parameters.MAX_BLOSC_THREADS = 1
|
||||
tables.parameters.MAX_THREADS = 1
|
||||
|
||||
|
||||
def safe_close(store):
|
||||
try:
|
||||
if store is not None:
|
||||
store.close()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
# contextmanager to ensure the file cleanup
|
||||
@contextmanager
|
||||
def ensure_clean_store(
|
||||
path, mode="a", complevel=None, complib=None, fletcher32=False
|
||||
) -> Generator[HDFStore, None, None]:
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
tmp_path = pathlib.Path(tmpdirname, path)
|
||||
with HDFStore(
|
||||
tmp_path,
|
||||
mode=mode,
|
||||
complevel=complevel,
|
||||
complib=complib,
|
||||
fletcher32=fletcher32,
|
||||
) as store:
|
||||
yield store
|
||||
|
||||
|
||||
def _maybe_remove(store, key):
|
||||
"""
|
||||
For tests using tables, try removing the table to be sure there is
|
||||
no content from previous tests using the same table name.
|
||||
"""
|
||||
try:
|
||||
store.remove(key)
|
||||
except (ValueError, KeyError):
|
||||
pass
|
||||
@ -0,0 +1,9 @@
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_path():
|
||||
"""Fixture for setup path"""
|
||||
return f"tmp.__{uuid.uuid4()}__.h5"
|
||||
1015
lib/python3.11/site-packages/pandas/tests/io/pytables/test_append.py
Normal file
1015
lib/python3.11/site-packages/pandas/tests/io/pytables/test_append.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,214 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Series,
|
||||
_testing as tm,
|
||||
concat,
|
||||
read_hdf,
|
||||
)
|
||||
from pandas.tests.io.pytables.common import (
|
||||
_maybe_remove,
|
||||
ensure_clean_store,
|
||||
)
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
|
||||
def test_categorical(setup_path):
|
||||
with ensure_clean_store(setup_path) as store:
|
||||
# Basic
|
||||
_maybe_remove(store, "s")
|
||||
s = Series(
|
||||
Categorical(
|
||||
["a", "b", "b", "a", "a", "c"],
|
||||
categories=["a", "b", "c", "d"],
|
||||
ordered=False,
|
||||
)
|
||||
)
|
||||
store.append("s", s, format="table")
|
||||
result = store.select("s")
|
||||
tm.assert_series_equal(s, result)
|
||||
|
||||
_maybe_remove(store, "s_ordered")
|
||||
s = Series(
|
||||
Categorical(
|
||||
["a", "b", "b", "a", "a", "c"],
|
||||
categories=["a", "b", "c", "d"],
|
||||
ordered=True,
|
||||
)
|
||||
)
|
||||
store.append("s_ordered", s, format="table")
|
||||
result = store.select("s_ordered")
|
||||
tm.assert_series_equal(s, result)
|
||||
|
||||
_maybe_remove(store, "df")
|
||||
df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]})
|
||||
store.append("df", df, format="table")
|
||||
result = store.select("df")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
# Dtypes
|
||||
_maybe_remove(store, "si")
|
||||
s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category")
|
||||
store.append("si", s)
|
||||
result = store.select("si")
|
||||
tm.assert_series_equal(result, s)
|
||||
|
||||
_maybe_remove(store, "si2")
|
||||
s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category")
|
||||
store.append("si2", s)
|
||||
result = store.select("si2")
|
||||
tm.assert_series_equal(result, s)
|
||||
|
||||
# Multiple
|
||||
_maybe_remove(store, "df2")
|
||||
df2 = df.copy()
|
||||
df2["s2"] = Series(list("abcdefg")).astype("category")
|
||||
store.append("df2", df2)
|
||||
result = store.select("df2")
|
||||
tm.assert_frame_equal(result, df2)
|
||||
|
||||
# Make sure the metadata is OK
|
||||
info = store.info()
|
||||
assert "/df2 " in info
|
||||
# df2._mgr.blocks[0] and df2._mgr.blocks[2] are Categorical
|
||||
assert "/df2/meta/values_block_0/meta" in info
|
||||
assert "/df2/meta/values_block_2/meta" in info
|
||||
|
||||
# unordered
|
||||
_maybe_remove(store, "s2")
|
||||
s = Series(
|
||||
Categorical(
|
||||
["a", "b", "b", "a", "a", "c"],
|
||||
categories=["a", "b", "c", "d"],
|
||||
ordered=False,
|
||||
)
|
||||
)
|
||||
store.append("s2", s, format="table")
|
||||
result = store.select("s2")
|
||||
tm.assert_series_equal(result, s)
|
||||
|
||||
# Query
|
||||
_maybe_remove(store, "df3")
|
||||
store.append("df3", df, data_columns=["s"])
|
||||
expected = df[df.s.isin(["b", "c"])]
|
||||
result = store.select("df3", where=['s in ["b","c"]'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = df[df.s.isin(["b", "c"])]
|
||||
result = store.select("df3", where=['s = ["b","c"]'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = df[df.s.isin(["d"])]
|
||||
result = store.select("df3", where=['s in ["d"]'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = df[df.s.isin(["f"])]
|
||||
result = store.select("df3", where=['s in ["f"]'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Appending with same categories is ok
|
||||
store.append("df3", df)
|
||||
|
||||
df = concat([df, df])
|
||||
expected = df[df.s.isin(["b", "c"])]
|
||||
result = store.select("df3", where=['s in ["b","c"]'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Appending must have the same categories
|
||||
df3 = df.copy()
|
||||
df3["s"] = df3["s"].cat.remove_unused_categories()
|
||||
|
||||
msg = "cannot append a categorical with different categories to the existing"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
store.append("df3", df3)
|
||||
|
||||
# Remove, and make sure meta data is removed (its a recursive
|
||||
# removal so should be).
|
||||
result = store.select("df3/meta/s/meta")
|
||||
assert result is not None
|
||||
store.remove("df3")
|
||||
|
||||
with pytest.raises(
|
||||
KeyError, match="'No object named df3/meta/s/meta in the file'"
|
||||
):
|
||||
store.select("df3/meta/s/meta")
|
||||
|
||||
|
||||
def test_categorical_conversion(tmp_path, setup_path):
|
||||
# GH13322
|
||||
# Check that read_hdf with categorical columns doesn't return rows if
|
||||
# where criteria isn't met.
|
||||
obsids = ["ESP_012345_6789", "ESP_987654_3210"]
|
||||
imgids = ["APF00006np", "APF0001imm"]
|
||||
data = [4.3, 9.8]
|
||||
|
||||
# Test without categories
|
||||
df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data})
|
||||
|
||||
# We are expecting an empty DataFrame matching types of df
|
||||
expected = df.iloc[[], :]
|
||||
path = tmp_path / setup_path
|
||||
df.to_hdf(path, key="df", format="table", data_columns=True)
|
||||
result = read_hdf(path, "df", where="obsids=B")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Test with categories
|
||||
df.obsids = df.obsids.astype("category")
|
||||
df.imgids = df.imgids.astype("category")
|
||||
|
||||
# We are expecting an empty DataFrame matching types of df
|
||||
expected = df.iloc[[], :]
|
||||
path = tmp_path / setup_path
|
||||
df.to_hdf(path, key="df", format="table", data_columns=True)
|
||||
result = read_hdf(path, "df", where="obsids=B")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_nan_only_columns(tmp_path, setup_path):
|
||||
# GH18413
|
||||
# Check that read_hdf with categorical columns with NaN-only values can
|
||||
# be read back.
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": ["a", "b", "c", np.nan],
|
||||
"b": [np.nan, np.nan, np.nan, np.nan],
|
||||
"c": [1, 2, 3, 4],
|
||||
"d": Series([None] * 4, dtype=object),
|
||||
}
|
||||
)
|
||||
df["a"] = df.a.astype("category")
|
||||
df["b"] = df.b.astype("category")
|
||||
df["d"] = df.b.astype("category")
|
||||
expected = df
|
||||
path = tmp_path / setup_path
|
||||
df.to_hdf(path, key="df", format="table", data_columns=True)
|
||||
result = read_hdf(path, "df")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"where, df, expected",
|
||||
[
|
||||
('col=="q"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": []})),
|
||||
('col=="a"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": ["a"]})),
|
||||
],
|
||||
)
|
||||
def test_convert_value(
|
||||
tmp_path, setup_path, where: str, df: DataFrame, expected: DataFrame
|
||||
):
|
||||
# GH39420
|
||||
# Check that read_hdf with categorical columns can filter by where condition.
|
||||
df.col = df.col.astype("category")
|
||||
max_widths = {"col": 1}
|
||||
categorical_values = sorted(df.col.unique())
|
||||
expected.col = expected.col.astype("category")
|
||||
expected.col = expected.col.cat.set_categories(categorical_values)
|
||||
|
||||
path = tmp_path / setup_path
|
||||
df.to_hdf(path, key="df", format="table", min_itemsize=max_widths)
|
||||
result = read_hdf(path, where=where)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,75 @@
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
tables = pytest.importorskip("tables")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pytables_hdf5_file(tmp_path):
|
||||
"""
|
||||
Use PyTables to create a simple HDF5 file.
|
||||
"""
|
||||
table_schema = {
|
||||
"c0": tables.Time64Col(pos=0),
|
||||
"c1": tables.StringCol(5, pos=1),
|
||||
"c2": tables.Int64Col(pos=2),
|
||||
}
|
||||
|
||||
t0 = 1_561_105_000.0
|
||||
|
||||
testsamples = [
|
||||
{"c0": t0, "c1": "aaaaa", "c2": 1},
|
||||
{"c0": t0 + 1, "c1": "bbbbb", "c2": 2},
|
||||
{"c0": t0 + 2, "c1": "ccccc", "c2": 10**5},
|
||||
{"c0": t0 + 3, "c1": "ddddd", "c2": 4_294_967_295},
|
||||
]
|
||||
|
||||
objname = "pandas_test_timeseries"
|
||||
|
||||
path = tmp_path / "written_with_pytables.h5"
|
||||
with tables.open_file(path, mode="w") as f:
|
||||
t = f.create_table("/", name=objname, description=table_schema)
|
||||
for sample in testsamples:
|
||||
for key, value in sample.items():
|
||||
t.row[key] = value
|
||||
t.row.append()
|
||||
|
||||
yield path, objname, pd.DataFrame(testsamples)
|
||||
|
||||
|
||||
class TestReadPyTablesHDF5:
|
||||
"""
|
||||
A group of tests which covers reading HDF5 files written by plain PyTables
|
||||
(not written by pandas).
|
||||
|
||||
Was introduced for regression-testing issue 11188.
|
||||
"""
|
||||
|
||||
def test_read_complete(self, pytables_hdf5_file):
|
||||
path, objname, df = pytables_hdf5_file
|
||||
result = pd.read_hdf(path, key=objname)
|
||||
expected = df
|
||||
tm.assert_frame_equal(result, expected, check_index_type=True)
|
||||
|
||||
def test_read_with_start(self, pytables_hdf5_file):
|
||||
path, objname, df = pytables_hdf5_file
|
||||
# This is a regression test for pandas-dev/pandas/issues/11188
|
||||
result = pd.read_hdf(path, key=objname, start=1)
|
||||
expected = df[1:].reset_index(drop=True)
|
||||
tm.assert_frame_equal(result, expected, check_index_type=True)
|
||||
|
||||
def test_read_with_stop(self, pytables_hdf5_file):
|
||||
path, objname, df = pytables_hdf5_file
|
||||
# This is a regression test for pandas-dev/pandas/issues/11188
|
||||
result = pd.read_hdf(path, key=objname, stop=1)
|
||||
expected = df[:1].reset_index(drop=True)
|
||||
tm.assert_frame_equal(result, expected, check_index_type=True)
|
||||
|
||||
def test_read_with_startstop(self, pytables_hdf5_file):
|
||||
path, objname, df = pytables_hdf5_file
|
||||
# This is a regression test for pandas-dev/pandas/issues/11188
|
||||
result = pd.read_hdf(path, key=objname, start=1, stop=2)
|
||||
expected = df[1:2].reset_index(drop=True)
|
||||
tm.assert_frame_equal(result, expected, check_index_type=True)
|
||||
@ -0,0 +1,195 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.io.pytables.common import ensure_clean_store
|
||||
|
||||
from pandas.io.pytables import read_hdf
|
||||
|
||||
|
||||
def test_complex_fixed(tmp_path, setup_path):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)).astype(np.complex64),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
|
||||
path = tmp_path / setup_path
|
||||
df.to_hdf(path, key="df")
|
||||
reread = read_hdf(path, "df")
|
||||
tm.assert_frame_equal(df, reread)
|
||||
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)).astype(np.complex128),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
path = tmp_path / setup_path
|
||||
df.to_hdf(path, key="df")
|
||||
reread = read_hdf(path, "df")
|
||||
tm.assert_frame_equal(df, reread)
|
||||
|
||||
|
||||
def test_complex_table(tmp_path, setup_path):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)).astype(np.complex64),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
|
||||
path = tmp_path / setup_path
|
||||
df.to_hdf(path, key="df", format="table")
|
||||
reread = read_hdf(path, key="df")
|
||||
tm.assert_frame_equal(df, reread)
|
||||
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)).astype(np.complex128),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
|
||||
path = tmp_path / setup_path
|
||||
df.to_hdf(path, key="df", format="table", mode="w")
|
||||
reread = read_hdf(path, "df")
|
||||
tm.assert_frame_equal(df, reread)
|
||||
|
||||
|
||||
def test_complex_mixed_fixed(tmp_path, setup_path):
|
||||
complex64 = np.array(
|
||||
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64
|
||||
)
|
||||
complex128 = np.array(
|
||||
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128
|
||||
)
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3, 4],
|
||||
"B": ["a", "b", "c", "d"],
|
||||
"C": complex64,
|
||||
"D": complex128,
|
||||
"E": [1.0, 2.0, 3.0, 4.0],
|
||||
},
|
||||
index=list("abcd"),
|
||||
)
|
||||
path = tmp_path / setup_path
|
||||
df.to_hdf(path, key="df")
|
||||
reread = read_hdf(path, "df")
|
||||
tm.assert_frame_equal(df, reread)
|
||||
|
||||
|
||||
def test_complex_mixed_table(tmp_path, setup_path):
|
||||
complex64 = np.array(
|
||||
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64
|
||||
)
|
||||
complex128 = np.array(
|
||||
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128
|
||||
)
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3, 4],
|
||||
"B": ["a", "b", "c", "d"],
|
||||
"C": complex64,
|
||||
"D": complex128,
|
||||
"E": [1.0, 2.0, 3.0, 4.0],
|
||||
},
|
||||
index=list("abcd"),
|
||||
)
|
||||
|
||||
with ensure_clean_store(setup_path) as store:
|
||||
store.append("df", df, data_columns=["A", "B"])
|
||||
result = store.select("df", where="A>2")
|
||||
tm.assert_frame_equal(df.loc[df.A > 2], result)
|
||||
|
||||
path = tmp_path / setup_path
|
||||
df.to_hdf(path, key="df", format="table")
|
||||
reread = read_hdf(path, "df")
|
||||
tm.assert_frame_equal(df, reread)
|
||||
|
||||
|
||||
def test_complex_across_dimensions_fixed(tmp_path, setup_path):
|
||||
complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
|
||||
s = Series(complex128, index=list("abcd"))
|
||||
df = DataFrame({"A": s, "B": s})
|
||||
|
||||
objs = [s, df]
|
||||
comps = [tm.assert_series_equal, tm.assert_frame_equal]
|
||||
for obj, comp in zip(objs, comps):
|
||||
path = tmp_path / setup_path
|
||||
obj.to_hdf(path, key="obj", format="fixed")
|
||||
reread = read_hdf(path, "obj")
|
||||
comp(obj, reread)
|
||||
|
||||
|
||||
def test_complex_across_dimensions(tmp_path, setup_path):
|
||||
complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
|
||||
s = Series(complex128, index=list("abcd"))
|
||||
df = DataFrame({"A": s, "B": s})
|
||||
|
||||
path = tmp_path / setup_path
|
||||
df.to_hdf(path, key="obj", format="table")
|
||||
reread = read_hdf(path, "obj")
|
||||
tm.assert_frame_equal(df, reread)
|
||||
|
||||
|
||||
def test_complex_indexing_error(setup_path):
|
||||
complex128 = np.array(
|
||||
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128
|
||||
)
|
||||
df = DataFrame(
|
||||
{"A": [1, 2, 3, 4], "B": ["a", "b", "c", "d"], "C": complex128},
|
||||
index=list("abcd"),
|
||||
)
|
||||
|
||||
msg = (
|
||||
"Columns containing complex values can be stored "
|
||||
"but cannot be indexed when using table format. "
|
||||
"Either use fixed format, set index=False, "
|
||||
"or do not include the columns containing complex "
|
||||
"values to data_columns when initializing the table."
|
||||
)
|
||||
|
||||
with ensure_clean_store(setup_path) as store:
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
store.append("df", df, data_columns=["C"])
|
||||
|
||||
|
||||
def test_complex_series_error(tmp_path, setup_path):
|
||||
complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
|
||||
s = Series(complex128, index=list("abcd"))
|
||||
|
||||
msg = (
|
||||
"Columns containing complex values can be stored "
|
||||
"but cannot be indexed when using table format. "
|
||||
"Either use fixed format, set index=False, "
|
||||
"or do not include the columns containing complex "
|
||||
"values to data_columns when initializing the table."
|
||||
)
|
||||
|
||||
path = tmp_path / setup_path
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
s.to_hdf(path, key="obj", format="t")
|
||||
|
||||
path = tmp_path / setup_path
|
||||
s.to_hdf(path, key="obj", format="t", index=False)
|
||||
reread = read_hdf(path, "obj")
|
||||
tm.assert_series_equal(s, reread)
|
||||
|
||||
|
||||
def test_complex_append(setup_path):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).standard_normal(100).astype(np.complex128),
|
||||
"b": np.random.default_rng(2).standard_normal(100),
|
||||
}
|
||||
)
|
||||
|
||||
with ensure_clean_store(setup_path) as store:
|
||||
store.append("df", df, data_columns=["b"])
|
||||
store.append("df", df)
|
||||
result = store.select("df")
|
||||
tm.assert_frame_equal(pd.concat([df, df], axis=0), result)
|
||||
@ -0,0 +1,256 @@
|
||||
import datetime
|
||||
from io import BytesIO
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
CategoricalIndex,
|
||||
DataFrame,
|
||||
HDFStore,
|
||||
Index,
|
||||
MultiIndex,
|
||||
_testing as tm,
|
||||
date_range,
|
||||
read_hdf,
|
||||
)
|
||||
from pandas.tests.io.pytables.common import ensure_clean_store
|
||||
|
||||
from pandas.io.pytables import (
|
||||
Term,
|
||||
_maybe_adjust_name,
|
||||
)
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
|
||||
def test_pass_spec_to_storer(setup_path):
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
|
||||
with ensure_clean_store(setup_path) as store:
|
||||
store.put("df", df)
|
||||
msg = (
|
||||
"cannot pass a column specification when reading a Fixed format "
|
||||
"store. this store must be selected in its entirety"
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
store.select("df", columns=["A"])
|
||||
msg = (
|
||||
"cannot pass a where specification when reading from a Fixed "
|
||||
"format store. this store must be selected in its entirety"
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
store.select("df", where=[("columns=A")])
|
||||
|
||||
|
||||
def test_table_index_incompatible_dtypes(setup_path):
|
||||
df1 = DataFrame({"a": [1, 2, 3]})
|
||||
df2 = DataFrame({"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3))
|
||||
|
||||
with ensure_clean_store(setup_path) as store:
|
||||
store.put("frame", df1, format="table")
|
||||
msg = re.escape("incompatible kind in col [integer - datetime64[ns]]")
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
store.put("frame", df2, format="table", append=True)
|
||||
|
||||
|
||||
def test_unimplemented_dtypes_table_columns(setup_path):
|
||||
with ensure_clean_store(setup_path) as store:
|
||||
dtypes = [("date", datetime.date(2001, 1, 2))]
|
||||
|
||||
# currently not supported dtypes ####
|
||||
for n, f in dtypes:
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
df[n] = f
|
||||
msg = re.escape(f"[{n}] is not implemented as a table column")
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
store.append(f"df1_{n}", df)
|
||||
|
||||
# frame
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
df["obj1"] = "foo"
|
||||
df["obj2"] = "bar"
|
||||
df["datetime1"] = datetime.date(2001, 1, 2)
|
||||
df = df._consolidate()
|
||||
|
||||
with ensure_clean_store(setup_path) as store:
|
||||
# this fails because we have a date in the object block......
|
||||
msg = "|".join(
|
||||
[
|
||||
re.escape(
|
||||
"Cannot serialize the column [datetime1]\nbecause its data "
|
||||
"contents are not [string] but [date] object dtype"
|
||||
),
|
||||
re.escape("[date] is not implemented as a table column"),
|
||||
]
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
store.append("df_unimplemented", df)
|
||||
|
||||
|
||||
def test_invalid_terms(tmp_path, setup_path):
|
||||
with ensure_clean_store(setup_path) as store:
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
df["string"] = "foo"
|
||||
df.loc[df.index[0:4], "string"] = "bar"
|
||||
|
||||
store.put("df", df, format="table")
|
||||
|
||||
# some invalid terms
|
||||
msg = re.escape("__init__() missing 1 required positional argument: 'where'")
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
Term()
|
||||
|
||||
# more invalid
|
||||
msg = re.escape(
|
||||
"cannot process expression [df.index[3]], "
|
||||
"[2000-01-06 00:00:00] is not a valid condition"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
store.select("df", "df.index[3]")
|
||||
|
||||
msg = "invalid syntax"
|
||||
with pytest.raises(SyntaxError, match=msg):
|
||||
store.select("df", "index>")
|
||||
|
||||
# from the docs
|
||||
path = tmp_path / setup_path
|
||||
dfq = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=list("ABCD"),
|
||||
index=date_range("20130101", periods=10),
|
||||
)
|
||||
dfq.to_hdf(path, key="dfq", format="table", data_columns=True)
|
||||
|
||||
# check ok
|
||||
read_hdf(path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']")
|
||||
read_hdf(path, "dfq", where="A>0 or C>0")
|
||||
|
||||
# catch the invalid reference
|
||||
path = tmp_path / setup_path
|
||||
dfq = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=list("ABCD"),
|
||||
index=date_range("20130101", periods=10),
|
||||
)
|
||||
dfq.to_hdf(path, key="dfq", format="table")
|
||||
|
||||
msg = (
|
||||
r"The passed where expression: A>0 or C>0\n\s*"
|
||||
r"contains an invalid variable reference\n\s*"
|
||||
r"all of the variable references must be a reference to\n\s*"
|
||||
r"an axis \(e.g. 'index' or 'columns'\), or a data_column\n\s*"
|
||||
r"The currently defined references are: index,columns\n"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_hdf(path, "dfq", where="A>0 or C>0")
|
||||
|
||||
|
||||
def test_append_with_diff_col_name_types_raises_value_error(setup_path):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 1)))
|
||||
df2 = DataFrame({"a": np.random.default_rng(2).standard_normal(10)})
|
||||
df3 = DataFrame({(1, 2): np.random.default_rng(2).standard_normal(10)})
|
||||
df4 = DataFrame({("1", 2): np.random.default_rng(2).standard_normal(10)})
|
||||
df5 = DataFrame({("1", 2, object): np.random.default_rng(2).standard_normal(10)})
|
||||
|
||||
with ensure_clean_store(setup_path) as store:
|
||||
name = "df_diff_valerror"
|
||||
store.append(name, df)
|
||||
|
||||
for d in (df2, df3, df4, df5):
|
||||
msg = re.escape(
|
||||
"cannot match existing table structure for [0] on appending data"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
store.append(name, d)
|
||||
|
||||
|
||||
def test_invalid_complib(setup_path):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
with tm.ensure_clean(setup_path) as path:
|
||||
msg = r"complib only supports \[.*\] compression."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_hdf(path, key="df", complib="foolib")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx",
|
||||
[
|
||||
date_range("2019", freq="D", periods=3, tz="UTC"),
|
||||
CategoricalIndex(list("abc")),
|
||||
],
|
||||
)
|
||||
def test_to_hdf_multiindex_extension_dtype(idx, tmp_path, setup_path):
|
||||
# GH 7775
|
||||
mi = MultiIndex.from_arrays([idx, idx])
|
||||
df = DataFrame(0, index=mi, columns=["a"])
|
||||
path = tmp_path / setup_path
|
||||
with pytest.raises(NotImplementedError, match="Saving a MultiIndex"):
|
||||
df.to_hdf(path, key="df")
|
||||
|
||||
|
||||
def test_unsuppored_hdf_file_error(datapath):
|
||||
# GH 9539
|
||||
data_path = datapath("io", "data", "legacy_hdf/incompatible_dataset.h5")
|
||||
message = (
|
||||
r"Dataset\(s\) incompatible with Pandas data types, "
|
||||
"not table, or no datasets found in HDF5 file."
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=message):
|
||||
read_hdf(data_path)
|
||||
|
||||
|
||||
def test_read_hdf_errors(setup_path, tmp_path):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
|
||||
path = tmp_path / setup_path
|
||||
msg = r"File [\S]* does not exist"
|
||||
with pytest.raises(OSError, match=msg):
|
||||
read_hdf(path, "key")
|
||||
|
||||
df.to_hdf(path, key="df")
|
||||
store = HDFStore(path, mode="r")
|
||||
store.close()
|
||||
|
||||
msg = "The HDFStore must be open for reading."
|
||||
with pytest.raises(OSError, match=msg):
|
||||
read_hdf(store, "df")
|
||||
|
||||
|
||||
def test_read_hdf_generic_buffer_errors():
|
||||
msg = "Support for generic buffers has not been implemented."
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
read_hdf(BytesIO(b""), "df")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bad_version", [(1, 2), (1,), [], "12", "123"])
|
||||
def test_maybe_adjust_name_bad_version_raises(bad_version):
|
||||
msg = "Version is incorrect, expected sequence of 3 integers"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
_maybe_adjust_name("values_block_0", version=bad_version)
|
||||
@ -0,0 +1,517 @@
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import (
|
||||
PY311,
|
||||
is_ci_environment,
|
||||
is_platform_linux,
|
||||
is_platform_little_endian,
|
||||
)
|
||||
from pandas.errors import (
|
||||
ClosedFileError,
|
||||
PossibleDataLossError,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
HDFStore,
|
||||
Index,
|
||||
Series,
|
||||
_testing as tm,
|
||||
date_range,
|
||||
read_hdf,
|
||||
)
|
||||
from pandas.tests.io.pytables.common import (
|
||||
_maybe_remove,
|
||||
ensure_clean_store,
|
||||
tables,
|
||||
)
|
||||
|
||||
from pandas.io import pytables
|
||||
from pandas.io.pytables import Term
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mode", ["r", "r+", "a", "w"])
|
||||
def test_mode(setup_path, tmp_path, mode, using_infer_string):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
msg = r"[\S]* does not exist"
|
||||
path = tmp_path / setup_path
|
||||
|
||||
# constructor
|
||||
if mode in ["r", "r+"]:
|
||||
with pytest.raises(OSError, match=msg):
|
||||
HDFStore(path, mode=mode)
|
||||
|
||||
else:
|
||||
with HDFStore(path, mode=mode) as store:
|
||||
assert store._handle.mode == mode
|
||||
|
||||
path = tmp_path / setup_path
|
||||
|
||||
# context
|
||||
if mode in ["r", "r+"]:
|
||||
with pytest.raises(OSError, match=msg):
|
||||
with HDFStore(path, mode=mode) as store:
|
||||
pass
|
||||
else:
|
||||
with HDFStore(path, mode=mode) as store:
|
||||
assert store._handle.mode == mode
|
||||
|
||||
path = tmp_path / setup_path
|
||||
|
||||
# conv write
|
||||
if mode in ["r", "r+"]:
|
||||
with pytest.raises(OSError, match=msg):
|
||||
df.to_hdf(path, key="df", mode=mode)
|
||||
df.to_hdf(path, key="df", mode="w")
|
||||
else:
|
||||
df.to_hdf(path, key="df", mode=mode)
|
||||
|
||||
# conv read
|
||||
if mode in ["w"]:
|
||||
msg = (
|
||||
"mode w is not allowed while performing a read. "
|
||||
r"Allowed modes are r, r\+ and a."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_hdf(path, "df", mode=mode)
|
||||
else:
|
||||
result = read_hdf(path, "df", mode=mode)
|
||||
if using_infer_string:
|
||||
df.columns = df.columns.astype("str")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_default_mode(tmp_path, setup_path, using_infer_string):
|
||||
# read_hdf uses default mode
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
path = tmp_path / setup_path
|
||||
df.to_hdf(path, key="df", mode="w")
|
||||
result = read_hdf(path, "df")
|
||||
expected = df.copy()
|
||||
if using_infer_string:
|
||||
expected.columns = expected.columns.astype("str")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_reopen_handle(tmp_path, setup_path):
|
||||
path = tmp_path / setup_path
|
||||
|
||||
store = HDFStore(path, mode="a")
|
||||
store["a"] = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
|
||||
msg = (
|
||||
r"Re-opening the file \[[\S]*\] with mode \[a\] will delete the "
|
||||
"current file!"
|
||||
)
|
||||
# invalid mode change
|
||||
with pytest.raises(PossibleDataLossError, match=msg):
|
||||
store.open("w")
|
||||
|
||||
store.close()
|
||||
assert not store.is_open
|
||||
|
||||
# truncation ok here
|
||||
store.open("w")
|
||||
assert store.is_open
|
||||
assert len(store) == 0
|
||||
store.close()
|
||||
assert not store.is_open
|
||||
|
||||
store = HDFStore(path, mode="a")
|
||||
store["a"] = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
|
||||
# reopen as read
|
||||
store.open("r")
|
||||
assert store.is_open
|
||||
assert len(store) == 1
|
||||
assert store._mode == "r"
|
||||
store.close()
|
||||
assert not store.is_open
|
||||
|
||||
# reopen as append
|
||||
store.open("a")
|
||||
assert store.is_open
|
||||
assert len(store) == 1
|
||||
assert store._mode == "a"
|
||||
store.close()
|
||||
assert not store.is_open
|
||||
|
||||
# reopen as append (again)
|
||||
store.open("a")
|
||||
assert store.is_open
|
||||
assert len(store) == 1
|
||||
assert store._mode == "a"
|
||||
store.close()
|
||||
assert not store.is_open
|
||||
|
||||
|
||||
def test_open_args(setup_path, using_infer_string):
|
||||
with tm.ensure_clean(setup_path) as path:
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
|
||||
# create an in memory store
|
||||
store = HDFStore(
|
||||
path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0
|
||||
)
|
||||
store["df"] = df
|
||||
store.append("df2", df)
|
||||
|
||||
expected = df.copy()
|
||||
if using_infer_string:
|
||||
expected.index = expected.index.astype("str")
|
||||
expected.columns = expected.columns.astype("str")
|
||||
|
||||
tm.assert_frame_equal(store["df"], expected)
|
||||
tm.assert_frame_equal(store["df2"], expected)
|
||||
|
||||
store.close()
|
||||
|
||||
# the file should not have actually been written
|
||||
assert not os.path.exists(path)
|
||||
|
||||
|
||||
def test_flush(setup_path):
|
||||
with ensure_clean_store(setup_path) as store:
|
||||
store["a"] = Series(range(5))
|
||||
store.flush()
|
||||
store.flush(fsync=True)
|
||||
|
||||
|
||||
def test_complibs_default_settings(tmp_path, setup_path, using_infer_string):
|
||||
# GH15943
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
|
||||
# Set complevel and check if complib is automatically set to
|
||||
# default value
|
||||
tmpfile = tmp_path / setup_path
|
||||
df.to_hdf(tmpfile, key="df", complevel=9)
|
||||
result = read_hdf(tmpfile, "df")
|
||||
expected = df.copy()
|
||||
if using_infer_string:
|
||||
expected.index = expected.index.astype("str")
|
||||
expected.columns = expected.columns.astype("str")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with tables.open_file(tmpfile, mode="r") as h5file:
|
||||
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
|
||||
assert node.filters.complevel == 9
|
||||
assert node.filters.complib == "zlib"
|
||||
|
||||
# Set complib and check to see if compression is disabled
|
||||
tmpfile = tmp_path / setup_path
|
||||
df.to_hdf(tmpfile, key="df", complib="zlib")
|
||||
result = read_hdf(tmpfile, "df")
|
||||
expected = df.copy()
|
||||
if using_infer_string:
|
||||
expected.index = expected.index.astype("str")
|
||||
expected.columns = expected.columns.astype("str")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with tables.open_file(tmpfile, mode="r") as h5file:
|
||||
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
|
||||
assert node.filters.complevel == 0
|
||||
assert node.filters.complib is None
|
||||
|
||||
# Check if not setting complib or complevel results in no compression
|
||||
tmpfile = tmp_path / setup_path
|
||||
df.to_hdf(tmpfile, key="df")
|
||||
result = read_hdf(tmpfile, "df")
|
||||
expected = df.copy()
|
||||
if using_infer_string:
|
||||
expected.index = expected.index.astype("str")
|
||||
expected.columns = expected.columns.astype("str")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with tables.open_file(tmpfile, mode="r") as h5file:
|
||||
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
|
||||
assert node.filters.complevel == 0
|
||||
assert node.filters.complib is None
|
||||
|
||||
|
||||
def test_complibs_default_settings_override(tmp_path, setup_path):
|
||||
# Check if file-defaults can be overridden on a per table basis
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
tmpfile = tmp_path / setup_path
|
||||
store = HDFStore(tmpfile)
|
||||
store.append("dfc", df, complevel=9, complib="blosc")
|
||||
store.append("df", df)
|
||||
store.close()
|
||||
|
||||
with tables.open_file(tmpfile, mode="r") as h5file:
|
||||
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
|
||||
assert node.filters.complevel == 0
|
||||
assert node.filters.complib is None
|
||||
for node in h5file.walk_nodes(where="/dfc", classname="Leaf"):
|
||||
assert node.filters.complevel == 9
|
||||
assert node.filters.complib == "blosc"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lvl", range(10))
|
||||
@pytest.mark.parametrize("lib", tables.filters.all_complibs)
|
||||
@pytest.mark.filterwarnings("ignore:object name is not a valid")
|
||||
@pytest.mark.skipif(
|
||||
not PY311 and is_ci_environment() and is_platform_linux(),
|
||||
reason="Segfaulting in a CI environment"
|
||||
# with xfail, would sometimes raise UnicodeDecodeError
|
||||
# invalid state byte
|
||||
)
|
||||
def test_complibs(tmp_path, lvl, lib, request):
|
||||
# GH14478
|
||||
if PY311 and is_platform_linux() and lib == "blosc2" and lvl != 0:
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(reason=f"Fails for {lib} on Linux and PY > 3.11")
|
||||
)
|
||||
df = DataFrame(
|
||||
np.ones((30, 4)), columns=list("ABCD"), index=np.arange(30).astype(np.str_)
|
||||
)
|
||||
|
||||
# Remove lzo if its not available on this platform
|
||||
if not tables.which_lib_version("lzo"):
|
||||
pytest.skip("lzo not available")
|
||||
# Remove bzip2 if its not available on this platform
|
||||
if not tables.which_lib_version("bzip2"):
|
||||
pytest.skip("bzip2 not available")
|
||||
|
||||
tmpfile = tmp_path / f"{lvl}_{lib}.h5"
|
||||
gname = f"{lvl}_{lib}"
|
||||
|
||||
# Write and read file to see if data is consistent
|
||||
df.to_hdf(tmpfile, key=gname, complib=lib, complevel=lvl)
|
||||
result = read_hdf(tmpfile, gname)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
# Open file and check metadata for correct amount of compression
|
||||
with tables.open_file(tmpfile, mode="r") as h5table:
|
||||
for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"):
|
||||
assert node.filters.complevel == lvl
|
||||
if lvl == 0:
|
||||
assert node.filters.complib is None
|
||||
else:
|
||||
assert node.filters.complib == lib
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_platform_little_endian(), reason="reason platform is not little endian"
|
||||
)
|
||||
def test_encoding(setup_path):
|
||||
with ensure_clean_store(setup_path) as store:
|
||||
df = DataFrame({"A": "foo", "B": "bar"}, index=range(5))
|
||||
df.loc[2, "A"] = np.nan
|
||||
df.loc[3, "B"] = np.nan
|
||||
_maybe_remove(store, "df")
|
||||
store.append("df", df, encoding="ascii")
|
||||
tm.assert_frame_equal(store["df"], df)
|
||||
|
||||
expected = df.reindex(columns=["A"])
|
||||
result = store.select("df", Term("columns=A", encoding="ascii"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"val",
|
||||
[
|
||||
[b"E\xc9, 17", b"", b"a", b"b", b"c"],
|
||||
[b"E\xc9, 17", b"a", b"b", b"c"],
|
||||
[b"EE, 17", b"", b"a", b"b", b"c"],
|
||||
[b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"],
|
||||
[b"", b"a", b"b", b"c"],
|
||||
[b"\xf8\xfc", b"a", b"b", b"c"],
|
||||
[b"A\xf8\xfc", b"", b"a", b"b", b"c"],
|
||||
[np.nan, b"", b"b", b"c"],
|
||||
[b"A\xf8\xfc", np.nan, b"", b"b", b"c"],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["category", None])
|
||||
def test_latin_encoding(tmp_path, setup_path, dtype, val):
|
||||
enc = "latin-1"
|
||||
nan_rep = ""
|
||||
key = "data"
|
||||
|
||||
val = [x.decode(enc) if isinstance(x, bytes) else x for x in val]
|
||||
ser = Series(val, dtype=dtype)
|
||||
|
||||
store = tmp_path / setup_path
|
||||
ser.to_hdf(store, key=key, format="table", encoding=enc, nan_rep=nan_rep)
|
||||
retr = read_hdf(store, key)
|
||||
|
||||
# TODO:(3.0): once Categorical replace deprecation is enforced,
|
||||
# we may be able to re-simplify the construction of s_nan
|
||||
if dtype == "category":
|
||||
if nan_rep in ser.cat.categories:
|
||||
s_nan = ser.cat.remove_categories([nan_rep])
|
||||
else:
|
||||
s_nan = ser
|
||||
else:
|
||||
s_nan = ser.replace(nan_rep, np.nan)
|
||||
|
||||
tm.assert_series_equal(s_nan, retr)
|
||||
|
||||
|
||||
def test_multiple_open_close(tmp_path, setup_path):
|
||||
# gh-4409: open & close multiple times
|
||||
|
||||
path = tmp_path / setup_path
|
||||
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
df.to_hdf(path, key="df", mode="w", format="table")
|
||||
|
||||
# single
|
||||
store = HDFStore(path)
|
||||
assert "CLOSED" not in store.info()
|
||||
assert store.is_open
|
||||
|
||||
store.close()
|
||||
assert "CLOSED" in store.info()
|
||||
assert not store.is_open
|
||||
|
||||
path = tmp_path / setup_path
|
||||
|
||||
if pytables._table_file_open_policy_is_strict:
|
||||
# multiples
|
||||
store1 = HDFStore(path)
|
||||
msg = (
|
||||
r"The file [\S]* is already opened\. Please close it before "
|
||||
r"reopening in write mode\."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
HDFStore(path)
|
||||
|
||||
store1.close()
|
||||
else:
|
||||
# multiples
|
||||
store1 = HDFStore(path)
|
||||
store2 = HDFStore(path)
|
||||
|
||||
assert "CLOSED" not in store1.info()
|
||||
assert "CLOSED" not in store2.info()
|
||||
assert store1.is_open
|
||||
assert store2.is_open
|
||||
|
||||
store1.close()
|
||||
assert "CLOSED" in store1.info()
|
||||
assert not store1.is_open
|
||||
assert "CLOSED" not in store2.info()
|
||||
assert store2.is_open
|
||||
|
||||
store2.close()
|
||||
assert "CLOSED" in store1.info()
|
||||
assert "CLOSED" in store2.info()
|
||||
assert not store1.is_open
|
||||
assert not store2.is_open
|
||||
|
||||
# nested close
|
||||
store = HDFStore(path, mode="w")
|
||||
store.append("df", df)
|
||||
|
||||
store2 = HDFStore(path)
|
||||
store2.append("df2", df)
|
||||
store2.close()
|
||||
assert "CLOSED" in store2.info()
|
||||
assert not store2.is_open
|
||||
|
||||
store.close()
|
||||
assert "CLOSED" in store.info()
|
||||
assert not store.is_open
|
||||
|
||||
# double closing
|
||||
store = HDFStore(path, mode="w")
|
||||
store.append("df", df)
|
||||
|
||||
store2 = HDFStore(path)
|
||||
store.close()
|
||||
assert "CLOSED" in store.info()
|
||||
assert not store.is_open
|
||||
|
||||
store2.close()
|
||||
assert "CLOSED" in store2.info()
|
||||
assert not store2.is_open
|
||||
|
||||
# ops on a closed store
|
||||
path = tmp_path / setup_path
|
||||
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
df.to_hdf(path, key="df", mode="w", format="table")
|
||||
|
||||
store = HDFStore(path)
|
||||
store.close()
|
||||
|
||||
msg = r"[\S]* file is not open!"
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.keys()
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
"df" in store
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
len(store)
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store["df"]
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.select("df")
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.get("df")
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.append("df2", df)
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.put("df3", df)
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.get_storer("df2")
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.remove("df2")
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.select("df")
|
||||
|
||||
msg = "'HDFStore' object has no attribute 'df'"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
store.df
|
||||
|
||||
|
||||
def test_fspath():
|
||||
with tm.ensure_clean("foo.h5") as path:
|
||||
with HDFStore(path) as store:
|
||||
assert os.fspath(store) == str(path)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user