done
This commit is contained in:
@ -0,0 +1,25 @@
|
||||
def get_groupby_method_args(name, obj):
|
||||
"""
|
||||
Get required arguments for a groupby method.
|
||||
|
||||
When parametrizing a test over groupby methods (e.g. "sum", "mean", "fillna"),
|
||||
it is often the case that arguments are required for certain methods.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
Name of the method.
|
||||
obj: Series or DataFrame
|
||||
pandas object that is being grouped.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A tuple of required arguments for the method.
|
||||
"""
|
||||
if name in ("nth", "fillna", "take"):
|
||||
return (0,)
|
||||
if name == "quantile":
|
||||
return (0.5,)
|
||||
if name == "corrwith":
|
||||
return (obj,)
|
||||
return ()
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,437 @@
|
||||
"""
|
||||
test cython .agg behavior
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_float_dtype,
|
||||
is_integer_dtype,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
NaT,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
bdate_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
import pandas.core.common as com
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name",
|
||||
[
|
||||
"count",
|
||||
"sum",
|
||||
"std",
|
||||
"var",
|
||||
"sem",
|
||||
"mean",
|
||||
pytest.param(
|
||||
"median",
|
||||
# ignore mean of empty slice
|
||||
# and all-NaN
|
||||
marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
|
||||
),
|
||||
"prod",
|
||||
"min",
|
||||
"max",
|
||||
],
|
||||
)
|
||||
def test_cythonized_aggers(op_name):
|
||||
data = {
|
||||
"A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
|
||||
"B": ["A", "B"] * 6,
|
||||
"C": np.random.default_rng(2).standard_normal(12),
|
||||
}
|
||||
df = DataFrame(data)
|
||||
df.loc[2:10:2, "C"] = np.nan
|
||||
|
||||
op = lambda x: getattr(x, op_name)()
|
||||
|
||||
# single column
|
||||
grouped = df.drop(["B"], axis=1).groupby("A")
|
||||
exp = {cat: op(group["C"]) for cat, group in grouped}
|
||||
exp = DataFrame({"C": exp})
|
||||
exp.index.name = "A"
|
||||
result = op(grouped)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
# multiple columns
|
||||
grouped = df.groupby(["A", "B"])
|
||||
expd = {}
|
||||
for (cat1, cat2), group in grouped:
|
||||
expd.setdefault(cat1, {})[cat2] = op(group["C"])
|
||||
exp = DataFrame(expd).T.stack(future_stack=True)
|
||||
exp.index.names = ["A", "B"]
|
||||
exp.name = "C"
|
||||
|
||||
result = op(grouped)["C"]
|
||||
if op_name in ["sum", "prod"]:
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
def test_cython_agg_boolean():
|
||||
frame = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).integers(0, 5, 50),
|
||||
"b": np.random.default_rng(2).integers(0, 2, 50).astype("bool"),
|
||||
}
|
||||
)
|
||||
result = frame.groupby("a")["b"].mean()
|
||||
msg = "using SeriesGroupBy.mean"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
# GH#53425
|
||||
expected = frame.groupby("a")["b"].agg(np.mean)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg():
|
||||
frame = DataFrame(
|
||||
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
|
||||
)
|
||||
|
||||
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
frame.groupby("a")["b"].mean(numeric_only=True)
|
||||
|
||||
frame = DataFrame(
|
||||
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
|
||||
)
|
||||
|
||||
result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
|
||||
expected = DataFrame(
|
||||
[],
|
||||
index=frame["a"].sort_values().drop_duplicates(),
|
||||
columns=Index([], dtype="str"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg_with_dates():
|
||||
frame = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).integers(0, 5, 50),
|
||||
"b": ["foo", "bar"] * 25,
|
||||
"dates": pd.date_range("now", periods=50, freq="min"),
|
||||
}
|
||||
)
|
||||
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
frame.groupby("b").dates.mean(numeric_only=True)
|
||||
|
||||
|
||||
def test_cython_agg_frame_columns():
|
||||
# #2113
|
||||
df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]})
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
|
||||
|
||||
def test_cython_agg_return_dict():
|
||||
# GH 16741
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
|
||||
ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
|
||||
expected = Series(
|
||||
[{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
|
||||
index=Index(["bar", "foo"], name="A"),
|
||||
name="B",
|
||||
)
|
||||
tm.assert_series_equal(ts, expected)
|
||||
|
||||
|
||||
def test_cython_fail_agg():
|
||||
dr = bdate_range("1/1/2000", periods=50)
|
||||
ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr)
|
||||
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
summed = grouped.sum()
|
||||
msg = "using SeriesGroupBy.sum"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
# GH#53425
|
||||
expected = grouped.agg(np.sum).astype(object)
|
||||
tm.assert_series_equal(summed, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, targop",
|
||||
[
|
||||
("mean", np.mean),
|
||||
("median", np.median),
|
||||
("var", np.var),
|
||||
("sum", np.sum),
|
||||
("prod", np.prod),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
("first", lambda x: x.iloc[0]),
|
||||
("last", lambda x: x.iloc[-1]),
|
||||
],
|
||||
)
|
||||
def test__cython_agg_general(op, targop):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal(1000))
|
||||
labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
|
||||
|
||||
result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
|
||||
warn = FutureWarning if targop in com._cython_table else None
|
||||
msg = f"using DataFrameGroupBy.{op}"
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
# GH#53425
|
||||
expected = df.groupby(labels).agg(targop)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, targop",
|
||||
[
|
||||
("mean", np.mean),
|
||||
("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
|
||||
("var", lambda x: np.var(x, ddof=1)),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
],
|
||||
)
|
||||
def test_cython_agg_empty_buckets(op, targop, observed):
|
||||
df = DataFrame([11, 12, 13])
|
||||
grps = range(0, 55, 5)
|
||||
|
||||
# calling _cython_agg_general directly, instead of via the user API
|
||||
# which sets different values for min_count, so do that here.
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
result = g._cython_agg_general(op, alt=None, numeric_only=True)
|
||||
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
expected = g.agg(lambda x: targop(x))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_empty_buckets_nanops(observed):
|
||||
# GH-18869 can't call nanops on empty groups, so hardcode expected
|
||||
# for these
|
||||
df = DataFrame([11, 12, 13], columns=["a"])
|
||||
grps = np.arange(0, 25, 5, dtype=int)
|
||||
# add / sum
|
||||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
||||
"sum", alt=None, numeric_only=True
|
||||
)
|
||||
intervals = pd.interval_range(0, 20, freq=5)
|
||||
expected = DataFrame(
|
||||
{"a": [0, 0, 36, 0]},
|
||||
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected.a != 0]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# prod
|
||||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
||||
"prod", alt=None, numeric_only=True
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 1, 1716, 1]},
|
||||
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected.a != 1]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("op", ["first", "last", "max", "min"])
|
||||
@pytest.mark.parametrize(
|
||||
"data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
|
||||
)
|
||||
def test_cython_with_timestamp_and_nat(op, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/19526
|
||||
df = DataFrame({"a": [0, 1], "b": [data, NaT]})
|
||||
index = Index([0, 1], name="a")
|
||||
|
||||
# We will group by a and test the cython aggregations
|
||||
expected = DataFrame({"b": [data, NaT]}, index=index)
|
||||
|
||||
result = df.groupby("a").aggregate(op)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg",
|
||||
[
|
||||
"min",
|
||||
"max",
|
||||
"count",
|
||||
"sum",
|
||||
"prod",
|
||||
"var",
|
||||
"mean",
|
||||
"median",
|
||||
"ohlc",
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
"shift",
|
||||
"any",
|
||||
"all",
|
||||
"quantile",
|
||||
"first",
|
||||
"last",
|
||||
"rank",
|
||||
"cummin",
|
||||
"cummax",
|
||||
],
|
||||
)
|
||||
def test_read_only_buffer_source_agg(agg):
|
||||
# https://github.com/pandas-dev/pandas/issues/36014
|
||||
df = DataFrame(
|
||||
{
|
||||
"sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0],
|
||||
"species": ["setosa", "setosa", "setosa", "setosa", "setosa"],
|
||||
}
|
||||
)
|
||||
df._mgr.arrays[0].flags.writeable = False
|
||||
|
||||
result = df.groupby(["species"]).agg({"sepal_length": agg})
|
||||
expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name",
|
||||
[
|
||||
"count",
|
||||
"sum",
|
||||
"std",
|
||||
"var",
|
||||
"sem",
|
||||
"mean",
|
||||
"median",
|
||||
"prod",
|
||||
"min",
|
||||
"max",
|
||||
],
|
||||
)
|
||||
def test_cython_agg_nullable_int(op_name):
|
||||
# ensure that the cython-based aggregations don't fail for nullable dtype
|
||||
# (eg https://github.com/pandas-dev/pandas/issues/37415)
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["A", "B"] * 5,
|
||||
"B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"),
|
||||
}
|
||||
)
|
||||
result = getattr(df.groupby("A")["B"], op_name)()
|
||||
df2 = df.assign(B=df["B"].astype("float64"))
|
||||
expected = getattr(df2.groupby("A")["B"], op_name)()
|
||||
if op_name in ("mean", "median"):
|
||||
convert_integer = False
|
||||
else:
|
||||
convert_integer = True
|
||||
expected = expected.convert_dtypes(convert_integer=convert_integer)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
||||
def test_count_masked_returns_masked_dtype(dtype):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1],
|
||||
"B": pd.array([1, pd.NA], dtype=dtype),
|
||||
"C": pd.array([1, 1], dtype=dtype),
|
||||
}
|
||||
)
|
||||
result = df.groupby("A").count()
|
||||
expected = DataFrame(
|
||||
[[1, 2]], index=Index([1], name="A"), columns=["B", "C"], dtype="Int64"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("with_na", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"op_name, action",
|
||||
[
|
||||
# ("count", "always_int"),
|
||||
("sum", "large_int"),
|
||||
# ("std", "always_float"),
|
||||
("var", "always_float"),
|
||||
# ("sem", "always_float"),
|
||||
("mean", "always_float"),
|
||||
("median", "always_float"),
|
||||
("prod", "large_int"),
|
||||
("min", "preserve"),
|
||||
("max", "preserve"),
|
||||
("first", "preserve"),
|
||||
("last", "preserve"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
pd.array([1, 2, 3, 4], dtype="Int64"),
|
||||
pd.array([1, 2, 3, 4], dtype="Int8"),
|
||||
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"),
|
||||
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"),
|
||||
pd.array([True, True, False, False], dtype="boolean"),
|
||||
],
|
||||
)
|
||||
def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
|
||||
if with_na:
|
||||
data[3] = pd.NA
|
||||
|
||||
df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
|
||||
grouped = df.groupby("key")
|
||||
|
||||
if action == "always_int":
|
||||
# always Int64
|
||||
expected_dtype = pd.Int64Dtype()
|
||||
elif action == "large_int":
|
||||
# for any int/bool use Int64, for float preserve dtype
|
||||
if is_float_dtype(data.dtype):
|
||||
expected_dtype = data.dtype
|
||||
elif is_integer_dtype(data.dtype):
|
||||
# match the numpy dtype we'd get with the non-nullable analogue
|
||||
expected_dtype = data.dtype
|
||||
else:
|
||||
expected_dtype = pd.Int64Dtype()
|
||||
elif action == "always_float":
|
||||
# for any int/bool use Float64, for float preserve dtype
|
||||
if is_float_dtype(data.dtype):
|
||||
expected_dtype = data.dtype
|
||||
else:
|
||||
expected_dtype = pd.Float64Dtype()
|
||||
elif action == "preserve":
|
||||
expected_dtype = data.dtype
|
||||
|
||||
result = getattr(grouped, op_name)()
|
||||
assert result["col"].dtype == expected_dtype
|
||||
|
||||
result = grouped.aggregate(op_name)
|
||||
assert result["col"].dtype == expected_dtype
|
||||
|
||||
result = getattr(grouped["col"], op_name)()
|
||||
assert result.dtype == expected_dtype
|
||||
|
||||
result = grouped["col"].aggregate(op_name)
|
||||
assert result.dtype == expected_dtype
|
||||
@ -0,0 +1,402 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_arm
|
||||
from pandas.errors import NumbaUtilError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
NamedAgg,
|
||||
Series,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.util.version import Version
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
numba = pytest.importorskip("numba")
|
||||
pytestmark.append(
|
||||
pytest.mark.skipif(
|
||||
Version(numba.__version__) == Version("0.61") and is_platform_arm(),
|
||||
reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test_correct_function_signature():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def incorrect_function(x):
|
||||
return sum(x) * 2.7
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key").agg(incorrect_function, engine="numba")
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key")["data"].agg(incorrect_function, engine="numba")
|
||||
|
||||
|
||||
def test_check_nopython_kwargs():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def incorrect_function(values, index):
|
||||
return sum(values) * 2.7
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key").agg(incorrect_function, engine="numba", a=1)
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_numba(values, index):
|
||||
return np.mean(values) * 2.7
|
||||
|
||||
if jit:
|
||||
# Test accepted jitted functions
|
||||
import numba
|
||||
|
||||
func_numba = numba.jit(func_numba)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0, as_index=as_index)
|
||||
if pandas_obj == "Series":
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.agg(func_numba, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
||||
def test_cache(jit, pandas_obj, nogil, parallel, nopython):
|
||||
# Test that the functions are cached correctly if we switch functions
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_1(values, index):
|
||||
return np.mean(values) - 3.4
|
||||
|
||||
def func_2(values, index):
|
||||
return np.mean(values) * 2.7
|
||||
|
||||
if jit:
|
||||
import numba
|
||||
|
||||
func_1 = numba.jit(func_1)
|
||||
func_2 = numba.jit(func_2)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0)
|
||||
if pandas_obj == "Series":
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Add func_2 to the cache
|
||||
result = grouped.agg(func_2, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Retest func_1 which should use the cache
|
||||
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
def test_use_global_config():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_1(values, index):
|
||||
return np.mean(values) - 3.4
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
expected = grouped.agg(func_1, engine="numba")
|
||||
with option_context("compute.use_numba", True):
|
||||
result = grouped.agg(func_1, engine=None)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg_kwargs",
|
||||
[
|
||||
{"func": ["min", "max"]},
|
||||
{"func": "min"},
|
||||
{"func": {1: ["min", "max"], 2: "sum"}},
|
||||
{"bmin": NamedAgg(column=1, aggfunc="min")},
|
||||
],
|
||||
)
|
||||
def test_multifunc_numba_vs_cython_frame(agg_kwargs):
|
||||
pytest.importorskip("numba")
|
||||
data = DataFrame(
|
||||
{
|
||||
0: ["a", "a", "b", "b", "a"],
|
||||
1: [1.0, 2.0, 3.0, 4.0, 5.0],
|
||||
2: [1, 2, 3, 4, 5],
|
||||
},
|
||||
columns=[0, 1, 2],
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
result = grouped.agg(**agg_kwargs, engine="numba")
|
||||
expected = grouped.agg(**agg_kwargs, engine="cython")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg_kwargs,expected_func",
|
||||
[
|
||||
({"func": lambda values, index: values.sum()}, "sum"),
|
||||
# FIXME
|
||||
pytest.param(
|
||||
{
|
||||
"func": [
|
||||
lambda values, index: values.sum(),
|
||||
lambda values, index: values.min(),
|
||||
]
|
||||
},
|
||||
["sum", "min"],
|
||||
marks=pytest.mark.xfail(
|
||||
reason="This doesn't work yet! Fails in nopython pipeline!"
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_multifunc_numba_udf_frame(agg_kwargs, expected_func):
|
||||
pytest.importorskip("numba")
|
||||
data = DataFrame(
|
||||
{
|
||||
0: ["a", "a", "b", "b", "a"],
|
||||
1: [1.0, 2.0, 3.0, 4.0, 5.0],
|
||||
2: [1, 2, 3, 4, 5],
|
||||
},
|
||||
columns=[0, 1, 2],
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
result = grouped.agg(**agg_kwargs, engine="numba")
|
||||
expected = grouped.agg(expected_func, engine="cython")
|
||||
# check_dtype can be removed if GH 44952 is addressed
|
||||
# Currently, UDFs still always return float64 while reductions can preserve dtype
|
||||
tm.assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg_kwargs",
|
||||
[{"func": ["min", "max"]}, {"func": "min"}, {"min_val": "min", "max_val": "max"}],
|
||||
)
|
||||
def test_multifunc_numba_vs_cython_series(agg_kwargs):
|
||||
pytest.importorskip("numba")
|
||||
labels = ["a", "a", "b", "b", "a"]
|
||||
data = Series([1.0, 2.0, 3.0, 4.0, 5.0])
|
||||
grouped = data.groupby(labels)
|
||||
agg_kwargs["engine"] = "numba"
|
||||
result = grouped.agg(**agg_kwargs)
|
||||
agg_kwargs["engine"] = "cython"
|
||||
expected = grouped.agg(**agg_kwargs)
|
||||
if isinstance(expected, DataFrame):
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
@pytest.mark.parametrize(
|
||||
"data,agg_kwargs",
|
||||
[
|
||||
(Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": ["min", "max"]}),
|
||||
(Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": "min"}),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"func": ["min", "max"]},
|
||||
),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"func": "min"},
|
||||
),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"func": {1: ["min", "max"], 2: "sum"}},
|
||||
),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"min_col": NamedAgg(column=1, aggfunc="min")},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_multifunc_numba_kwarg_propagation(data, agg_kwargs):
|
||||
pytest.importorskip("numba")
|
||||
labels = ["a", "a", "b", "b", "a"]
|
||||
grouped = data.groupby(labels)
|
||||
result = grouped.agg(**agg_kwargs, engine="numba", engine_kwargs={"parallel": True})
|
||||
expected = grouped.agg(**agg_kwargs, engine="numba")
|
||||
if isinstance(expected, DataFrame):
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_args_not_cached():
|
||||
# GH 41647
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def sum_last(values, index, n):
|
||||
return values[-n:].sum()
|
||||
|
||||
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
|
||||
grouped_x = df.groupby("id")["x"]
|
||||
result = grouped_x.agg(sum_last, 1, engine="numba")
|
||||
expected = Series([1.0] * 2, name="x", index=Index([0, 1], name="id"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = grouped_x.agg(sum_last, 2, engine="numba")
|
||||
expected = Series([2.0] * 2, name="x", index=Index([0, 1], name="id"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_data_correctly_passed():
|
||||
# GH 43133
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def f(values, index):
|
||||
return np.mean(index)
|
||||
|
||||
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
|
||||
result = df.groupby("group").aggregate(f, engine="numba")
|
||||
expected = DataFrame(
|
||||
[-1.5, -3.0], columns=["v"], index=Index(["A", "B"], name="group")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_engine_kwargs_not_cached():
|
||||
# If the user passes a different set of engine_kwargs don't return the same
|
||||
# jitted function
|
||||
pytest.importorskip("numba")
|
||||
nogil = True
|
||||
parallel = False
|
||||
nopython = True
|
||||
|
||||
def func_kwargs(values, index):
|
||||
return nogil + parallel + nopython
|
||||
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
df = DataFrame({"value": [0, 0, 0]})
|
||||
result = df.groupby(level=0).aggregate(
|
||||
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame({"value": [2.0, 2.0, 2.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
nogil = False
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby(level=0).aggregate(
|
||||
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame({"value": [1.0, 1.0, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
def test_multiindex_one_key(nogil, parallel, nopython):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby("A").agg(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
|
||||
df.groupby(["A", "B"]).agg(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
|
||||
|
||||
def test_multilabel_numba_vs_cython(numba_supported_reductions):
|
||||
pytest.importorskip("numba")
|
||||
reduction, kwargs = numba_supported_reductions
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
gb = df.groupby(["A", "B"])
|
||||
res_agg = gb.agg(reduction, engine="numba", **kwargs)
|
||||
expected_agg = gb.agg(reduction, engine="cython", **kwargs)
|
||||
tm.assert_frame_equal(res_agg, expected_agg)
|
||||
# Test that calling the aggregation directly also works
|
||||
direct_res = getattr(gb, reduction)(engine="numba", **kwargs)
|
||||
direct_expected = getattr(gb, reduction)(engine="cython", **kwargs)
|
||||
tm.assert_frame_equal(direct_res, direct_expected)
|
||||
|
||||
|
||||
def test_multilabel_udf_numba_vs_cython():
|
||||
pytest.importorskip("numba")
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
gb = df.groupby(["A", "B"])
|
||||
result = gb.agg(lambda values, index: values.min(), engine="numba")
|
||||
expected = gb.agg(lambda x: x.min(), engine="cython")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,676 @@
|
||||
"""
|
||||
test all other .agg behavior
|
||||
"""
|
||||
|
||||
import datetime as dt
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import SpecificationError
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
date_range,
|
||||
period_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
|
||||
def test_agg_partial_failure_raises():
|
||||
# GH#43741
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"data1": np.random.default_rng(2).standard_normal(5),
|
||||
"data2": np.random.default_rng(2).standard_normal(5),
|
||||
"key1": ["a", "a", "b", "b", "a"],
|
||||
"key2": ["one", "two", "one", "two", "one"],
|
||||
}
|
||||
)
|
||||
grouped = df.groupby("key1")
|
||||
|
||||
def peak_to_peak(arr):
|
||||
return arr.max() - arr.min()
|
||||
|
||||
with pytest.raises(TypeError, match="unsupported operand type"):
|
||||
grouped.agg([peak_to_peak])
|
||||
|
||||
with pytest.raises(TypeError, match="unsupported operand type"):
|
||||
grouped.agg(peak_to_peak)
|
||||
|
||||
|
||||
def test_agg_datetimes_mixed():
|
||||
data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
|
||||
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"key": [x[0] for x in data],
|
||||
"date": [x[1] for x in data],
|
||||
"value": [x[2] for x in data],
|
||||
}
|
||||
)
|
||||
|
||||
data = [
|
||||
[
|
||||
row[0],
|
||||
(dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
|
||||
row[2],
|
||||
]
|
||||
for row in data
|
||||
]
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"key": [x[0] for x in data],
|
||||
"date": [x[1] for x in data],
|
||||
"value": [x[2] for x in data],
|
||||
}
|
||||
)
|
||||
|
||||
df1["weights"] = df1["value"] / df1["value"].sum()
|
||||
gb1 = df1.groupby("date").aggregate("sum")
|
||||
|
||||
df2["weights"] = df1["value"] / df1["value"].sum()
|
||||
gb2 = df2.groupby("date").aggregate("sum")
|
||||
|
||||
assert len(gb1) == len(gb2)
|
||||
|
||||
|
||||
def test_agg_period_index():
|
||||
prng = period_range("2012-1-1", freq="M", periods=3)
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)), index=prng)
|
||||
rs = df.groupby(level=0).sum()
|
||||
assert isinstance(rs.index, PeriodIndex)
|
||||
|
||||
# GH 3579
|
||||
index = period_range(start="1999-01", periods=5, freq="M")
|
||||
s1 = Series(np.random.default_rng(2).random(len(index)), index=index)
|
||||
s2 = Series(np.random.default_rng(2).random(len(index)), index=index)
|
||||
df = DataFrame.from_dict({"s1": s1, "s2": s2})
|
||||
grouped = df.groupby(df.index.month)
|
||||
list(grouped)
|
||||
|
||||
|
||||
def test_agg_dict_parameter_cast_result_dtypes():
|
||||
# GH 12821
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"class": ["A", "A", "B", "B", "C", "C", "D", "D"],
|
||||
"time": date_range("1/1/2011", periods=8, freq="h"),
|
||||
}
|
||||
)
|
||||
df.loc[[0, 1, 2, 5], "time"] = None
|
||||
|
||||
# test for `first` function
|
||||
exp = df.loc[[0, 3, 4, 6]].set_index("class")
|
||||
grouped = df.groupby("class")
|
||||
tm.assert_frame_equal(grouped.first(), exp)
|
||||
tm.assert_frame_equal(grouped.agg("first"), exp)
|
||||
tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
|
||||
tm.assert_series_equal(grouped.time.first(), exp["time"])
|
||||
tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
|
||||
|
||||
# test for `last` function
|
||||
exp = df.loc[[0, 3, 4, 7]].set_index("class")
|
||||
grouped = df.groupby("class")
|
||||
tm.assert_frame_equal(grouped.last(), exp)
|
||||
tm.assert_frame_equal(grouped.agg("last"), exp)
|
||||
tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
|
||||
tm.assert_series_equal(grouped.time.last(), exp["time"])
|
||||
tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
|
||||
|
||||
# count
|
||||
exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
|
||||
tm.assert_series_equal(grouped.time.agg(len), exp)
|
||||
tm.assert_series_equal(grouped.time.size(), exp)
|
||||
|
||||
exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
|
||||
tm.assert_series_equal(grouped.time.count(), exp)
|
||||
|
||||
|
||||
def test_agg_cast_results_dtypes():
|
||||
# similar to GH12821
|
||||
# xref #11444
|
||||
u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
|
||||
v = list("aaabbbbbbccd")
|
||||
df = DataFrame({"X": v, "Y": u})
|
||||
|
||||
result = df.groupby("X")["Y"].agg(len)
|
||||
expected = df.groupby("X")["Y"].count()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_float64_no_int64():
|
||||
# see gh-11199
|
||||
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a", "c"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_api_consistency():
|
||||
# GH 9052
|
||||
# make sure that the aggregates via dict
|
||||
# are consistent
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
grouped = df.groupby(["A", "B"])
|
||||
c_mean = grouped["C"].mean()
|
||||
c_sum = grouped["C"].sum()
|
||||
d_mean = grouped["D"].mean()
|
||||
d_sum = grouped["D"].sum()
|
||||
|
||||
result = grouped["D"].agg(["sum", "mean"])
|
||||
expected = pd.concat([d_sum, d_mean], axis=1)
|
||||
expected.columns = ["sum", "mean"]
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg(["sum", "mean"])
|
||||
expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped[["D", "C"]].agg(["sum", "mean"])
|
||||
expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({"C": "mean", "D": "sum"})
|
||||
expected = pd.concat([d_sum, c_mean], axis=1)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
|
||||
expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
|
||||
|
||||
msg = r"Column\(s\) \['r', 'r2'\] do not exist"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
grouped[["D", "C"]].agg({"r": "sum", "r2": "mean"})
|
||||
|
||||
|
||||
def test_agg_dict_renaming_deprecation():
|
||||
# 15931
|
||||
df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
df.groupby("A").agg(
|
||||
{"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
|
||||
)
|
||||
|
||||
msg = r"Column\(s\) \['ma'\] do not exist"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
df.groupby("A")[["B", "C"]].agg({"ma": "max"})
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
df.groupby("A").B.agg({"foo": "count"})
|
||||
|
||||
|
||||
def test_agg_compat():
|
||||
# GH 12334
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"C": ["sum", "std"]})
|
||||
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"C": "sum", "D": "std"})
|
||||
|
||||
|
||||
def test_agg_nested_dicts():
|
||||
# API change for disallowing these types of nested dicts
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
|
||||
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
|
||||
|
||||
# same name as the original column
|
||||
# GH9052
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"result1": np.sum, "result2": np.mean})
|
||||
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"D": np.sum, "result2": np.mean})
|
||||
|
||||
|
||||
def test_agg_item_by_item_raise_typeerror():
|
||||
df = DataFrame(np.random.default_rng(2).integers(10, size=(20, 10)))
|
||||
|
||||
def raiseException(df):
|
||||
pprint_thing("----------------------------------------")
|
||||
pprint_thing(df.to_string())
|
||||
raise TypeError("test")
|
||||
|
||||
with pytest.raises(TypeError, match="test"):
|
||||
df.groupby(0).agg(raiseException)
|
||||
|
||||
|
||||
def test_series_agg_multikey():
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
||||
|
||||
result = grouped.agg("sum")
|
||||
expected = grouped.sum()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_series_agg_multi_pure_python():
|
||||
data = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
"D": np.random.default_rng(2).standard_normal(11),
|
||||
"E": np.random.default_rng(2).standard_normal(11),
|
||||
"F": np.random.default_rng(2).standard_normal(11),
|
||||
}
|
||||
)
|
||||
|
||||
def bad(x):
|
||||
if isinstance(x.values, np.ndarray):
|
||||
assert len(x.values.base) > 0
|
||||
return "foo"
|
||||
|
||||
result = data.groupby(["A", "B"]).agg(bad)
|
||||
expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_consistency():
|
||||
# agg with ([]) and () not consistent
|
||||
# GH 6715
|
||||
def P1(a):
|
||||
return np.percentile(a.dropna(), q=1)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"col1": [1, 2, 3, 4],
|
||||
"col2": [10, 25, 26, 31],
|
||||
"date": [
|
||||
dt.date(2013, 2, 10),
|
||||
dt.date(2013, 2, 10),
|
||||
dt.date(2013, 2, 11),
|
||||
dt.date(2013, 2, 11),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby("date")
|
||||
|
||||
expected = g.agg([P1])
|
||||
expected.columns = expected.columns.levels[0]
|
||||
|
||||
result = g.agg(P1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_callables():
|
||||
# GH 7929
|
||||
df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
|
||||
|
||||
class fn_class:
|
||||
def __call__(self, x):
|
||||
return sum(x)
|
||||
|
||||
equiv_callables = [
|
||||
sum,
|
||||
np.sum,
|
||||
lambda x: sum(x),
|
||||
lambda x: x.sum(),
|
||||
partial(sum),
|
||||
fn_class(),
|
||||
]
|
||||
|
||||
expected = df.groupby("foo").agg("sum")
|
||||
for ecall in equiv_callables:
|
||||
warn = FutureWarning if ecall is sum or ecall is np.sum else None
|
||||
msg = "using DataFrameGroupBy.sum"
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
result = df.groupby("foo").agg(ecall)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_over_numpy_arrays():
|
||||
# GH 3788
|
||||
df = DataFrame(
|
||||
[
|
||||
[1, np.array([10, 20, 30])],
|
||||
[1, np.array([40, 50, 60])],
|
||||
[2, np.array([20, 30, 40])],
|
||||
],
|
||||
columns=["category", "arraydata"],
|
||||
)
|
||||
gb = df.groupby("category")
|
||||
|
||||
expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
|
||||
expected_index = Index([1, 2], name="category")
|
||||
expected_column = ["arraydata"]
|
||||
expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
|
||||
|
||||
alt = gb.sum(numeric_only=False)
|
||||
tm.assert_frame_equal(alt, expected)
|
||||
|
||||
result = gb.agg("sum", numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# FIXME: the original version of this test called `gb.agg(sum)`
|
||||
# and that raises TypeError if `numeric_only=False` is passed
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_period", [True, False])
|
||||
def test_agg_tzaware_non_datetime_result(as_period):
|
||||
# discussed in GH#29589, fixed in GH#29641, operating on tzaware values
|
||||
# with function that is not dtype-preserving
|
||||
dti = date_range("2012-01-01", periods=4, tz="UTC")
|
||||
if as_period:
|
||||
dti = dti.tz_localize(None).to_period("D")
|
||||
|
||||
df = DataFrame({"a": [0, 0, 1, 1], "b": dti})
|
||||
gb = df.groupby("a")
|
||||
|
||||
# Case that _does_ preserve the dtype
|
||||
result = gb["b"].agg(lambda x: x.iloc[0])
|
||||
expected = Series(dti[::2], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# Cases that do _not_ preserve the dtype
|
||||
result = gb["b"].agg(lambda x: x.iloc[0].year)
|
||||
expected = Series([2012, 2012], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0])
|
||||
expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b")
|
||||
expected.index.name = "a"
|
||||
if as_period:
|
||||
expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_timezone_round_trip():
|
||||
# GH 15426
|
||||
ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
|
||||
df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]})
|
||||
|
||||
result1 = df.groupby("a")["b"].agg("min").iloc[0]
|
||||
result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
|
||||
result3 = df.groupby("a")["b"].min().iloc[0]
|
||||
|
||||
assert result1 == ts
|
||||
assert result2 == ts
|
||||
assert result3 == ts
|
||||
|
||||
dates = [
|
||||
pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5)
|
||||
]
|
||||
df = DataFrame({"A": ["a", "b"] * 2, "B": dates})
|
||||
grouped = df.groupby("A")
|
||||
|
||||
ts = df["B"].iloc[0]
|
||||
assert ts == grouped.nth(0)["B"].iloc[0]
|
||||
assert ts == grouped.head(1)["B"].iloc[0]
|
||||
assert ts == grouped.first()["B"].iloc[0]
|
||||
|
||||
# GH#27110 applying iloc should return a DataFrame
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
|
||||
|
||||
ts = df["B"].iloc[2]
|
||||
assert ts == grouped.last()["B"].iloc[0]
|
||||
|
||||
# GH#27110 applying iloc should return a DataFrame
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
|
||||
|
||||
|
||||
def test_sum_uint64_overflow():
|
||||
# see gh-14758
|
||||
# Convert to uint64 and don't overflow
|
||||
df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
|
||||
df = df + 9223372036854775807
|
||||
|
||||
index = Index(
|
||||
[9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
|
||||
)
|
||||
expected = DataFrame(
|
||||
{1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
|
||||
index=index,
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
expected.index.name = 0
|
||||
result = df.groupby(0).sum(numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# out column is non-numeric, so with numeric_only=True it is dropped
|
||||
result2 = df.groupby(0).sum(numeric_only=True)
|
||||
expected2 = expected[[]]
|
||||
tm.assert_frame_equal(result2, expected2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"structure, expected",
|
||||
[
|
||||
(tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
|
||||
(list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
|
||||
(
|
||||
lambda x: tuple(x),
|
||||
DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}),
|
||||
),
|
||||
(
|
||||
lambda x: list(x),
|
||||
DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_agg_structs_dataframe(structure, expected):
|
||||
df = DataFrame(
|
||||
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
||||
)
|
||||
|
||||
result = df.groupby(["A", "B"]).aggregate(structure)
|
||||
expected.index.names = ["A", "B"]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"structure, expected",
|
||||
[
|
||||
(tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
|
||||
(list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
|
||||
(lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
|
||||
(lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
|
||||
],
|
||||
)
|
||||
def test_agg_structs_series(structure, expected):
|
||||
# Issue #18079
|
||||
df = DataFrame(
|
||||
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
||||
)
|
||||
|
||||
result = df.groupby("A")["C"].aggregate(structure)
|
||||
expected.index.name = "A"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_category_nansum(observed):
|
||||
categories = ["a", "b", "c"]
|
||||
df = DataFrame(
|
||||
{"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
|
||||
)
|
||||
msg = "using SeriesGroupBy.sum"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.groupby("A", observed=observed).B.agg(np.nansum)
|
||||
expected = Series(
|
||||
[3, 3, 0],
|
||||
index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
|
||||
name="B",
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected != 0]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_list_like_func():
|
||||
# GH 18473
|
||||
df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]})
|
||||
grouped = df.groupby("A", as_index=False, sort=False)
|
||||
result = grouped.agg({"B": lambda x: list(x)})
|
||||
expected = DataFrame(
|
||||
{"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_lambda_with_timezone():
|
||||
# GH 23683
|
||||
df = DataFrame(
|
||||
{
|
||||
"tag": [1, 1],
|
||||
"date": [
|
||||
pd.Timestamp("2018-01-01", tz="UTC"),
|
||||
pd.Timestamp("2018-01-02", tz="UTC"),
|
||||
],
|
||||
}
|
||||
)
|
||||
result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
|
||||
expected = DataFrame(
|
||||
[pd.Timestamp("2018-01-01", tz="UTC")],
|
||||
index=Index([1], name="tag"),
|
||||
columns=["date"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"err_cls",
|
||||
[
|
||||
NotImplementedError,
|
||||
RuntimeError,
|
||||
KeyError,
|
||||
IndexError,
|
||||
OSError,
|
||||
ValueError,
|
||||
ArithmeticError,
|
||||
AttributeError,
|
||||
],
|
||||
)
|
||||
def test_groupby_agg_err_catching(err_cls):
|
||||
# make sure we suppress anything other than TypeError or AssertionError
|
||||
# in _python_agg_general
|
||||
|
||||
# Use a non-standard EA to make sure we don't go down ndarray paths
|
||||
from pandas.tests.extension.decimal.array import (
|
||||
DecimalArray,
|
||||
make_data,
|
||||
to_decimal,
|
||||
)
|
||||
|
||||
data = make_data()[:5]
|
||||
df = DataFrame(
|
||||
{"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)}
|
||||
)
|
||||
|
||||
expected = Series(to_decimal([data[0], data[3]]))
|
||||
|
||||
def weird_func(x):
|
||||
# weird function that raise something other than TypeError or IndexError
|
||||
# in _python_agg_general
|
||||
if len(x) == 0:
|
||||
raise err_cls
|
||||
return x.iloc[0]
|
||||
|
||||
result = df["decimals"].groupby(df["id1"]).agg(weird_func)
|
||||
tm.assert_series_equal(result, expected, check_names=False)
|
||||
208
lib/python3.11/site-packages/pandas/tests/groupby/conftest.py
Normal file
208
lib/python3.11/site-packages/pandas/tests/groupby/conftest.py
Normal file
@ -0,0 +1,208 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
from pandas.core.groupby.base import (
|
||||
reduction_kernels,
|
||||
transformation_kernels,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def sort(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def as_index(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def dropna(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def observed(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ts():
|
||||
return Series(
|
||||
np.random.default_rng(2).standard_normal(30),
|
||||
index=date_range("2000-01-01", periods=30, freq="B"),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tsframe():
|
||||
return DataFrame(
|
||||
np.random.default_rng(2).standard_normal((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=30, freq="B"),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def three_group():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
"D": np.random.default_rng(2).standard_normal(11),
|
||||
"E": np.random.default_rng(2).standard_normal(11),
|
||||
"F": np.random.default_rng(2).standard_normal(11),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def slice_test_df():
|
||||
data = [
|
||||
[0, "a", "a0_at_0"],
|
||||
[1, "b", "b0_at_1"],
|
||||
[2, "a", "a1_at_2"],
|
||||
[3, "b", "b1_at_3"],
|
||||
[4, "c", "c0_at_4"],
|
||||
[5, "a", "a2_at_5"],
|
||||
[6, "a", "a3_at_6"],
|
||||
[7, "a", "a4_at_7"],
|
||||
]
|
||||
df = DataFrame(data, columns=["Index", "Group", "Value"])
|
||||
return df.set_index("Index")
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def slice_test_grouped(slice_test_df):
|
||||
return slice_test_df.groupby("Group", as_index=False)
|
||||
|
||||
|
||||
@pytest.fixture(params=sorted(reduction_kernels))
|
||||
def reduction_func(request):
|
||||
"""
|
||||
yields the string names of all groupby reduction functions, one at a time.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=sorted(transformation_kernels))
|
||||
def transformation_func(request):
|
||||
"""yields the string names of all groupby transformation functions."""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=sorted(reduction_kernels) + sorted(transformation_kernels))
|
||||
def groupby_func(request):
|
||||
"""yields both aggregation and transformation functions."""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def parallel(request):
|
||||
"""parallel keyword argument for numba.jit"""
|
||||
return request.param
|
||||
|
||||
|
||||
# Can parameterize nogil & nopython over True | False, but limiting per
|
||||
# https://github.com/pandas-dev/pandas/pull/41971#issuecomment-860607472
|
||||
|
||||
|
||||
@pytest.fixture(params=[False])
|
||||
def nogil(request):
|
||||
"""nogil keyword argument for numba.jit"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True])
|
||||
def nopython(request):
|
||||
"""nopython keyword argument for numba.jit"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
("mean", {}),
|
||||
("var", {"ddof": 1}),
|
||||
("var", {"ddof": 0}),
|
||||
("std", {"ddof": 1}),
|
||||
("std", {"ddof": 0}),
|
||||
("sum", {}),
|
||||
("min", {}),
|
||||
("max", {}),
|
||||
("sum", {"min_count": 2}),
|
||||
("min", {"min_count": 2}),
|
||||
("max", {"min_count": 2}),
|
||||
],
|
||||
ids=[
|
||||
"mean",
|
||||
"var_1",
|
||||
"var_0",
|
||||
"std_1",
|
||||
"std_0",
|
||||
"sum",
|
||||
"min",
|
||||
"max",
|
||||
"sum-min_count",
|
||||
"min-min_count",
|
||||
"max-min_count",
|
||||
],
|
||||
)
|
||||
def numba_supported_reductions(request):
|
||||
"""reductions supported with engine='numba'"""
|
||||
return request.param
|
||||
@ -0,0 +1,24 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_corrwith_with_1_axis():
|
||||
# GH 47723
|
||||
df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]})
|
||||
gb = df.groupby("a")
|
||||
|
||||
msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = gb.corrwith(df, axis=1)
|
||||
index = Index(
|
||||
data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)],
|
||||
name=("a", None),
|
||||
)
|
||||
expected = Series([np.nan] * 6, index=index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@ -0,0 +1,301 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_apply_describe_bug(multiindex_dataframe_random_data):
|
||||
grouped = multiindex_dataframe_random_data.groupby(level="first")
|
||||
grouped.describe() # it works!
|
||||
|
||||
|
||||
def test_series_describe_multikey():
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
||||
result = grouped.describe()
|
||||
tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
|
||||
tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
|
||||
tm.assert_series_equal(result["min"], grouped.min(), check_names=False)
|
||||
|
||||
|
||||
def test_series_describe_single():
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
result = grouped.apply(lambda x: x.describe())
|
||||
expected = grouped.describe().stack(future_stack=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]])
|
||||
def test_series_describe_as_index(as_index, keys):
|
||||
# GH#49256
|
||||
df = DataFrame(
|
||||
{
|
||||
"key1": ["one", "two", "two", "three", "two"],
|
||||
"key2": ["one", "two", "two", "three", "two"],
|
||||
"foo2": [1, 2, 4, 4, 6],
|
||||
}
|
||||
)
|
||||
gb = df.groupby(keys, as_index=as_index)["foo2"]
|
||||
result = gb.describe()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key1": ["one", "three", "two"],
|
||||
"count": [1.0, 1.0, 3.0],
|
||||
"mean": [1.0, 4.0, 4.0],
|
||||
"std": [np.nan, np.nan, 2.0],
|
||||
"min": [1.0, 4.0, 2.0],
|
||||
"25%": [1.0, 4.0, 3.0],
|
||||
"50%": [1.0, 4.0, 4.0],
|
||||
"75%": [1.0, 4.0, 5.0],
|
||||
"max": [1.0, 4.0, 6.0],
|
||||
}
|
||||
)
|
||||
if len(keys) == 2:
|
||||
expected.insert(1, "key2", expected["key1"])
|
||||
if as_index:
|
||||
expected = expected.set_index(keys)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_frame_describe_multikey(tsframe, using_infer_string):
|
||||
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
|
||||
result = grouped.describe()
|
||||
desc_groups = []
|
||||
for col in tsframe:
|
||||
group = grouped[col].describe()
|
||||
# GH 17464 - Remove duplicate MultiIndex levels
|
||||
group_col = MultiIndex(
|
||||
levels=[Index([col], dtype=tsframe.columns.dtype), group.columns],
|
||||
codes=[[0] * len(group.columns), range(len(group.columns))],
|
||||
)
|
||||
group = DataFrame(group.values, columns=group_col, index=group.index)
|
||||
desc_groups.append(group)
|
||||
expected = pd.concat(desc_groups, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# remainder of the tests fails with string dtype but is testing deprecated behaviour
|
||||
if using_infer_string:
|
||||
return
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
|
||||
result = groupedT.describe()
|
||||
expected = tsframe.describe().T
|
||||
# reverting the change from https://github.com/pandas-dev/pandas/pull/35441/
|
||||
expected.index = MultiIndex(
|
||||
levels=[[0, 1], expected.index],
|
||||
codes=[[0, 0, 1, 1], range(len(expected.index))],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_frame_describe_tupleindex():
|
||||
# GH 14848 - regression from 0.19.0 to 0.19.1
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"x": [1, 2, 3, 4, 5] * 3,
|
||||
"y": [10, 20, 30, 40, 50] * 3,
|
||||
"z": [100, 200, 300, 400, 500] * 3,
|
||||
}
|
||||
)
|
||||
df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
|
||||
df2 = df1.rename(columns={"k": "key"})
|
||||
msg = "Names should be list-like for a MultiIndex"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df1.groupby("k").describe()
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df2.groupby("key").describe()
|
||||
|
||||
|
||||
def test_frame_describe_unstacked_format():
|
||||
# GH 4792
|
||||
prices = {
|
||||
Timestamp("2011-01-06 10:59:05", tz=None): 24990,
|
||||
Timestamp("2011-01-06 12:43:33", tz=None): 25499,
|
||||
Timestamp("2011-01-06 12:54:09", tz=None): 25499,
|
||||
}
|
||||
volumes = {
|
||||
Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
|
||||
Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
|
||||
Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
|
||||
}
|
||||
df = DataFrame({"PRICE": prices, "VOLUME": volumes})
|
||||
result = df.groupby("PRICE").VOLUME.describe()
|
||||
data = [
|
||||
df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
|
||||
df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
|
||||
]
|
||||
expected = DataFrame(
|
||||
data,
|
||||
index=Index([24990, 25499], name="PRICE"),
|
||||
columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:"
|
||||
"indexing past lexsort depth may impact performance:"
|
||||
"pandas.errors.PerformanceWarning"
|
||||
)
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
|
||||
def test_describe_with_duplicate_output_column_names(as_index, keys):
|
||||
# GH 35314
|
||||
df = DataFrame(
|
||||
{
|
||||
"a1": [99, 99, 99, 88, 88, 88],
|
||||
"a2": [99, 99, 99, 88, 88, 88],
|
||||
"b": [1, 2, 3, 4, 5, 6],
|
||||
"c": [10, 20, 30, 40, 50, 60],
|
||||
},
|
||||
columns=["a1", "a2", "b", "b"],
|
||||
copy=False,
|
||||
)
|
||||
if keys == ["a1"]:
|
||||
df = df.drop(columns="a2")
|
||||
|
||||
expected = (
|
||||
DataFrame.from_records(
|
||||
[
|
||||
("b", "count", 3.0, 3.0),
|
||||
("b", "mean", 5.0, 2.0),
|
||||
("b", "std", 1.0, 1.0),
|
||||
("b", "min", 4.0, 1.0),
|
||||
("b", "25%", 4.5, 1.5),
|
||||
("b", "50%", 5.0, 2.0),
|
||||
("b", "75%", 5.5, 2.5),
|
||||
("b", "max", 6.0, 3.0),
|
||||
("b", "count", 3.0, 3.0),
|
||||
("b", "mean", 5.0, 2.0),
|
||||
("b", "std", 1.0, 1.0),
|
||||
("b", "min", 4.0, 1.0),
|
||||
("b", "25%", 4.5, 1.5),
|
||||
("b", "50%", 5.0, 2.0),
|
||||
("b", "75%", 5.5, 2.5),
|
||||
("b", "max", 6.0, 3.0),
|
||||
],
|
||||
)
|
||||
.set_index([0, 1])
|
||||
.T
|
||||
)
|
||||
expected.columns.names = [None, None]
|
||||
if len(keys) == 2:
|
||||
expected.index = MultiIndex(
|
||||
levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"]
|
||||
)
|
||||
else:
|
||||
expected.index = Index([88, 99], name="a1")
|
||||
|
||||
if not as_index:
|
||||
expected = expected.reset_index()
|
||||
|
||||
result = df.groupby(keys, as_index=as_index).describe()
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_describe_duplicate_columns():
|
||||
# GH#50806
|
||||
df = DataFrame([[0, 1, 2, 3]])
|
||||
df.columns = [0, 1, 2, 0]
|
||||
gb = df.groupby(df[1])
|
||||
result = gb.describe(percentiles=[])
|
||||
|
||||
columns = ["count", "mean", "std", "min", "50%", "max"]
|
||||
frames = [
|
||||
DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
|
||||
for val in (0.0, 2.0, 3.0)
|
||||
]
|
||||
expected = pd.concat(frames, axis=1)
|
||||
expected.columns = MultiIndex(
|
||||
levels=[[0, 2], columns],
|
||||
codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
|
||||
)
|
||||
expected.index.names = [1]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestGroupByNonCythonPaths:
|
||||
# GH#5610 non-cython calls should not include the grouper
|
||||
# Tests for code not expected to go through cython paths.
|
||||
|
||||
@pytest.fixture
|
||||
def df(self):
|
||||
df = DataFrame(
|
||||
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
return df
|
||||
|
||||
@pytest.fixture
|
||||
def gb(self, df):
|
||||
gb = df.groupby("A")
|
||||
return gb
|
||||
|
||||
@pytest.fixture
|
||||
def gni(self, df):
|
||||
gni = df.groupby("A", as_index=False)
|
||||
return gni
|
||||
|
||||
def test_describe(self, df, gb, gni):
|
||||
# describe
|
||||
expected_index = Index([1, 3], name="A")
|
||||
expected_col = MultiIndex(
|
||||
levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
|
||||
codes=[[0] * 8, list(range(8))],
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
|
||||
[0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
],
|
||||
index=expected_index,
|
||||
columns=expected_col,
|
||||
)
|
||||
result = gb.describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = expected.reset_index()
|
||||
result = gni.describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [int, float, object])
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None},
|
||||
{"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]},
|
||||
{"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None},
|
||||
],
|
||||
)
|
||||
def test_groupby_empty_dataset(dtype, kwargs):
|
||||
# GH#41575
|
||||
df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype)
|
||||
df["B"] = df["B"].astype(int)
|
||||
df["C"] = df["C"].astype(float)
|
||||
|
||||
result = df.iloc[:0].groupby("A").describe(**kwargs)
|
||||
expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.iloc[:0].groupby("A").B.describe(**kwargs)
|
||||
expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
|
||||
expected.index = Index([], dtype=df.columns.dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,255 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
NaT,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_group_shift_with_null_key():
|
||||
# This test is designed to replicate the segfault in issue #13813.
|
||||
n_rows = 1200
|
||||
|
||||
# Generate a moderately large dataframe with occasional missing
|
||||
# values in column `B`, and then group by [`A`, `B`]. This should
|
||||
# force `-1` in `labels` array of `g._grouper.group_info` exactly
|
||||
# at those places, where the group-by key is partially missing.
|
||||
df = DataFrame(
|
||||
[(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["A", "B", "Z"],
|
||||
index=None,
|
||||
)
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
expected = DataFrame(
|
||||
[(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["Z"],
|
||||
index=None,
|
||||
)
|
||||
result = g.shift(-1)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_shift_with_fill_value():
|
||||
# GH #24128
|
||||
n_rows = 24
|
||||
df = DataFrame(
|
||||
[(i % 12, i % 3, i) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["A", "B", "Z"],
|
||||
index=None,
|
||||
)
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
expected = DataFrame(
|
||||
[(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["Z"],
|
||||
index=None,
|
||||
)
|
||||
result = g.shift(-1, fill_value=0)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_shift_lose_timezone():
|
||||
# GH 30134
|
||||
now_dt = Timestamp.utcnow().as_unit("ns")
|
||||
df = DataFrame({"a": [1, 1], "date": now_dt})
|
||||
result = df.groupby("a").shift(0).iloc[0]
|
||||
expected = Series({"date": now_dt}, name=result.name)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_real_series(any_real_numpy_dtype):
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]},
|
||||
dtype=any_real_numpy_dtype,
|
||||
)
|
||||
result = df.groupby("a")["b"].diff()
|
||||
exp_dtype = "float"
|
||||
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
|
||||
exp_dtype = "float32"
|
||||
expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_real_frame(any_real_numpy_dtype):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3, 3, 2],
|
||||
"b": [1, 2, 3, 4, 5],
|
||||
"c": [1, 2, 3, 4, 6],
|
||||
},
|
||||
dtype=any_real_numpy_dtype,
|
||||
)
|
||||
result = df.groupby("a").diff()
|
||||
exp_dtype = "float"
|
||||
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
|
||||
exp_dtype = "float32"
|
||||
expected = DataFrame(
|
||||
{
|
||||
"b": [np.nan, np.nan, np.nan, 1.0, 3.0],
|
||||
"c": [np.nan, np.nan, np.nan, 1.0, 4.0],
|
||||
},
|
||||
dtype=exp_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
[
|
||||
Timestamp("2013-01-01"),
|
||||
Timestamp("2013-01-02"),
|
||||
Timestamp("2013-01-03"),
|
||||
],
|
||||
[Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
|
||||
],
|
||||
)
|
||||
def test_group_diff_datetimelike(data, unit):
|
||||
df = DataFrame({"a": [1, 2, 2], "b": data})
|
||||
df["b"] = df["b"].dt.as_unit(unit)
|
||||
result = df.groupby("a")["b"].diff()
|
||||
expected = Series([NaT, NaT, Timedelta("1 days")], name="b").dt.as_unit(unit)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_bool():
|
||||
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
|
||||
result = df.groupby("a")["b"].diff()
|
||||
expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_object_raises(object_dtype):
|
||||
df = DataFrame(
|
||||
{"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
|
||||
)
|
||||
with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
|
||||
df.groupby("a")["b"].diff()
|
||||
|
||||
|
||||
def test_empty_shift_with_fill():
|
||||
# GH 41264, single-index check
|
||||
df = DataFrame(columns=["a", "b", "c"])
|
||||
shifted = df.groupby(["a"]).shift(1)
|
||||
shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0)
|
||||
tm.assert_frame_equal(shifted, shifted_with_fill)
|
||||
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
|
||||
|
||||
|
||||
def test_multindex_empty_shift_with_fill():
|
||||
# GH 41264, multi-index check
|
||||
df = DataFrame(columns=["a", "b", "c"])
|
||||
shifted = df.groupby(["a", "b"]).shift(1)
|
||||
shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0)
|
||||
tm.assert_frame_equal(shifted, shifted_with_fill)
|
||||
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
|
||||
|
||||
|
||||
def test_shift_periods_freq():
|
||||
# GH 54093
|
||||
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
|
||||
df = DataFrame(data, index=date_range(start="20100101", periods=6))
|
||||
result = df.groupby(df.index).shift(periods=-2, freq="D")
|
||||
expected = DataFrame(data, index=date_range(start="2009-12-30", periods=6))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_shift_deprecate_freq_and_fill_value():
|
||||
# GH 53832
|
||||
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
|
||||
df = DataFrame(data, index=date_range(start="20100101", periods=6))
|
||||
msg = (
|
||||
"Passing a 'freq' together with a 'fill_value' silently ignores the fill_value"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1")
|
||||
|
||||
|
||||
def test_shift_disallow_suffix_if_periods_is_int():
|
||||
# GH#44424
|
||||
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
|
||||
df = DataFrame(data)
|
||||
msg = "Cannot specify `suffix` if `periods` is an int."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("b").shift(1, suffix="fails")
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods():
|
||||
# GH#44424
|
||||
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
|
||||
|
||||
shifted_df = df.groupby("b")[["a"]].shift([0, 1])
|
||||
expected_df = DataFrame(
|
||||
{"a_0": [1, 2, 3, 3, 2], "a_1": [np.nan, 1.0, np.nan, 3.0, 2.0]}
|
||||
)
|
||||
tm.assert_frame_equal(shifted_df, expected_df)
|
||||
|
||||
# series
|
||||
shifted_series = df.groupby("b")["a"].shift([0, 1])
|
||||
tm.assert_frame_equal(shifted_series, expected_df)
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods_and_freq():
|
||||
# GH#44424
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
|
||||
index=date_range("1/1/2000", periods=5, freq="h"),
|
||||
)
|
||||
shifted_df = df.groupby("b")[["a"]].shift(
|
||||
[0, 1],
|
||||
freq="h",
|
||||
)
|
||||
expected_df = DataFrame(
|
||||
{
|
||||
"a_0": [1.0, 2.0, 3.0, 4.0, 5.0, np.nan],
|
||||
"a_1": [
|
||||
np.nan,
|
||||
1.0,
|
||||
2.0,
|
||||
3.0,
|
||||
4.0,
|
||||
5.0,
|
||||
],
|
||||
},
|
||||
index=date_range("1/1/2000", periods=6, freq="h"),
|
||||
)
|
||||
tm.assert_frame_equal(shifted_df, expected_df)
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods_and_fill_value():
|
||||
# GH#44424
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
|
||||
)
|
||||
shifted_df = df.groupby("b")[["a"]].shift([0, 1], fill_value=-1)
|
||||
expected_df = DataFrame(
|
||||
{"a_0": [1, 2, 3, 4, 5], "a_1": [-1, 1, -1, 3, 2]},
|
||||
)
|
||||
tm.assert_frame_equal(shifted_df, expected_df)
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated():
|
||||
# GH#44424
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
|
||||
index=date_range("1/1/2000", periods=5, freq="h"),
|
||||
)
|
||||
msg = (
|
||||
"Passing a 'freq' together with a 'fill_value' silently ignores the "
|
||||
"fill_value"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h")
|
||||
@ -0,0 +1,78 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"in_vals, out_vals",
|
||||
[
|
||||
# Basics: strictly increasing (T), strictly decreasing (F),
|
||||
# abs val increasing (F), non-strictly increasing (T)
|
||||
([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]),
|
||||
# Test with inf vals
|
||||
(
|
||||
[1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
|
||||
[True, False, True, False],
|
||||
),
|
||||
# Test with nan vals; should always be False
|
||||
(
|
||||
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
|
||||
[False, False, False, False],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_is_monotonic_increasing(in_vals, out_vals):
|
||||
# GH 17015
|
||||
source_dict = {
|
||||
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
|
||||
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
|
||||
"C": in_vals,
|
||||
}
|
||||
df = DataFrame(source_dict)
|
||||
result = df.groupby("B").C.is_monotonic_increasing
|
||||
index = Index(list("abcd"), name="B")
|
||||
expected = Series(index=index, data=out_vals, name="C")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# Also check result equal to manually taking x.is_monotonic_increasing.
|
||||
expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"in_vals, out_vals",
|
||||
[
|
||||
# Basics: strictly decreasing (T), strictly increasing (F),
|
||||
# abs val decreasing (F), non-strictly increasing (T)
|
||||
([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]),
|
||||
# Test with inf vals
|
||||
(
|
||||
[np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
|
||||
[True, True, False, True],
|
||||
),
|
||||
# Test with nan vals; should always be False
|
||||
(
|
||||
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
|
||||
[False, False, False, False],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_is_monotonic_decreasing(in_vals, out_vals):
|
||||
# GH 17015
|
||||
source_dict = {
|
||||
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
|
||||
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
|
||||
"C": in_vals,
|
||||
}
|
||||
|
||||
df = DataFrame(source_dict)
|
||||
result = df.groupby("B").C.is_monotonic_decreasing
|
||||
index = Index(list("abcd"), name="B")
|
||||
expected = Series(index=index, data=out_vals, name="C")
|
||||
tm.assert_series_equal(result, expected)
|
||||
@ -0,0 +1,115 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
MultiIndex,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_nlargest():
|
||||
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
|
||||
b = Series(list("a" * 5 + "b" * 5))
|
||||
gb = a.groupby(b)
|
||||
r = gb.nlargest(3)
|
||||
e = Series(
|
||||
[7, 5, 3, 10, 9, 6],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]),
|
||||
)
|
||||
tm.assert_series_equal(r, e)
|
||||
|
||||
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
|
||||
gb = a.groupby(b)
|
||||
e = Series(
|
||||
[3, 2, 1, 3, 3, 2],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]),
|
||||
)
|
||||
tm.assert_series_equal(gb.nlargest(3, keep="last"), e)
|
||||
|
||||
|
||||
def test_nlargest_mi_grouper():
|
||||
# see gh-21411
|
||||
npr = np.random.default_rng(2)
|
||||
|
||||
dts = date_range("20180101", periods=10)
|
||||
iterables = [dts, ["one", "two"]]
|
||||
|
||||
idx = MultiIndex.from_product(iterables, names=["first", "second"])
|
||||
s = Series(npr.standard_normal(20), index=idx)
|
||||
|
||||
result = s.groupby("first").nlargest(1)
|
||||
|
||||
exp_idx = MultiIndex.from_tuples(
|
||||
[
|
||||
(dts[0], dts[0], "one"),
|
||||
(dts[1], dts[1], "one"),
|
||||
(dts[2], dts[2], "one"),
|
||||
(dts[3], dts[3], "two"),
|
||||
(dts[4], dts[4], "one"),
|
||||
(dts[5], dts[5], "one"),
|
||||
(dts[6], dts[6], "one"),
|
||||
(dts[7], dts[7], "one"),
|
||||
(dts[8], dts[8], "one"),
|
||||
(dts[9], dts[9], "one"),
|
||||
],
|
||||
names=["first", "first", "second"],
|
||||
)
|
||||
|
||||
exp_values = [
|
||||
0.18905338179353307,
|
||||
-0.41306354339189344,
|
||||
1.799707382720902,
|
||||
0.7738065867276614,
|
||||
0.28121066979764925,
|
||||
0.9775674511260357,
|
||||
-0.3288239040579627,
|
||||
0.45495807124085547,
|
||||
0.5452887139646817,
|
||||
0.12682784711186987,
|
||||
]
|
||||
|
||||
expected = Series(exp_values, index=exp_idx)
|
||||
tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3)
|
||||
|
||||
|
||||
def test_nsmallest():
|
||||
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
|
||||
b = Series(list("a" * 5 + "b" * 5))
|
||||
gb = a.groupby(b)
|
||||
r = gb.nsmallest(3)
|
||||
e = Series(
|
||||
[1, 2, 3, 0, 4, 6],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]),
|
||||
)
|
||||
tm.assert_series_equal(r, e)
|
||||
|
||||
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
|
||||
gb = a.groupby(b)
|
||||
e = Series(
|
||||
[0, 1, 1, 0, 1, 2],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]),
|
||||
)
|
||||
tm.assert_series_equal(gb.nsmallest(3, keep="last"), e)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, groups",
|
||||
[([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES])
|
||||
@pytest.mark.parametrize("method", ["nlargest", "nsmallest"])
|
||||
def test_nlargest_and_smallest_noop(data, groups, dtype, method):
|
||||
# GH 15272, GH 16345, GH 29129
|
||||
# Test nlargest/smallest when it results in a noop,
|
||||
# i.e. input is sorted and group size <= n
|
||||
if dtype is not None:
|
||||
data = np.array(data, dtype=dtype)
|
||||
if method == "nlargest":
|
||||
data = list(reversed(data))
|
||||
ser = Series(data, name="a")
|
||||
result = getattr(ser.groupby(groups), method)(n=2)
|
||||
expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups
|
||||
expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a")
|
||||
tm.assert_series_equal(result, expected)
|
||||
@ -0,0 +1,922 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
isna,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_first_last_nth(df):
|
||||
# tests for first / last / nth
|
||||
grouped = df.groupby("A")
|
||||
first = grouped.first()
|
||||
expected = df.loc[[1, 0], ["B", "C", "D"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(first, expected)
|
||||
|
||||
nth = grouped.nth(0)
|
||||
expected = df.loc[[0, 1]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
last = grouped.last()
|
||||
expected = df.loc[[5, 7], ["B", "C", "D"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
tm.assert_frame_equal(last, expected)
|
||||
|
||||
nth = grouped.nth(-1)
|
||||
expected = df.iloc[[5, 7]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
nth = grouped.nth(1)
|
||||
expected = df.iloc[[2, 3]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
# it works!
|
||||
grouped["B"].first()
|
||||
grouped["B"].last()
|
||||
grouped["B"].nth(0)
|
||||
|
||||
df = df.copy()
|
||||
df.loc[df["A"] == "foo", "B"] = np.nan
|
||||
grouped = df.groupby("A")
|
||||
assert isna(grouped["B"].first()["foo"])
|
||||
assert isna(grouped["B"].last()["foo"])
|
||||
assert isna(grouped["B"].nth(0).iloc[0])
|
||||
|
||||
# v0.14.0 whatsnew
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
result = g.first()
|
||||
expected = df.iloc[[1, 2]].set_index("A")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = df.iloc[[1, 2]]
|
||||
result = g.nth(0, dropna="any")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
def test_first_last_with_na_object(method, nulls_fixture):
|
||||
# https://github.com/pandas-dev/pandas/issues/32123
|
||||
groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a")
|
||||
result = getattr(groups, method)()
|
||||
|
||||
if method == "first":
|
||||
values = [1, 3]
|
||||
else:
|
||||
values = [2, 3]
|
||||
|
||||
values = np.array(values, dtype=result["b"].dtype)
|
||||
idx = Index([1, 2], name="a")
|
||||
expected = DataFrame({"b": values}, index=idx)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index", [0, -1])
|
||||
def test_nth_with_na_object(index, nulls_fixture):
|
||||
# https://github.com/pandas-dev/pandas/issues/32123
|
||||
df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]})
|
||||
groups = df.groupby("a")
|
||||
result = groups.nth(index)
|
||||
expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
def test_first_last_with_None(method):
|
||||
# https://github.com/pandas-dev/pandas/issues/32800
|
||||
# None should be preserved as object dtype
|
||||
df = DataFrame.from_dict({"id": ["a"], "value": [None]})
|
||||
groups = df.groupby("id", as_index=False)
|
||||
result = getattr(groups, method)()
|
||||
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
@pytest.mark.parametrize(
|
||||
"df, expected",
|
||||
[
|
||||
(
|
||||
DataFrame({"id": "a", "value": [None, "foo", np.nan]}),
|
||||
DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")),
|
||||
),
|
||||
(
|
||||
DataFrame({"id": "a", "value": [np.nan]}, dtype=object),
|
||||
DataFrame({"value": [None]}, index=Index(["a"], name="id")),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_first_last_with_None_expanded(method, df, expected):
|
||||
# GH 32800, 38286
|
||||
result = getattr(df.groupby("id"), method)()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_last_nth_dtypes():
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"),
|
||||
}
|
||||
)
|
||||
df["E"] = True
|
||||
df["F"] = 1
|
||||
|
||||
# tests for first / last / nth
|
||||
grouped = df.groupby("A")
|
||||
first = grouped.first()
|
||||
expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(first, expected)
|
||||
|
||||
last = grouped.last()
|
||||
expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(last, expected)
|
||||
|
||||
nth = grouped.nth(1)
|
||||
expected = df.iloc[[2, 3]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
|
||||
def test_first_last_nth_dtypes2():
|
||||
# GH 2763, first/last shifting dtypes
|
||||
idx = list(range(10))
|
||||
idx.append(9)
|
||||
ser = Series(data=range(11), index=idx, name="IntCol")
|
||||
assert ser.dtype == "int64"
|
||||
f = ser.groupby(level=0).first()
|
||||
assert f.dtype == "int64"
|
||||
|
||||
|
||||
def test_first_last_nth_nan_dtype():
|
||||
# GH 33591
|
||||
df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)})
|
||||
grouped = df.groupby("data")
|
||||
|
||||
expected = df.set_index("data").nans
|
||||
tm.assert_series_equal(grouped.nans.first(), expected)
|
||||
tm.assert_series_equal(grouped.nans.last(), expected)
|
||||
|
||||
expected = df.nans
|
||||
tm.assert_series_equal(grouped.nans.nth(-1), expected)
|
||||
tm.assert_series_equal(grouped.nans.nth(0), expected)
|
||||
|
||||
|
||||
def test_first_strings_timestamps():
|
||||
# GH 11244
|
||||
test = DataFrame(
|
||||
{
|
||||
Timestamp("2012-01-01 00:00:00"): ["a", "b"],
|
||||
Timestamp("2012-01-02 00:00:00"): ["c", "d"],
|
||||
"name": ["e", "e"],
|
||||
"aaaa": ["f", "g"],
|
||||
}
|
||||
)
|
||||
result = test.groupby("name").first()
|
||||
expected = DataFrame(
|
||||
[["a", "c", "f"]],
|
||||
columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]),
|
||||
index=Index(["e"], name="name"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth():
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
gb = df.groupby("A")
|
||||
|
||||
tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 2]])
|
||||
tm.assert_frame_equal(gb.nth(1), df.iloc[[1]])
|
||||
tm.assert_frame_equal(gb.nth(2), df.loc[[]])
|
||||
tm.assert_frame_equal(gb.nth(-1), df.iloc[[1, 2]])
|
||||
tm.assert_frame_equal(gb.nth(-2), df.iloc[[0]])
|
||||
tm.assert_frame_equal(gb.nth(-3), df.loc[[]])
|
||||
tm.assert_series_equal(gb.B.nth(0), df.B.iloc[[0, 2]])
|
||||
tm.assert_series_equal(gb.B.nth(1), df.B.iloc[[1]])
|
||||
tm.assert_frame_equal(gb[["B"]].nth(0), df[["B"]].iloc[[0, 2]])
|
||||
|
||||
tm.assert_frame_equal(gb.nth(0, dropna="any"), df.iloc[[1, 2]])
|
||||
tm.assert_frame_equal(gb.nth(-1, dropna="any"), df.iloc[[1, 2]])
|
||||
|
||||
tm.assert_frame_equal(gb.nth(7, dropna="any"), df.iloc[:0])
|
||||
tm.assert_frame_equal(gb.nth(2, dropna="any"), df.iloc[:0])
|
||||
|
||||
|
||||
def test_nth2():
|
||||
# out of bounds, regression from 0.13.1
|
||||
# GH 6621
|
||||
df = DataFrame(
|
||||
{
|
||||
"color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"},
|
||||
"food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"},
|
||||
"two": {
|
||||
0: 1.5456590000000001,
|
||||
1: -0.070345000000000005,
|
||||
2: -2.4004539999999999,
|
||||
3: 0.46206000000000003,
|
||||
4: 0.52350799999999997,
|
||||
},
|
||||
"one": {
|
||||
0: 0.56573799999999996,
|
||||
1: -0.9742360000000001,
|
||||
2: 1.033801,
|
||||
3: -0.78543499999999999,
|
||||
4: 0.70422799999999997,
|
||||
},
|
||||
}
|
||||
).set_index(["color", "food"])
|
||||
|
||||
result = df.groupby(level=0, as_index=False).nth(2)
|
||||
expected = df.iloc[[-1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(level=0, as_index=False).nth(3)
|
||||
expected = df.loc[[]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth3():
|
||||
# GH 7559
|
||||
# from the vbench
|
||||
df = DataFrame(np.random.default_rng(2).integers(1, 10, (100, 2)), dtype="int64")
|
||||
ser = df[1]
|
||||
gb = df[0]
|
||||
expected = ser.groupby(gb).first()
|
||||
expected2 = ser.groupby(gb).apply(lambda x: x.iloc[0])
|
||||
tm.assert_series_equal(expected2, expected, check_names=False)
|
||||
assert expected.name == 1
|
||||
assert expected2.name == 1
|
||||
|
||||
# validate first
|
||||
v = ser[gb == 1].iloc[0]
|
||||
assert expected.iloc[0] == v
|
||||
assert expected2.iloc[0] == v
|
||||
|
||||
with pytest.raises(ValueError, match="For a DataFrame"):
|
||||
ser.groupby(gb, sort=False).nth(0, dropna=True)
|
||||
|
||||
|
||||
def test_nth4():
|
||||
# doc example
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
gb = df.groupby("A")
|
||||
result = gb.B.nth(0, dropna="all")
|
||||
expected = df.B.iloc[[1, 2]]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth5():
|
||||
# test multiple nth values
|
||||
df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"])
|
||||
gb = df.groupby("A")
|
||||
|
||||
tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 3]])
|
||||
tm.assert_frame_equal(gb.nth([0]), df.iloc[[0, 3]])
|
||||
tm.assert_frame_equal(gb.nth([0, 1]), df.iloc[[0, 1, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([0, -1]), df.iloc[[0, 2, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([2]), df.iloc[[2]])
|
||||
tm.assert_frame_equal(gb.nth([3, 4]), df.loc[[]])
|
||||
|
||||
|
||||
def test_nth_bdays(unit):
|
||||
business_dates = pd.date_range(
|
||||
start="4/1/2014", end="6/30/2014", freq="B", unit=unit
|
||||
)
|
||||
df = DataFrame(1, index=business_dates, columns=["a", "b"])
|
||||
# get the first, fourth and last two business days for each month
|
||||
key = [df.index.year, df.index.month]
|
||||
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
|
||||
expected_dates = pd.to_datetime(
|
||||
[
|
||||
"2014/4/1",
|
||||
"2014/4/4",
|
||||
"2014/4/29",
|
||||
"2014/4/30",
|
||||
"2014/5/1",
|
||||
"2014/5/6",
|
||||
"2014/5/29",
|
||||
"2014/5/30",
|
||||
"2014/6/2",
|
||||
"2014/6/5",
|
||||
"2014/6/27",
|
||||
"2014/6/30",
|
||||
]
|
||||
).as_unit(unit)
|
||||
expected = DataFrame(1, columns=["a", "b"], index=expected_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth_multi_grouper(three_group):
|
||||
# PR 9090, related to issue 8979
|
||||
# test nth on multiple groupers
|
||||
grouped = three_group.groupby(["A", "B"])
|
||||
result = grouped.nth(0)
|
||||
expected = three_group.iloc[[0, 3, 4, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, expected_first, expected_last",
|
||||
[
|
||||
(
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
),
|
||||
(
|
||||
{
|
||||
"id": ["A", "B", "A"],
|
||||
"time": [
|
||||
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
|
||||
],
|
||||
"foo": [1, 2, 3],
|
||||
},
|
||||
{
|
||||
"id": ["A", "B"],
|
||||
"time": [
|
||||
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
],
|
||||
"foo": [1, 2],
|
||||
},
|
||||
{
|
||||
"id": ["A", "B"],
|
||||
"time": [
|
||||
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
],
|
||||
"foo": [3, 2],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_first_last_tz(data, expected_first, expected_last):
|
||||
# GH15884
|
||||
# Test that the timezone is retained when calling first
|
||||
# or last on groupby with as_index=False
|
||||
|
||||
df = DataFrame(data)
|
||||
|
||||
result = df.groupby("id", as_index=False).first()
|
||||
expected = DataFrame(expected_first)
|
||||
cols = ["id", "time", "foo"]
|
||||
tm.assert_frame_equal(result[cols], expected[cols])
|
||||
|
||||
result = df.groupby("id", as_index=False)["time"].first()
|
||||
tm.assert_frame_equal(result, expected[["id", "time"]])
|
||||
|
||||
result = df.groupby("id", as_index=False).last()
|
||||
expected = DataFrame(expected_last)
|
||||
cols = ["id", "time", "foo"]
|
||||
tm.assert_frame_equal(result[cols], expected[cols])
|
||||
|
||||
result = df.groupby("id", as_index=False)["time"].last()
|
||||
tm.assert_frame_equal(result, expected[["id", "time"]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, ts, alpha",
|
||||
[
|
||||
["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"],
|
||||
["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"],
|
||||
],
|
||||
)
|
||||
def test_first_last_tz_multi_column(method, ts, alpha, unit):
|
||||
# GH 21603
|
||||
category_string = Series(list("abc")).astype("category")
|
||||
dti = pd.date_range("20130101", periods=3, tz="US/Eastern", unit=unit)
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": [1, 1, 2],
|
||||
"category_string": category_string,
|
||||
"datetimetz": dti,
|
||||
}
|
||||
)
|
||||
result = getattr(df.groupby("group"), method)()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"category_string": pd.Categorical(
|
||||
[alpha, "c"], dtype=category_string.dtype
|
||||
),
|
||||
"datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")],
|
||||
},
|
||||
index=Index([1, 2], name="group"),
|
||||
)
|
||||
expected["datetimetz"] = expected["datetimetz"].dt.as_unit(unit)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
pd.array([True, False], dtype="boolean"),
|
||||
pd.array([1, 2], dtype="Int64"),
|
||||
pd.to_datetime(["2020-01-01", "2020-02-01"]),
|
||||
pd.to_timedelta([1, 2], unit="D"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("function", ["first", "last", "min", "max"])
|
||||
def test_first_last_extension_array_keeps_dtype(values, function):
|
||||
# https://github.com/pandas-dev/pandas/issues/33071
|
||||
# https://github.com/pandas-dev/pandas/issues/32194
|
||||
df = DataFrame({"a": [1, 2], "b": values})
|
||||
grouped = df.groupby("a")
|
||||
idx = Index([1, 2], name="a")
|
||||
expected_series = Series(values, name="b", index=idx)
|
||||
expected_frame = DataFrame({"b": values}, index=idx)
|
||||
|
||||
result_series = getattr(grouped["b"], function)()
|
||||
tm.assert_series_equal(result_series, expected_series)
|
||||
|
||||
result_frame = grouped.agg({"b": function})
|
||||
tm.assert_frame_equal(result_frame, expected_frame)
|
||||
|
||||
|
||||
def test_nth_multi_index_as_expected():
|
||||
# PR 9090, related to issue 8979
|
||||
# test nth on MultiIndex
|
||||
three_group = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
}
|
||||
)
|
||||
grouped = three_group.groupby(["A", "B"])
|
||||
result = grouped.nth(0)
|
||||
expected = three_group.iloc[[0, 3, 4, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, n, expected_rows",
|
||||
[
|
||||
("head", -1, [0]),
|
||||
("head", 0, []),
|
||||
("head", 1, [0, 2]),
|
||||
("head", 7, [0, 1, 2]),
|
||||
("tail", -1, [1]),
|
||||
("tail", 0, []),
|
||||
("tail", 1, [1, 2]),
|
||||
("tail", 7, [0, 1, 2]),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]])
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
def test_groupby_head_tail(op, n, expected_rows, columns, as_index):
|
||||
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A", as_index=as_index)
|
||||
expected = df.iloc[expected_rows]
|
||||
if columns is not None:
|
||||
g = g[columns]
|
||||
expected = expected[columns]
|
||||
result = getattr(g, op)(n)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, n, expected_cols",
|
||||
[
|
||||
("head", -1, [0]),
|
||||
("head", 0, []),
|
||||
("head", 1, [0, 2]),
|
||||
("head", 7, [0, 1, 2]),
|
||||
("tail", -1, [1]),
|
||||
("tail", 0, []),
|
||||
("tail", 1, [1, 2]),
|
||||
("tail", 7, [0, 1, 2]),
|
||||
],
|
||||
)
|
||||
def test_groupby_head_tail_axis_1(op, n, expected_cols):
|
||||
# GH 9772
|
||||
df = DataFrame(
|
||||
[[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"]
|
||||
)
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
g = df.groupby([0, 0, 1], axis=1)
|
||||
expected = df.iloc[:, expected_cols]
|
||||
result = getattr(g, op)(n)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_selection_cache():
|
||||
# GH 12839 nth, head, and tail should return same result consistently
|
||||
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
expected = df.iloc[[0, 2]]
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.head(n=2)
|
||||
result2 = g.nth(0)
|
||||
tm.assert_frame_equal(result1, df)
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.tail(n=2)
|
||||
result2 = g.nth(0)
|
||||
tm.assert_frame_equal(result1, df)
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.nth(0)
|
||||
result2 = g.head(n=2)
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
tm.assert_frame_equal(result2, df)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.nth(0)
|
||||
result2 = g.tail(n=2)
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
tm.assert_frame_equal(result2, df)
|
||||
|
||||
|
||||
def test_nth_empty():
|
||||
# GH 16064
|
||||
df = DataFrame(index=[0], columns=["a", "b", "c"])
|
||||
result = df.groupby("a").nth(10)
|
||||
expected = df.iloc[:0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["a", "b"]).nth(10)
|
||||
expected = df.iloc[:0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth_column_order():
|
||||
# GH 20760
|
||||
# Check that nth preserves column order
|
||||
df = DataFrame(
|
||||
[[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]],
|
||||
columns=["A", "C", "B"],
|
||||
)
|
||||
result = df.groupby("A").nth(0)
|
||||
expected = df.iloc[[0, 3]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("A").nth(-1, dropna="any")
|
||||
expected = df.iloc[[1, 4]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [None, "any", "all"])
|
||||
def test_nth_nan_in_grouper(dropna):
|
||||
# GH 26011
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [np.nan, "a", np.nan, "b", np.nan],
|
||||
"b": [0, 2, 4, 6, 8],
|
||||
"c": [1, 3, 5, 7, 9],
|
||||
}
|
||||
)
|
||||
result = df.groupby("a").nth(0, dropna=dropna)
|
||||
expected = df.iloc[[1, 3]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [None, "any", "all"])
|
||||
def test_nth_nan_in_grouper_series(dropna):
|
||||
# GH 26454
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [np.nan, "a", np.nan, "b", np.nan],
|
||||
"b": [0, 2, 4, 6, 8],
|
||||
}
|
||||
)
|
||||
result = df.groupby("a")["b"].nth(0, dropna=dropna)
|
||||
expected = df["b"].iloc[[1, 3]]
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_categorical_and_datetime_data_nat():
|
||||
# GH 20520
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": ["first", "first", "second", "third", "third"],
|
||||
"time": 5 * [np.datetime64("NaT")],
|
||||
"categories": Series(["a", "b", "c", "a", "b"], dtype="category"),
|
||||
}
|
||||
)
|
||||
result = df.groupby("group").first()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"time": 3 * [np.datetime64("NaT")],
|
||||
"categories": Series(["a", "c", "a"]).astype(
|
||||
pd.CategoricalDtype(["a", "b", "c"])
|
||||
),
|
||||
}
|
||||
)
|
||||
expected.index = Index(["first", "second", "third"], name="group")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_multi_key_groupby_categorical():
|
||||
# GH 22512
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1, 1, 2, 2],
|
||||
"B": [100, 100, 200, 100, 100],
|
||||
"C": ["apple", "orange", "mango", "mango", "orange"],
|
||||
"D": ["jupiter", "mercury", "mars", "venus", "venus"],
|
||||
}
|
||||
)
|
||||
df = df.astype({"D": "category"})
|
||||
result = df.groupby(by=["A", "B"]).first()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": ["apple", "mango", "mango"],
|
||||
"D": Series(["jupiter", "mars", "venus"]).astype(
|
||||
pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"])
|
||||
),
|
||||
}
|
||||
)
|
||||
expected.index = MultiIndex.from_tuples(
|
||||
[(1, 100), (1, 200), (2, 100)], names=["A", "B"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last", "nth"])
|
||||
def test_groupby_last_first_nth_with_none(method, nulls_fixture):
|
||||
# GH29645
|
||||
expected = Series(["y"], dtype=object)
|
||||
data = Series(
|
||||
[nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
|
||||
index=[0, 0, 0, 0, 0],
|
||||
dtype=object,
|
||||
).groupby(level=0)
|
||||
|
||||
if method == "nth":
|
||||
result = getattr(data, method)(3)
|
||||
else:
|
||||
result = getattr(data, method)()
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[slice(None, 3, 2), [0, 1, 4, 5]],
|
||||
[slice(None, -2), [0, 2, 5]],
|
||||
[[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
|
||||
[[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
|
||||
],
|
||||
)
|
||||
def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows):
|
||||
# Test slices GH #42947
|
||||
|
||||
result = slice_test_grouped.nth[arg]
|
||||
equivalent = slice_test_grouped.nth(arg)
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(equivalent, expected)
|
||||
|
||||
|
||||
def test_nth_indexed(slice_test_df, slice_test_grouped):
|
||||
# Test index notation GH #44688
|
||||
|
||||
result = slice_test_grouped.nth[0, 1, -2:]
|
||||
equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)])
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(equivalent, expected)
|
||||
|
||||
|
||||
def test_invalid_argument(slice_test_grouped):
|
||||
# Test for error on invalid argument
|
||||
|
||||
with pytest.raises(TypeError, match="Invalid index"):
|
||||
slice_test_grouped.nth(3.14)
|
||||
|
||||
|
||||
def test_negative_step(slice_test_grouped):
|
||||
# Test for error on negative slice step
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid step"):
|
||||
slice_test_grouped.nth(slice(None, None, -1))
|
||||
|
||||
|
||||
def test_np_ints(slice_test_df, slice_test_grouped):
|
||||
# Test np ints work
|
||||
|
||||
result = slice_test_grouped.nth(np.array([0, 1]))
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_nth_with_column_axis():
|
||||
# GH43926
|
||||
df = DataFrame(
|
||||
[
|
||||
[4, 5, 6],
|
||||
[8, 8, 7],
|
||||
],
|
||||
index=["z", "y"],
|
||||
columns=["C", "B", "A"],
|
||||
)
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby(df.iloc[1], axis=1)
|
||||
result = gb.nth(0)
|
||||
expected = df.iloc[:, [0, 2]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_nth_interval():
|
||||
# GH#24205
|
||||
idx_result = MultiIndex(
|
||||
[
|
||||
pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
|
||||
pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
|
||||
],
|
||||
[[0, 0, 0, 1, 1], [0, 1, 1, 0, -1]],
|
||||
)
|
||||
df_result = DataFrame({"col": range(len(idx_result))}, index=idx_result)
|
||||
result = df_result.groupby(level=[0, 1], observed=False).nth(0)
|
||||
val_expected = [0, 1, 3]
|
||||
idx_expected = MultiIndex(
|
||||
[
|
||||
pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
|
||||
pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
|
||||
],
|
||||
[[0, 0, 1], [0, 1, 0]],
|
||||
)
|
||||
expected = DataFrame(val_expected, index=idx_expected, columns=["col"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"start, stop, expected_values, expected_columns",
|
||||
[
|
||||
(None, None, [0, 1, 2, 3, 4], list("ABCDE")),
|
||||
(None, 1, [0, 3], list("AD")),
|
||||
(None, 9, [0, 1, 2, 3, 4], list("ABCDE")),
|
||||
(None, -1, [0, 1, 3], list("ABD")),
|
||||
(1, None, [1, 2, 4], list("BCE")),
|
||||
(1, -1, [1], list("B")),
|
||||
(-1, None, [2, 4], list("CE")),
|
||||
(-1, 2, [4], list("E")),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("method", ["call", "index"])
|
||||
def test_nth_slices_with_column_axis(
|
||||
start, stop, expected_values, expected_columns, method
|
||||
):
|
||||
df = DataFrame([range(5)], columns=[list("ABCDE")])
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby([5, 5, 5, 6, 6], axis=1)
|
||||
result = {
|
||||
"call": lambda start, stop: gb.nth(slice(start, stop)),
|
||||
"index": lambda start, stop: gb.nth[start:stop],
|
||||
}[method](start, stop)
|
||||
expected = DataFrame([expected_values], columns=[expected_columns])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:invalid value encountered in remainder:RuntimeWarning"
|
||||
)
|
||||
def test_head_tail_dropna_true():
|
||||
# GH#45089
|
||||
df = DataFrame(
|
||||
[["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"]
|
||||
)
|
||||
expected = DataFrame([["a", "z"]], columns=["X", "Y"])
|
||||
|
||||
result = df.groupby(["X", "Y"]).head(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"]).tail(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"]).nth(n=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_head_tail_dropna_false():
|
||||
# GH#45089
|
||||
df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
|
||||
expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
|
||||
|
||||
result = df.groupby(["X", "Y"], dropna=False).head(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"], dropna=False).tail(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"], dropna=False).nth(n=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"]))
|
||||
@pytest.mark.parametrize("dropna", ["any", "all", None])
|
||||
def test_nth_after_selection(selection, dropna):
|
||||
# GH#11038, GH#53518
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 2],
|
||||
"b": [np.nan, 3, 4],
|
||||
"c": [5, 6, 7],
|
||||
}
|
||||
)
|
||||
gb = df.groupby("a")[selection]
|
||||
result = gb.nth(0, dropna=dropna)
|
||||
if dropna == "any" or (dropna == "all" and selection != ["b", "c"]):
|
||||
locs = [1, 2]
|
||||
else:
|
||||
locs = [0, 2]
|
||||
expected = df.loc[locs, selection]
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
(
|
||||
Timestamp("2011-01-15 12:50:28.502376"),
|
||||
Timestamp("2011-01-20 12:50:28.593448"),
|
||||
),
|
||||
(24650000000000001, 24650000000000002),
|
||||
],
|
||||
)
|
||||
def test_groupby_nth_int_like_precision(data):
|
||||
# GH#6620, GH#9311
|
||||
df = DataFrame({"a": [1, 1], "b": data})
|
||||
|
||||
grouped = df.groupby("a")
|
||||
result = grouped.nth(0)
|
||||
expected = DataFrame({"a": 1, "b": [data[0]]})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,496 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"a_vals,b_vals",
|
||||
[
|
||||
# Ints
|
||||
([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),
|
||||
([1, 2, 3, 4], [4, 3, 2, 1]),
|
||||
([1, 2, 3, 4, 5], [4, 3, 2, 1]),
|
||||
# Floats
|
||||
([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]),
|
||||
# Missing data
|
||||
([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]),
|
||||
([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]),
|
||||
# Timestamps
|
||||
(
|
||||
pd.date_range("1/1/18", freq="D", periods=5),
|
||||
pd.date_range("1/1/18", freq="D", periods=5)[::-1],
|
||||
),
|
||||
(
|
||||
pd.date_range("1/1/18", freq="D", periods=5).as_unit("s"),
|
||||
pd.date_range("1/1/18", freq="D", periods=5)[::-1].as_unit("s"),
|
||||
),
|
||||
# All NA
|
||||
([np.nan] * 5, [np.nan] * 5),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1])
|
||||
def test_quantile(interpolation, a_vals, b_vals, q, request):
|
||||
if (
|
||||
interpolation == "nearest"
|
||||
and q == 0.5
|
||||
and isinstance(b_vals, list)
|
||||
and b_vals == [4, 3, 2, 1]
|
||||
):
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason="Unclear numpy expectation for nearest "
|
||||
"result with equidistant data"
|
||||
)
|
||||
)
|
||||
all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)])
|
||||
|
||||
a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation)
|
||||
b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation)
|
||||
|
||||
df = DataFrame({"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": all_vals})
|
||||
|
||||
expected = DataFrame(
|
||||
[a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key")
|
||||
)
|
||||
if all_vals.dtype.kind == "M" and expected.dtypes.values[0].kind == "M":
|
||||
# TODO(non-nano): this should be unnecessary once array_to_datetime
|
||||
# correctly infers non-nano from Timestamp.unit
|
||||
expected = expected.astype(all_vals.dtype)
|
||||
result = df.groupby("key").quantile(q, interpolation=interpolation)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array():
|
||||
# https://github.com/pandas-dev/pandas/issues/27526
|
||||
df = DataFrame({"A": [0, 1, 2, 3, 4]})
|
||||
key = np.array([0, 0, 1, 1, 1], dtype=np.int64)
|
||||
result = df.groupby(key).quantile([0.25])
|
||||
|
||||
index = pd.MultiIndex.from_product([[0, 1], [0.25]])
|
||||
expected = DataFrame({"A": [0.25, 2.50]}, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
|
||||
index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])
|
||||
|
||||
key = np.array([0, 0, 1, 1], dtype=np.int64)
|
||||
result = df.groupby(key).quantile([0.25, 0.75])
|
||||
expected = DataFrame(
|
||||
{"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array2():
|
||||
# https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959
|
||||
arr = np.random.default_rng(2).integers(0, 5, size=(10, 3), dtype=np.int64)
|
||||
df = DataFrame(arr, columns=list("ABC"))
|
||||
result = df.groupby("A").quantile([0.3, 0.7])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"B": [2.0, 2.0, 2.3, 2.7, 0.3, 0.7, 3.2, 4.0, 0.3, 0.7],
|
||||
"C": [1.0, 1.0, 1.9, 3.0999999999999996, 0.3, 0.7, 2.6, 3.0, 1.2, 2.8],
|
||||
},
|
||||
index=pd.MultiIndex.from_product(
|
||||
[[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array_no_sort():
|
||||
df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
|
||||
key = np.array([1, 0, 1], dtype=np.int64)
|
||||
result = df.groupby(key, sort=False).quantile([0.25, 0.5, 0.75])
|
||||
expected = DataFrame(
|
||||
{"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
|
||||
index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(key, sort=False).quantile([0.75, 0.25])
|
||||
expected = DataFrame(
|
||||
{"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
|
||||
index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array_multiple_levels():
|
||||
df = DataFrame(
|
||||
{"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
|
||||
)
|
||||
result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
|
||||
index = pd.MultiIndex.from_tuples(
|
||||
[("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
|
||||
names=["c", "d", None],
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)])
|
||||
@pytest.mark.parametrize("groupby", [[0], [0, 1]])
|
||||
@pytest.mark.parametrize("q", [[0.5, 0.6]])
|
||||
def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q):
|
||||
# GH30289
|
||||
nrow, ncol = frame_size
|
||||
df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol))
|
||||
|
||||
idx_levels = [np.arange(min(nrow, 4))] * len(groupby) + [q]
|
||||
idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [
|
||||
list(range(len(q))) * min(nrow, 4)
|
||||
]
|
||||
expected_index = pd.MultiIndex(
|
||||
levels=idx_levels, codes=idx_codes, names=groupby + [None]
|
||||
)
|
||||
expected_values = [
|
||||
[float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q
|
||||
]
|
||||
expected_columns = [x for x in range(ncol) if x not in groupby]
|
||||
expected = DataFrame(
|
||||
expected_values, index=expected_index, columns=expected_columns
|
||||
)
|
||||
result = df.groupby(groupby).quantile(q)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_raises():
|
||||
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
|
||||
|
||||
msg = "dtype '(object|str)' does not support operation 'quantile'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.groupby("key").quantile()
|
||||
|
||||
|
||||
def test_quantile_out_of_bounds_q_raises():
|
||||
# https://github.com/pandas-dev/pandas/issues/27470
|
||||
df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
|
||||
g = df.groupby([0, 0, 0, 1, 1, 1])
|
||||
with pytest.raises(ValueError, match="Got '50.0' instead"):
|
||||
g.quantile(50)
|
||||
|
||||
with pytest.raises(ValueError, match="Got '-1.0' instead"):
|
||||
g.quantile(-1)
|
||||
|
||||
|
||||
def test_quantile_missing_group_values_no_segfaults():
|
||||
# GH 28662
|
||||
data = np.array([1.0, np.nan, 1.0])
|
||||
df = DataFrame({"key": data, "val": range(3)})
|
||||
|
||||
# Random segfaults; would have been guaranteed in loop
|
||||
grp = df.groupby("key")
|
||||
for _ in range(100):
|
||||
grp.quantile()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key, val, expected_key, expected_val",
|
||||
[
|
||||
([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]),
|
||||
([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
|
||||
(["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
|
||||
([0], [42], [0], [42.0]),
|
||||
([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
|
||||
],
|
||||
)
|
||||
def test_quantile_missing_group_values_correct_results(
|
||||
key, val, expected_key, expected_val
|
||||
):
|
||||
# GH 28662, GH 33200, GH 33569
|
||||
df = DataFrame({"key": key, "val": val})
|
||||
|
||||
expected = DataFrame(
|
||||
expected_val, index=Index(expected_key, name="key"), columns=["val"]
|
||||
)
|
||||
|
||||
grp = df.groupby("key")
|
||||
|
||||
result = grp.quantile(0.5)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grp.quantile()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
pd.array([1, 0, None] * 2, dtype="Int64"),
|
||||
pd.array([True, False, None] * 2, dtype="boolean"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
|
||||
def test_groupby_quantile_nullable_array(values, q):
|
||||
# https://github.com/pandas-dev/pandas/issues/33136
|
||||
df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values})
|
||||
result = df.groupby("a")["b"].quantile(q)
|
||||
|
||||
if isinstance(q, list):
|
||||
idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None])
|
||||
true_quantiles = [0.0, 0.5, 1.0]
|
||||
else:
|
||||
idx = Index(["x", "y"], name="a")
|
||||
true_quantiles = [0.5]
|
||||
|
||||
expected = pd.Series(true_quantiles * 2, index=idx, name="b", dtype="Float64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
|
||||
@pytest.mark.parametrize("numeric_only", [True, False])
|
||||
def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
|
||||
df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
|
||||
if numeric_only:
|
||||
result = df.groupby("a").quantile(q, numeric_only=numeric_only)
|
||||
expected = df.groupby("a")[["b"]].quantile(q)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
msg = "dtype '.*' does not support operation 'quantile'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.groupby("a").quantile(q, numeric_only=numeric_only)
|
||||
|
||||
|
||||
def test_groupby_quantile_NA_float(any_float_dtype):
|
||||
# GH#42849
|
||||
df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype)
|
||||
result = df.groupby("x")["y"].quantile(0.5)
|
||||
exp_index = Index([1.0], dtype=any_float_dtype, name="x")
|
||||
|
||||
if any_float_dtype in ["Float32", "Float64"]:
|
||||
expected_dtype = any_float_dtype
|
||||
else:
|
||||
expected_dtype = None
|
||||
|
||||
expected = pd.Series([0.2], dtype=expected_dtype, index=exp_index, name="y")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby("x")["y"].quantile([0.5, 0.75])
|
||||
expected = pd.Series(
|
||||
[0.2] * 2,
|
||||
index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]),
|
||||
name="y",
|
||||
dtype=expected_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_quantile_NA_int(any_int_ea_dtype):
|
||||
# GH#42849
|
||||
df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype)
|
||||
result = df.groupby("x")["y"].quantile(0.5)
|
||||
expected = pd.Series(
|
||||
[3.5],
|
||||
dtype="Float64",
|
||||
index=Index([1], name="x", dtype=any_int_ea_dtype),
|
||||
name="y",
|
||||
)
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
result = df.groupby("x").quantile(0.5)
|
||||
expected = DataFrame(
|
||||
{"y": 3.5}, dtype="Float64", index=Index([1], name="x", dtype=any_int_ea_dtype)
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"interpolation, val1, val2", [("lower", 2, 2), ("higher", 2, 3), ("nearest", 2, 2)]
|
||||
)
|
||||
def test_groupby_quantile_all_na_group_masked(
|
||||
interpolation, val1, val2, any_numeric_ea_dtype
|
||||
):
|
||||
# GH#37493
|
||||
df = DataFrame(
|
||||
{"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
|
||||
)
|
||||
result = df.groupby("a").quantile(q=[0.5, 0.7], interpolation=interpolation)
|
||||
expected = DataFrame(
|
||||
{"b": [val1, val2, pd.NA, pd.NA]},
|
||||
dtype=any_numeric_ea_dtype,
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), [0.5, 0.7, 0.5, 0.7]],
|
||||
names=["a", None],
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("interpolation", ["midpoint", "linear"])
|
||||
def test_groupby_quantile_all_na_group_masked_interp(
|
||||
interpolation, any_numeric_ea_dtype
|
||||
):
|
||||
# GH#37493
|
||||
df = DataFrame(
|
||||
{"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
|
||||
)
|
||||
result = df.groupby("a").quantile(q=[0.5, 0.75], interpolation=interpolation)
|
||||
|
||||
if any_numeric_ea_dtype == "Float32":
|
||||
expected_dtype = any_numeric_ea_dtype
|
||||
else:
|
||||
expected_dtype = "Float64"
|
||||
|
||||
expected = DataFrame(
|
||||
{"b": [2.0, 2.5, pd.NA, pd.NA]},
|
||||
dtype=expected_dtype,
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[
|
||||
pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype),
|
||||
[0.5, 0.75, 0.5, 0.75],
|
||||
],
|
||||
names=["a", None],
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Float64", "Float32"])
|
||||
def test_groupby_quantile_allNA_column(dtype):
|
||||
# GH#42849
|
||||
df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype)
|
||||
result = df.groupby("x")["y"].quantile(0.5)
|
||||
expected = pd.Series(
|
||||
[np.nan], dtype=dtype, index=Index([1.0], dtype=dtype), name="y"
|
||||
)
|
||||
expected.index.name = "x"
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_groupby_timedelta_quantile():
|
||||
# GH: 29485
|
||||
df = DataFrame(
|
||||
{"value": pd.to_timedelta(np.arange(4), unit="s"), "group": [1, 1, 2, 2]}
|
||||
)
|
||||
result = df.groupby("group").quantile(0.99)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"value": [
|
||||
pd.Timedelta("0 days 00:00:00.990000"),
|
||||
pd.Timedelta("0 days 00:00:02.990000"),
|
||||
]
|
||||
},
|
||||
index=Index([1, 2], name="group"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_columns_groupby_quantile():
|
||||
# GH 33795
|
||||
df = DataFrame(
|
||||
np.arange(12).reshape(3, -1),
|
||||
index=list("XYZ"),
|
||||
columns=pd.Series(list("ABAB"), name="col"),
|
||||
)
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby("col", axis=1)
|
||||
result = gb.quantile(q=[0.8, 0.2])
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1.6, 0.4, 2.6, 1.4],
|
||||
[5.6, 4.4, 6.6, 5.4],
|
||||
[9.6, 8.4, 10.6, 9.4],
|
||||
],
|
||||
index=list("XYZ"),
|
||||
columns=pd.MultiIndex.from_tuples(
|
||||
[("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None]
|
||||
),
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_timestamp_groupby_quantile(unit):
|
||||
# GH 33168
|
||||
dti = pd.date_range(
|
||||
start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC", unit=unit
|
||||
).floor("1h")
|
||||
df = DataFrame(
|
||||
{
|
||||
"timestamp": dti,
|
||||
"category": list(range(1, 101)),
|
||||
"value": list(range(101, 201)),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby("timestamp").quantile([0.2, 0.8])
|
||||
|
||||
mi = pd.MultiIndex.from_product([dti[::99], [0.2, 0.8]], names=("timestamp", None))
|
||||
expected = DataFrame(
|
||||
[
|
||||
{"category": 12.8, "value": 112.8},
|
||||
{"category": 48.2, "value": 148.2},
|
||||
{"category": 68.8, "value": 168.8},
|
||||
{"category": 92.2, "value": 192.2},
|
||||
],
|
||||
index=mi,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_quantile_dt64tz_period():
|
||||
# GH#51373
|
||||
dti = pd.date_range("2016-01-01", periods=1000)
|
||||
df = pd.Series(dti).to_frame().copy()
|
||||
df[1] = dti.tz_localize("US/Pacific")
|
||||
df[2] = dti.to_period("D")
|
||||
df[3] = dti - dti[0]
|
||||
df.iloc[-1] = pd.NaT
|
||||
|
||||
by = np.tile(np.arange(5), 200)
|
||||
gb = df.groupby(by)
|
||||
|
||||
result = gb.quantile(0.5)
|
||||
|
||||
# Check that we match the group-by-group result
|
||||
exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)}
|
||||
expected = DataFrame(exp).T.infer_objects()
|
||||
expected.index = expected.index.astype(int)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_quantile_nonmulti_levels_order():
|
||||
# Non-regression test for GH #53009
|
||||
ind = pd.MultiIndex.from_tuples(
|
||||
[
|
||||
(0, "a", "B"),
|
||||
(0, "a", "A"),
|
||||
(0, "b", "B"),
|
||||
(0, "b", "A"),
|
||||
(1, "a", "B"),
|
||||
(1, "a", "A"),
|
||||
(1, "b", "B"),
|
||||
(1, "b", "A"),
|
||||
],
|
||||
names=["sample", "cat0", "cat1"],
|
||||
)
|
||||
ser = pd.Series(range(8), index=ind)
|
||||
result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8])
|
||||
|
||||
qind = pd.MultiIndex.from_tuples(
|
||||
[("B", 0.2), ("B", 0.8), ("A", 0.2), ("A", 0.8)], names=["cat1", None]
|
||||
)
|
||||
expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# We need to check that index levels are not sorted
|
||||
expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]])
|
||||
tm.assert_equal(result.index.levels, expected_levels)
|
||||
@ -0,0 +1,721 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
NaT,
|
||||
Series,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_rank_unordered_categorical_typeerror():
|
||||
# GH#51034 should be TypeError, not NotImplementedError
|
||||
cat = pd.Categorical([], ordered=False)
|
||||
ser = Series(cat)
|
||||
df = ser.to_frame()
|
||||
|
||||
msg = "Cannot perform rank with non-ordered Categorical"
|
||||
|
||||
gb = ser.groupby(cat, observed=False)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
gb.rank()
|
||||
|
||||
gb2 = df.groupby(cat, observed=False)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
gb2.rank()
|
||||
|
||||
|
||||
def test_rank_apply():
|
||||
lev1 = np.array(["a" * 10] * 100, dtype=object)
|
||||
lev2 = np.array(["b" * 10] * 130, dtype=object)
|
||||
lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int)
|
||||
lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"value": np.random.default_rng(2).standard_normal(500),
|
||||
"key1": lev1.take(lab1),
|
||||
"key2": lev2.take(lab2),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby(["key1", "key2"]).value.rank()
|
||||
|
||||
expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])]
|
||||
expected = concat(expected, axis=0)
|
||||
expected = expected.reindex(result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby(["key1", "key2"]).value.rank(pct=True)
|
||||
|
||||
expected = [
|
||||
piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"])
|
||||
]
|
||||
expected = concat(expected, axis=0)
|
||||
expected = expected.reindex(result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
np.array([2, 2, 8, 2, 6], dtype=dtype)
|
||||
for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"]
|
||||
]
|
||||
+ [
|
||||
[
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-08"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-06"),
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-08", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-06", tz="US/Pacific"),
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-08").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-06").to_period("D"),
|
||||
],
|
||||
],
|
||||
ids=lambda x: type(x[0]),
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,pct,exp",
|
||||
[
|
||||
("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]),
|
||||
("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
|
||||
("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]),
|
||||
("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]),
|
||||
("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]),
|
||||
("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
|
||||
("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
|
||||
("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]),
|
||||
("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]),
|
||||
("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
|
||||
("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]),
|
||||
("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]),
|
||||
("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]),
|
||||
("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
|
||||
("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]),
|
||||
("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]),
|
||||
("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]),
|
||||
("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]),
|
||||
("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
|
||||
("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]),
|
||||
],
|
||||
)
|
||||
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
|
||||
key = np.repeat(grps, len(vals))
|
||||
|
||||
orig_vals = vals
|
||||
vals = list(vals) * len(grps)
|
||||
if isinstance(orig_vals, np.ndarray):
|
||||
vals = np.array(vals, dtype=orig_vals.dtype)
|
||||
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
|
||||
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,na_option,exp",
|
||||
[
|
||||
("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
|
||||
("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]),
|
||||
("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]),
|
||||
("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
|
||||
("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]),
|
||||
("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]),
|
||||
("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]),
|
||||
("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]),
|
||||
("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]),
|
||||
("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]),
|
||||
("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]),
|
||||
("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]),
|
||||
("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]),
|
||||
("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]),
|
||||
("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]),
|
||||
("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]),
|
||||
("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]),
|
||||
("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]),
|
||||
("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]),
|
||||
("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]),
|
||||
("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]),
|
||||
("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]),
|
||||
("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]),
|
||||
("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]),
|
||||
("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]),
|
||||
("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]),
|
||||
("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]),
|
||||
("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]),
|
||||
("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]),
|
||||
("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]),
|
||||
],
|
||||
)
|
||||
def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
|
||||
# GH 20561
|
||||
key = np.repeat(grps, len(vals))
|
||||
vals = vals * len(grps)
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option
|
||||
)
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype)
|
||||
for dtype in ["f8", "f4", "f2"]
|
||||
]
|
||||
+ [
|
||||
[
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-06"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-06", tz="US/Pacific"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-06").to_period("D"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
],
|
||||
ids=lambda x: type(x[0]),
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,na_option,pct,exp",
|
||||
[
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan],
|
||||
),
|
||||
("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]),
|
||||
("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
|
||||
(
|
||||
"min",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
|
||||
("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]),
|
||||
("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
|
||||
(
|
||||
"max",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[
|
||||
1.0 / 3.0,
|
||||
1.0 / 3.0,
|
||||
np.nan,
|
||||
3.0 / 3.0,
|
||||
1.0 / 3.0,
|
||||
2.0 / 3.0,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[
|
||||
3.0 / 3.0,
|
||||
3.0 / 3.0,
|
||||
np.nan,
|
||||
1.0 / 3.0,
|
||||
3.0 / 3.0,
|
||||
2.0 / 3.0,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
),
|
||||
("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]),
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875],
|
||||
),
|
||||
("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875],
|
||||
),
|
||||
("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]),
|
||||
(
|
||||
"min",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75],
|
||||
),
|
||||
("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]),
|
||||
(
|
||||
"min",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75],
|
||||
),
|
||||
("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]),
|
||||
("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]),
|
||||
("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]),
|
||||
(
|
||||
"max",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0],
|
||||
),
|
||||
("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0],
|
||||
),
|
||||
("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0],
|
||||
),
|
||||
("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]),
|
||||
("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]),
|
||||
("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]),
|
||||
("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]),
|
||||
],
|
||||
)
|
||||
def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
|
||||
key = np.repeat(grps, len(vals))
|
||||
|
||||
orig_vals = vals
|
||||
vals = list(vals) * len(grps)
|
||||
if isinstance(orig_vals, np.ndarray):
|
||||
vals = np.array(vals, dtype=orig_vals.dtype)
|
||||
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])]
|
||||
)
|
||||
def test_rank_resets_each_group(pct, exp):
|
||||
df = DataFrame(
|
||||
{"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10}
|
||||
)
|
||||
result = df.groupby("key").rank(pct=pct)
|
||||
exp_df = DataFrame(exp * 2, columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"]
|
||||
)
|
||||
@pytest.mark.parametrize("upper", [True, False])
|
||||
def test_rank_avg_even_vals(dtype, upper):
|
||||
if upper:
|
||||
# use IntegerDtype/FloatingDtype
|
||||
dtype = dtype[0].upper() + dtype[1:]
|
||||
dtype = dtype.replace("Ui", "UI")
|
||||
df = DataFrame({"key": ["a"] * 4, "val": [1] * 4})
|
||||
df["val"] = df["val"].astype(dtype)
|
||||
assert df["val"].dtype == dtype
|
||||
|
||||
result = df.groupby("key").rank()
|
||||
exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"])
|
||||
if upper:
|
||||
exp_df = exp_df.astype("Float64")
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
|
||||
@pytest.mark.parametrize("pct", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]]
|
||||
)
|
||||
def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals):
|
||||
df = DataFrame({"key": ["foo"] * 5, "val": vals})
|
||||
mask = df["val"].isna()
|
||||
|
||||
gb = df.groupby("key")
|
||||
res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct)
|
||||
|
||||
# construct our expected by using numeric values with the same ordering
|
||||
if mask.any():
|
||||
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]})
|
||||
else:
|
||||
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]})
|
||||
|
||||
gb2 = df2.groupby("key")
|
||||
alt = gb2.rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(res, alt)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_option", [True, "bad", 1])
|
||||
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
@pytest.mark.parametrize("pct", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
["bar", "bar", "foo", "bar", "baz"],
|
||||
["bar", np.nan, "foo", np.nan, "baz"],
|
||||
[1, np.nan, 2, np.nan, 3],
|
||||
],
|
||||
)
|
||||
def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals):
|
||||
df = DataFrame({"key": ["foo"] * 5, "val": vals})
|
||||
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
|
||||
def test_rank_empty_group():
|
||||
# see gh-22519
|
||||
column = "A"
|
||||
df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]})
|
||||
|
||||
result = df.groupby(column).B.rank(pct=True)
|
||||
expected = Series([0.5, np.nan, 1.0], name="B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby(column).rank(pct=True)
|
||||
expected = DataFrame({"B": [0.5, np.nan, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_key,input_value,output_value",
|
||||
[
|
||||
([1, 2], [1, 1], [1.0, 1.0]),
|
||||
([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
|
||||
([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
|
||||
([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]),
|
||||
],
|
||||
)
|
||||
def test_rank_zero_div(input_key, input_value, output_value):
|
||||
# GH 23666
|
||||
df = DataFrame({"A": input_key, "B": input_value})
|
||||
|
||||
result = df.groupby("A").rank(method="dense", pct=True)
|
||||
expected = DataFrame({"B": output_value})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_rank_min_int():
|
||||
# GH-32859
|
||||
df = DataFrame(
|
||||
{
|
||||
"grp": [1, 1, 2],
|
||||
"int_col": [
|
||||
np.iinfo(np.int64).min,
|
||||
np.iinfo(np.int64).max,
|
||||
np.iinfo(np.int64).min,
|
||||
],
|
||||
"datetimelike": [NaT, datetime(2001, 1, 1), NaT],
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby("grp").rank()
|
||||
expected = DataFrame(
|
||||
{"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_nan", [True, False])
|
||||
def test_rank_pct_equal_values_on_group_transition(use_nan):
|
||||
# GH#40518
|
||||
fill_value = np.nan if use_nan else 3
|
||||
df = DataFrame(
|
||||
[
|
||||
[-1, 1],
|
||||
[-1, 2],
|
||||
[1, fill_value],
|
||||
[-1, fill_value],
|
||||
],
|
||||
columns=["group", "val"],
|
||||
)
|
||||
result = df.groupby(["group"])["val"].rank(
|
||||
method="dense",
|
||||
pct=True,
|
||||
)
|
||||
if use_nan:
|
||||
expected = Series([0.5, 1, np.nan, np.nan], name="val")
|
||||
else:
|
||||
expected = Series([1 / 3, 2 / 3, 1, 1], name="val")
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_rank_multiindex():
|
||||
# GH27721
|
||||
df = concat(
|
||||
{
|
||||
"a": DataFrame({"col1": [3, 4], "col2": [1, 2]}),
|
||||
"b": DataFrame({"col3": [5, 6], "col4": [7, 8]}),
|
||||
},
|
||||
axis=1,
|
||||
)
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby(level=0, axis=1)
|
||||
msg = "DataFrameGroupBy.rank with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = gb.rank(axis=1)
|
||||
|
||||
expected = concat(
|
||||
[
|
||||
df["a"].rank(axis=1),
|
||||
df["b"].rank(axis=1),
|
||||
],
|
||||
axis=1,
|
||||
keys=["a", "b"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_axis0_rank_axis1():
|
||||
# GH#41320
|
||||
df = DataFrame(
|
||||
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
|
||||
index=["a", "a", "b", "b"],
|
||||
)
|
||||
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby(level=0, axis=0)
|
||||
|
||||
msg = "DataFrameGroupBy.rank with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
res = gb.rank(axis=1)
|
||||
|
||||
# This should match what we get when "manually" operating group-by-group
|
||||
expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0)
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
# check that we haven't accidentally written a case that coincidentally
|
||||
# matches rank(axis=0)
|
||||
msg = "The 'axis' keyword in DataFrameGroupBy.rank"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
alt = gb.rank(axis=0)
|
||||
assert not alt.equals(expected)
|
||||
|
||||
|
||||
def test_groupby_axis0_cummax_axis1():
|
||||
# case where groupby axis is 0 and axis keyword in transform is 1
|
||||
|
||||
# df has mixed dtype -> multiple blocks
|
||||
df = DataFrame(
|
||||
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
|
||||
index=["a", "a", "b", "b"],
|
||||
)
|
||||
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby(level=0, axis=0)
|
||||
|
||||
msg = "DataFrameGroupBy.cummax with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
cmax = gb.cummax(axis=1)
|
||||
expected = df[[0, 1]].astype(np.float64)
|
||||
expected[2] = expected[1]
|
||||
tm.assert_frame_equal(cmax, expected)
|
||||
|
||||
|
||||
def test_non_unique_index():
|
||||
# GH 16577
|
||||
df = DataFrame(
|
||||
{"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0},
|
||||
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
|
||||
)
|
||||
result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True)
|
||||
expected = Series(
|
||||
[1.0, 1.0, 1.0, np.nan],
|
||||
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
|
||||
name="value",
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_rank_categorical():
|
||||
cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True)
|
||||
cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True)
|
||||
|
||||
df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2})
|
||||
|
||||
gb = df.groupby("col1")
|
||||
|
||||
res = gb.rank()
|
||||
|
||||
expected = df.astype(object).groupby("col1").rank()
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_option", ["top", "bottom"])
|
||||
def test_groupby_op_with_nullables(na_option):
|
||||
# GH 54206
|
||||
df = DataFrame({"x": [None]}, dtype="Float64")
|
||||
result = df.groupby("x", dropna=False)["x"].rank(method="min", na_option=na_option)
|
||||
expected = Series([1.0], dtype="Float64", name=result.name)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@ -0,0 +1,154 @@
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)])
|
||||
def test_groupby_sample_balanced_groups_shape(n, frac):
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(n=n, frac=frac)
|
||||
values = [1] * 2 + [2] * 2
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=n, frac=frac)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_unbalanced_groups_shape():
|
||||
values = [1] * 10 + [2] * 20
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(n=5)
|
||||
values = [1] * 5 + [2] * 5
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=5)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_index_value_spans_groups():
|
||||
values = [1] * 3 + [2] * 3
|
||||
df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2])
|
||||
|
||||
result = df.groupby("a").sample(n=2)
|
||||
values = [1] * 2 + [2] * 2
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=2)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_n_and_frac_raises():
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
msg = "Please enter a value for `frac` OR `n`, not both"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a").sample(n=1, frac=1.0)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a")["b"].sample(n=1, frac=1.0)
|
||||
|
||||
|
||||
def test_groupby_sample_frac_gt_one_without_replacement_raises():
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
msg = "Replace has to be set to `True` when upsampling the population `frac` > 1."
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a").sample(frac=1.5, replace=False)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a")["b"].sample(frac=1.5, replace=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n", [-1, 1.5])
|
||||
def test_groupby_sample_invalid_n_raises(n):
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
|
||||
if n < 0:
|
||||
msg = "A negative number of rows requested. Please provide `n` >= 0."
|
||||
else:
|
||||
msg = "Only integers accepted as `n` values"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a").sample(n=n)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a")["b"].sample(n=n)
|
||||
|
||||
|
||||
def test_groupby_sample_oversample():
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(frac=2.0, replace=True)
|
||||
values = [1] * 20 + [2] * 20
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(frac=2.0, replace=True)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_without_n_or_frac():
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(n=None, frac=None)
|
||||
expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=None, frac=None)
|
||||
expected = Series([1, 2], name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index, expected_index",
|
||||
[(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])],
|
||||
)
|
||||
def test_groupby_sample_with_weights(index, expected_index):
|
||||
# GH 39927 - tests for integer index needed
|
||||
values = [1] * 2 + [2] * 2
|
||||
df = DataFrame({"a": values, "b": values}, index=Index(index))
|
||||
|
||||
result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0])
|
||||
expected = DataFrame({"a": values, "b": values}, index=Index(expected_index))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0])
|
||||
expected = Series(values, name="b", index=Index(expected_index))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_with_selections():
|
||||
# GH 39928
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values, "c": values})
|
||||
|
||||
result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None)
|
||||
expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_with_empty_inputs():
|
||||
# GH48459
|
||||
df = DataFrame({"a": [], "b": []})
|
||||
groupby_df = df.groupby("a")
|
||||
|
||||
result = groupby_df.sample()
|
||||
expected = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,122 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.common import is_integer_dtype
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
|
||||
def test_size(df, by):
|
||||
grouped = df.groupby(by=by)
|
||||
result = grouped.size()
|
||||
for key, group in grouped:
|
||||
assert result[key] == len(group)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"by",
|
||||
[
|
||||
[0, 0, 0, 0],
|
||||
[0, 1, 1, 1],
|
||||
[1, 0, 1, 1],
|
||||
[0, None, None, None],
|
||||
pytest.param([None, None, None, None], marks=pytest.mark.xfail),
|
||||
],
|
||||
)
|
||||
def test_size_axis_1(df, axis_1, by, sort, dropna):
|
||||
# GH#45715
|
||||
counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)}
|
||||
if dropna:
|
||||
counts = {key: value for key, value in counts.items() if key is not None}
|
||||
expected = Series(counts, dtype="int64")
|
||||
if sort:
|
||||
expected = expected.sort_index()
|
||||
if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by):
|
||||
expected.index = expected.index.astype(int)
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna)
|
||||
result = grouped.size()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
|
||||
@pytest.mark.parametrize("sort", [True, False])
|
||||
def test_size_sort(sort, by):
|
||||
df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC"))
|
||||
left = df.groupby(by=by, sort=sort).size()
|
||||
right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0])
|
||||
tm.assert_series_equal(left, right, check_names=False)
|
||||
|
||||
|
||||
def test_size_series_dataframe():
|
||||
# https://github.com/pandas-dev/pandas/issues/11699
|
||||
df = DataFrame(columns=["A", "B"])
|
||||
out = Series(dtype="int64", index=Index([], name="A"))
|
||||
tm.assert_series_equal(df.groupby("A").size(), out)
|
||||
|
||||
|
||||
def test_size_groupby_all_null():
|
||||
# https://github.com/pandas-dev/pandas/issues/23050
|
||||
# Assert no 'Value Error : Length of passed values is 2, index implies 0'
|
||||
df = DataFrame({"A": [None, None]}) # all-null groups
|
||||
result = df.groupby("A").size()
|
||||
expected = Series(dtype="int64", index=Index([], name="A"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_size_period_index():
|
||||
# https://github.com/pandas-dev/pandas/issues/34010
|
||||
ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D"))
|
||||
grp = ser.groupby(level="A")
|
||||
result = grp.size()
|
||||
tm.assert_series_equal(result, ser)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
def test_size_on_categorical(as_index):
|
||||
df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"])
|
||||
df["A"] = df["A"].astype("category")
|
||||
result = df.groupby(["A", "B"], as_index=as_index, observed=False).size()
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"]
|
||||
)
|
||||
expected["A"] = expected["A"].astype("category")
|
||||
if as_index:
|
||||
expected = expected.set_index(["A", "B"])["size"].rename(None)
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
||||
def test_size_series_masked_type_returns_Int64(dtype):
|
||||
# GH 54132
|
||||
ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype)
|
||||
result = ser.groupby(level=0).size()
|
||||
expected = Series([2, 1], dtype="Int64", index=["a", "b"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_size_strings(any_string_dtype, using_infer_string):
|
||||
# GH#55627
|
||||
dtype = any_string_dtype
|
||||
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
|
||||
result = df.groupby("a")["b"].size()
|
||||
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
|
||||
exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype
|
||||
expected = Series(
|
||||
[2, 1],
|
||||
index=Index(["a", "b"], name="a", dtype=exp_index_dtype),
|
||||
name="b",
|
||||
dtype=exp_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@ -0,0 +1,27 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_groupby_skew_equivalence():
|
||||
# Test that that groupby skew method (which uses libgroupby.group_skew)
|
||||
# matches the results of operating group-by-group (which uses nanops.nanskew)
|
||||
nrows = 1000
|
||||
ngroups = 3
|
||||
ncols = 2
|
||||
nan_frac = 0.05
|
||||
|
||||
arr = np.random.default_rng(2).standard_normal((nrows, ncols))
|
||||
arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan
|
||||
|
||||
df = pd.DataFrame(arr)
|
||||
grps = np.random.default_rng(2).integers(0, ngroups, size=nrows)
|
||||
gb = df.groupby(grps)
|
||||
|
||||
result = gb.skew()
|
||||
|
||||
grpwise = [grp.skew().to_frame(i).T for i, grp in gb]
|
||||
expected = pd.concat(grpwise, axis=0)
|
||||
expected.index = expected.index.astype(result.index.dtype) # 32bit builds
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,83 @@
|
||||
"""
|
||||
Tests that apply to all groupby operation methods.
|
||||
|
||||
The only tests that should appear here are those that use the `groupby_func` fixture.
|
||||
Even if it does use that fixture, prefer a more specific test file if it available
|
||||
such as:
|
||||
|
||||
- test_categorical
|
||||
- test_groupby_dropna
|
||||
- test_groupby_subclass
|
||||
- test_raises
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.groupby import get_groupby_method_args
|
||||
|
||||
|
||||
def test_multiindex_group_all_columns_when_empty(groupby_func):
|
||||
# GH 32464
|
||||
df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"])
|
||||
gb = df.groupby(["a", "b", "c"], group_keys=False)
|
||||
method = getattr(gb, groupby_func)
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
|
||||
warn = FutureWarning if groupby_func == "fillna" else None
|
||||
warn_msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
result = method(*args).index
|
||||
expected = df.index
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
|
||||
def test_duplicate_columns(request, groupby_func, as_index):
|
||||
# GH#50806
|
||||
if groupby_func == "corrwith":
|
||||
msg = "GH#50845 - corrwith fails when there are duplicate columns"
|
||||
request.applymarker(pytest.mark.xfail(reason=msg))
|
||||
df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb"))
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
gb = df.groupby("a", as_index=as_index)
|
||||
warn = FutureWarning if groupby_func == "fillna" else None
|
||||
warn_msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
result = getattr(gb, groupby_func)(*args)
|
||||
|
||||
expected_df = df.set_axis(["a", "b", "c"], axis=1)
|
||||
expected_args = get_groupby_method_args(groupby_func, expected_df)
|
||||
expected_gb = expected_df.groupby("a", as_index=as_index)
|
||||
warn = FutureWarning if groupby_func == "fillna" else None
|
||||
warn_msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
expected = getattr(expected_gb, groupby_func)(*expected_args)
|
||||
if groupby_func not in ("size", "ngroup", "cumcount"):
|
||||
expected = expected.rename(columns={"c": "b"})
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx",
|
||||
[
|
||||
pd.Index(["a", "a"], name="foo"),
|
||||
pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")), names=["foo", "bar"]),
|
||||
],
|
||||
)
|
||||
def test_dup_labels_output_shape(groupby_func, idx):
|
||||
if groupby_func in {"size", "ngroup", "cumcount"}:
|
||||
pytest.skip(f"Not applicable for {groupby_func}")
|
||||
|
||||
df = DataFrame([[1, 1]], columns=idx)
|
||||
grp_by = df.groupby([0])
|
||||
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
warn = FutureWarning if groupby_func == "fillna" else None
|
||||
warn_msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
result = getattr(grp_by, groupby_func)(*args)
|
||||
|
||||
assert result.shape == (1, 2)
|
||||
tm.assert_index_equal(result.columns, idx)
|
||||
265
lib/python3.11/site-packages/pandas/tests/groupby/test_api.py
Normal file
265
lib/python3.11/site-packages/pandas/tests/groupby/test_api.py
Normal file
@ -0,0 +1,265 @@
|
||||
"""
|
||||
Tests of the groupby API, including internal consistency and with other pandas objects.
|
||||
|
||||
Tests in this file should only check the existence, names, and arguments of groupby
|
||||
methods. It should not test the results of any groupby operation.
|
||||
"""
|
||||
|
||||
import inspect
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
from pandas.core.groupby.base import (
|
||||
groupby_other_methods,
|
||||
reduction_kernels,
|
||||
transformation_kernels,
|
||||
)
|
||||
from pandas.core.groupby.generic import (
|
||||
DataFrameGroupBy,
|
||||
SeriesGroupBy,
|
||||
)
|
||||
|
||||
|
||||
def test_tab_completion(multiindex_dataframe_random_data):
|
||||
grp = multiindex_dataframe_random_data.groupby(level="second")
|
||||
results = {v for v in dir(grp) if not v.startswith("_")}
|
||||
expected = {
|
||||
"A",
|
||||
"B",
|
||||
"C",
|
||||
"agg",
|
||||
"aggregate",
|
||||
"apply",
|
||||
"boxplot",
|
||||
"filter",
|
||||
"first",
|
||||
"get_group",
|
||||
"groups",
|
||||
"hist",
|
||||
"indices",
|
||||
"last",
|
||||
"max",
|
||||
"mean",
|
||||
"median",
|
||||
"min",
|
||||
"ngroups",
|
||||
"nth",
|
||||
"ohlc",
|
||||
"plot",
|
||||
"prod",
|
||||
"size",
|
||||
"std",
|
||||
"sum",
|
||||
"transform",
|
||||
"var",
|
||||
"sem",
|
||||
"count",
|
||||
"nunique",
|
||||
"head",
|
||||
"describe",
|
||||
"cummax",
|
||||
"quantile",
|
||||
"rank",
|
||||
"cumprod",
|
||||
"tail",
|
||||
"resample",
|
||||
"cummin",
|
||||
"fillna",
|
||||
"cumsum",
|
||||
"cumcount",
|
||||
"ngroup",
|
||||
"all",
|
||||
"shift",
|
||||
"skew",
|
||||
"take",
|
||||
"pct_change",
|
||||
"any",
|
||||
"corr",
|
||||
"corrwith",
|
||||
"cov",
|
||||
"dtypes",
|
||||
"ndim",
|
||||
"diff",
|
||||
"idxmax",
|
||||
"idxmin",
|
||||
"ffill",
|
||||
"bfill",
|
||||
"rolling",
|
||||
"expanding",
|
||||
"pipe",
|
||||
"sample",
|
||||
"ewm",
|
||||
"value_counts",
|
||||
}
|
||||
assert results == expected
|
||||
|
||||
|
||||
def test_all_methods_categorized(multiindex_dataframe_random_data):
|
||||
grp = multiindex_dataframe_random_data.groupby(
|
||||
multiindex_dataframe_random_data.iloc[:, 0]
|
||||
)
|
||||
names = {_ for _ in dir(grp) if not _.startswith("_")} - set(
|
||||
multiindex_dataframe_random_data.columns
|
||||
)
|
||||
new_names = set(names)
|
||||
new_names -= reduction_kernels
|
||||
new_names -= transformation_kernels
|
||||
new_names -= groupby_other_methods
|
||||
|
||||
assert not reduction_kernels & transformation_kernels
|
||||
assert not reduction_kernels & groupby_other_methods
|
||||
assert not transformation_kernels & groupby_other_methods
|
||||
|
||||
# new public method?
|
||||
if new_names:
|
||||
msg = f"""
|
||||
There are uncategorized methods defined on the Grouper class:
|
||||
{new_names}.
|
||||
|
||||
Was a new method recently added?
|
||||
|
||||
Every public method On Grouper must appear in exactly one the
|
||||
following three lists defined in pandas.core.groupby.base:
|
||||
- `reduction_kernels`
|
||||
- `transformation_kernels`
|
||||
- `groupby_other_methods`
|
||||
see the comments in pandas/core/groupby/base.py for guidance on
|
||||
how to fix this test.
|
||||
"""
|
||||
raise AssertionError(msg)
|
||||
|
||||
# removed a public method?
|
||||
all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods
|
||||
if names != all_categorized:
|
||||
msg = f"""
|
||||
Some methods which are supposed to be on the Grouper class
|
||||
are missing:
|
||||
{all_categorized - names}.
|
||||
|
||||
They're still defined in one of the lists that live in pandas/core/groupby/base.py.
|
||||
If you removed a method, you should update them
|
||||
"""
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
def test_frame_consistency(groupby_func):
|
||||
# GH#48028
|
||||
if groupby_func in ("first", "last"):
|
||||
msg = "first and last are entirely different between frame and groupby"
|
||||
pytest.skip(reason=msg)
|
||||
|
||||
if groupby_func in ("cumcount", "ngroup"):
|
||||
assert not hasattr(DataFrame, groupby_func)
|
||||
return
|
||||
|
||||
frame_method = getattr(DataFrame, groupby_func)
|
||||
gb_method = getattr(DataFrameGroupBy, groupby_func)
|
||||
result = set(inspect.signature(gb_method).parameters)
|
||||
if groupby_func == "size":
|
||||
# "size" is a method on GroupBy but property on DataFrame:
|
||||
expected = {"self"}
|
||||
else:
|
||||
expected = set(inspect.signature(frame_method).parameters)
|
||||
|
||||
# Exclude certain arguments from result and expected depending on the operation
|
||||
# Some of these may be purposeful inconsistencies between the APIs
|
||||
exclude_expected, exclude_result = set(), set()
|
||||
if groupby_func in ("any", "all"):
|
||||
exclude_expected = {"kwargs", "bool_only", "axis"}
|
||||
elif groupby_func in ("count",):
|
||||
exclude_expected = {"numeric_only", "axis"}
|
||||
elif groupby_func in ("nunique",):
|
||||
exclude_expected = {"axis"}
|
||||
elif groupby_func in ("max", "min"):
|
||||
exclude_expected = {"axis", "kwargs", "skipna"}
|
||||
exclude_result = {"min_count", "engine", "engine_kwargs"}
|
||||
elif groupby_func in ("mean", "std", "sum", "var"):
|
||||
exclude_expected = {"axis", "kwargs", "skipna"}
|
||||
exclude_result = {"engine", "engine_kwargs"}
|
||||
elif groupby_func in ("median", "prod", "sem"):
|
||||
exclude_expected = {"axis", "kwargs", "skipna"}
|
||||
elif groupby_func in ("backfill", "bfill", "ffill", "pad"):
|
||||
exclude_expected = {"downcast", "inplace", "axis", "limit_area"}
|
||||
elif groupby_func in ("cummax", "cummin"):
|
||||
exclude_expected = {"skipna", "args"}
|
||||
exclude_result = {"numeric_only"}
|
||||
elif groupby_func in ("cumprod", "cumsum"):
|
||||
exclude_expected = {"skipna"}
|
||||
elif groupby_func in ("pct_change",):
|
||||
exclude_expected = {"kwargs"}
|
||||
exclude_result = {"axis"}
|
||||
elif groupby_func in ("rank",):
|
||||
exclude_expected = {"numeric_only"}
|
||||
elif groupby_func in ("quantile",):
|
||||
exclude_expected = {"method", "axis"}
|
||||
|
||||
# Ensure excluded arguments are actually in the signatures
|
||||
assert result & exclude_result == exclude_result
|
||||
assert expected & exclude_expected == exclude_expected
|
||||
|
||||
result -= exclude_result
|
||||
expected -= exclude_expected
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_series_consistency(request, groupby_func):
|
||||
# GH#48028
|
||||
if groupby_func in ("first", "last"):
|
||||
pytest.skip("first and last are entirely different between Series and groupby")
|
||||
|
||||
if groupby_func in ("cumcount", "corrwith", "ngroup"):
|
||||
assert not hasattr(Series, groupby_func)
|
||||
return
|
||||
|
||||
series_method = getattr(Series, groupby_func)
|
||||
gb_method = getattr(SeriesGroupBy, groupby_func)
|
||||
result = set(inspect.signature(gb_method).parameters)
|
||||
if groupby_func == "size":
|
||||
# "size" is a method on GroupBy but property on Series
|
||||
expected = {"self"}
|
||||
else:
|
||||
expected = set(inspect.signature(series_method).parameters)
|
||||
|
||||
# Exclude certain arguments from result and expected depending on the operation
|
||||
# Some of these may be purposeful inconsistencies between the APIs
|
||||
exclude_expected, exclude_result = set(), set()
|
||||
if groupby_func in ("any", "all"):
|
||||
exclude_expected = {"kwargs", "bool_only", "axis"}
|
||||
elif groupby_func in ("diff",):
|
||||
exclude_result = {"axis"}
|
||||
elif groupby_func in ("max", "min"):
|
||||
exclude_expected = {"axis", "kwargs", "skipna"}
|
||||
exclude_result = {"min_count", "engine", "engine_kwargs"}
|
||||
elif groupby_func in ("mean", "std", "sum", "var"):
|
||||
exclude_expected = {"axis", "kwargs", "skipna"}
|
||||
exclude_result = {"engine", "engine_kwargs"}
|
||||
elif groupby_func in ("median", "prod", "sem"):
|
||||
exclude_expected = {"axis", "kwargs", "skipna"}
|
||||
elif groupby_func in ("backfill", "bfill", "ffill", "pad"):
|
||||
exclude_expected = {"downcast", "inplace", "axis", "limit_area"}
|
||||
elif groupby_func in ("cummax", "cummin"):
|
||||
exclude_expected = {"skipna", "args"}
|
||||
exclude_result = {"numeric_only"}
|
||||
elif groupby_func in ("cumprod", "cumsum"):
|
||||
exclude_expected = {"skipna"}
|
||||
elif groupby_func in ("pct_change",):
|
||||
exclude_expected = {"kwargs"}
|
||||
exclude_result = {"axis"}
|
||||
elif groupby_func in ("rank",):
|
||||
exclude_expected = {"numeric_only"}
|
||||
elif groupby_func in ("idxmin", "idxmax"):
|
||||
exclude_expected = {"args", "kwargs"}
|
||||
elif groupby_func in ("quantile",):
|
||||
exclude_result = {"numeric_only"}
|
||||
|
||||
# Ensure excluded arguments are actually in the signatures
|
||||
assert result & exclude_result == exclude_result
|
||||
assert expected & exclude_expected == exclude_expected
|
||||
|
||||
result -= exclude_result
|
||||
expected -= exclude_expected
|
||||
assert result == expected
|
||||
1605
lib/python3.11/site-packages/pandas/tests/groupby/test_apply.py
Normal file
1605
lib/python3.11/site-packages/pandas/tests/groupby/test_apply.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,163 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_group_by_copy():
|
||||
# GH#44803
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"name": ["Alice", "Bob", "Carl"],
|
||||
"age": [20, 21, 20],
|
||||
}
|
||||
).set_index("name")
|
||||
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
grp_by_same_value = df.groupby(["age"], group_keys=False).apply(
|
||||
lambda group: group
|
||||
)
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
grp_by_copy = df.groupby(["age"], group_keys=False).apply(
|
||||
lambda group: group.copy()
|
||||
)
|
||||
tm.assert_frame_equal(grp_by_same_value, grp_by_copy)
|
||||
|
||||
|
||||
def test_mutate_groups():
|
||||
# GH3380
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"cat1": ["a"] * 8 + ["b"] * 6,
|
||||
"cat2": ["c"] * 2
|
||||
+ ["d"] * 2
|
||||
+ ["e"] * 2
|
||||
+ ["f"] * 2
|
||||
+ ["c"] * 2
|
||||
+ ["d"] * 2
|
||||
+ ["e"] * 2,
|
||||
"cat3": [f"g{x}" for x in range(1, 15)],
|
||||
"val": np.random.default_rng(2).integers(100, size=14),
|
||||
}
|
||||
)
|
||||
|
||||
def f_copy(x):
|
||||
x = x.copy()
|
||||
x["rank"] = x.val.rank(method="min")
|
||||
return x.groupby("cat2")["rank"].min()
|
||||
|
||||
def f_no_copy(x):
|
||||
x["rank"] = x.val.rank(method="min")
|
||||
return x.groupby("cat2")["rank"].min()
|
||||
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
grpby_copy = df.groupby("cat1").apply(f_copy)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
grpby_no_copy = df.groupby("cat1").apply(f_no_copy)
|
||||
tm.assert_series_equal(grpby_copy, grpby_no_copy)
|
||||
|
||||
|
||||
def test_no_mutate_but_looks_like():
|
||||
# GH 8467
|
||||
# first show's mutation indicator
|
||||
# second does not, but should yield the same results
|
||||
df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
|
||||
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key)
|
||||
tm.assert_series_equal(result1, result2)
|
||||
|
||||
|
||||
def test_apply_function_with_indexing(warn_copy_on_write):
|
||||
# GH: 33058
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]}
|
||||
)
|
||||
|
||||
def fn(x):
|
||||
x.loc[x.index[-1], "col2"] = 0
|
||||
return x.col2
|
||||
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=msg, raise_on_extra_warnings=not warn_copy_on_write
|
||||
):
|
||||
result = df.groupby(["col1"], as_index=False).apply(fn)
|
||||
expected = pd.Series(
|
||||
[1, 2, 0, 4, 5, 0],
|
||||
index=pd.MultiIndex.from_tuples(
|
||||
[(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)]
|
||||
),
|
||||
name="col2",
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_mutate_columns_multiindex():
|
||||
# GH 12652
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
("C", "julian"): [1, 2, 3],
|
||||
("B", "geoffrey"): [1, 2, 3],
|
||||
("A", "julian"): [1, 2, 3],
|
||||
("B", "julian"): [1, 2, 3],
|
||||
("A", "geoffrey"): [1, 2, 3],
|
||||
("C", "geoffrey"): [1, 2, 3],
|
||||
},
|
||||
columns=pd.MultiIndex.from_tuples(
|
||||
[
|
||||
("A", "julian"),
|
||||
("A", "geoffrey"),
|
||||
("B", "julian"),
|
||||
("B", "geoffrey"),
|
||||
("C", "julian"),
|
||||
("C", "geoffrey"),
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
def add_column(grouped):
|
||||
name = grouped.columns[0][1]
|
||||
grouped["sum", name] = grouped.sum(axis=1)
|
||||
return grouped
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby(level=1, axis=1)
|
||||
result = gb.apply(add_column)
|
||||
expected = pd.DataFrame(
|
||||
[
|
||||
[1, 1, 1, 3, 1, 1, 1, 3],
|
||||
[2, 2, 2, 6, 2, 2, 2, 6],
|
||||
[
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
9,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
9,
|
||||
],
|
||||
],
|
||||
columns=pd.MultiIndex.from_tuples(
|
||||
[
|
||||
("geoffrey", "A", "geoffrey"),
|
||||
("geoffrey", "B", "geoffrey"),
|
||||
("geoffrey", "C", "geoffrey"),
|
||||
("geoffrey", "sum", "geoffrey"),
|
||||
("julian", "A", "julian"),
|
||||
("julian", "B", "julian"),
|
||||
("julian", "C", "julian"),
|
||||
("julian", "sum", "julian"),
|
||||
]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,65 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import lib
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def assert_block_lengths(x):
|
||||
assert len(x) == len(x._mgr.blocks[0].mgr_locs)
|
||||
return 0
|
||||
|
||||
|
||||
def cumsum_max(x):
|
||||
x.cumsum().max()
|
||||
return 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"func",
|
||||
[
|
||||
cumsum_max,
|
||||
pytest.param(assert_block_lengths, marks=td.skip_array_manager_invalid_test),
|
||||
],
|
||||
)
|
||||
def test_mgr_locs_updated(func):
|
||||
# https://github.com/pandas-dev/pandas/issues/31802
|
||||
# Some operations may require creating new blocks, which requires
|
||||
# valid mgr_locs
|
||||
df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]})
|
||||
result = df.groupby(["A", "B"]).agg(func)
|
||||
expected = pd.DataFrame(
|
||||
{"C": [0, 0]},
|
||||
index=pd.MultiIndex.from_product([["a"], ["a", "b"]], names=["A", "B"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"binner,closed,expected",
|
||||
[
|
||||
(
|
||||
np.array([0, 3, 6, 9], dtype=np.int64),
|
||||
"left",
|
||||
np.array([2, 5, 6], dtype=np.int64),
|
||||
),
|
||||
(
|
||||
np.array([0, 3, 6, 9], dtype=np.int64),
|
||||
"right",
|
||||
np.array([3, 6, 6], dtype=np.int64),
|
||||
),
|
||||
(np.array([0, 3, 6], dtype=np.int64), "left", np.array([2, 5], dtype=np.int64)),
|
||||
(
|
||||
np.array([0, 3, 6], dtype=np.int64),
|
||||
"right",
|
||||
np.array([3, 6], dtype=np.int64),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_generate_bins(binner, closed, expected):
|
||||
values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
|
||||
result = lib.generate_bins_dt64(values, binner, closed=closed)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,394 @@
|
||||
from itertools import product
|
||||
from string import ascii_lowercase
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Period,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestCounting:
|
||||
def test_cumcount(self):
|
||||
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"])
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3])
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_empty(self):
|
||||
ge = DataFrame().groupby(level=0)
|
||||
se = Series(dtype=object).groupby(level=0)
|
||||
|
||||
# edge case, as this is usually considered float
|
||||
e = Series(dtype="int64")
|
||||
|
||||
tm.assert_series_equal(e, ge.cumcount())
|
||||
tm.assert_series_equal(e, se.cumcount())
|
||||
|
||||
def test_cumcount_dupe_index(self):
|
||||
df = DataFrame(
|
||||
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
|
||||
)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_mi(self):
|
||||
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
|
||||
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=mi)
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_groupby_not_col(self):
|
||||
df = DataFrame(
|
||||
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
|
||||
)
|
||||
g = df.groupby([0, 0, 0, 1, 0])
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_ngroup(self):
|
||||
df = DataFrame({"A": list("aaaba")})
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0])
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_distinct(self):
|
||||
df = DataFrame({"A": list("abcde")})
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series(range(5), dtype="int64")
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_one_group(self):
|
||||
df = DataFrame({"A": [0] * 5})
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_empty(self):
|
||||
ge = DataFrame().groupby(level=0)
|
||||
se = Series(dtype=object).groupby(level=0)
|
||||
|
||||
# edge case, as this is usually considered float
|
||||
e = Series(dtype="int64")
|
||||
|
||||
tm.assert_series_equal(e, ge.ngroup())
|
||||
tm.assert_series_equal(e, se.ngroup())
|
||||
|
||||
def test_ngroup_series_matches_frame(self):
|
||||
df = DataFrame({"A": list("aaaba")})
|
||||
s = Series(list("aaaba"))
|
||||
|
||||
tm.assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup())
|
||||
|
||||
def test_ngroup_dupe_index(self):
|
||||
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_mi(self):
|
||||
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
|
||||
df = DataFrame({"A": list("aaaba")}, index=mi)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
expected = Series([0, 0, 0, 1, 0], index=mi)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_groupby_not_col(self):
|
||||
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
|
||||
g = df.groupby([0, 0, 0, 1, 0])
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_descending(self):
|
||||
df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"])
|
||||
g = df.groupby(["A"])
|
||||
|
||||
ascending = Series([0, 0, 1, 0, 1])
|
||||
descending = Series([1, 1, 0, 1, 0])
|
||||
|
||||
tm.assert_series_equal(descending, (g.ngroups - 1) - ascending)
|
||||
tm.assert_series_equal(ascending, g.ngroup(ascending=True))
|
||||
tm.assert_series_equal(descending, g.ngroup(ascending=False))
|
||||
|
||||
def test_ngroup_matches_cumcount(self):
|
||||
# verify one manually-worked out case works
|
||||
df = DataFrame(
|
||||
[["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]],
|
||||
columns=["A", "X"],
|
||||
)
|
||||
g = df.groupby(["A", "X"])
|
||||
g_ngroup = g.ngroup()
|
||||
g_cumcount = g.cumcount()
|
||||
expected_ngroup = Series([0, 1, 2, 0, 3])
|
||||
expected_cumcount = Series([0, 0, 0, 1, 0])
|
||||
|
||||
tm.assert_series_equal(g_ngroup, expected_ngroup)
|
||||
tm.assert_series_equal(g_cumcount, expected_cumcount)
|
||||
|
||||
def test_ngroup_cumcount_pair(self):
|
||||
# brute force comparison for all small series
|
||||
for p in product(range(3), repeat=4):
|
||||
df = DataFrame({"a": p})
|
||||
g = df.groupby(["a"])
|
||||
|
||||
order = sorted(set(p))
|
||||
ngroupd = [order.index(val) for val in p]
|
||||
cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
|
||||
|
||||
tm.assert_series_equal(g.ngroup(), Series(ngroupd))
|
||||
tm.assert_series_equal(g.cumcount(), Series(cumcounted))
|
||||
|
||||
def test_ngroup_respects_groupby_order(self, sort):
|
||||
df = DataFrame({"a": np.random.default_rng(2).choice(list("abcdef"), 100)})
|
||||
g = df.groupby("a", sort=sort)
|
||||
df["group_id"] = -1
|
||||
df["group_index"] = -1
|
||||
|
||||
for i, (_, group) in enumerate(g):
|
||||
df.loc[group.index, "group_id"] = i
|
||||
for j, ind in enumerate(group.index):
|
||||
df.loc[ind, "group_index"] = j
|
||||
|
||||
tm.assert_series_equal(Series(df["group_id"].values), g.ngroup())
|
||||
tm.assert_series_equal(Series(df["group_index"].values), g.cumcount())
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"datetimelike",
|
||||
[
|
||||
[Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)],
|
||||
[Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)],
|
||||
[Timestamp(f"2016-05-{i:02d} 20:09:25", tz="UTC") for i in range(1, 4)],
|
||||
[Timedelta(x, unit="h") for x in range(1, 4)],
|
||||
[Period(freq="2W", year=2017, month=x) for x in range(1, 4)],
|
||||
],
|
||||
)
|
||||
def test_count_with_datetimelike(self, datetimelike):
|
||||
# test for #13393, where DataframeGroupBy.count() fails
|
||||
# when counting a datetimelike column.
|
||||
|
||||
df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike})
|
||||
res = df.groupby("x").count()
|
||||
expected = DataFrame({"y": [2, 1]}, index=["a", "b"])
|
||||
expected.index.name = "x"
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
def test_count_with_only_nans_in_first_group(self):
|
||||
# GH21956
|
||||
df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]})
|
||||
result = df.groupby(["A", "B"]).C.count()
|
||||
mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"])
|
||||
expected = Series([], index=mi, dtype=np.int64, name="C")
|
||||
tm.assert_series_equal(result, expected, check_index_type=False)
|
||||
|
||||
def test_count_groupby_column_with_nan_in_groupby_column(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/32841
|
||||
df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.nan, 3, 0]})
|
||||
res = df.groupby(["B"]).count()
|
||||
expected = DataFrame(
|
||||
index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]}
|
||||
)
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
def test_groupby_count_dateparseerror(self):
|
||||
dr = date_range(start="1/1/2012", freq="5min", periods=10)
|
||||
|
||||
# BAD Example, datetimes first
|
||||
ser = Series(np.arange(10), index=[dr, np.arange(10)])
|
||||
grouped = ser.groupby(lambda x: x[1] % 2 == 0)
|
||||
result = grouped.count()
|
||||
|
||||
ser = Series(np.arange(10), index=[np.arange(10), dr])
|
||||
grouped = ser.groupby(lambda x: x[0] % 2 == 0)
|
||||
expected = grouped.count()
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_timedelta_cython_count():
|
||||
df = DataFrame(
|
||||
{"g": list("ab" * 2), "delta": np.arange(4).astype("timedelta64[ns]")}
|
||||
)
|
||||
expected = Series([2, 2], index=Index(["a", "b"], name="g"), name="delta")
|
||||
result = df.groupby("g").delta.count()
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_count():
|
||||
n = 1 << 15
|
||||
dr = date_range("2015-08-30", periods=n // 10, freq="min")
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"1st": np.random.default_rng(2).choice(list(ascii_lowercase), n),
|
||||
"2nd": np.random.default_rng(2).integers(0, 5, n),
|
||||
"3rd": np.random.default_rng(2).standard_normal(n).round(3),
|
||||
"4th": np.random.default_rng(2).integers(-10, 10, n),
|
||||
"5th": np.random.default_rng(2).choice(dr, n),
|
||||
"6th": np.random.default_rng(2).standard_normal(n).round(3),
|
||||
"7th": np.random.default_rng(2).standard_normal(n).round(3),
|
||||
"8th": np.random.default_rng(2).choice(dr, n)
|
||||
- np.random.default_rng(2).choice(dr, 1),
|
||||
"9th": np.random.default_rng(2).choice(list(ascii_lowercase), n),
|
||||
}
|
||||
)
|
||||
|
||||
for col in df.columns.drop(["1st", "2nd", "4th"]):
|
||||
df.loc[np.random.default_rng(2).choice(n, n // 10), col] = np.nan
|
||||
|
||||
df["9th"] = df["9th"].astype("category")
|
||||
|
||||
for key in ["1st", "2nd", ["1st", "2nd"]]:
|
||||
left = df.groupby(key).count()
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
|
||||
tm.assert_frame_equal(left, right)
|
||||
|
||||
|
||||
def test_count_non_nulls():
|
||||
# GH#5610
|
||||
# count counts non-nulls
|
||||
df = DataFrame(
|
||||
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
|
||||
count_as = df.groupby("A").count()
|
||||
count_not_as = df.groupby("A", as_index=False).count()
|
||||
|
||||
expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3])
|
||||
expected.index.name = "A"
|
||||
tm.assert_frame_equal(count_not_as, expected.reset_index())
|
||||
tm.assert_frame_equal(count_as, expected)
|
||||
|
||||
count_B = df.groupby("A")["B"].count()
|
||||
tm.assert_series_equal(count_B, expected["B"])
|
||||
|
||||
|
||||
def test_count_object():
|
||||
df = DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3})
|
||||
result = df.groupby("c").a.count()
|
||||
expected = Series([3, 3], index=Index([2, 3], name="c"), name="a")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
|
||||
result = df.groupby("c").a.count()
|
||||
expected = Series([1, 3], index=Index([2, 3], name="c"), name="a")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_count_cross_type():
|
||||
# GH8169
|
||||
# Set float64 dtype to avoid upcast when setting nan below
|
||||
vals = np.hstack(
|
||||
(
|
||||
np.random.default_rng(2).integers(0, 5, (100, 2)),
|
||||
np.random.default_rng(2).integers(0, 2, (100, 2)),
|
||||
)
|
||||
).astype("float64")
|
||||
|
||||
df = DataFrame(vals, columns=["a", "b", "c", "d"])
|
||||
df[df == 2] = np.nan
|
||||
expected = df.groupby(["c", "d"]).count()
|
||||
|
||||
for t in ["float32", "object"]:
|
||||
df["a"] = df["a"].astype(t)
|
||||
df["b"] = df["b"].astype(t)
|
||||
result = df.groupby(["c", "d"]).count()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_lower_int_prec_count():
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.array([0, 1, 2, 100], np.int8),
|
||||
"b": np.array([1, 2, 3, 6], np.uint32),
|
||||
"c": np.array([4, 5, 6, 8], np.int16),
|
||||
"grp": list("ab" * 2),
|
||||
}
|
||||
)
|
||||
result = df.groupby("grp").count()
|
||||
expected = DataFrame(
|
||||
{"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=Index(list("ab"), name="grp")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_count_uses_size_on_exception():
|
||||
class RaisingObjectException(Exception):
|
||||
pass
|
||||
|
||||
class RaisingObject:
|
||||
def __init__(self, msg="I will raise inside Cython") -> None:
|
||||
super().__init__()
|
||||
self.msg = msg
|
||||
|
||||
def __eq__(self, other):
|
||||
# gets called in Cython to check that raising calls the method
|
||||
raise RaisingObjectException(self.msg)
|
||||
|
||||
df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)})
|
||||
result = df.groupby("grp").count()
|
||||
expected = DataFrame({"a": [2, 2]}, index=Index(list("ab"), name="grp"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_count_arrow_string_array(any_string_dtype):
|
||||
# GH#54751
|
||||
pytest.importorskip("pyarrow")
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3], "b": Series(["a", "b", "a"], dtype=any_string_dtype)}
|
||||
)
|
||||
result = df.groupby("a").count()
|
||||
expected = DataFrame({"b": 1}, index=Index([1, 2, 3], name="a"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,319 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import UnsupportedFunctionCall
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"],
|
||||
ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"],
|
||||
)
|
||||
def dtypes_for_minmax(request):
|
||||
"""
|
||||
Fixture of dtypes with min and max values used for testing
|
||||
cummin and cummax
|
||||
"""
|
||||
dtype = request.param
|
||||
|
||||
np_type = dtype
|
||||
if dtype == "Int64":
|
||||
np_type = np.int64
|
||||
elif dtype == "Float64":
|
||||
np_type = np.float64
|
||||
|
||||
min_val = (
|
||||
np.iinfo(np_type).min
|
||||
if np.dtype(np_type).kind == "i"
|
||||
else np.finfo(np_type).min
|
||||
)
|
||||
max_val = (
|
||||
np.iinfo(np_type).max
|
||||
if np.dtype(np_type).kind == "i"
|
||||
else np.finfo(np_type).max
|
||||
)
|
||||
|
||||
return (dtype, min_val, max_val)
|
||||
|
||||
|
||||
def test_groupby_cumprod():
|
||||
# GH 4095
|
||||
df = DataFrame({"key": ["b"] * 10, "value": 2})
|
||||
|
||||
actual = df.groupby("key")["value"].cumprod()
|
||||
expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
|
||||
expected.name = "value"
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
df = DataFrame({"key": ["b"] * 100, "value": 2})
|
||||
df["value"] = df["value"].astype(float)
|
||||
actual = df.groupby("key")["value"].cumprod()
|
||||
expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
|
||||
expected.name = "value"
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.skip_ubsan
|
||||
def test_groupby_cumprod_overflow():
|
||||
# GH#37493 if we overflow we return garbage consistent with numpy
|
||||
df = DataFrame({"key": ["b"] * 4, "value": 100_000})
|
||||
actual = df.groupby("key")["value"].cumprod()
|
||||
expected = Series(
|
||||
[100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920],
|
||||
name="value",
|
||||
)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
numpy_result = df.groupby("key", group_keys=False)["value"].apply(
|
||||
lambda x: x.cumprod()
|
||||
)
|
||||
numpy_result.name = "value"
|
||||
tm.assert_series_equal(actual, numpy_result)
|
||||
|
||||
|
||||
def test_groupby_cumprod_nan_influences_other_columns():
|
||||
# GH#48064
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": 1,
|
||||
"b": [1, np.nan, 2],
|
||||
"c": [1, 2, 3.0],
|
||||
}
|
||||
)
|
||||
result = df.groupby("a").cumprod(numeric_only=True, skipna=False)
|
||||
expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cummin(dtypes_for_minmax):
|
||||
dtype = dtypes_for_minmax[0]
|
||||
min_val = dtypes_for_minmax[1]
|
||||
|
||||
# GH 15048
|
||||
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
|
||||
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
|
||||
|
||||
df = base_df.astype(dtype)
|
||||
|
||||
expected = DataFrame({"B": expected_mins}).astype(dtype)
|
||||
result = df.groupby("A").cummin()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Test w/ min value for dtype
|
||||
df.loc[[2, 6], "B"] = min_val
|
||||
df.loc[[1, 5], "B"] = min_val + 1
|
||||
expected.loc[[2, 3, 6, 7], "B"] = min_val
|
||||
expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val
|
||||
result = df.groupby("A").cummin()
|
||||
tm.assert_frame_equal(result, expected, check_exact=True)
|
||||
expected = (
|
||||
df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
|
||||
)
|
||||
tm.assert_frame_equal(result, expected, check_exact=True)
|
||||
|
||||
# Test nan in some values
|
||||
# Explicit cast to float to avoid implicit cast when setting nan
|
||||
base_df = base_df.astype({"B": "float"})
|
||||
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
|
||||
expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
|
||||
result = base_df.groupby("A").cummin()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
expected = (
|
||||
base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 15561
|
||||
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
|
||||
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
|
||||
|
||||
result = df.groupby("a")["b"].cummin()
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
# GH 15635
|
||||
df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
|
||||
result = df.groupby("a").b.cummin()
|
||||
expected = Series([1, 2, 1], name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||||
@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"])
|
||||
def test_cummin_max_all_nan_column(method, dtype):
|
||||
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})
|
||||
base_df["B"] = base_df["B"].astype(dtype)
|
||||
grouped = base_df.groupby("A")
|
||||
|
||||
expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype)
|
||||
result = getattr(grouped, method)()
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
result = getattr(grouped["B"], method)().to_frame()
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
def test_cummax(dtypes_for_minmax):
|
||||
dtype = dtypes_for_minmax[0]
|
||||
max_val = dtypes_for_minmax[2]
|
||||
|
||||
# GH 15048
|
||||
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
|
||||
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
|
||||
|
||||
df = base_df.astype(dtype)
|
||||
|
||||
expected = DataFrame({"B": expected_maxs}).astype(dtype)
|
||||
result = df.groupby("A").cummax()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Test w/ max value for dtype
|
||||
df.loc[[2, 6], "B"] = max_val
|
||||
expected.loc[[2, 3, 6, 7], "B"] = max_val
|
||||
result = df.groupby("A").cummax()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
expected = (
|
||||
df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Test nan in some values
|
||||
# Explicit cast to float to avoid implicit cast when setting nan
|
||||
base_df = base_df.astype({"B": "float"})
|
||||
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
|
||||
expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
|
||||
result = base_df.groupby("A").cummax()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
expected = (
|
||||
base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 15561
|
||||
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
|
||||
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
|
||||
|
||||
result = df.groupby("a")["b"].cummax()
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
# GH 15635
|
||||
df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
|
||||
result = df.groupby("a").b.cummax()
|
||||
expected = Series([2, 1, 2], name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cummax_i8_at_implementation_bound():
|
||||
# the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT
|
||||
# for int64 dtype GH#46382
|
||||
ser = Series([pd.NaT._value + n for n in range(5)])
|
||||
df = DataFrame({"A": 1, "B": ser, "C": ser._values.view("M8[ns]")})
|
||||
gb = df.groupby("A")
|
||||
|
||||
res = gb.cummax()
|
||||
exp = df[["B", "C"]]
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||||
@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"])
|
||||
@pytest.mark.parametrize(
|
||||
"groups,expected_data",
|
||||
[
|
||||
([1, 1, 1], [1, None, None]),
|
||||
([1, 2, 3], [1, None, 2]),
|
||||
([1, 3, 3], [1, None, None]),
|
||||
],
|
||||
)
|
||||
def test_cummin_max_skipna(method, dtype, groups, expected_data):
|
||||
# GH-34047
|
||||
df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
|
||||
orig = df.copy()
|
||||
gb = df.groupby(groups)["a"]
|
||||
|
||||
result = getattr(gb, method)(skipna=False)
|
||||
expected = Series(expected_data, dtype=dtype, name="a")
|
||||
|
||||
# check we didn't accidentally alter df
|
||||
tm.assert_frame_equal(df, orig)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||||
def test_cummin_max_skipna_multiple_cols(method):
|
||||
# Ensure missing value in "a" doesn't cause "b" to be nan-filled
|
||||
df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]})
|
||||
gb = df.groupby([1, 1, 1])[["a", "b"]]
|
||||
|
||||
result = getattr(gb, method)(skipna=False)
|
||||
expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["cumprod", "cumsum"])
|
||||
def test_numpy_compat(func):
|
||||
# see gh-12811
|
||||
df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
|
||||
g = df.groupby("A")
|
||||
|
||||
msg = "numpy operations are not valid with groupby"
|
||||
|
||||
with pytest.raises(UnsupportedFunctionCall, match=msg):
|
||||
getattr(g, func)(1, 2, 3)
|
||||
with pytest.raises(UnsupportedFunctionCall, match=msg):
|
||||
getattr(g, func)(foo=1)
|
||||
|
||||
|
||||
@td.skip_if_32bit
|
||||
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||||
@pytest.mark.parametrize(
|
||||
"dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)]
|
||||
)
|
||||
def test_nullable_int_not_cast_as_float(method, dtype, val):
|
||||
data = [val, pd.NA]
|
||||
df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype)
|
||||
grouped = df.groupby("grp")
|
||||
|
||||
result = grouped.transform(method)
|
||||
expected = DataFrame({"b": data}, dtype=dtype)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_api2():
|
||||
# this takes the fast apply path
|
||||
|
||||
# cumsum (GH5614)
|
||||
df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
|
||||
expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
|
||||
result = df.groupby("A").cumsum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 5755 - cumsum is a transformer and should ignore as_index
|
||||
result = df.groupby("A", as_index=False).cumsum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 13994
|
||||
msg = "DataFrameGroupBy.cumsum with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.groupby("A").cumsum(axis=1)
|
||||
expected = df.cumsum(axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
msg = "DataFrameGroupBy.cumprod with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.groupby("A").cumprod(axis=1)
|
||||
expected = df.cumprod(axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,636 @@
|
||||
from string import ascii_lowercase
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_filter_series():
|
||||
s = Series([1, 3, 20, 5, 22, 24, 7])
|
||||
expected_odd = Series([1, 3, 5, 7], index=[0, 1, 3, 6])
|
||||
expected_even = Series([20, 22, 24], index=[2, 4, 5])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
|
||||
# Test dropna=False.
|
||||
tm.assert_series_equal(
|
||||
grouped.filter(lambda x: x.mean() < 10, dropna=False),
|
||||
expected_odd.reindex(s.index),
|
||||
)
|
||||
tm.assert_series_equal(
|
||||
grouped.filter(lambda x: x.mean() > 10, dropna=False),
|
||||
expected_even.reindex(s.index),
|
||||
)
|
||||
|
||||
|
||||
def test_filter_single_column_df():
|
||||
df = DataFrame([1, 3, 20, 5, 22, 24, 7])
|
||||
expected_odd = DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
|
||||
expected_even = DataFrame([20, 22, 24], index=[2, 4, 5])
|
||||
grouper = df[0].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
|
||||
# Test dropna=False.
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x.mean() < 10, dropna=False),
|
||||
expected_odd.reindex(df.index),
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x.mean() > 10, dropna=False),
|
||||
expected_even.reindex(df.index),
|
||||
)
|
||||
|
||||
|
||||
def test_filter_multi_column_df():
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
expected = DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2])
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected
|
||||
)
|
||||
|
||||
|
||||
def test_filter_mixed_df():
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
expected = DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2])
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected)
|
||||
|
||||
|
||||
def test_filter_out_all_groups():
|
||||
s = Series([1, 3, 20, 5, 22, 24, 7])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]])
|
||||
|
||||
|
||||
def test_filter_out_no_groups():
|
||||
s = Series([1, 3, 20, 5, 22, 24, 7])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
filtered = grouped.filter(lambda x: x.mean() > 0)
|
||||
tm.assert_series_equal(filtered, s)
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
filtered = grouped.filter(lambda x: x["A"].mean() > 0)
|
||||
tm.assert_frame_equal(filtered, df)
|
||||
|
||||
|
||||
def test_filter_out_all_groups_in_df():
|
||||
# GH12768
|
||||
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
|
||||
res = df.groupby("a")
|
||||
res = res.filter(lambda x: x["b"].sum() > 5, dropna=False)
|
||||
expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3})
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
|
||||
res = df.groupby("a")
|
||||
res = res.filter(lambda x: x["b"].sum() > 5, dropna=True)
|
||||
expected = DataFrame({"a": [], "b": []}, dtype="int64")
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
|
||||
def test_filter_condition_raises():
|
||||
def raise_if_sum_is_zero(x):
|
||||
if x.sum() == 0:
|
||||
raise ValueError
|
||||
return x.sum() > 0
|
||||
|
||||
s = Series([-1, 0, 1, 2])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
grouped.filter(raise_if_sum_is_zero)
|
||||
|
||||
|
||||
def test_filter_with_axis_in_groupby():
|
||||
# issue 11041
|
||||
index = pd.MultiIndex.from_product([range(10), [0, 1]])
|
||||
data = DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64")
|
||||
|
||||
msg = "DataFrame.groupby with axis=1"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = data.groupby(level=0, axis=1)
|
||||
result = gb.filter(lambda x: x.iloc[0, 0] > 10)
|
||||
expected = data.iloc[:, 12:20]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_filter_bad_shapes():
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
s = df["B"]
|
||||
g_df = df.groupby("B")
|
||||
g_s = s.groupby(s)
|
||||
|
||||
f = lambda x: x
|
||||
msg = "filter function returned a DataFrame, but expected a scalar bool"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
f = lambda x: x == 1
|
||||
msg = "filter function returned a DataFrame, but expected a scalar bool"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
f = lambda x: np.outer(x, x)
|
||||
msg = "can't multiply sequence by non-int of type 'str'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
|
||||
def test_filter_nan_is_false():
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
s = df["B"]
|
||||
g_df = df.groupby(df["B"])
|
||||
g_s = s.groupby(s)
|
||||
|
||||
f = lambda x: np.nan
|
||||
tm.assert_frame_equal(g_df.filter(f), df.loc[[]])
|
||||
tm.assert_series_equal(g_s.filter(f), s[[]])
|
||||
|
||||
|
||||
def test_filter_pdna_is_false():
|
||||
# in particular, dont raise in filter trying to call bool(pd.NA)
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
ser = df["B"]
|
||||
g_df = df.groupby(df["B"])
|
||||
g_s = ser.groupby(ser)
|
||||
|
||||
func = lambda x: pd.NA
|
||||
res = g_df.filter(func)
|
||||
tm.assert_frame_equal(res, df.loc[[]])
|
||||
res = g_s.filter(func)
|
||||
tm.assert_series_equal(res, ser[[]])
|
||||
|
||||
|
||||
def test_filter_against_workaround_ints():
|
||||
# Series of ints
|
||||
s = Series(np.random.default_rng(2).integers(0, 100, 100))
|
||||
grouper = s.apply(lambda x: np.round(x, -1))
|
||||
grouped = s.groupby(grouper)
|
||||
f = lambda x: x.mean() > 10
|
||||
|
||||
old_way = s[grouped.transform(f).astype("bool")]
|
||||
new_way = grouped.filter(f)
|
||||
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
|
||||
|
||||
|
||||
def test_filter_against_workaround_floats():
|
||||
# Series of floats
|
||||
s = 100 * Series(np.random.default_rng(2).random(100))
|
||||
grouper = s.apply(lambda x: np.round(x, -1))
|
||||
grouped = s.groupby(grouper)
|
||||
f = lambda x: x.mean() > 10
|
||||
old_way = s[grouped.transform(f).astype("bool")]
|
||||
new_way = grouped.filter(f)
|
||||
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
|
||||
|
||||
|
||||
def test_filter_against_workaround_dataframe():
|
||||
# Set up DataFrame of ints, floats, strings.
|
||||
letters = np.array(list(ascii_lowercase))
|
||||
N = 100
|
||||
random_letters = letters.take(
|
||||
np.random.default_rng(2).integers(0, 26, N, dtype=int)
|
||||
)
|
||||
df = DataFrame(
|
||||
{
|
||||
"ints": Series(np.random.default_rng(2).integers(0, 100, N)),
|
||||
"floats": N / 10 * Series(np.random.default_rng(2).random(N)),
|
||||
"letters": Series(random_letters),
|
||||
}
|
||||
)
|
||||
|
||||
# Group by ints; filter on floats.
|
||||
grouped = df.groupby("ints")
|
||||
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")]
|
||||
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
# Group by floats (rounded); filter on strings.
|
||||
grouper = df.floats.apply(lambda x: np.round(x, -1))
|
||||
grouped = df.groupby(grouper)
|
||||
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")]
|
||||
new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
# Group by strings; filter on ints.
|
||||
grouped = df.groupby("letters")
|
||||
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")]
|
||||
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
|
||||
def test_filter_using_len():
|
||||
# BUG GH4447
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
grouped = df.groupby("B")
|
||||
actual = grouped.filter(lambda x: len(x) > 2)
|
||||
expected = DataFrame(
|
||||
{"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)},
|
||||
index=np.arange(2, 6, dtype=np.int64),
|
||||
)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped.filter(lambda x: len(x) > 4)
|
||||
expected = df.loc[[]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Series have always worked properly, but we'll test anyway.
|
||||
s = df["B"]
|
||||
grouped = s.groupby(s)
|
||||
actual = grouped.filter(lambda x: len(x) > 2)
|
||||
expected = Series(4 * ["b"], index=np.arange(2, 6, dtype=np.int64), name="B")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped.filter(lambda x: len(x) > 4)
|
||||
expected = s[[]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_maintains_ordering():
|
||||
# Simple case: index is sequential. #4621
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}
|
||||
)
|
||||
s = df["pid"]
|
||||
grouped = df.groupby("tag")
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df["tag"])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Now index is sequentially decreasing.
|
||||
df.index = np.arange(len(df) - 1, -1, -1)
|
||||
s = df["pid"]
|
||||
grouped = df.groupby("tag")
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df["tag"])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Index is shuffled.
|
||||
SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
|
||||
df.index = df.index[SHUFFLED]
|
||||
s = df["pid"]
|
||||
grouped = df.groupby("tag")
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df["tag"])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_multiple_timestamp():
|
||||
# GH 10114
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": np.arange(5, dtype="int64"),
|
||||
"B": ["foo", "bar", "foo", "bar", "bar"],
|
||||
"C": Timestamp("20130101"),
|
||||
}
|
||||
)
|
||||
|
||||
grouped = df.groupby(["B", "C"])
|
||||
|
||||
result = grouped["A"].filter(lambda x: True)
|
||||
tm.assert_series_equal(df["A"], result)
|
||||
|
||||
result = grouped["A"].transform(len)
|
||||
expected = Series([2, 3, 2, 3, 3], name="A")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = grouped.filter(lambda x: True)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
result = grouped.transform("sum")
|
||||
expected = DataFrame({"A": [2, 8, 2, 8, 8]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grouped.transform(len)
|
||||
expected = DataFrame({"A": [2, 3, 2, 3, 3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_int_index():
|
||||
# GH4620
|
||||
index = [1, 1, 1, 2, 1, 1, 0, 1]
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
# Cast to avoid upcast when setting nan below
|
||||
expected = df.copy().astype("float64")
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_multiple_non_unique_int_index():
|
||||
# GH4620
|
||||
index = [1, 1, 1, 2, 0, 0, 0, 1]
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
# Cast to avoid upcast when setting nan below
|
||||
expected = df.copy().astype("float64")
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_float_index():
|
||||
# GH4620
|
||||
index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
# Cast to avoid upcast when setting nan below
|
||||
expected = df.copy().astype("float64")
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_timestamp_index():
|
||||
# GH4620
|
||||
t0 = Timestamp("2013-09-30 00:05:00")
|
||||
t1 = Timestamp("2013-10-30 00:05:00")
|
||||
t2 = Timestamp("2013-11-30 00:05:00")
|
||||
index = [t1, t1, t1, t2, t1, t1, t0, t1]
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
# Cast to avoid upcast when setting nan below
|
||||
expected = df.copy().astype("float64")
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_string_index():
|
||||
# GH4620
|
||||
index = list("bbbcbbab")
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
# Cast to avoid upcast when setting nan below
|
||||
expected = df.copy().astype("float64")
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_has_access_to_grouped_cols():
|
||||
df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
# previously didn't have access to col A #????
|
||||
filt = g.filter(lambda x: x["A"].sum() == 2)
|
||||
tm.assert_frame_equal(filt, df.iloc[[0, 1]])
|
||||
|
||||
|
||||
def test_filter_enforces_scalarness():
|
||||
df = DataFrame(
|
||||
[
|
||||
["best", "a", "x"],
|
||||
["worst", "b", "y"],
|
||||
["best", "c", "x"],
|
||||
["best", "d", "y"],
|
||||
["worst", "d", "y"],
|
||||
["worst", "d", "y"],
|
||||
["best", "d", "z"],
|
||||
],
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
with pytest.raises(TypeError, match="filter function returned a.*"):
|
||||
df.groupby("c").filter(lambda g: g["a"] == "best")
|
||||
|
||||
|
||||
def test_filter_non_bool_raises():
|
||||
df = DataFrame(
|
||||
[
|
||||
["best", "a", 1],
|
||||
["worst", "b", 1],
|
||||
["best", "c", 1],
|
||||
["best", "d", 1],
|
||||
["worst", "d", 1],
|
||||
["worst", "d", 1],
|
||||
["best", "d", 1],
|
||||
],
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
with pytest.raises(TypeError, match="filter function returned a.*"):
|
||||
df.groupby("a").filter(lambda g: g.c.mean())
|
||||
|
||||
|
||||
def test_filter_dropna_with_empty_groups():
|
||||
# GH 10780
|
||||
data = Series(np.random.default_rng(2).random(9), index=np.repeat([1, 2, 3], 3))
|
||||
grouped = data.groupby(level=0)
|
||||
result_false = grouped.filter(lambda x: x.mean() > 1, dropna=False)
|
||||
expected_false = Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3))
|
||||
tm.assert_series_equal(result_false, expected_false)
|
||||
|
||||
result_true = grouped.filter(lambda x: x.mean() > 1, dropna=True)
|
||||
expected_true = Series(index=pd.Index([], dtype=int), dtype=np.float64)
|
||||
tm.assert_series_equal(result_true, expected_true)
|
||||
|
||||
|
||||
def test_filter_consistent_result_before_after_agg_func():
|
||||
# GH 17091
|
||||
df = DataFrame({"data": range(6), "key": list("ABCABC")})
|
||||
grouper = df.groupby("key")
|
||||
result = grouper.filter(lambda x: True)
|
||||
expected = DataFrame({"data": range(6), "key": list("ABCABC")})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
grouper.sum()
|
||||
result = grouper.filter(lambda x: True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
3363
lib/python3.11/site-packages/pandas/tests/groupby/test_groupby.py
Normal file
3363
lib/python3.11/site-packages/pandas/tests/groupby/test_groupby.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,696 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat.pyarrow import pa_version_under10p1
|
||||
|
||||
from pandas.core.dtypes.missing import na_value_for_dtype
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.groupby import get_groupby_method_args
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, tuples, outputs",
|
||||
[
|
||||
(
|
||||
True,
|
||||
[["A", "B"], ["B", "A"]],
|
||||
{"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]},
|
||||
),
|
||||
(
|
||||
False,
|
||||
[["A", "B"], ["A", np.nan], ["B", "A"]],
|
||||
{
|
||||
"c": [13.0, 12.3, 123.23],
|
||||
"d": [13.0, 233.0, 123.0],
|
||||
"e": [13.0, 12.0, 1.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_multi_index_dataframe_nan_in_one_group(
|
||||
dropna, tuples, outputs, nulls_fixture
|
||||
):
|
||||
# GH 3729 this is to test that NA is in one group
|
||||
df_list = [
|
||||
["A", "B", 12, 12, 12],
|
||||
["A", nulls_fixture, 12.3, 233.0, 12],
|
||||
["B", "A", 123.23, 123, 1],
|
||||
["A", "B", 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
|
||||
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
|
||||
|
||||
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
|
||||
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna:
|
||||
mi = mi.set_levels(["A", "B", np.nan], level="b")
|
||||
expected = pd.DataFrame(outputs, index=mi)
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, tuples, outputs",
|
||||
[
|
||||
(
|
||||
True,
|
||||
[["A", "B"], ["B", "A"]],
|
||||
{"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]},
|
||||
),
|
||||
(
|
||||
False,
|
||||
[["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]],
|
||||
{
|
||||
"c": [12.0, 13.3, 123.23, 1.0],
|
||||
"d": [12.0, 234.0, 123.0, 1.0],
|
||||
"e": [12.0, 13.0, 1.0, 1.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
|
||||
dropna, tuples, outputs, nulls_fixture, nulls_fixture2
|
||||
):
|
||||
# GH 3729 this is to test that NA in different groups with different representations
|
||||
df_list = [
|
||||
["A", "B", 12, 12, 12],
|
||||
["A", nulls_fixture, 12.3, 233.0, 12],
|
||||
["B", "A", 123.23, 123, 1],
|
||||
[nulls_fixture2, "B", 1, 1, 1.0],
|
||||
["A", nulls_fixture2, 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
|
||||
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
|
||||
|
||||
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
|
||||
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna:
|
||||
mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]])
|
||||
expected = pd.DataFrame(outputs, index=mi)
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, idx, outputs",
|
||||
[
|
||||
(True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}),
|
||||
(
|
||||
False,
|
||||
["A", "B", np.nan],
|
||||
{
|
||||
"b": [123.23, 13.0, 12.3],
|
||||
"c": [123.0, 13.0, 233.0],
|
||||
"d": [1.0, 13.0, 12.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
|
||||
# GH 3729
|
||||
df_list = [
|
||||
["B", 12, 12, 12],
|
||||
[None, 12.3, 233.0, 12],
|
||||
["A", 123.23, 123, 1],
|
||||
["B", 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
|
||||
grouped = df.groupby("a", dropna=dropna).sum()
|
||||
|
||||
expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a"))
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, idx, expected",
|
||||
[
|
||||
(True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])),
|
||||
(
|
||||
False,
|
||||
["a", "a", "b", np.nan],
|
||||
pd.Series([3, 3, 3], index=["a", "b", np.nan]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_series_level(dropna, idx, expected):
|
||||
ser = pd.Series([1, 2, 3, 3], index=idx)
|
||||
|
||||
result = ser.groupby(level=0, dropna=dropna).sum()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, expected",
|
||||
[
|
||||
(True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")),
|
||||
(
|
||||
False,
|
||||
pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_series_by(dropna, expected):
|
||||
ser = pd.Series(
|
||||
[390.0, 350.0, 30.0, 20.0],
|
||||
index=["Falcon", "Falcon", "Parrot", "Parrot"],
|
||||
name="Max Speed",
|
||||
)
|
||||
|
||||
result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", (False, True))
|
||||
def test_grouper_dropna_propagation(dropna):
|
||||
# GH 36604
|
||||
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]})
|
||||
gb = df.groupby("A", dropna=dropna)
|
||||
assert gb._grouper.dropna == dropna
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index",
|
||||
[
|
||||
pd.RangeIndex(0, 4),
|
||||
list("abcd"),
|
||||
pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]),
|
||||
],
|
||||
)
|
||||
def test_groupby_dataframe_slice_then_transform(dropna, index):
|
||||
# GH35014 & GH35612
|
||||
expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]}
|
||||
|
||||
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index)
|
||||
gb = df.groupby("A", dropna=dropna)
|
||||
|
||||
result = gb.transform(len)
|
||||
expected = pd.DataFrame(expected_data, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = gb[["B"]].transform(len)
|
||||
expected = pd.DataFrame(expected_data, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = gb["B"].transform(len)
|
||||
expected = pd.Series(expected_data["B"], index=index, name="B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, tuples, outputs",
|
||||
[
|
||||
(
|
||||
True,
|
||||
[["A", "B"], ["B", "A"]],
|
||||
{"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]},
|
||||
),
|
||||
(
|
||||
False,
|
||||
[["A", "B"], ["A", np.nan], ["B", "A"]],
|
||||
{
|
||||
"c": [13.0, 12.3, 123.23],
|
||||
"d": [12.0, 233.0, 123.0],
|
||||
"e": [1.0, 12.0, 1.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs):
|
||||
# GH 3729
|
||||
df_list = [
|
||||
["A", "B", 12, 12, 12],
|
||||
["A", None, 12.3, 233.0, 12],
|
||||
["B", "A", 123.23, 123, 1],
|
||||
["A", "B", 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
|
||||
agg_dict = {"c": "sum", "d": "max", "e": "min"}
|
||||
grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict)
|
||||
|
||||
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
|
||||
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna:
|
||||
mi = mi.set_levels(["A", "B", np.nan], level="b")
|
||||
expected = pd.DataFrame(outputs, index=mi)
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.arm_slow
|
||||
@pytest.mark.parametrize(
|
||||
"datetime1, datetime2",
|
||||
[
|
||||
(pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")),
|
||||
(pd.Timedelta("-2 days"), pd.Timedelta("-1 days")),
|
||||
(pd.Period("2020-01-01"), pd.Period("2020-02-01")),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dropna, values", [(True, [12, 3]), (False, [12, 3, 6])])
|
||||
def test_groupby_dropna_datetime_like_data(
|
||||
dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2
|
||||
):
|
||||
# 3729
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"values": [1, 2, 3, 4, 5, 6],
|
||||
"dt": [
|
||||
datetime1,
|
||||
unique_nulls_fixture,
|
||||
datetime2,
|
||||
unique_nulls_fixture2,
|
||||
datetime1,
|
||||
datetime1,
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
if dropna:
|
||||
indexes = [datetime1, datetime2]
|
||||
else:
|
||||
indexes = [datetime1, datetime2, np.nan]
|
||||
|
||||
grouped = df.groupby("dt", dropna=dropna).agg({"values": "sum"})
|
||||
expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt"))
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, data, selected_data, levels",
|
||||
[
|
||||
pytest.param(
|
||||
False,
|
||||
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0, 0]},
|
||||
["a", "b", np.nan],
|
||||
id="dropna_false_has_nan",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0]},
|
||||
None,
|
||||
id="dropna_true_has_nan",
|
||||
),
|
||||
pytest.param(
|
||||
# no nan in "groups"; dropna=True|False should be same.
|
||||
False,
|
||||
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0, 0]},
|
||||
None,
|
||||
id="dropna_false_no_nan",
|
||||
),
|
||||
pytest.param(
|
||||
# no nan in "groups"; dropna=True|False should be same.
|
||||
True,
|
||||
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0, 0]},
|
||||
None,
|
||||
id="dropna_true_no_nan",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels):
|
||||
# GH 35889
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
gb = df.groupby("groups", dropna=dropna)
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))}))
|
||||
|
||||
mi_tuples = tuple(zip(data["groups"], selected_data["values"]))
|
||||
mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None])
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna and levels:
|
||||
mi = mi.set_levels(levels, level="groups")
|
||||
|
||||
expected = pd.DataFrame(selected_data, index=mi)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_index", [None, ["a"], ["a", "b"]])
|
||||
@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
|
||||
@pytest.mark.parametrize("series", [True, False])
|
||||
def test_groupby_dropna_with_multiindex_input(input_index, keys, series):
|
||||
# GH#46783
|
||||
obj = pd.DataFrame(
|
||||
{
|
||||
"a": [1, np.nan],
|
||||
"b": [1, 1],
|
||||
"c": [2, 3],
|
||||
}
|
||||
)
|
||||
|
||||
expected = obj.set_index(keys)
|
||||
if series:
|
||||
expected = expected["c"]
|
||||
elif input_index == ["a", "b"] and keys == ["a"]:
|
||||
# Column b should not be aggregated
|
||||
expected = expected[["c"]]
|
||||
|
||||
if input_index is not None:
|
||||
obj = obj.set_index(input_index)
|
||||
gb = obj.groupby(keys, dropna=False)
|
||||
if series:
|
||||
gb = gb["c"]
|
||||
result = gb.sum()
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_nan_included():
|
||||
# GH 35646
|
||||
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
|
||||
df = pd.DataFrame(data)
|
||||
grouped = df.groupby("group", dropna=False)
|
||||
result = grouped.indices
|
||||
dtype = np.intp
|
||||
expected = {
|
||||
"g1": np.array([0, 2], dtype=dtype),
|
||||
"g2": np.array([3], dtype=dtype),
|
||||
np.nan: np.array([1, 4], dtype=dtype),
|
||||
}
|
||||
for result_values, expected_values in zip(result.values(), expected.values()):
|
||||
tm.assert_numpy_array_equal(result_values, expected_values)
|
||||
assert np.isnan(list(result.keys())[2])
|
||||
assert list(result.keys())[0:2] == ["g1", "g2"]
|
||||
|
||||
|
||||
def test_groupby_drop_nan_with_multi_index():
|
||||
# GH 39895
|
||||
df = pd.DataFrame([[np.nan, 0, 1]], columns=["a", "b", "c"])
|
||||
df = df.set_index(["a", "b"])
|
||||
result = df.groupby(["a", "b"], dropna=False).first()
|
||||
expected = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# sequence_index enumerates all strings made up of x, y, z of length 4
|
||||
@pytest.mark.parametrize("sequence_index", range(3**4))
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
None,
|
||||
"UInt8",
|
||||
"Int8",
|
||||
"UInt16",
|
||||
"Int16",
|
||||
"UInt32",
|
||||
"Int32",
|
||||
"UInt64",
|
||||
"Int64",
|
||||
"Float32",
|
||||
"Int64",
|
||||
"Float64",
|
||||
"category",
|
||||
"string",
|
||||
pytest.param(
|
||||
"string[pyarrow]",
|
||||
marks=pytest.mark.skipif(
|
||||
pa_version_under10p1, reason="pyarrow is not installed"
|
||||
),
|
||||
),
|
||||
"datetime64[ns]",
|
||||
"period[d]",
|
||||
"Sparse[float]",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("test_series", [True, False])
|
||||
def test_no_sort_keep_na(sequence_index, dtype, test_series, as_index):
|
||||
# GH#46584, GH#48794
|
||||
|
||||
# Convert sequence_index into a string sequence, e.g. 5 becomes "xxyz"
|
||||
# This sequence is used for the grouper.
|
||||
sequence = "".join(
|
||||
[{0: "x", 1: "y", 2: "z"}[sequence_index // (3**k) % 3] for k in range(4)]
|
||||
)
|
||||
|
||||
# Unique values to use for grouper, depends on dtype
|
||||
if dtype in ("string", "string[pyarrow]"):
|
||||
uniques = {"x": "x", "y": "y", "z": pd.NA}
|
||||
elif dtype in ("datetime64[ns]", "period[d]"):
|
||||
uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA}
|
||||
else:
|
||||
uniques = {"x": 1, "y": 2, "z": np.nan}
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"key": pd.Series([uniques[label] for label in sequence], dtype=dtype),
|
||||
"a": [0, 1, 2, 3],
|
||||
}
|
||||
)
|
||||
gb = df.groupby("key", dropna=False, sort=False, as_index=as_index, observed=False)
|
||||
if test_series:
|
||||
gb = gb["a"]
|
||||
result = gb.sum()
|
||||
|
||||
# Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid
|
||||
# issues with hashing np.nan
|
||||
summed = {}
|
||||
for idx, label in enumerate(sequence):
|
||||
summed[label] = summed.get(label, 0) + idx
|
||||
if dtype == "category":
|
||||
index = pd.CategoricalIndex(
|
||||
[uniques[e] for e in summed],
|
||||
df["key"].cat.categories,
|
||||
name="key",
|
||||
)
|
||||
elif isinstance(dtype, str) and dtype.startswith("Sparse"):
|
||||
index = pd.Index(
|
||||
pd.array([uniques[label] for label in summed], dtype=dtype), name="key"
|
||||
)
|
||||
else:
|
||||
index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key")
|
||||
expected = pd.Series(summed.values(), index=index, name="a", dtype=None)
|
||||
if not test_series:
|
||||
expected = expected.to_frame()
|
||||
if not as_index:
|
||||
expected = expected.reset_index()
|
||||
if dtype is not None and dtype.startswith("Sparse"):
|
||||
expected["key"] = expected["key"].astype(dtype)
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_series", [True, False])
|
||||
@pytest.mark.parametrize("dtype", [object, None])
|
||||
def test_null_is_null_for_dtype(
|
||||
sort, dtype, nulls_fixture, nulls_fixture2, test_series
|
||||
):
|
||||
# GH#48506 - groups should always result in using the null for the dtype
|
||||
df = pd.DataFrame({"a": [1, 2]})
|
||||
groups = pd.Series([nulls_fixture, nulls_fixture2], dtype=dtype)
|
||||
obj = df["a"] if test_series else df
|
||||
gb = obj.groupby(groups, dropna=False, sort=sort)
|
||||
result = gb.sum()
|
||||
index = pd.Index([na_value_for_dtype(groups.dtype)])
|
||||
expected = pd.DataFrame({"a": [3]}, index=index)
|
||||
if test_series:
|
||||
tm.assert_series_equal(result, expected["a"])
|
||||
else:
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
|
||||
def test_categorical_reducers(reduction_func, observed, sort, as_index, index_kind):
|
||||
# Ensure there is at least one null value by appending to the end
|
||||
values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None)
|
||||
df = pd.DataFrame(
|
||||
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
|
||||
)
|
||||
|
||||
# Strategy: Compare to dropna=True by filling null values with a new code
|
||||
df_filled = df.copy()
|
||||
df_filled["x"] = pd.Categorical(values, categories=[1, 2, 3, 4]).fillna(4)
|
||||
|
||||
if index_kind == "range":
|
||||
keys = ["x"]
|
||||
elif index_kind == "single":
|
||||
keys = ["x"]
|
||||
df = df.set_index("x")
|
||||
df_filled = df_filled.set_index("x")
|
||||
else:
|
||||
keys = ["x", "x2"]
|
||||
df["x2"] = df["x"]
|
||||
df = df.set_index(["x", "x2"])
|
||||
df_filled["x2"] = df_filled["x"]
|
||||
df_filled = df_filled.set_index(["x", "x2"])
|
||||
args = get_groupby_method_args(reduction_func, df)
|
||||
args_filled = get_groupby_method_args(reduction_func, df_filled)
|
||||
if reduction_func == "corrwith" and index_kind == "range":
|
||||
# Don't include the grouping columns so we can call reset_index
|
||||
args = (args[0].drop(columns=keys),)
|
||||
args_filled = (args_filled[0].drop(columns=keys),)
|
||||
|
||||
gb_keepna = df.groupby(
|
||||
keys, dropna=False, observed=observed, sort=sort, as_index=as_index
|
||||
)
|
||||
|
||||
if not observed and reduction_func in ["idxmin", "idxmax"]:
|
||||
with pytest.raises(
|
||||
ValueError, match="empty group due to unobserved categories"
|
||||
):
|
||||
getattr(gb_keepna, reduction_func)(*args)
|
||||
return
|
||||
|
||||
gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True)
|
||||
expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index()
|
||||
expected["x"] = expected["x"].cat.remove_categories([4])
|
||||
if index_kind == "multi":
|
||||
expected["x2"] = expected["x2"].cat.remove_categories([4])
|
||||
if as_index:
|
||||
if index_kind == "multi":
|
||||
expected = expected.set_index(["x", "x2"])
|
||||
else:
|
||||
expected = expected.set_index("x")
|
||||
elif index_kind != "range" and reduction_func != "size":
|
||||
# size, unlike other methods, has the desired behavior in GH#49519
|
||||
expected = expected.drop(columns="x")
|
||||
if index_kind == "multi":
|
||||
expected = expected.drop(columns="x2")
|
||||
if reduction_func in ("idxmax", "idxmin") and index_kind != "range":
|
||||
# expected was computed with a RangeIndex; need to translate to index values
|
||||
values = expected["y"].values.tolist()
|
||||
if index_kind == "single":
|
||||
values = [np.nan if e == 4 else e for e in values]
|
||||
expected["y"] = pd.Categorical(values, categories=[1, 2, 3])
|
||||
else:
|
||||
values = [(np.nan, np.nan) if e == (4, 4) else e for e in values]
|
||||
expected["y"] = values
|
||||
if reduction_func == "size":
|
||||
# size, unlike other methods, has the desired behavior in GH#49519
|
||||
expected = expected.rename(columns={0: "size"})
|
||||
if as_index:
|
||||
expected = expected["size"].rename(None)
|
||||
|
||||
if as_index or index_kind == "range" or reduction_func == "size":
|
||||
warn = None
|
||||
else:
|
||||
warn = FutureWarning
|
||||
msg = "A grouping .* was excluded from the result"
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
result = getattr(gb_keepna, reduction_func)(*args)
|
||||
|
||||
# size will return a Series, others are DataFrame
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_transformers(
|
||||
request, transformation_func, observed, sort, as_index
|
||||
):
|
||||
# GH#36327
|
||||
if transformation_func == "fillna":
|
||||
msg = "GH#49651 fillna may incorrectly reorders results when dropna=False"
|
||||
request.applymarker(pytest.mark.xfail(reason=msg, strict=False))
|
||||
|
||||
values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None)
|
||||
df = pd.DataFrame(
|
||||
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
|
||||
)
|
||||
args = get_groupby_method_args(transformation_func, df)
|
||||
|
||||
# Compute result for null group
|
||||
null_group_values = df[df["x"].isnull()]["y"]
|
||||
if transformation_func == "cumcount":
|
||||
null_group_data = list(range(len(null_group_values)))
|
||||
elif transformation_func == "ngroup":
|
||||
if sort:
|
||||
if observed:
|
||||
na_group = df["x"].nunique(dropna=False) - 1
|
||||
else:
|
||||
# TODO: Should this be 3?
|
||||
na_group = df["x"].nunique(dropna=False) - 1
|
||||
else:
|
||||
na_group = df.iloc[: null_group_values.index[0]]["x"].nunique()
|
||||
null_group_data = len(null_group_values) * [na_group]
|
||||
else:
|
||||
null_group_data = getattr(null_group_values, transformation_func)(*args)
|
||||
null_group_result = pd.DataFrame({"y": null_group_data})
|
||||
|
||||
gb_keepna = df.groupby(
|
||||
"x", dropna=False, observed=observed, sort=sort, as_index=as_index
|
||||
)
|
||||
gb_dropna = df.groupby("x", dropna=True, observed=observed, sort=sort)
|
||||
|
||||
msg = "The default fill_method='ffill' in DataFrameGroupBy.pct_change is deprecated"
|
||||
if transformation_func == "pct_change":
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = getattr(gb_keepna, "pct_change")(*args)
|
||||
else:
|
||||
result = getattr(gb_keepna, transformation_func)(*args)
|
||||
expected = getattr(gb_dropna, transformation_func)(*args)
|
||||
|
||||
for iloc, value in zip(
|
||||
df[df["x"].isnull()].index.tolist(), null_group_result.values.ravel()
|
||||
):
|
||||
if expected.ndim == 1:
|
||||
expected.iloc[iloc] = value
|
||||
else:
|
||||
expected.iloc[iloc, 0] = value
|
||||
if transformation_func == "ngroup":
|
||||
expected[df["x"].notnull() & expected.ge(na_group)] += 1
|
||||
if transformation_func not in ("rank", "diff", "pct_change", "shift"):
|
||||
expected = expected.astype("int64")
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["head", "tail"])
|
||||
def test_categorical_head_tail(method, observed, sort, as_index):
|
||||
# GH#36327
|
||||
values = np.random.default_rng(2).choice([1, 2, None], 30)
|
||||
df = pd.DataFrame(
|
||||
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
|
||||
)
|
||||
gb = df.groupby("x", dropna=False, observed=observed, sort=sort, as_index=as_index)
|
||||
result = getattr(gb, method)()
|
||||
|
||||
if method == "tail":
|
||||
values = values[::-1]
|
||||
# Take the top 5 values from each group
|
||||
mask = (
|
||||
((values == 1) & ((values == 1).cumsum() <= 5))
|
||||
| ((values == 2) & ((values == 2).cumsum() <= 5))
|
||||
# flake8 doesn't like the vectorized check for None, thinks we should use `is`
|
||||
| ((values == None) & ((values == None).cumsum() <= 5)) # noqa: E711
|
||||
)
|
||||
if method == "tail":
|
||||
mask = mask[::-1]
|
||||
expected = df[mask]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_agg():
|
||||
# GH#36327
|
||||
values = np.random.default_rng(2).choice([1, 2, None], 30)
|
||||
df = pd.DataFrame(
|
||||
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
|
||||
)
|
||||
gb = df.groupby("x", dropna=False, observed=False)
|
||||
result = gb.agg(lambda x: x.sum())
|
||||
expected = gb.sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_transform():
|
||||
# GH#36327
|
||||
values = np.random.default_rng(2).choice([1, 2, None], 30)
|
||||
df = pd.DataFrame(
|
||||
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
|
||||
)
|
||||
gb = df.groupby("x", dropna=False, observed=False)
|
||||
result = gb.transform(lambda x: x.sum())
|
||||
expected = gb.transform("sum")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,135 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.groupby import get_groupby_method_args
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"obj",
|
||||
[
|
||||
tm.SubclassedDataFrame({"A": np.arange(0, 10)}),
|
||||
tm.SubclassedSeries(np.arange(0, 10), name="A"),
|
||||
],
|
||||
)
|
||||
def test_groupby_preserves_subclass(obj, groupby_func):
|
||||
# GH28330 -- preserve subclass through groupby operations
|
||||
|
||||
if isinstance(obj, Series) and groupby_func in {"corrwith"}:
|
||||
pytest.skip(f"Not applicable for Series and {groupby_func}")
|
||||
|
||||
grouped = obj.groupby(np.arange(0, 10))
|
||||
|
||||
# Groups should preserve subclass type
|
||||
assert isinstance(grouped.get_group(0), type(obj))
|
||||
|
||||
args = get_groupby_method_args(groupby_func, obj)
|
||||
|
||||
warn = FutureWarning if groupby_func == "fillna" else None
|
||||
msg = f"{type(grouped).__name__}.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False):
|
||||
result1 = getattr(grouped, groupby_func)(*args)
|
||||
with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False):
|
||||
result2 = grouped.agg(groupby_func, *args)
|
||||
|
||||
# Reduction or transformation kernels should preserve type
|
||||
slices = {"ngroup", "cumcount", "size"}
|
||||
if isinstance(obj, DataFrame) and groupby_func in slices:
|
||||
assert isinstance(result1, tm.SubclassedSeries)
|
||||
else:
|
||||
assert isinstance(result1, type(obj))
|
||||
|
||||
# Confirm .agg() groupby operations return same results
|
||||
if isinstance(result1, DataFrame):
|
||||
tm.assert_frame_equal(result1, result2)
|
||||
else:
|
||||
tm.assert_series_equal(result1, result2)
|
||||
|
||||
|
||||
def test_groupby_preserves_metadata():
|
||||
# GH-37343
|
||||
custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]})
|
||||
assert "testattr" in custom_df._metadata
|
||||
custom_df.testattr = "hello"
|
||||
for _, group_df in custom_df.groupby("c"):
|
||||
assert group_df.testattr == "hello"
|
||||
|
||||
# GH-45314
|
||||
def func(group):
|
||||
assert isinstance(group, tm.SubclassedDataFrame)
|
||||
assert hasattr(group, "testattr")
|
||||
assert group.testattr == "hello"
|
||||
return group.testattr
|
||||
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning,
|
||||
match=msg,
|
||||
raise_on_extra_warnings=False,
|
||||
check_stacklevel=False,
|
||||
):
|
||||
result = custom_df.groupby("c").apply(func)
|
||||
expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = custom_df.groupby("c").apply(func, include_groups=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# https://github.com/pandas-dev/pandas/pull/56761
|
||||
result = custom_df.groupby("c")[["a", "b"]].apply(func)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def func2(group):
|
||||
assert isinstance(group, tm.SubclassedSeries)
|
||||
assert hasattr(group, "testattr")
|
||||
return group.testattr
|
||||
|
||||
custom_series = tm.SubclassedSeries([1, 2, 3])
|
||||
custom_series.testattr = "hello"
|
||||
result = custom_series.groupby(custom_df["c"]).apply(func2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
result = custom_series.groupby(custom_df["c"]).agg(func2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("obj", [DataFrame, tm.SubclassedDataFrame])
|
||||
def test_groupby_resample_preserves_subclass(obj):
|
||||
# GH28330 -- preserve subclass through groupby.resample()
|
||||
|
||||
df = obj(
|
||||
{
|
||||
"Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object),
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
df = df.set_index("Date")
|
||||
|
||||
# Confirm groupby.resample() preserves dataframe type
|
||||
msg = "DataFrameGroupBy.resample operated on the grouping columns"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning,
|
||||
match=msg,
|
||||
raise_on_extra_warnings=False,
|
||||
check_stacklevel=False,
|
||||
):
|
||||
result = df.groupby("Buyer").resample("5D").sum()
|
||||
assert isinstance(result, obj)
|
||||
1238
lib/python3.11/site-packages/pandas/tests/groupby/test_grouping.py
Normal file
1238
lib/python3.11/site-packages/pandas/tests/groupby/test_grouping.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,85 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture(params=[["inner"], ["inner", "outer"]])
|
||||
def frame(request):
|
||||
levels = request.param
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"outer": ["a", "a", "a", "b", "b", "b"],
|
||||
"inner": [1, 2, 3, 1, 2, 3],
|
||||
"A": np.arange(6),
|
||||
"B": ["one", "one", "two", "two", "one", "one"],
|
||||
}
|
||||
)
|
||||
if levels:
|
||||
df = df.set_index(levels)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def series():
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"outer": ["a", "a", "a", "b", "b", "b"],
|
||||
"inner": [1, 2, 3, 1, 2, 3],
|
||||
"A": np.arange(6),
|
||||
"B": ["one", "one", "two", "two", "one", "one"],
|
||||
}
|
||||
)
|
||||
s = df.set_index(["outer", "inner", "B"])["A"]
|
||||
|
||||
return s
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key_strs,groupers",
|
||||
[
|
||||
("inner", pd.Grouper(level="inner")), # Index name
|
||||
(["inner"], [pd.Grouper(level="inner")]), # List of index name
|
||||
(["B", "inner"], ["B", pd.Grouper(level="inner")]), # Column and index
|
||||
(["inner", "B"], [pd.Grouper(level="inner"), "B"]), # Index and column
|
||||
],
|
||||
)
|
||||
def test_grouper_index_level_as_string(frame, key_strs, groupers):
|
||||
if "B" not in key_strs or "outer" in frame.columns:
|
||||
result = frame.groupby(key_strs).mean(numeric_only=True)
|
||||
expected = frame.groupby(groupers).mean(numeric_only=True)
|
||||
else:
|
||||
result = frame.groupby(key_strs).mean()
|
||||
expected = frame.groupby(groupers).mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"levels",
|
||||
[
|
||||
"inner",
|
||||
"outer",
|
||||
"B",
|
||||
["inner"],
|
||||
["outer"],
|
||||
["B"],
|
||||
["inner", "outer"],
|
||||
["outer", "inner"],
|
||||
["inner", "outer", "B"],
|
||||
["B", "outer", "inner"],
|
||||
],
|
||||
)
|
||||
def test_grouper_index_level_as_string_series(series, levels):
|
||||
# Compute expected result
|
||||
if isinstance(levels, list):
|
||||
groupers = [pd.Grouper(level=lv) for lv in levels]
|
||||
else:
|
||||
groupers = pd.Grouper(level=levels)
|
||||
|
||||
expected = series.groupby(groupers).mean()
|
||||
|
||||
# Compute and check result
|
||||
result = series.groupby(levels).mean()
|
||||
tm.assert_series_equal(result, expected)
|
||||
@ -0,0 +1,333 @@
|
||||
# Test GroupBy._positional_selector positional grouped indexing GH#42864
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[0, [0, 1, 4]],
|
||||
[2, [5]],
|
||||
[5, []],
|
||||
[-1, [3, 4, 7]],
|
||||
[-2, [1, 6]],
|
||||
[-6, []],
|
||||
],
|
||||
)
|
||||
def test_int(slice_test_df, slice_test_grouped, arg, expected_rows):
|
||||
# Test single integer
|
||||
result = slice_test_grouped._positional_selector[arg]
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_slice(slice_test_df, slice_test_grouped):
|
||||
# Test single slice
|
||||
result = slice_test_grouped._positional_selector[0:3:2]
|
||||
expected = slice_test_df.iloc[[0, 1, 4, 5]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[[0, 2], [0, 1, 4, 5]],
|
||||
[[0, 2, -1], [0, 1, 3, 4, 5, 7]],
|
||||
[range(0, 3, 2), [0, 1, 4, 5]],
|
||||
[{0, 2}, [0, 1, 4, 5]],
|
||||
],
|
||||
ids=[
|
||||
"list",
|
||||
"negative",
|
||||
"range",
|
||||
"set",
|
||||
],
|
||||
)
|
||||
def test_list(slice_test_df, slice_test_grouped, arg, expected_rows):
|
||||
# Test lists of integers and integer valued iterables
|
||||
result = slice_test_grouped._positional_selector[arg]
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_ints(slice_test_df, slice_test_grouped):
|
||||
# Test tuple of ints
|
||||
result = slice_test_grouped._positional_selector[0, 2, -1]
|
||||
expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_slices(slice_test_df, slice_test_grouped):
|
||||
# Test tuple of slices
|
||||
result = slice_test_grouped._positional_selector[:2, -2:]
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_mix(slice_test_df, slice_test_grouped):
|
||||
# Test mixed tuple of ints and slices
|
||||
result = slice_test_grouped._positional_selector[0, 1, -2:]
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[0, [0, 1, 4]],
|
||||
[[0, 2, -1], [0, 1, 3, 4, 5, 7]],
|
||||
[(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]],
|
||||
],
|
||||
)
|
||||
def test_as_index(slice_test_df, arg, expected_rows):
|
||||
# Test the default as_index behaviour
|
||||
result = slice_test_df.groupby("Group", sort=False)._positional_selector[arg]
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_doc_examples():
|
||||
# Test the examples in the documentation
|
||||
df = pd.DataFrame(
|
||||
[["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"]
|
||||
)
|
||||
|
||||
grouped = df.groupby("A", as_index=False)
|
||||
|
||||
result = grouped._positional_selector[1:2]
|
||||
expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grouped._positional_selector[1, -1]
|
||||
expected = pd.DataFrame(
|
||||
[["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4]
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def multiindex_data():
|
||||
rng = np.random.default_rng(2)
|
||||
ndates = 100
|
||||
nitems = 20
|
||||
dates = pd.date_range("20130101", periods=ndates, freq="D")
|
||||
items = [f"item {i}" for i in range(nitems)]
|
||||
|
||||
data = {}
|
||||
for date in dates:
|
||||
nitems_for_date = nitems - rng.integers(0, 12)
|
||||
levels = [
|
||||
(item, rng.integers(0, 10000) / 100, rng.integers(0, 10000) / 100)
|
||||
for item in items[:nitems_for_date]
|
||||
]
|
||||
levels.sort(key=lambda x: x[1])
|
||||
data[date] = levels
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def _make_df_from_data(data):
|
||||
rows = {}
|
||||
for date in data:
|
||||
for level in data[date]:
|
||||
rows[(date, level[0])] = {"A": level[1], "B": level[2]}
|
||||
|
||||
df = pd.DataFrame.from_dict(rows, orient="index")
|
||||
df.index.names = ("Date", "Item")
|
||||
return df
|
||||
|
||||
|
||||
def test_multiindex(multiindex_data):
|
||||
# Test the multiindex mentioned as the use-case in the documentation
|
||||
df = _make_df_from_data(multiindex_data)
|
||||
result = df.groupby("Date", as_index=False).nth(slice(3, -3))
|
||||
|
||||
sliced = {date: multiindex_data[date][3:-3] for date in multiindex_data}
|
||||
expected = _make_df_from_data(sliced)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arg", [1, 5, 30, 1000, -1, -5, -30, -1000])
|
||||
@pytest.mark.parametrize("method", ["head", "tail"])
|
||||
@pytest.mark.parametrize("simulated", [True, False])
|
||||
def test_against_head_and_tail(arg, method, simulated):
|
||||
# Test gives the same results as grouped head and tail
|
||||
n_groups = 100
|
||||
n_rows_per_group = 30
|
||||
|
||||
data = {
|
||||
"group": [
|
||||
f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups)
|
||||
],
|
||||
"value": [
|
||||
f"group {g} row {j}"
|
||||
for j in range(n_rows_per_group)
|
||||
for g in range(n_groups)
|
||||
],
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
grouped = df.groupby("group", as_index=False)
|
||||
size = arg if arg >= 0 else n_rows_per_group + arg
|
||||
|
||||
if method == "head":
|
||||
result = grouped._positional_selector[:arg]
|
||||
|
||||
if simulated:
|
||||
indices = [
|
||||
j * n_groups + i
|
||||
for j in range(size)
|
||||
for i in range(n_groups)
|
||||
if j * n_groups + i < n_groups * n_rows_per_group
|
||||
]
|
||||
expected = df.iloc[indices]
|
||||
|
||||
else:
|
||||
expected = grouped.head(arg)
|
||||
|
||||
else:
|
||||
result = grouped._positional_selector[-arg:]
|
||||
|
||||
if simulated:
|
||||
indices = [
|
||||
(n_rows_per_group + j - size) * n_groups + i
|
||||
for j in range(size)
|
||||
for i in range(n_groups)
|
||||
if (n_rows_per_group + j - size) * n_groups + i >= 0
|
||||
]
|
||||
expected = df.iloc[indices]
|
||||
|
||||
else:
|
||||
expected = grouped.tail(arg)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("start", [None, 0, 1, 10, -1, -10])
|
||||
@pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10])
|
||||
@pytest.mark.parametrize("step", [None, 1, 5])
|
||||
def test_against_df_iloc(start, stop, step):
|
||||
# Test that a single group gives the same results as DataFrame.iloc
|
||||
n_rows = 30
|
||||
|
||||
data = {
|
||||
"group": ["group 0"] * n_rows,
|
||||
"value": list(range(n_rows)),
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
grouped = df.groupby("group", as_index=False)
|
||||
|
||||
result = grouped._positional_selector[start:stop:step]
|
||||
expected = df.iloc[start:stop:step]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_series():
|
||||
# Test grouped Series
|
||||
ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"])
|
||||
grouped = ser.groupby(level=0)
|
||||
result = grouped._positional_selector[1:2]
|
||||
expected = pd.Series([2, 5], index=["a", "b"])
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("step", [1, 2, 3, 4, 5])
|
||||
def test_step(step):
|
||||
# Test slice with various step values
|
||||
data = [["x", f"x{i}"] for i in range(5)]
|
||||
data += [["y", f"y{i}"] for i in range(4)]
|
||||
data += [["z", f"z{i}"] for i in range(3)]
|
||||
df = pd.DataFrame(data, columns=["A", "B"])
|
||||
|
||||
grouped = df.groupby("A", as_index=False)
|
||||
|
||||
result = grouped._positional_selector[::step]
|
||||
|
||||
data = [["x", f"x{i}"] for i in range(0, 5, step)]
|
||||
data += [["y", f"y{i}"] for i in range(0, 4, step)]
|
||||
data += [["z", f"z{i}"] for i in range(0, 3, step)]
|
||||
|
||||
index = [0 + i for i in range(0, 5, step)]
|
||||
index += [5 + i for i in range(0, 4, step)]
|
||||
index += [9 + i for i in range(0, 3, step)]
|
||||
|
||||
expected = pd.DataFrame(data, columns=["A", "B"], index=index)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def column_group_df():
|
||||
return pd.DataFrame(
|
||||
[[0, 1, 2, 3, 4, 5, 6], [0, 0, 1, 0, 1, 0, 2]],
|
||||
columns=["A", "B", "C", "D", "E", "F", "G"],
|
||||
)
|
||||
|
||||
|
||||
def test_column_axis(column_group_df):
|
||||
msg = "DataFrame.groupby with axis=1"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
g = column_group_df.groupby(column_group_df.iloc[1], axis=1)
|
||||
result = g._positional_selector[1:-1]
|
||||
expected = column_group_df.iloc[:, [1, 3]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_columns_on_iter():
|
||||
# GitHub issue #44821
|
||||
df = pd.DataFrame({k: range(10) for k in "ABC"})
|
||||
|
||||
# Group-by and select columns
|
||||
cols = ["A", "B"]
|
||||
for _, dg in df.groupby(df.A < 4)[cols]:
|
||||
tm.assert_index_equal(dg.columns, pd.Index(cols))
|
||||
assert "C" not in dg.columns
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", [list, pd.Index, pd.Series, np.array])
|
||||
def test_groupby_duplicated_columns(func):
|
||||
# GH#44924
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": [1, 2],
|
||||
"B": [3, 3],
|
||||
"C": ["G", "G"],
|
||||
}
|
||||
)
|
||||
result = df.groupby("C")[func(["A", "B", "A"])].mean()
|
||||
expected = pd.DataFrame(
|
||||
[[1.5, 3.0, 1.5]], columns=["A", "B", "A"], index=pd.Index(["G"], name="C")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_get_nonexisting_groups():
|
||||
# GH#32492
|
||||
df = pd.DataFrame(
|
||||
data={
|
||||
"A": ["a1", "a2", None],
|
||||
"B": ["b1", "b2", "b1"],
|
||||
"val": [1, 2, 3],
|
||||
}
|
||||
)
|
||||
grps = df.groupby(by=["A", "B"])
|
||||
|
||||
msg = "('a2', 'b1')"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
grps.get_group(("a2", "b1"))
|
||||
@ -0,0 +1,331 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import groupby as libgroupby
|
||||
from pandas._libs.groupby import (
|
||||
group_cumprod,
|
||||
group_cumsum,
|
||||
group_mean,
|
||||
group_sum,
|
||||
group_var,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import ensure_platform_int
|
||||
|
||||
from pandas import isna
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class GroupVarTestMixin:
|
||||
def test_group_var_generic_1d(self):
|
||||
prng = np.random.default_rng(2)
|
||||
|
||||
out = (np.nan * np.ones((5, 1))).astype(self.dtype)
|
||||
counts = np.zeros(5, dtype="int64")
|
||||
values = 10 * prng.random((15, 1)).astype(self.dtype)
|
||||
labels = np.tile(np.arange(5), (3,)).astype("intp")
|
||||
|
||||
expected_out = (
|
||||
np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2
|
||||
)[:, np.newaxis]
|
||||
expected_counts = counts + 3
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
assert np.allclose(out, expected_out, self.rtol)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_generic_1d_flat_labels(self):
|
||||
prng = np.random.default_rng(2)
|
||||
|
||||
out = (np.nan * np.ones((1, 1))).astype(self.dtype)
|
||||
counts = np.zeros(1, dtype="int64")
|
||||
values = 10 * prng.random((5, 1)).astype(self.dtype)
|
||||
labels = np.zeros(5, dtype="intp")
|
||||
|
||||
expected_out = np.array([[values.std(ddof=1) ** 2]])
|
||||
expected_counts = counts + 5
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
|
||||
assert np.allclose(out, expected_out, self.rtol)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_generic_2d_all_finite(self):
|
||||
prng = np.random.default_rng(2)
|
||||
|
||||
out = (np.nan * np.ones((5, 2))).astype(self.dtype)
|
||||
counts = np.zeros(5, dtype="int64")
|
||||
values = 10 * prng.random((10, 2)).astype(self.dtype)
|
||||
labels = np.tile(np.arange(5), (2,)).astype("intp")
|
||||
|
||||
expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2
|
||||
expected_counts = counts + 2
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
assert np.allclose(out, expected_out, self.rtol)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_generic_2d_some_nan(self):
|
||||
prng = np.random.default_rng(2)
|
||||
|
||||
out = (np.nan * np.ones((5, 2))).astype(self.dtype)
|
||||
counts = np.zeros(5, dtype="int64")
|
||||
values = 10 * prng.random((10, 2)).astype(self.dtype)
|
||||
values[:, 1] = np.nan
|
||||
labels = np.tile(np.arange(5), (2,)).astype("intp")
|
||||
|
||||
expected_out = np.vstack(
|
||||
[
|
||||
values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2,
|
||||
np.nan * np.ones(5),
|
||||
]
|
||||
).T.astype(self.dtype)
|
||||
expected_counts = counts + 2
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
tm.assert_almost_equal(out, expected_out, rtol=0.5e-06)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_constant(self):
|
||||
# Regression test from GH 10448.
|
||||
|
||||
out = np.array([[np.nan]], dtype=self.dtype)
|
||||
counts = np.array([0], dtype="int64")
|
||||
values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype)
|
||||
labels = np.zeros(3, dtype="intp")
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
|
||||
assert counts[0] == 3
|
||||
assert out[0, 0] >= 0
|
||||
tm.assert_almost_equal(out[0, 0], 0.0)
|
||||
|
||||
|
||||
class TestGroupVarFloat64(GroupVarTestMixin):
|
||||
__test__ = True
|
||||
|
||||
algo = staticmethod(group_var)
|
||||
dtype = np.float64
|
||||
rtol = 1e-5
|
||||
|
||||
def test_group_var_large_inputs(self):
|
||||
prng = np.random.default_rng(2)
|
||||
|
||||
out = np.array([[np.nan]], dtype=self.dtype)
|
||||
counts = np.array([0], dtype="int64")
|
||||
values = (prng.random(10**6) + 10**12).astype(self.dtype)
|
||||
values.shape = (10**6, 1)
|
||||
labels = np.zeros(10**6, dtype="intp")
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
|
||||
assert counts[0] == 10**6
|
||||
tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3)
|
||||
|
||||
|
||||
class TestGroupVarFloat32(GroupVarTestMixin):
|
||||
__test__ = True
|
||||
|
||||
algo = staticmethod(group_var)
|
||||
dtype = np.float32
|
||||
rtol = 1e-2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["float32", "float64"])
|
||||
def test_group_ohlc(dtype):
|
||||
obj = np.array(np.random.default_rng(2).standard_normal(20), dtype=dtype)
|
||||
|
||||
bins = np.array([6, 12, 20])
|
||||
out = np.zeros((3, 4), dtype)
|
||||
counts = np.zeros(len(out), dtype=np.int64)
|
||||
labels = ensure_platform_int(np.repeat(np.arange(3), np.diff(np.r_[0, bins])))
|
||||
|
||||
func = libgroupby.group_ohlc
|
||||
func(out, counts, obj[:, None], labels)
|
||||
|
||||
def _ohlc(group):
|
||||
if isna(group).all():
|
||||
return np.repeat(np.nan, 4)
|
||||
return [group[0], group.max(), group.min(), group[-1]]
|
||||
|
||||
expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])])
|
||||
|
||||
tm.assert_almost_equal(out, expected)
|
||||
tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64))
|
||||
|
||||
obj[:6] = np.nan
|
||||
func(out, counts, obj[:, None], labels)
|
||||
expected[0] = np.nan
|
||||
tm.assert_almost_equal(out, expected)
|
||||
|
||||
|
||||
def _check_cython_group_transform_cumulative(pd_op, np_op, dtype):
|
||||
"""
|
||||
Check a group transform that executes a cumulative function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pd_op : callable
|
||||
The pandas cumulative function.
|
||||
np_op : callable
|
||||
The analogous one in NumPy.
|
||||
dtype : type
|
||||
The specified dtype of the data.
|
||||
"""
|
||||
is_datetimelike = False
|
||||
|
||||
data = np.array([[1], [2], [3], [4]], dtype=dtype)
|
||||
answer = np.zeros_like(data)
|
||||
|
||||
labels = np.array([0, 0, 0, 0], dtype=np.intp)
|
||||
ngroups = 1
|
||||
pd_op(answer, data, labels, ngroups, is_datetimelike)
|
||||
|
||||
tm.assert_numpy_array_equal(np_op(data), answer[:, 0], check_dtype=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("np_dtype", ["int64", "uint64", "float32", "float64"])
|
||||
def test_cython_group_transform_cumsum(np_dtype):
|
||||
# see gh-4095
|
||||
dtype = np.dtype(np_dtype).type
|
||||
pd_op, np_op = group_cumsum, np.cumsum
|
||||
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)
|
||||
|
||||
|
||||
def test_cython_group_transform_cumprod():
|
||||
# see gh-4095
|
||||
dtype = np.float64
|
||||
pd_op, np_op = group_cumprod, np.cumprod
|
||||
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)
|
||||
|
||||
|
||||
def test_cython_group_transform_algos():
|
||||
# see gh-4095
|
||||
is_datetimelike = False
|
||||
|
||||
# with nans
|
||||
labels = np.array([0, 0, 0, 0, 0], dtype=np.intp)
|
||||
ngroups = 1
|
||||
|
||||
data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64")
|
||||
actual = np.zeros_like(data)
|
||||
actual.fill(np.nan)
|
||||
group_cumprod(actual, data, labels, ngroups, is_datetimelike)
|
||||
expected = np.array([1, 2, 6, np.nan, 24], dtype="float64")
|
||||
tm.assert_numpy_array_equal(actual[:, 0], expected)
|
||||
|
||||
actual = np.zeros_like(data)
|
||||
actual.fill(np.nan)
|
||||
group_cumsum(actual, data, labels, ngroups, is_datetimelike)
|
||||
expected = np.array([1, 3, 6, np.nan, 10], dtype="float64")
|
||||
tm.assert_numpy_array_equal(actual[:, 0], expected)
|
||||
|
||||
# timedelta
|
||||
is_datetimelike = True
|
||||
data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None]
|
||||
actual = np.zeros_like(data, dtype="int64")
|
||||
group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike)
|
||||
expected = np.array(
|
||||
[
|
||||
np.timedelta64(1, "ns"),
|
||||
np.timedelta64(2, "ns"),
|
||||
np.timedelta64(3, "ns"),
|
||||
np.timedelta64(4, "ns"),
|
||||
np.timedelta64(5, "ns"),
|
||||
]
|
||||
)
|
||||
tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected)
|
||||
|
||||
|
||||
def test_cython_group_mean_datetimelike():
|
||||
actual = np.zeros(shape=(1, 1), dtype="float64")
|
||||
counts = np.array([0], dtype="int64")
|
||||
data = (
|
||||
np.array(
|
||||
[np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")],
|
||||
dtype="m8[ns]",
|
||||
)[:, None]
|
||||
.view("int64")
|
||||
.astype("float64")
|
||||
)
|
||||
labels = np.zeros(len(data), dtype=np.intp)
|
||||
|
||||
group_mean(actual, counts, data, labels, is_datetimelike=True)
|
||||
|
||||
tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64"))
|
||||
|
||||
|
||||
def test_cython_group_mean_wrong_min_count():
|
||||
actual = np.zeros(shape=(1, 1), dtype="float64")
|
||||
counts = np.zeros(1, dtype="int64")
|
||||
data = np.zeros(1, dtype="float64")[:, None]
|
||||
labels = np.zeros(1, dtype=np.intp)
|
||||
|
||||
with pytest.raises(AssertionError, match="min_count"):
|
||||
group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0)
|
||||
|
||||
|
||||
def test_cython_group_mean_not_datetimelike_but_has_NaT_values():
|
||||
actual = np.zeros(shape=(1, 1), dtype="float64")
|
||||
counts = np.array([0], dtype="int64")
|
||||
data = (
|
||||
np.array(
|
||||
[np.timedelta64("NaT"), np.timedelta64("NaT")],
|
||||
dtype="m8[ns]",
|
||||
)[:, None]
|
||||
.view("int64")
|
||||
.astype("float64")
|
||||
)
|
||||
labels = np.zeros(len(data), dtype=np.intp)
|
||||
|
||||
group_mean(actual, counts, data, labels, is_datetimelike=False)
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64")
|
||||
)
|
||||
|
||||
|
||||
def test_cython_group_mean_Inf_at_begining_and_end():
|
||||
# GH 50367
|
||||
actual = np.array([[np.nan, np.nan], [np.nan, np.nan]], dtype="float64")
|
||||
counts = np.array([0, 0], dtype="int64")
|
||||
data = np.array(
|
||||
[[np.inf, 1.0], [1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0], [5, np.inf]],
|
||||
dtype="float64",
|
||||
)
|
||||
labels = np.array([0, 1, 0, 1, 0, 1], dtype=np.intp)
|
||||
|
||||
group_mean(actual, counts, data, labels, is_datetimelike=False)
|
||||
|
||||
expected = np.array([[np.inf, 3], [3, np.inf]], dtype="float64")
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
actual,
|
||||
expected,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, out",
|
||||
[
|
||||
([[np.inf], [np.inf], [np.inf]], [[np.inf], [np.inf]]),
|
||||
([[np.inf], [np.inf], [-np.inf]], [[np.inf], [np.nan]]),
|
||||
([[np.inf], [-np.inf], [np.inf]], [[np.inf], [np.nan]]),
|
||||
([[np.inf], [-np.inf], [-np.inf]], [[np.inf], [-np.inf]]),
|
||||
],
|
||||
)
|
||||
def test_cython_group_sum_Inf_at_begining_and_end(values, out):
|
||||
# GH #53606
|
||||
actual = np.array([[np.nan], [np.nan]], dtype="float64")
|
||||
counts = np.array([0, 0], dtype="int64")
|
||||
data = np.array(values, dtype="float64")
|
||||
labels = np.array([0, 1, 1], dtype=np.intp)
|
||||
|
||||
group_sum(actual, counts, data, labels, None, is_datetimelike=False)
|
||||
|
||||
expected = np.array(out, dtype="float64")
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
actual,
|
||||
expected,
|
||||
)
|
||||
@ -0,0 +1,163 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["ffill", "bfill"])
|
||||
def test_groupby_column_index_name_lost_fill_funcs(func):
|
||||
# GH: 29764 groupby loses index sometimes
|
||||
df = DataFrame(
|
||||
[[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]],
|
||||
columns=Index(["type", "a", "b"], name="idx"),
|
||||
)
|
||||
df_grouped = df.groupby(["type"])[["a", "b"]]
|
||||
result = getattr(df_grouped, func)().columns
|
||||
expected = Index(["a", "b"], name="idx")
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["ffill", "bfill"])
|
||||
def test_groupby_fill_duplicate_column_names(func):
|
||||
# GH: 25610 ValueError with duplicate column names
|
||||
df1 = DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]})
|
||||
df2 = DataFrame({"field1": [1, np.nan, 4]})
|
||||
df_grouped = pd.concat([df1, df2], axis=1).groupby(by=["field2"])
|
||||
expected = DataFrame(
|
||||
[[1, 1.0], [3, np.nan], [4, 4.0]], columns=["field1", "field1"]
|
||||
)
|
||||
result = getattr(df_grouped, func)()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_ffill_missing_arguments():
|
||||
# GH 14955
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 1]})
|
||||
msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
with pytest.raises(ValueError, match="Must specify a fill"):
|
||||
df.groupby("b").fillna()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, expected", [("ffill", [None, "a", "a"]), ("bfill", ["a", "a", None])]
|
||||
)
|
||||
def test_fillna_with_string_dtype(method, expected):
|
||||
# GH 40250
|
||||
df = DataFrame({"a": pd.array([None, "a", None], dtype="string"), "b": [0, 0, 0]})
|
||||
grp = df.groupby("b")
|
||||
msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = grp.fillna(method=method)
|
||||
expected = DataFrame({"a": pd.array(expected, dtype="string")})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_fill_consistency():
|
||||
# GH9221
|
||||
# pass thru keyword arguments to the generated wrapper
|
||||
# are set if the passed kw is None (only)
|
||||
df = DataFrame(
|
||||
index=pd.MultiIndex.from_product(
|
||||
[["value1", "value2"], date_range("2014-01-01", "2014-01-06")]
|
||||
),
|
||||
columns=Index(["1", "2"], name="id"),
|
||||
)
|
||||
df["1"] = [
|
||||
np.nan,
|
||||
1,
|
||||
np.nan,
|
||||
np.nan,
|
||||
11,
|
||||
np.nan,
|
||||
np.nan,
|
||||
2,
|
||||
np.nan,
|
||||
np.nan,
|
||||
22,
|
||||
np.nan,
|
||||
]
|
||||
df["2"] = [
|
||||
np.nan,
|
||||
3,
|
||||
np.nan,
|
||||
np.nan,
|
||||
33,
|
||||
np.nan,
|
||||
np.nan,
|
||||
4,
|
||||
np.nan,
|
||||
np.nan,
|
||||
44,
|
||||
np.nan,
|
||||
]
|
||||
|
||||
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
expected = df.groupby(level=0, axis=0).fillna(method="ffill")
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["ffill", "bfill"])
|
||||
@pytest.mark.parametrize("dropna", [True, False])
|
||||
@pytest.mark.parametrize("has_nan_group", [True, False])
|
||||
def test_ffill_handles_nan_groups(dropna, method, has_nan_group):
|
||||
# GH 34725
|
||||
|
||||
df_without_nan_rows = DataFrame([(1, 0.1), (2, 0.2)])
|
||||
|
||||
ridx = [-1, 0, -1, -1, 1, -1]
|
||||
df = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
|
||||
|
||||
group_b = np.nan if has_nan_group else "b"
|
||||
df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3)
|
||||
|
||||
grouped = df.groupby(by="group_col", dropna=dropna)
|
||||
result = getattr(grouped, method)(limit=None)
|
||||
|
||||
expected_rows = {
|
||||
("ffill", True, True): [-1, 0, 0, -1, -1, -1],
|
||||
("ffill", True, False): [-1, 0, 0, -1, 1, 1],
|
||||
("ffill", False, True): [-1, 0, 0, -1, 1, 1],
|
||||
("ffill", False, False): [-1, 0, 0, -1, 1, 1],
|
||||
("bfill", True, True): [0, 0, -1, -1, -1, -1],
|
||||
("bfill", True, False): [0, 0, -1, 1, 1, -1],
|
||||
("bfill", False, True): [0, 0, -1, 1, 1, -1],
|
||||
("bfill", False, False): [0, 0, -1, 1, 1, -1],
|
||||
}
|
||||
|
||||
ridx = expected_rows.get((method, dropna, has_nan_group))
|
||||
expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
|
||||
# columns are a 'take' on df.columns, which are object dtype
|
||||
expected.columns = expected.columns.astype(object)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)])
|
||||
@pytest.mark.parametrize("func", ["first", "last", "max", "min"])
|
||||
def test_min_count(func, min_count, value):
|
||||
# GH#37821
|
||||
df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3})
|
||||
result = getattr(df.groupby("a"), func)(min_count=min_count)
|
||||
expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_indices_with_missing():
|
||||
# GH 9304
|
||||
df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
|
||||
g = df.groupby(["a", "b"])
|
||||
result = g.indices
|
||||
expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])}
|
||||
assert result == expected
|
||||
@ -0,0 +1,89 @@
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_arm
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.util.version import Version
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
numba = pytest.importorskip("numba")
|
||||
pytestmark.append(
|
||||
pytest.mark.skipif(
|
||||
Version(numba.__version__) == Version("0.61") and is_platform_arm(),
|
||||
reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
class TestEngine:
|
||||
def test_cython_vs_numba_frame(
|
||||
self, sort, nogil, parallel, nopython, numba_supported_reductions
|
||||
):
|
||||
func, kwargs = numba_supported_reductions
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
gb = df.groupby("a", sort=sort)
|
||||
result = getattr(gb, func)(
|
||||
engine="numba", engine_kwargs=engine_kwargs, **kwargs
|
||||
)
|
||||
expected = getattr(gb, func)(**kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_cython_vs_numba_getitem(
|
||||
self, sort, nogil, parallel, nopython, numba_supported_reductions
|
||||
):
|
||||
func, kwargs = numba_supported_reductions
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
gb = df.groupby("a", sort=sort)["c"]
|
||||
result = getattr(gb, func)(
|
||||
engine="numba", engine_kwargs=engine_kwargs, **kwargs
|
||||
)
|
||||
expected = getattr(gb, func)(**kwargs)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_cython_vs_numba_series(
|
||||
self, sort, nogil, parallel, nopython, numba_supported_reductions
|
||||
):
|
||||
func, kwargs = numba_supported_reductions
|
||||
ser = Series(range(3), index=[1, 2, 1], name="foo")
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
gb = ser.groupby(level=0, sort=sort)
|
||||
result = getattr(gb, func)(
|
||||
engine="numba", engine_kwargs=engine_kwargs, **kwargs
|
||||
)
|
||||
expected = getattr(gb, func)(**kwargs)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_as_index_false_unsupported(self, numba_supported_reductions):
|
||||
func, kwargs = numba_supported_reductions
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
gb = df.groupby("a", as_index=False)
|
||||
with pytest.raises(NotImplementedError, match="as_index=False"):
|
||||
getattr(gb, func)(engine="numba", **kwargs)
|
||||
|
||||
def test_axis_1_unsupported(self, numba_supported_reductions):
|
||||
func, kwargs = numba_supported_reductions
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
gb = df.groupby("a", axis=1)
|
||||
with pytest.raises(NotImplementedError, match="axis=1"):
|
||||
getattr(gb, func)(engine="numba", **kwargs)
|
||||
|
||||
def test_no_engine_doesnt_raise(self):
|
||||
# GH55520
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
gb = df.groupby("a")
|
||||
# Make sure behavior of functions w/out engine argument don't raise
|
||||
# when the global use_numba option is set
|
||||
with option_context("compute.use_numba", True):
|
||||
res = gb.agg({"b": "first"})
|
||||
expected = gb.agg({"b": "first"})
|
||||
tm.assert_frame_equal(res, expected)
|
||||
@ -0,0 +1,532 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.groupby import get_groupby_method_args
|
||||
|
||||
|
||||
class TestNumericOnly:
|
||||
# make sure that we are passing thru kwargs to our agg functions
|
||||
|
||||
@pytest.fixture
|
||||
def df(self):
|
||||
# GH3668
|
||||
# GH5724
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": [1, 1, 2],
|
||||
"int": [1, 2, 3],
|
||||
"float": [4.0, 5.0, 6.0],
|
||||
"string": Series(["a", "b", "c"], dtype="str"),
|
||||
"object": Series(["a", "b", "c"], dtype=object),
|
||||
"category_string": Series(list("abc")).astype("category"),
|
||||
"category_int": [7, 8, 9],
|
||||
"datetime": date_range("20130101", periods=3),
|
||||
"datetimetz": date_range("20130101", periods=3, tz="US/Eastern"),
|
||||
"timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
|
||||
},
|
||||
columns=[
|
||||
"group",
|
||||
"int",
|
||||
"float",
|
||||
"string",
|
||||
"object",
|
||||
"category_string",
|
||||
"category_int",
|
||||
"datetime",
|
||||
"datetimetz",
|
||||
"timedelta",
|
||||
],
|
||||
)
|
||||
return df
|
||||
|
||||
@pytest.mark.parametrize("method", ["mean", "median"])
|
||||
def test_averages(self, df, method):
|
||||
# mean / median
|
||||
expected_columns_numeric = Index(["int", "float", "category_int"])
|
||||
|
||||
gb = df.groupby("group")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"category_int": [7.5, 9],
|
||||
"float": [4.5, 6.0],
|
||||
"timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")],
|
||||
"int": [1.5, 3],
|
||||
"datetime": [
|
||||
Timestamp("2013-01-01 12:00:00"),
|
||||
Timestamp("2013-01-03 00:00:00"),
|
||||
],
|
||||
"datetimetz": [
|
||||
Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
|
||||
Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
|
||||
],
|
||||
},
|
||||
index=Index([1, 2], name="group"),
|
||||
columns=[
|
||||
"int",
|
||||
"float",
|
||||
"category_int",
|
||||
],
|
||||
)
|
||||
|
||||
result = getattr(gb, method)(numeric_only=True)
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
expected_columns = expected.columns
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
@pytest.mark.parametrize("method", ["min", "max"])
|
||||
def test_extrema(self, df, method):
|
||||
# TODO: min, max *should* handle
|
||||
# categorical (ordered) dtype
|
||||
|
||||
expected_columns = Index(
|
||||
[
|
||||
"int",
|
||||
"float",
|
||||
"string",
|
||||
"category_int",
|
||||
"datetime",
|
||||
"datetimetz",
|
||||
"timedelta",
|
||||
]
|
||||
)
|
||||
expected_columns_numeric = expected_columns
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
def test_first_last(self, df, method):
|
||||
expected_columns = Index(
|
||||
[
|
||||
"int",
|
||||
"float",
|
||||
"string",
|
||||
"object",
|
||||
"category_string",
|
||||
"category_int",
|
||||
"datetime",
|
||||
"datetimetz",
|
||||
"timedelta",
|
||||
]
|
||||
)
|
||||
expected_columns_numeric = expected_columns
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
@pytest.mark.parametrize("method", ["sum", "cumsum"])
|
||||
def test_sum_cumsum(self, df, method):
|
||||
expected_columns_numeric = Index(["int", "float", "category_int"])
|
||||
expected_columns = Index(
|
||||
["int", "float", "string", "category_int", "timedelta"]
|
||||
)
|
||||
if method == "cumsum":
|
||||
# cumsum loses string
|
||||
expected_columns = Index(["int", "float", "category_int", "timedelta"])
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
@pytest.mark.parametrize("method", ["prod", "cumprod"])
|
||||
def test_prod_cumprod(self, df, method):
|
||||
expected_columns = Index(["int", "float", "category_int"])
|
||||
expected_columns_numeric = expected_columns
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||||
def test_cummin_cummax(self, df, method):
|
||||
# like min, max, but don't include strings
|
||||
expected_columns = Index(
|
||||
["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
|
||||
)
|
||||
|
||||
# GH#15561: numeric_only=False set by default like min/max
|
||||
expected_columns_numeric = expected_columns
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
def _check(self, df, method, expected_columns, expected_columns_numeric):
|
||||
gb = df.groupby("group")
|
||||
|
||||
# object dtypes for transformations are not implemented in Cython and
|
||||
# have no Python fallback
|
||||
exception = (
|
||||
(NotImplementedError, TypeError) if method.startswith("cum") else TypeError
|
||||
)
|
||||
|
||||
if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
|
||||
# The methods default to numeric_only=False and raise TypeError
|
||||
msg = "|".join(
|
||||
[
|
||||
"Categorical is not ordered",
|
||||
f"Cannot perform {method} with non-ordered Categorical",
|
||||
re.escape(f"agg function failed [how->{method},dtype->object]"),
|
||||
# cumsum/cummin/cummax/cumprod
|
||||
"function is not implemented for this dtype",
|
||||
f"dtype 'str' does not support operation '{method}'",
|
||||
]
|
||||
)
|
||||
with pytest.raises(exception, match=msg):
|
||||
getattr(gb, method)()
|
||||
elif method in ("sum", "mean", "median", "prod"):
|
||||
msg = "|".join(
|
||||
[
|
||||
"category type does not support sum operations",
|
||||
re.escape(f"agg function failed [how->{method},dtype->object]"),
|
||||
re.escape(f"agg function failed [how->{method},dtype->string]"),
|
||||
f"dtype 'str' does not support operation '{method}'",
|
||||
]
|
||||
)
|
||||
with pytest.raises(exception, match=msg):
|
||||
getattr(gb, method)()
|
||||
else:
|
||||
result = getattr(gb, method)()
|
||||
tm.assert_index_equal(result.columns, expected_columns_numeric)
|
||||
|
||||
if method not in ("first", "last"):
|
||||
msg = "|".join(
|
||||
[
|
||||
"Categorical is not ordered",
|
||||
"category type does not support",
|
||||
"function is not implemented for this dtype",
|
||||
f"Cannot perform {method} with non-ordered Categorical",
|
||||
re.escape(f"agg function failed [how->{method},dtype->object]"),
|
||||
re.escape(f"agg function failed [how->{method},dtype->string]"),
|
||||
f"dtype 'str' does not support operation '{method}'",
|
||||
]
|
||||
)
|
||||
with pytest.raises(exception, match=msg):
|
||||
getattr(gb, method)(numeric_only=False)
|
||||
else:
|
||||
result = getattr(gb, method)(numeric_only=False)
|
||||
tm.assert_index_equal(result.columns, expected_columns)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("numeric_only", [True, False, None])
|
||||
def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_string):
|
||||
if groupby_func in ("idxmax", "idxmin"):
|
||||
pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1")
|
||||
if groupby_func in ("corrwith", "skew"):
|
||||
msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1"
|
||||
request.applymarker(pytest.mark.xfail(reason=msg))
|
||||
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"]
|
||||
)
|
||||
df["E"] = "x"
|
||||
groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4]
|
||||
gb = df.groupby(groups)
|
||||
method = getattr(gb, groupby_func)
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
kwargs = {"axis": 1}
|
||||
if numeric_only is not None:
|
||||
# when numeric_only is None we don't pass any argument
|
||||
kwargs["numeric_only"] = numeric_only
|
||||
|
||||
# Functions without numeric_only and axis args
|
||||
no_args = ("cumprod", "cumsum", "diff", "fillna", "pct_change", "rank", "shift")
|
||||
# Functions with axis args
|
||||
has_axis = (
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
"diff",
|
||||
"pct_change",
|
||||
"rank",
|
||||
"shift",
|
||||
"cummax",
|
||||
"cummin",
|
||||
"idxmin",
|
||||
"idxmax",
|
||||
"fillna",
|
||||
)
|
||||
warn_msg = f"DataFrameGroupBy.{groupby_func} with axis=1 is deprecated"
|
||||
if numeric_only is not None and groupby_func in no_args:
|
||||
msg = "got an unexpected keyword argument 'numeric_only'"
|
||||
if groupby_func in ["cumprod", "cumsum"]:
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
|
||||
method(*args, **kwargs)
|
||||
else:
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
method(*args, **kwargs)
|
||||
elif groupby_func not in has_axis:
|
||||
msg = "got an unexpected keyword argument 'axis'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
method(*args, **kwargs)
|
||||
# fillna and shift are successful even on object dtypes
|
||||
elif (numeric_only is None or not numeric_only) and groupby_func not in (
|
||||
"fillna",
|
||||
"shift",
|
||||
):
|
||||
msgs = (
|
||||
# cummax, cummin, rank
|
||||
"not supported between instances of",
|
||||
# cumprod
|
||||
"can't multiply sequence by non-int of type 'float'",
|
||||
# cumsum, diff, pct_change
|
||||
"unsupported operand type",
|
||||
"has no kernel",
|
||||
"operation 'sub' not supported for dtype 'str' with dtype 'float64'",
|
||||
)
|
||||
if using_infer_string:
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
|
||||
errs = (TypeError, pa.lib.ArrowNotImplementedError)
|
||||
else:
|
||||
errs = TypeError
|
||||
with pytest.raises(errs, match=f"({'|'.join(msgs)})"):
|
||||
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
|
||||
method(*args, **kwargs)
|
||||
else:
|
||||
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
|
||||
result = method(*args, **kwargs)
|
||||
|
||||
df_expected = df.drop(columns="E").T if numeric_only else df.T
|
||||
expected = getattr(df_expected, groupby_func)(*args).T
|
||||
if groupby_func == "shift" and not numeric_only:
|
||||
# shift with axis=1 leaves the leftmost column as numeric
|
||||
# but transposing for expected gives us object dtype
|
||||
expected = expected.astype(float)
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kernel, has_arg",
|
||||
[
|
||||
("all", False),
|
||||
("any", False),
|
||||
("bfill", False),
|
||||
("corr", True),
|
||||
("corrwith", True),
|
||||
("cov", True),
|
||||
("cummax", True),
|
||||
("cummin", True),
|
||||
("cumprod", True),
|
||||
("cumsum", True),
|
||||
("diff", False),
|
||||
("ffill", False),
|
||||
("fillna", False),
|
||||
("first", True),
|
||||
("idxmax", True),
|
||||
("idxmin", True),
|
||||
("last", True),
|
||||
("max", True),
|
||||
("mean", True),
|
||||
("median", True),
|
||||
("min", True),
|
||||
("nth", False),
|
||||
("nunique", False),
|
||||
("pct_change", False),
|
||||
("prod", True),
|
||||
("quantile", True),
|
||||
("sem", True),
|
||||
("skew", True),
|
||||
("std", True),
|
||||
("sum", True),
|
||||
("var", True),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default])
|
||||
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
|
||||
def test_numeric_only(kernel, has_arg, numeric_only, keys):
|
||||
# GH#46072
|
||||
# drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False
|
||||
# has_arg: Whether the op has a numeric_only arg
|
||||
df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]})
|
||||
|
||||
args = get_groupby_method_args(kernel, df)
|
||||
kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only}
|
||||
|
||||
gb = df.groupby(keys)
|
||||
method = getattr(gb, kernel)
|
||||
if has_arg and numeric_only is True:
|
||||
# Cases where b does not appear in the result
|
||||
result = method(*args, **kwargs)
|
||||
assert "b" not in result.columns
|
||||
elif (
|
||||
# kernels that work on any dtype and have numeric_only arg
|
||||
kernel in ("first", "last")
|
||||
or (
|
||||
# kernels that work on any dtype and don't have numeric_only arg
|
||||
kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique")
|
||||
and numeric_only is lib.no_default
|
||||
)
|
||||
):
|
||||
warn = FutureWarning if kernel == "fillna" else None
|
||||
msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
result = method(*args, **kwargs)
|
||||
assert "b" in result.columns
|
||||
elif has_arg:
|
||||
assert numeric_only is not True
|
||||
# kernels that are successful on any dtype were above; this will fail
|
||||
|
||||
# object dtypes for transformations are not implemented in Cython and
|
||||
# have no Python fallback
|
||||
exception = NotImplementedError if kernel.startswith("cum") else TypeError
|
||||
|
||||
msg = "|".join(
|
||||
[
|
||||
"not allowed for this dtype",
|
||||
"cannot be performed against 'object' dtypes",
|
||||
# On PY39 message is "a number"; on PY310 and after is "a real number"
|
||||
"must be a string or a.* number",
|
||||
"unsupported operand type",
|
||||
"function is not implemented for this dtype",
|
||||
re.escape(f"agg function failed [how->{kernel},dtype->object]"),
|
||||
]
|
||||
)
|
||||
if kernel == "quantile":
|
||||
msg = "dtype 'object' does not support operation 'quantile'"
|
||||
elif kernel == "idxmin":
|
||||
msg = "'<' not supported between instances of 'type' and 'type'"
|
||||
elif kernel == "idxmax":
|
||||
msg = "'>' not supported between instances of 'type' and 'type'"
|
||||
with pytest.raises(exception, match=msg):
|
||||
method(*args, **kwargs)
|
||||
elif not has_arg and numeric_only is not lib.no_default:
|
||||
with pytest.raises(
|
||||
TypeError, match="got an unexpected keyword argument 'numeric_only'"
|
||||
):
|
||||
method(*args, **kwargs)
|
||||
else:
|
||||
assert kernel in ("diff", "pct_change")
|
||||
assert numeric_only is lib.no_default
|
||||
# Doesn't have numeric_only argument and fails on nuisance columns
|
||||
with pytest.raises(TypeError, match=r"unsupported operand type"):
|
||||
method(*args, **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning")
|
||||
@pytest.mark.parametrize("dtype", [bool, int, float, object])
|
||||
def test_deprecate_numeric_only_series(dtype, groupby_func, request):
|
||||
# GH#46560
|
||||
grouper = [0, 0, 1]
|
||||
|
||||
ser = Series([1, 0, 0], dtype=dtype)
|
||||
gb = ser.groupby(grouper)
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
# corrwith is not implemented on SeriesGroupBy
|
||||
assert not hasattr(gb, groupby_func)
|
||||
return
|
||||
|
||||
method = getattr(gb, groupby_func)
|
||||
|
||||
expected_ser = Series([1, 0, 0])
|
||||
expected_gb = expected_ser.groupby(grouper)
|
||||
expected_method = getattr(expected_gb, groupby_func)
|
||||
|
||||
args = get_groupby_method_args(groupby_func, ser)
|
||||
|
||||
fails_on_numeric_object = (
|
||||
"corr",
|
||||
"cov",
|
||||
"cummax",
|
||||
"cummin",
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
"quantile",
|
||||
)
|
||||
# ops that give an object result on object input
|
||||
obj_result = (
|
||||
"first",
|
||||
"last",
|
||||
"nth",
|
||||
"bfill",
|
||||
"ffill",
|
||||
"shift",
|
||||
"sum",
|
||||
"diff",
|
||||
"pct_change",
|
||||
"var",
|
||||
"mean",
|
||||
"median",
|
||||
"min",
|
||||
"max",
|
||||
"prod",
|
||||
"skew",
|
||||
)
|
||||
|
||||
# Test default behavior; kernels that fail may be enabled in the future but kernels
|
||||
# that succeed should not be allowed to fail (without deprecation, at least)
|
||||
if groupby_func in fails_on_numeric_object and dtype is object:
|
||||
if groupby_func == "quantile":
|
||||
msg = "dtype 'object' does not support operation 'quantile'"
|
||||
else:
|
||||
msg = "is not supported for object dtype"
|
||||
warn = FutureWarning if groupby_func == "fillna" else None
|
||||
warn_msg = "DataFrameGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
method(*args)
|
||||
elif dtype is object:
|
||||
warn = FutureWarning if groupby_func == "fillna" else None
|
||||
warn_msg = "SeriesGroupBy.fillna is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
result = method(*args)
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
expected = expected_method(*args)
|
||||
if groupby_func in obj_result:
|
||||
expected = expected.astype(object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
has_numeric_only = (
|
||||
"first",
|
||||
"last",
|
||||
"max",
|
||||
"mean",
|
||||
"median",
|
||||
"min",
|
||||
"prod",
|
||||
"quantile",
|
||||
"sem",
|
||||
"skew",
|
||||
"std",
|
||||
"sum",
|
||||
"var",
|
||||
"cummax",
|
||||
"cummin",
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
)
|
||||
if groupby_func not in has_numeric_only:
|
||||
msg = "got an unexpected keyword argument 'numeric_only'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
method(*args, numeric_only=True)
|
||||
elif dtype is object:
|
||||
msg = "|".join(
|
||||
[
|
||||
"SeriesGroupBy.sem called with numeric_only=True and dtype object",
|
||||
"Series.skew does not allow numeric_only=True with non-numeric",
|
||||
"cum(sum|prod|min|max) is not supported for object dtype",
|
||||
r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric",
|
||||
]
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
method(*args, numeric_only=True)
|
||||
elif dtype == bool and groupby_func == "quantile":
|
||||
msg = "Allowing bool dtype in SeriesGroupBy.quantile"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
# GH#51424
|
||||
result = method(*args, numeric_only=True)
|
||||
expected = method(*args, numeric_only=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
result = method(*args, numeric_only=True)
|
||||
expected = method(*args, numeric_only=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@ -0,0 +1,80 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_pipe():
|
||||
# Test the pipe method of DataFrameGroupBy.
|
||||
# Issue #17871
|
||||
|
||||
random_state = np.random.default_rng(2)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": random_state.standard_normal(8),
|
||||
"C": random_state.standard_normal(8),
|
||||
}
|
||||
)
|
||||
|
||||
def f(dfgb):
|
||||
return dfgb.B.max() - dfgb.C.min().min()
|
||||
|
||||
def square(srs):
|
||||
return srs**2
|
||||
|
||||
# Note that the transformations are
|
||||
# GroupBy -> Series
|
||||
# Series -> Series
|
||||
# This then chains the GroupBy.pipe and the
|
||||
# NDFrame.pipe methods
|
||||
result = df.groupby("A").pipe(f).pipe(square)
|
||||
|
||||
index = Index(["bar", "foo"], name="A")
|
||||
expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index)
|
||||
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_pipe_args():
|
||||
# Test passing args to the pipe method of DataFrameGroupBy.
|
||||
# Issue #17871
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": ["A", "A", "B", "B", "C"],
|
||||
"x": [1.0, 2.0, 3.0, 2.0, 5.0],
|
||||
"y": [10.0, 100.0, 1000.0, -100.0, -1000.0],
|
||||
}
|
||||
)
|
||||
|
||||
def f(dfgb, arg1):
|
||||
filtered = dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False)
|
||||
return filtered.groupby("group")
|
||||
|
||||
def g(dfgb, arg2):
|
||||
return dfgb.sum() / dfgb.sum().sum() + arg2
|
||||
|
||||
def h(df, arg3):
|
||||
return df.x + df.y - arg3
|
||||
|
||||
result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100)
|
||||
|
||||
# Assert the results here
|
||||
index = Index(["A", "B"], name="group")
|
||||
expected = pd.Series([-79.5160891089, -78.4839108911], index=index)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# test SeriesGroupby.pipe
|
||||
ser = pd.Series([1, 1, 2, 2, 3, 3])
|
||||
result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())
|
||||
|
||||
expected = pd.Series([4, 8, 12], index=Index([1, 2, 3], dtype=np.int64))
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
757
lib/python3.11/site-packages/pandas/tests/groupby/test_raises.py
Normal file
757
lib/python3.11/site-packages/pandas/tests/groupby/test_raises.py
Normal file
@ -0,0 +1,757 @@
|
||||
# Only tests that raise an error and have no better location should go here.
|
||||
# Tests for specific groupby methods should go in their respective
|
||||
# test file.
|
||||
|
||||
import datetime
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Grouper,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.groupby import get_groupby_method_args
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
"a",
|
||||
["a"],
|
||||
["a", "b"],
|
||||
Grouper(key="a"),
|
||||
lambda x: x % 2,
|
||||
[0, 0, 0, 1, 2, 2, 2, 3, 3],
|
||||
np.array([0, 0, 0, 1, 2, 2, 2, 3, 3]),
|
||||
dict(zip(range(9), [0, 0, 0, 1, 2, 2, 2, 3, 3])),
|
||||
Series([1, 1, 1, 1, 1, 2, 2, 2, 2]),
|
||||
[Series([1, 1, 1, 1, 1, 2, 2, 2, 2]), Series([3, 3, 4, 4, 4, 4, 4, 3, 3])],
|
||||
]
|
||||
)
|
||||
def by(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def groupby_series(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_with_string_col():
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
||||
"c": range(9),
|
||||
"d": list("xyzwtyuio"),
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_with_datetime_col():
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
||||
"c": range(9),
|
||||
"d": datetime.datetime(2005, 1, 1, 10, 30, 23, 540000),
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_with_timedelta_col():
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
||||
"c": range(9),
|
||||
"d": datetime.timedelta(days=1),
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_with_cat_col():
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
||||
"c": range(9),
|
||||
"d": Categorical(
|
||||
["a", "a", "a", "a", "b", "b", "b", "b", "c"],
|
||||
categories=["a", "b", "c", "d"],
|
||||
ordered=True,
|
||||
),
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""):
|
||||
warn_klass = None if warn_msg == "" else FutureWarning
|
||||
with tm.assert_produces_warning(warn_klass, match=warn_msg):
|
||||
if klass is None:
|
||||
if how == "method":
|
||||
getattr(gb, groupby_func)(*args)
|
||||
elif how == "agg":
|
||||
gb.agg(groupby_func, *args)
|
||||
else:
|
||||
gb.transform(groupby_func, *args)
|
||||
else:
|
||||
with pytest.raises(klass, match=msg):
|
||||
if how == "method":
|
||||
getattr(gb, groupby_func)(*args)
|
||||
elif how == "agg":
|
||||
gb.agg(groupby_func, *args)
|
||||
else:
|
||||
gb.transform(groupby_func, *args)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
||||
def test_groupby_raises_string(
|
||||
how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string
|
||||
):
|
||||
df = df_with_string_col
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
assert not hasattr(gb, "corrwith")
|
||||
return
|
||||
|
||||
klass, msg = {
|
||||
"all": (None, ""),
|
||||
"any": (None, ""),
|
||||
"bfill": (None, ""),
|
||||
"corrwith": (TypeError, "Could not convert"),
|
||||
"count": (None, ""),
|
||||
"cumcount": (None, ""),
|
||||
"cummax": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(function|cummax) is not (implemented|supported) for (this|object) dtype",
|
||||
),
|
||||
"cummin": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(function|cummin) is not (implemented|supported) for (this|object) dtype",
|
||||
),
|
||||
"cumprod": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(function|cumprod) is not (implemented|supported) for (this|object) dtype",
|
||||
),
|
||||
"cumsum": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(function|cumsum) is not (implemented|supported) for (this|object) dtype",
|
||||
),
|
||||
"diff": (TypeError, "unsupported operand type"),
|
||||
"ffill": (None, ""),
|
||||
"fillna": (None, ""),
|
||||
"first": (None, ""),
|
||||
"idxmax": (None, ""),
|
||||
"idxmin": (None, ""),
|
||||
"last": (None, ""),
|
||||
"max": (None, ""),
|
||||
"mean": (
|
||||
TypeError,
|
||||
re.escape("agg function failed [how->mean,dtype->object]"),
|
||||
),
|
||||
"median": (
|
||||
TypeError,
|
||||
re.escape("agg function failed [how->median,dtype->object]"),
|
||||
),
|
||||
"min": (None, ""),
|
||||
"ngroup": (None, ""),
|
||||
"nunique": (None, ""),
|
||||
"pct_change": (TypeError, "unsupported operand type"),
|
||||
"prod": (
|
||||
TypeError,
|
||||
re.escape("agg function failed [how->prod,dtype->object]"),
|
||||
),
|
||||
"quantile": (TypeError, "dtype 'object' does not support operation 'quantile'"),
|
||||
"rank": (None, ""),
|
||||
"sem": (ValueError, "could not convert string to float"),
|
||||
"shift": (None, ""),
|
||||
"size": (None, ""),
|
||||
"skew": (ValueError, "could not convert string to float"),
|
||||
"std": (ValueError, "could not convert string to float"),
|
||||
"sum": (None, ""),
|
||||
"var": (
|
||||
TypeError,
|
||||
re.escape("agg function failed [how->var,dtype->"),
|
||||
),
|
||||
}[groupby_func]
|
||||
|
||||
if using_infer_string:
|
||||
if groupby_func in [
|
||||
"prod",
|
||||
"mean",
|
||||
"median",
|
||||
"cumsum",
|
||||
"cumprod",
|
||||
"std",
|
||||
"sem",
|
||||
"var",
|
||||
"skew",
|
||||
"quantile",
|
||||
]:
|
||||
msg = f"dtype 'str' does not support operation '{groupby_func}'"
|
||||
if groupby_func in ["sem", "std", "skew"]:
|
||||
# The object-dtype raises ValueError when trying to convert to numeric.
|
||||
klass = TypeError
|
||||
elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow":
|
||||
# This doesn't go through EA._groupby_op so the message isn't controlled
|
||||
# there.
|
||||
msg = "operation 'truediv' not supported for dtype 'str' with dtype 'str'"
|
||||
elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow":
|
||||
# This doesn't go through EA._groupby_op so the message isn't controlled
|
||||
# there.
|
||||
msg = "operation 'sub' not supported for dtype 'str' with dtype 'str'"
|
||||
|
||||
elif groupby_func in ["cummin", "cummax"]:
|
||||
msg = msg.replace("object", "str")
|
||||
elif groupby_func == "corrwith":
|
||||
msg = "Cannot perform reduction 'mean' with string dtype"
|
||||
|
||||
if groupby_func == "fillna":
|
||||
kind = "Series" if groupby_series else "DataFrame"
|
||||
warn_msg = f"{kind}GroupBy.fillna is deprecated"
|
||||
else:
|
||||
warn_msg = ""
|
||||
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
def test_groupby_raises_string_udf(how, by, groupby_series, df_with_string_col):
|
||||
df = df_with_string_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
def func(x):
|
||||
raise TypeError("Test error message")
|
||||
|
||||
with pytest.raises(TypeError, match="Test error message"):
|
||||
getattr(gb, how)(func)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
|
||||
def test_groupby_raises_string_np(
|
||||
how,
|
||||
by,
|
||||
groupby_series,
|
||||
groupby_func_np,
|
||||
df_with_string_col,
|
||||
using_infer_string,
|
||||
):
|
||||
# GH#50749
|
||||
df = df_with_string_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
klass, msg = {
|
||||
np.sum: (None, ""),
|
||||
np.mean: (
|
||||
TypeError,
|
||||
"agg function failed|Cannot perform reduction 'mean' with string dtype",
|
||||
),
|
||||
}[groupby_func_np]
|
||||
|
||||
if using_infer_string:
|
||||
if groupby_func_np is np.mean:
|
||||
klass = TypeError
|
||||
msg = "dtype 'str' does not support operation 'mean'"
|
||||
|
||||
if groupby_series:
|
||||
warn_msg = "using SeriesGroupBy.[sum|mean]"
|
||||
else:
|
||||
warn_msg = "using DataFrameGroupBy.[sum|mean]"
|
||||
_call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
||||
def test_groupby_raises_datetime(
|
||||
how, by, groupby_series, groupby_func, df_with_datetime_col
|
||||
):
|
||||
df = df_with_datetime_col
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
assert not hasattr(gb, "corrwith")
|
||||
return
|
||||
|
||||
klass, msg = {
|
||||
"all": (None, ""),
|
||||
"any": (None, ""),
|
||||
"bfill": (None, ""),
|
||||
"corrwith": (TypeError, "cannot perform __mul__ with this index type"),
|
||||
"count": (None, ""),
|
||||
"cumcount": (None, ""),
|
||||
"cummax": (None, ""),
|
||||
"cummin": (None, ""),
|
||||
"cumprod": (TypeError, "datetime64 type does not support cumprod operations"),
|
||||
"cumsum": (TypeError, "datetime64 type does not support cumsum operations"),
|
||||
"diff": (None, ""),
|
||||
"ffill": (None, ""),
|
||||
"fillna": (None, ""),
|
||||
"first": (None, ""),
|
||||
"idxmax": (None, ""),
|
||||
"idxmin": (None, ""),
|
||||
"last": (None, ""),
|
||||
"max": (None, ""),
|
||||
"mean": (None, ""),
|
||||
"median": (None, ""),
|
||||
"min": (None, ""),
|
||||
"ngroup": (None, ""),
|
||||
"nunique": (None, ""),
|
||||
"pct_change": (TypeError, "cannot perform __truediv__ with this index type"),
|
||||
"prod": (TypeError, "datetime64 type does not support prod"),
|
||||
"quantile": (None, ""),
|
||||
"rank": (None, ""),
|
||||
"sem": (None, ""),
|
||||
"shift": (None, ""),
|
||||
"size": (None, ""),
|
||||
"skew": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
r"dtype datetime64\[ns\] does not support reduction",
|
||||
"datetime64 type does not support skew operations",
|
||||
]
|
||||
),
|
||||
),
|
||||
"std": (None, ""),
|
||||
"sum": (TypeError, "datetime64 type does not support sum operations"),
|
||||
"var": (TypeError, "datetime64 type does not support var operations"),
|
||||
}[groupby_func]
|
||||
|
||||
if groupby_func in ["any", "all"]:
|
||||
warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated"
|
||||
elif groupby_func == "fillna":
|
||||
kind = "Series" if groupby_series else "DataFrame"
|
||||
warn_msg = f"{kind}GroupBy.fillna is deprecated"
|
||||
else:
|
||||
warn_msg = ""
|
||||
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
def test_groupby_raises_datetime_udf(how, by, groupby_series, df_with_datetime_col):
|
||||
df = df_with_datetime_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
def func(x):
|
||||
raise TypeError("Test error message")
|
||||
|
||||
with pytest.raises(TypeError, match="Test error message"):
|
||||
getattr(gb, how)(func)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
|
||||
def test_groupby_raises_datetime_np(
|
||||
how, by, groupby_series, groupby_func_np, df_with_datetime_col
|
||||
):
|
||||
# GH#50749
|
||||
df = df_with_datetime_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
klass, msg = {
|
||||
np.sum: (TypeError, "datetime64 type does not support sum operations"),
|
||||
np.mean: (None, ""),
|
||||
}[groupby_func_np]
|
||||
|
||||
if groupby_series:
|
||||
warn_msg = "using SeriesGroupBy.[sum|mean]"
|
||||
else:
|
||||
warn_msg = "using DataFrameGroupBy.[sum|mean]"
|
||||
_call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"])
|
||||
def test_groupby_raises_timedelta(func, df_with_timedelta_col):
|
||||
df = df_with_timedelta_col
|
||||
gb = df.groupby(by="a")
|
||||
|
||||
_call_and_check(
|
||||
TypeError,
|
||||
"timedelta64 type does not support .* operations",
|
||||
"method",
|
||||
gb,
|
||||
func,
|
||||
[],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
||||
def test_groupby_raises_category(
|
||||
how, by, groupby_series, groupby_func, using_copy_on_write, df_with_cat_col
|
||||
):
|
||||
# GH#50749
|
||||
df = df_with_cat_col
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
assert not hasattr(gb, "corrwith")
|
||||
return
|
||||
|
||||
klass, msg = {
|
||||
"all": (None, ""),
|
||||
"any": (None, ""),
|
||||
"bfill": (None, ""),
|
||||
"corrwith": (
|
||||
TypeError,
|
||||
r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'",
|
||||
),
|
||||
"count": (None, ""),
|
||||
"cumcount": (None, ""),
|
||||
"cummax": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(category type does not support cummax operations|"
|
||||
"category dtype not supported|"
|
||||
"cummax is not supported for category dtype)",
|
||||
),
|
||||
"cummin": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(category type does not support cummin operations|"
|
||||
"category dtype not supported|"
|
||||
"cummin is not supported for category dtype)",
|
||||
),
|
||||
"cumprod": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(category type does not support cumprod operations|"
|
||||
"category dtype not supported|"
|
||||
"cumprod is not supported for category dtype)",
|
||||
),
|
||||
"cumsum": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(category type does not support cumsum operations|"
|
||||
"category dtype not supported|"
|
||||
"cumsum is not supported for category dtype)",
|
||||
),
|
||||
"diff": (
|
||||
TypeError,
|
||||
r"unsupported operand type\(s\) for -: 'Categorical' and 'Categorical'",
|
||||
),
|
||||
"ffill": (None, ""),
|
||||
"fillna": (
|
||||
TypeError,
|
||||
r"Cannot setitem on a Categorical with a new category \(0\), "
|
||||
"set the categories first",
|
||||
)
|
||||
if not using_copy_on_write
|
||||
else (None, ""), # no-op with CoW
|
||||
"first": (None, ""),
|
||||
"idxmax": (None, ""),
|
||||
"idxmin": (None, ""),
|
||||
"last": (None, ""),
|
||||
"max": (None, ""),
|
||||
"mean": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'mean'",
|
||||
"category dtype does not support aggregation 'mean'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"median": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'median'",
|
||||
"category dtype does not support aggregation 'median'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"min": (None, ""),
|
||||
"ngroup": (None, ""),
|
||||
"nunique": (None, ""),
|
||||
"pct_change": (
|
||||
TypeError,
|
||||
r"unsupported operand type\(s\) for /: 'Categorical' and 'Categorical'",
|
||||
),
|
||||
"prod": (TypeError, "category type does not support prod operations"),
|
||||
"quantile": (TypeError, "No matching signature found"),
|
||||
"rank": (None, ""),
|
||||
"sem": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'sem'",
|
||||
"category dtype does not support aggregation 'sem'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"shift": (None, ""),
|
||||
"size": (None, ""),
|
||||
"skew": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"dtype category does not support reduction 'skew'",
|
||||
"category type does not support skew operations",
|
||||
]
|
||||
),
|
||||
),
|
||||
"std": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'std'",
|
||||
"category dtype does not support aggregation 'std'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"sum": (TypeError, "category type does not support sum operations"),
|
||||
"var": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'var'",
|
||||
"category dtype does not support aggregation 'var'",
|
||||
]
|
||||
),
|
||||
),
|
||||
}[groupby_func]
|
||||
|
||||
if groupby_func == "fillna":
|
||||
kind = "Series" if groupby_series else "DataFrame"
|
||||
warn_msg = f"{kind}GroupBy.fillna is deprecated"
|
||||
else:
|
||||
warn_msg = ""
|
||||
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
def test_groupby_raises_category_udf(how, by, groupby_series, df_with_cat_col):
|
||||
# GH#50749
|
||||
df = df_with_cat_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
def func(x):
|
||||
raise TypeError("Test error message")
|
||||
|
||||
with pytest.raises(TypeError, match="Test error message"):
|
||||
getattr(gb, how)(func)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
|
||||
def test_groupby_raises_category_np(
|
||||
how, by, groupby_series, groupby_func_np, df_with_cat_col
|
||||
):
|
||||
# GH#50749
|
||||
df = df_with_cat_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
klass, msg = {
|
||||
np.sum: (TypeError, "category type does not support sum operations"),
|
||||
np.mean: (
|
||||
TypeError,
|
||||
"category dtype does not support aggregation 'mean'",
|
||||
),
|
||||
}[groupby_func_np]
|
||||
|
||||
if groupby_series:
|
||||
warn_msg = "using SeriesGroupBy.[sum|mean]"
|
||||
else:
|
||||
warn_msg = "using DataFrameGroupBy.[sum|mean]"
|
||||
_call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
||||
def test_groupby_raises_category_on_category(
|
||||
how,
|
||||
by,
|
||||
groupby_series,
|
||||
groupby_func,
|
||||
observed,
|
||||
using_copy_on_write,
|
||||
df_with_cat_col,
|
||||
):
|
||||
# GH#50749
|
||||
df = df_with_cat_col
|
||||
df["a"] = Categorical(
|
||||
["a", "a", "a", "a", "b", "b", "b", "b", "c"],
|
||||
categories=["a", "b", "c", "d"],
|
||||
ordered=True,
|
||||
)
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
gb = df.groupby(by=by, observed=observed)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
assert not hasattr(gb, "corrwith")
|
||||
return
|
||||
|
||||
empty_groups = not observed and any(group.empty for group in gb.groups.values())
|
||||
if (
|
||||
not observed
|
||||
and how != "transform"
|
||||
and isinstance(by, list)
|
||||
and isinstance(by[0], str)
|
||||
and by == ["a", "b"]
|
||||
):
|
||||
assert not empty_groups
|
||||
# TODO: empty_groups should be true due to unobserved categorical combinations
|
||||
empty_groups = True
|
||||
if how == "transform":
|
||||
# empty groups will be ignored
|
||||
empty_groups = False
|
||||
|
||||
klass, msg = {
|
||||
"all": (None, ""),
|
||||
"any": (None, ""),
|
||||
"bfill": (None, ""),
|
||||
"corrwith": (
|
||||
TypeError,
|
||||
r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'",
|
||||
),
|
||||
"count": (None, ""),
|
||||
"cumcount": (None, ""),
|
||||
"cummax": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(cummax is not supported for category dtype|"
|
||||
"category dtype not supported|"
|
||||
"category type does not support cummax operations)",
|
||||
),
|
||||
"cummin": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(cummin is not supported for category dtype|"
|
||||
"category dtype not supported|"
|
||||
"category type does not support cummin operations)",
|
||||
),
|
||||
"cumprod": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(cumprod is not supported for category dtype|"
|
||||
"category dtype not supported|"
|
||||
"category type does not support cumprod operations)",
|
||||
),
|
||||
"cumsum": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(cumsum is not supported for category dtype|"
|
||||
"category dtype not supported|"
|
||||
"category type does not support cumsum operations)",
|
||||
),
|
||||
"diff": (TypeError, "unsupported operand type"),
|
||||
"ffill": (None, ""),
|
||||
"fillna": (
|
||||
TypeError,
|
||||
r"Cannot setitem on a Categorical with a new category \(0\), "
|
||||
"set the categories first",
|
||||
)
|
||||
if not using_copy_on_write
|
||||
else (None, ""), # no-op with CoW
|
||||
"first": (None, ""),
|
||||
"idxmax": (ValueError, "empty group due to unobserved categories")
|
||||
if empty_groups
|
||||
else (None, ""),
|
||||
"idxmin": (ValueError, "empty group due to unobserved categories")
|
||||
if empty_groups
|
||||
else (None, ""),
|
||||
"last": (None, ""),
|
||||
"max": (None, ""),
|
||||
"mean": (TypeError, "category dtype does not support aggregation 'mean'"),
|
||||
"median": (TypeError, "category dtype does not support aggregation 'median'"),
|
||||
"min": (None, ""),
|
||||
"ngroup": (None, ""),
|
||||
"nunique": (None, ""),
|
||||
"pct_change": (TypeError, "unsupported operand type"),
|
||||
"prod": (TypeError, "category type does not support prod operations"),
|
||||
"quantile": (TypeError, "No matching signature found"),
|
||||
"rank": (None, ""),
|
||||
"sem": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'sem'",
|
||||
"category dtype does not support aggregation 'sem'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"shift": (None, ""),
|
||||
"size": (None, ""),
|
||||
"skew": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"category type does not support skew operations",
|
||||
"dtype category does not support reduction 'skew'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"std": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'std'",
|
||||
"category dtype does not support aggregation 'std'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"sum": (TypeError, "category type does not support sum operations"),
|
||||
"var": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support reduction 'var'",
|
||||
"category dtype does not support aggregation 'var'",
|
||||
]
|
||||
),
|
||||
),
|
||||
}[groupby_func]
|
||||
|
||||
if groupby_func == "fillna":
|
||||
kind = "Series" if groupby_series else "DataFrame"
|
||||
warn_msg = f"{kind}GroupBy.fillna is deprecated"
|
||||
else:
|
||||
warn_msg = ""
|
||||
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg)
|
||||
|
||||
|
||||
def test_subsetting_columns_axis_1_raises():
|
||||
# GH 35443
|
||||
df = DataFrame({"a": [1], "b": [2], "c": [3]})
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby("a", axis=1)
|
||||
with pytest.raises(ValueError, match="Cannot subset columns when using axis=1"):
|
||||
gb["b"]
|
||||
1277
lib/python3.11/site-packages/pandas/tests/groupby/test_reductions.py
Normal file
1277
lib/python3.11/site-packages/pandas/tests/groupby/test_reductions.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,968 @@
|
||||
"""
|
||||
test with the TimeGrouper / grouping with datetimes
|
||||
"""
|
||||
from datetime import (
|
||||
datetime,
|
||||
timedelta,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
from pandas._config import using_string_dtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
offsets,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.groupby.grouper import Grouper
|
||||
from pandas.core.groupby.ops import BinGrouper
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame_for_truncated_bingrouper():
|
||||
"""
|
||||
DataFrame used by groupby_with_truncated_bingrouper, made into
|
||||
a separate fixture for easier reuse in
|
||||
test_groupby_apply_timegrouper_with_nat_apply_squeeze
|
||||
"""
|
||||
df = DataFrame(
|
||||
{
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
Timestamp(2013, 9, 1, 13, 0),
|
||||
Timestamp(2013, 9, 1, 13, 5),
|
||||
Timestamp(2013, 10, 1, 20, 0),
|
||||
Timestamp(2013, 10, 3, 10, 0),
|
||||
pd.NaT,
|
||||
Timestamp(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
|
||||
"""
|
||||
GroupBy object such that gb._grouper is a BinGrouper and
|
||||
len(gb._grouper.result_index) < len(gb._grouper.group_keys_seq)
|
||||
|
||||
Aggregations on this groupby should have
|
||||
|
||||
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
|
||||
|
||||
As either the index or an index level.
|
||||
"""
|
||||
df = frame_for_truncated_bingrouper
|
||||
|
||||
tdg = Grouper(key="Date", freq="5D")
|
||||
gb = df.groupby(tdg)
|
||||
|
||||
# check we're testing the case we're interested in
|
||||
assert len(gb._grouper.result_index) != len(gb._grouper.group_keys_seq)
|
||||
|
||||
return gb
|
||||
|
||||
|
||||
class TestGroupBy:
|
||||
# TODO(infer_string) resample sum introduces 0's
|
||||
# https://github.com/pandas-dev/pandas/issues/60229
|
||||
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
|
||||
def test_groupby_with_timegrouper(self):
|
||||
# GH 4161
|
||||
# TimeGrouper requires a sorted index
|
||||
# also verifies that the resultant index has the correct name
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
# GH 6908 change target column's order
|
||||
df_reordered = df_original.sort_values(by="Quantity")
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
df = df.set_index(["Date"])
|
||||
|
||||
exp_dti = date_range(
|
||||
"20130901",
|
||||
"20131205",
|
||||
freq="5D",
|
||||
name="Date",
|
||||
inclusive="left",
|
||||
unit=df.index.unit,
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"Buyer": 0, "Quantity": 0},
|
||||
index=exp_dti,
|
||||
)
|
||||
# Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl"
|
||||
expected = expected.astype({"Buyer": object})
|
||||
expected.iloc[0, 0] = "CarlCarlCarl"
|
||||
expected.iloc[6, 0] = "CarlCarl"
|
||||
expected.iloc[18, 0] = "Joe"
|
||||
expected.iloc[[0, 6, 18], 1] = np.array([24, 6, 9], dtype="int64")
|
||||
|
||||
result1 = df.resample("5D").sum()
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
|
||||
df_sorted = df.sort_index()
|
||||
result2 = df_sorted.groupby(Grouper(freq="5D")).sum()
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
result3 = df.groupby(Grouper(freq="5D")).sum()
|
||||
tm.assert_frame_equal(result3, expected)
|
||||
|
||||
@pytest.mark.parametrize("should_sort", [True, False])
|
||||
def test_groupby_with_timegrouper_methods(self, should_sort):
|
||||
# GH 3881
|
||||
# make sure API of timegrouper conforms
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"Branch": "A A A A A B".split(),
|
||||
"Buyer": "Carl Mark Carl Joe Joe Carl".split(),
|
||||
"Quantity": [1, 3, 5, 8, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 1, 1, 13, 0),
|
||||
datetime(2013, 1, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 12, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
if should_sort:
|
||||
df = df.sort_values(by="Quantity", ascending=False)
|
||||
|
||||
df = df.set_index("Date", drop=False)
|
||||
g = df.groupby(Grouper(freq="6ME"))
|
||||
assert g.group_keys
|
||||
|
||||
assert isinstance(g._grouper, BinGrouper)
|
||||
groups = g.groups
|
||||
assert isinstance(groups, dict)
|
||||
assert len(groups) == 3
|
||||
|
||||
def test_timegrouper_with_reg_groups(self):
|
||||
# GH 3794
|
||||
# allow combination of timegrouper/reg groups
|
||||
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Branch": "A A A A A A A B".split(),
|
||||
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
|
||||
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 1, 1, 13, 0),
|
||||
datetime(2013, 1, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 12, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
).set_index("Date")
|
||||
|
||||
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
|
||||
|
||||
for df in [df_original, df_sorted]:
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark".split(),
|
||||
"Quantity": [10, 18, 3],
|
||||
"Date": [
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
|
||||
msg = "The default value of numeric_only"
|
||||
result = df.groupby([Grouper(freq="YE"), "Buyer"]).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Mark Carl Joe".split(),
|
||||
"Quantity": [1, 3, 9, 18],
|
||||
"Date": [
|
||||
datetime(2013, 1, 1, 0, 0),
|
||||
datetime(2013, 1, 1, 0, 0),
|
||||
datetime(2013, 7, 1, 0, 0),
|
||||
datetime(2013, 7, 1, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Branch": "A A A A A A A B".split(),
|
||||
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
|
||||
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 10, 1, 13, 0),
|
||||
datetime(2013, 10, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 2, 12, 0),
|
||||
datetime(2013, 10, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
).set_index("Date")
|
||||
|
||||
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
|
||||
for df in [df_original, df_sorted]:
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark Carl Joe".split(),
|
||||
"Quantity": [6, 8, 3, 4, 10],
|
||||
"Date": [
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 2, 0, 0),
|
||||
datetime(2013, 10, 2, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
|
||||
result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([Grouper(freq="1ME"), "Buyer"]).sum(numeric_only=True)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark".split(),
|
||||
"Quantity": [10, 18, 3],
|
||||
"Date": [
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# passing the name
|
||||
df = df.reset_index()
|
||||
result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum(
|
||||
numeric_only=True
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(KeyError, match="'The grouper name foo is not found'"):
|
||||
df.groupby([Grouper(freq="1ME", key="foo"), "Buyer"]).sum()
|
||||
|
||||
# passing the level
|
||||
df = df.set_index("Date")
|
||||
result = df.groupby([Grouper(freq="1ME", level="Date"), "Buyer"]).sum(
|
||||
numeric_only=True
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.groupby([Grouper(freq="1ME", level=0), "Buyer"]).sum(
|
||||
numeric_only=True
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(ValueError, match="The level foo is not valid"):
|
||||
df.groupby([Grouper(freq="1ME", level="foo"), "Buyer"]).sum()
|
||||
|
||||
# multi names
|
||||
df = df.copy()
|
||||
df["Date"] = df.index + offsets.MonthEnd(2)
|
||||
result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum(
|
||||
numeric_only=True
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark".split(),
|
||||
"Quantity": [10, 18, 3],
|
||||
"Date": [
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# error as we have both a level and a name!
|
||||
msg = "The Grouper cannot specify both a key and a level!"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby(
|
||||
[Grouper(freq="1ME", key="Date", level="Date"), "Buyer"]
|
||||
).sum()
|
||||
|
||||
# single groupers
|
||||
expected = DataFrame(
|
||||
[[31]],
|
||||
columns=["Quantity"],
|
||||
index=DatetimeIndex(
|
||||
[datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date"
|
||||
),
|
||||
)
|
||||
result = df.groupby(Grouper(freq="1ME")).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([Grouper(freq="1ME")]).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected.index = expected.index.shift(1)
|
||||
assert expected.index.freq == offsets.MonthEnd()
|
||||
result = df.groupby(Grouper(freq="1ME", key="Date")).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([Grouper(freq="1ME", key="Date")]).sum(
|
||||
numeric_only=True
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("freq", ["D", "ME", "YE", "QE-APR"])
|
||||
def test_timegrouper_with_reg_groups_freq(self, freq):
|
||||
# GH 6764 multiple grouping with/without sort
|
||||
df = DataFrame(
|
||||
{
|
||||
"date": pd.to_datetime(
|
||||
[
|
||||
"20121002",
|
||||
"20121007",
|
||||
"20130130",
|
||||
"20130202",
|
||||
"20130305",
|
||||
"20121002",
|
||||
"20121207",
|
||||
"20130130",
|
||||
"20130202",
|
||||
"20130305",
|
||||
"20130202",
|
||||
"20130305",
|
||||
]
|
||||
),
|
||||
"user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
|
||||
"whole_cost": [
|
||||
1790,
|
||||
364,
|
||||
280,
|
||||
259,
|
||||
201,
|
||||
623,
|
||||
90,
|
||||
312,
|
||||
359,
|
||||
301,
|
||||
359,
|
||||
801,
|
||||
],
|
||||
"cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12],
|
||||
}
|
||||
).set_index("date")
|
||||
|
||||
expected = (
|
||||
df.groupby("user_id")["whole_cost"]
|
||||
.resample(freq)
|
||||
.sum(min_count=1) # XXX
|
||||
.dropna()
|
||||
.reorder_levels(["date", "user_id"])
|
||||
.sort_index()
|
||||
.astype("int64")
|
||||
)
|
||||
expected.name = "whole_cost"
|
||||
|
||||
result1 = (
|
||||
df.sort_index().groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
|
||||
)
|
||||
tm.assert_series_equal(result1, expected)
|
||||
|
||||
result2 = df.groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
|
||||
tm.assert_series_equal(result2, expected)
|
||||
|
||||
def test_timegrouper_get_group(self):
|
||||
# GH 6914
|
||||
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Joe Carl Joe Carl".split(),
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
df_reordered = df_original.sort_values(by="Quantity")
|
||||
|
||||
# single grouping
|
||||
expected_list = [
|
||||
df_original.iloc[[0, 1, 5]],
|
||||
df_original.iloc[[2, 3]],
|
||||
df_original.iloc[[4]],
|
||||
]
|
||||
dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(Grouper(freq="ME", key="Date"))
|
||||
for t, expected in zip(dt_list, expected_list):
|
||||
dt = Timestamp(t)
|
||||
result = grouped.get_group(dt)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# multiple grouping
|
||||
expected_list = [
|
||||
df_original.iloc[[1]],
|
||||
df_original.iloc[[3]],
|
||||
df_original.iloc[[4]],
|
||||
]
|
||||
g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(["Buyer", Grouper(freq="ME", key="Date")])
|
||||
for (b, t), expected in zip(g_list, expected_list):
|
||||
dt = Timestamp(t)
|
||||
result = grouped.get_group((b, dt))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# with index
|
||||
df_original = df_original.set_index("Date")
|
||||
df_reordered = df_original.sort_values(by="Quantity")
|
||||
|
||||
expected_list = [
|
||||
df_original.iloc[[0, 1, 5]],
|
||||
df_original.iloc[[2, 3]],
|
||||
df_original.iloc[[4]],
|
||||
]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(Grouper(freq="ME"))
|
||||
for t, expected in zip(dt_list, expected_list):
|
||||
dt = Timestamp(t)
|
||||
result = grouped.get_group(dt)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_timegrouper_apply_return_type_series(self):
|
||||
# Using `apply` with the `TimeGrouper` should give the
|
||||
# same return type as an `apply` with a `Grouper`.
|
||||
# Issue #11742
|
||||
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
|
||||
df_dt = df.copy()
|
||||
df_dt["date"] = pd.to_datetime(df_dt["date"])
|
||||
|
||||
def sumfunc_series(x):
|
||||
return Series([x["value"].sum()], ("sum",))
|
||||
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
expected = df.groupby(Grouper(key="date")).apply(sumfunc_series)
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series)
|
||||
tm.assert_frame_equal(
|
||||
result.reset_index(drop=True), expected.reset_index(drop=True)
|
||||
)
|
||||
|
||||
def test_timegrouper_apply_return_type_value(self):
|
||||
# Using `apply` with the `TimeGrouper` should give the
|
||||
# same return type as an `apply` with a `Grouper`.
|
||||
# Issue #11742
|
||||
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
|
||||
df_dt = df.copy()
|
||||
df_dt["date"] = pd.to_datetime(df_dt["date"])
|
||||
|
||||
def sumfunc_value(x):
|
||||
return x.value.sum()
|
||||
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
expected = df.groupby(Grouper(key="date")).apply(sumfunc_value)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value)
|
||||
tm.assert_series_equal(
|
||||
result.reset_index(drop=True), expected.reset_index(drop=True)
|
||||
)
|
||||
|
||||
def test_groupby_groups_datetimeindex(self):
|
||||
# GH#1430
|
||||
periods = 1000
|
||||
ind = date_range(start="2012/1/1", freq="5min", periods=periods)
|
||||
df = DataFrame(
|
||||
{"high": np.arange(periods), "low": np.arange(periods)}, index=ind
|
||||
)
|
||||
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
|
||||
|
||||
# it works!
|
||||
groups = grouped.groups
|
||||
assert isinstance(next(iter(groups.keys())), datetime)
|
||||
|
||||
def test_groupby_groups_datetimeindex2(self):
|
||||
# GH#11442
|
||||
index = date_range("2015/01/01", periods=5, name="date")
|
||||
df = DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index)
|
||||
result = df.groupby(level="date").groups
|
||||
dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"]
|
||||
expected = {
|
||||
Timestamp(date): DatetimeIndex([date], name="date") for date in dates
|
||||
}
|
||||
tm.assert_dict_equal(result, expected)
|
||||
|
||||
grouped = df.groupby(level="date")
|
||||
for date in dates:
|
||||
result = grouped.get_group(date)
|
||||
data = [[df.loc[date, "A"], df.loc[date, "B"]]]
|
||||
expected_index = DatetimeIndex(
|
||||
[date], name="date", freq="D", dtype=index.dtype
|
||||
)
|
||||
expected = DataFrame(data, columns=list("AB"), index=expected_index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_groups_datetimeindex_tz(self):
|
||||
# GH 3950
|
||||
dates = [
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
]
|
||||
df = DataFrame(
|
||||
{
|
||||
"label": ["a", "a", "a", "b", "b", "b"],
|
||||
"datetime": dates,
|
||||
"value1": np.arange(6, dtype="int64"),
|
||||
"value2": [1, 2] * 3,
|
||||
}
|
||||
)
|
||||
df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific"))
|
||||
|
||||
exp_idx1 = DatetimeIndex(
|
||||
[
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
],
|
||||
tz="US/Pacific",
|
||||
name="datetime",
|
||||
)
|
||||
exp_idx2 = Index(["a", "b"] * 3, name="label")
|
||||
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||||
expected = DataFrame(
|
||||
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(["datetime", "label"]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# by level
|
||||
didx = DatetimeIndex(dates, tz="Asia/Tokyo")
|
||||
df = DataFrame(
|
||||
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
|
||||
index=didx,
|
||||
)
|
||||
|
||||
exp_idx = DatetimeIndex(
|
||||
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
|
||||
tz="Asia/Tokyo",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(level=0).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_frame_datetime64_handling_groupby(self):
|
||||
# it works!
|
||||
df = DataFrame(
|
||||
[(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))],
|
||||
columns=["a", "date"],
|
||||
)
|
||||
result = df.groupby("a").first()
|
||||
assert result["date"][3] == Timestamp("2012-07-03")
|
||||
|
||||
def test_groupby_multi_timezone(self):
|
||||
# combining multiple / different timezones yields UTC
|
||||
df = DataFrame(
|
||||
{
|
||||
"value": range(5),
|
||||
"date": [
|
||||
"2000-01-28 16:47:00",
|
||||
"2000-01-29 16:48:00",
|
||||
"2000-01-30 16:49:00",
|
||||
"2000-01-31 16:50:00",
|
||||
"2000-01-01 16:50:00",
|
||||
],
|
||||
"tz": [
|
||||
"America/Chicago",
|
||||
"America/Chicago",
|
||||
"America/Los_Angeles",
|
||||
"America/Chicago",
|
||||
"America/New_York",
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby("tz", group_keys=False).date.apply(
|
||||
lambda x: pd.to_datetime(x).dt.tz_localize(x.name)
|
||||
)
|
||||
|
||||
expected = Series(
|
||||
[
|
||||
Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"),
|
||||
Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"),
|
||||
Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"),
|
||||
Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"),
|
||||
Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"),
|
||||
],
|
||||
name="date",
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
tz = "America/Chicago"
|
||||
res_values = df.groupby("tz").date.get_group(tz)
|
||||
result = pd.to_datetime(res_values).dt.tz_localize(tz)
|
||||
exp_values = Series(
|
||||
["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"],
|
||||
index=[0, 1, 3],
|
||||
name="date",
|
||||
)
|
||||
expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_groups_periods(self):
|
||||
dates = [
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
]
|
||||
df = DataFrame(
|
||||
{
|
||||
"label": ["a", "a", "a", "b", "b", "b"],
|
||||
"period": [pd.Period(d, freq="h") for d in dates],
|
||||
"value1": np.arange(6, dtype="int64"),
|
||||
"value2": [1, 2] * 3,
|
||||
}
|
||||
)
|
||||
|
||||
exp_idx1 = pd.PeriodIndex(
|
||||
[
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
],
|
||||
freq="h",
|
||||
name="period",
|
||||
)
|
||||
exp_idx2 = Index(["a", "b"] * 3, name="label")
|
||||
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||||
expected = DataFrame(
|
||||
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(["period", "label"]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# by level
|
||||
didx = pd.PeriodIndex(dates, freq="h")
|
||||
df = DataFrame(
|
||||
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
|
||||
index=didx,
|
||||
)
|
||||
|
||||
exp_idx = pd.PeriodIndex(
|
||||
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
|
||||
freq="h",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(level=0).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_first_datetime64(self):
|
||||
df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
|
||||
df[1] = df[1].astype("M8[ns]")
|
||||
|
||||
assert issubclass(df[1].dtype.type, np.datetime64)
|
||||
|
||||
result = df.groupby(level=0).first()
|
||||
got_dt = result[1].dtype
|
||||
assert issubclass(got_dt.type, np.datetime64)
|
||||
|
||||
result = df[1].groupby(level=0).first()
|
||||
got_dt = result.dtype
|
||||
assert issubclass(got_dt.type, np.datetime64)
|
||||
|
||||
def test_groupby_max_datetime64(self):
|
||||
# GH 5869
|
||||
# datetimelike dtype conversion from int
|
||||
df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
|
||||
# TODO: can we retain second reso in .apply here?
|
||||
expected = df.groupby("A")["A"].apply(lambda x: x.max()).astype("M8[s]")
|
||||
result = df.groupby("A")["A"].max()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_datetime64_32_bit(self):
|
||||
# GH 6410 / numpy 4328
|
||||
# 32-bit under 1.9-dev indexing issue
|
||||
|
||||
df = DataFrame({"A": range(2), "B": [Timestamp("2000-01-1")] * 2})
|
||||
result = df.groupby("A")["B"].transform("min")
|
||||
expected = Series([Timestamp("2000-01-1")] * 2, name="B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_with_timezone_selection(self):
|
||||
# GH 11616
|
||||
# Test that column selection returns output in correct timezone.
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"factor": np.random.default_rng(2).integers(0, 3, size=60),
|
||||
"time": date_range("01/01/2000 00:00", periods=60, freq="s", tz="UTC"),
|
||||
}
|
||||
)
|
||||
df1 = df.groupby("factor").max()["time"]
|
||||
df2 = df.groupby("factor")["time"].max()
|
||||
tm.assert_series_equal(df1, df2)
|
||||
|
||||
def test_timezone_info(self):
|
||||
# see gh-11682: Timezone info lost when broadcasting
|
||||
# scalar datetime to DataFrame
|
||||
|
||||
df = DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]})
|
||||
assert df["b"][0].tzinfo == pytz.utc
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
df["b"] = datetime.now(pytz.utc)
|
||||
assert df["b"][0].tzinfo == pytz.utc
|
||||
|
||||
def test_datetime_count(self):
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3] * 2, "dates": date_range("now", periods=6, freq="min")}
|
||||
)
|
||||
result = df.groupby("a").dates.count()
|
||||
expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_first_last_max_min_on_time_data(self):
|
||||
# GH 10295
|
||||
# Verify that NaT is not in the result of max, min, first and last on
|
||||
# Dataframe with datetime or timedelta values.
|
||||
df_test = DataFrame(
|
||||
{
|
||||
"dt": [
|
||||
np.nan,
|
||||
"2015-07-24 10:10",
|
||||
"2015-07-25 11:11",
|
||||
"2015-07-23 12:12",
|
||||
np.nan,
|
||||
],
|
||||
"td": [
|
||||
np.nan,
|
||||
timedelta(days=1),
|
||||
timedelta(days=2),
|
||||
timedelta(days=3),
|
||||
np.nan,
|
||||
],
|
||||
}
|
||||
)
|
||||
df_test.dt = pd.to_datetime(df_test.dt)
|
||||
df_test["group"] = "A"
|
||||
df_ref = df_test[df_test.dt.notna()]
|
||||
|
||||
grouped_test = df_test.groupby("group")
|
||||
grouped_ref = df_ref.groupby("group")
|
||||
|
||||
tm.assert_frame_equal(grouped_ref.max(), grouped_test.max())
|
||||
tm.assert_frame_equal(grouped_ref.min(), grouped_test.min())
|
||||
tm.assert_frame_equal(grouped_ref.first(), grouped_test.first())
|
||||
tm.assert_frame_equal(grouped_ref.last(), grouped_test.last())
|
||||
|
||||
def test_nunique_with_timegrouper_and_nat(self):
|
||||
# GH 17575
|
||||
test = DataFrame(
|
||||
{
|
||||
"time": [
|
||||
Timestamp("2016-06-28 09:35:35"),
|
||||
pd.NaT,
|
||||
Timestamp("2016-06-28 16:46:28"),
|
||||
],
|
||||
"data": ["1", "2", "3"],
|
||||
}
|
||||
)
|
||||
|
||||
grouper = Grouper(key="time", freq="h")
|
||||
result = test.groupby(grouper)["data"].nunique()
|
||||
expected = test[test.time.notnull()].groupby(grouper)["data"].nunique()
|
||||
expected.index = expected.index._with_freq(None)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_scalar_call_versus_list_call(self):
|
||||
# Issue: 17530
|
||||
data_frame = {
|
||||
"location": ["shanghai", "beijing", "shanghai"],
|
||||
"time": Series(
|
||||
["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"],
|
||||
dtype="datetime64[ns]",
|
||||
),
|
||||
"value": [1, 2, 3],
|
||||
}
|
||||
data_frame = DataFrame(data_frame).set_index("time")
|
||||
grouper = Grouper(freq="D")
|
||||
|
||||
grouped = data_frame.groupby(grouper)
|
||||
result = grouped.count()
|
||||
grouped = data_frame.groupby([grouper])
|
||||
expected = grouped.count()
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_grouper_period_index(self):
|
||||
# GH 32108
|
||||
periods = 2
|
||||
index = pd.period_range(
|
||||
start="2018-01", periods=periods, freq="M", name="Month"
|
||||
)
|
||||
period_series = Series(range(periods), index=index)
|
||||
result = period_series.groupby(period_series.index.month).sum()
|
||||
|
||||
expected = Series(
|
||||
range(periods), index=Index(range(1, periods + 1), name=index.name)
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_apply_timegrouper_with_nat_dict_returns(
|
||||
self, groupby_with_truncated_bingrouper
|
||||
):
|
||||
# GH#43500 case where gb._grouper.result_index and gb._grouper.group_keys_seq
|
||||
# have different lengths that goes through the `isinstance(values[0], dict)`
|
||||
# path
|
||||
gb = groupby_with_truncated_bingrouper
|
||||
|
||||
res = gb["Quantity"].apply(lambda x: {"foo": len(x)})
|
||||
|
||||
df = gb.obj
|
||||
unit = df["Date"]._values.unit
|
||||
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit)
|
||||
mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)])
|
||||
expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity")
|
||||
tm.assert_series_equal(res, expected)
|
||||
|
||||
def test_groupby_apply_timegrouper_with_nat_scalar_returns(
|
||||
self, groupby_with_truncated_bingrouper
|
||||
):
|
||||
# GH#43500 Previously raised ValueError bc used index with incorrect
|
||||
# length in wrap_applied_result
|
||||
gb = groupby_with_truncated_bingrouper
|
||||
|
||||
res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan)
|
||||
|
||||
df = gb.obj
|
||||
unit = df["Date"]._values.unit
|
||||
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit)
|
||||
expected = Series(
|
||||
[18, np.nan, np.nan, np.nan, np.nan, np.nan, 5],
|
||||
index=dti._with_freq(None),
|
||||
name="Quantity",
|
||||
)
|
||||
|
||||
tm.assert_series_equal(res, expected)
|
||||
|
||||
def test_groupby_apply_timegrouper_with_nat_apply_squeeze(
|
||||
self, frame_for_truncated_bingrouper
|
||||
):
|
||||
df = frame_for_truncated_bingrouper
|
||||
|
||||
# We need to create a GroupBy object with only one non-NaT group,
|
||||
# so use a huge freq so that all non-NaT dates will be grouped together
|
||||
tdg = Grouper(key="Date", freq="100YE")
|
||||
gb = df.groupby(tdg)
|
||||
|
||||
# check that we will go through the singular_series path
|
||||
# in _wrap_applied_output_series
|
||||
assert gb.ngroups == 1
|
||||
assert gb._selected_obj._get_axis(gb.axis).nlevels == 1
|
||||
|
||||
# function that returns a Series
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
res = gb.apply(lambda x: x["Quantity"] * 2)
|
||||
|
||||
dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date")
|
||||
expected = DataFrame(
|
||||
[[36, 6, 6, 10, 2]],
|
||||
index=dti,
|
||||
columns=Index([0, 1, 5, 2, 3], name="Quantity"),
|
||||
)
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
def test_groupby_agg_numba_timegrouper_with_nat(
|
||||
self, groupby_with_truncated_bingrouper
|
||||
):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
# See discussion in GH#43487
|
||||
gb = groupby_with_truncated_bingrouper
|
||||
|
||||
result = gb["Quantity"].aggregate(
|
||||
lambda values, index: np.nanmean(values), engine="numba"
|
||||
)
|
||||
|
||||
expected = gb["Quantity"].aggregate("mean")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result_df = gb[["Quantity"]].aggregate(
|
||||
lambda values, index: np.nanmean(values), engine="numba"
|
||||
)
|
||||
expected_df = gb[["Quantity"]].aggregate("mean")
|
||||
tm.assert_frame_equal(result_df, expected_df)
|
||||
@ -0,0 +1,294 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_arm
|
||||
from pandas.errors import NumbaUtilError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.util.version import Version
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
numba = pytest.importorskip("numba")
|
||||
pytestmark.append(
|
||||
pytest.mark.skipif(
|
||||
Version(numba.__version__) == Version("0.61") and is_platform_arm(),
|
||||
reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test_correct_function_signature():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def incorrect_function(x):
|
||||
return x + 1
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key").transform(incorrect_function, engine="numba")
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key")["data"].transform(incorrect_function, engine="numba")
|
||||
|
||||
|
||||
def test_check_nopython_kwargs():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def incorrect_function(values, index):
|
||||
return values + 1
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key").transform(incorrect_function, engine="numba", a=1)
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key")["data"].transform(incorrect_function, engine="numba", a=1)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func(values, index):
|
||||
return values + 1
|
||||
|
||||
if jit:
|
||||
# Test accepted jitted functions
|
||||
import numba
|
||||
|
||||
func = numba.jit(func)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0, as_index=as_index)
|
||||
if pandas_obj == "Series":
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.transform(func, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x + 1, engine="cython")
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
||||
def test_cache(jit, pandas_obj, nogil, parallel, nopython):
|
||||
# Test that the functions are cached correctly if we switch functions
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_1(values, index):
|
||||
return values + 1
|
||||
|
||||
def func_2(values, index):
|
||||
return values * 5
|
||||
|
||||
if jit:
|
||||
import numba
|
||||
|
||||
func_1 = numba.jit(func_1)
|
||||
func_2 = numba.jit(func_2)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0)
|
||||
if pandas_obj == "Series":
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x + 1, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
result = grouped.transform(func_2, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x * 5, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Retest func_1 which should use the cache
|
||||
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x + 1, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
def test_use_global_config():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_1(values, index):
|
||||
return values + 1
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
expected = grouped.transform(func_1, engine="numba")
|
||||
with option_context("compute.use_numba", True):
|
||||
result = grouped.transform(func_1, engine=None)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
# TODO: Test more than just reductions (e.g. actually test transformations once we have
|
||||
@pytest.mark.parametrize(
|
||||
"agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}]
|
||||
)
|
||||
def test_string_cython_vs_numba(agg_func, numba_supported_reductions):
|
||||
pytest.importorskip("numba")
|
||||
agg_func, kwargs = numba_supported_reductions
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
|
||||
result = grouped.transform(agg_func, engine="numba", **kwargs)
|
||||
expected = grouped.transform(agg_func, engine="cython", **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grouped[1].transform(agg_func, engine="numba", **kwargs)
|
||||
expected = grouped[1].transform(agg_func, engine="cython", **kwargs)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_args_not_cached():
|
||||
# GH 41647
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def sum_last(values, index, n):
|
||||
return values[-n:].sum()
|
||||
|
||||
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
|
||||
grouped_x = df.groupby("id")["x"]
|
||||
result = grouped_x.transform(sum_last, 1, engine="numba")
|
||||
expected = Series([1.0] * 4, name="x")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = grouped_x.transform(sum_last, 2, engine="numba")
|
||||
expected = Series([2.0] * 4, name="x")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_data_correctly_passed():
|
||||
# GH 43133
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def f(values, index):
|
||||
return index - 1
|
||||
|
||||
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
|
||||
result = df.groupby("group").transform(f, engine="numba")
|
||||
expected = DataFrame([-4.0, -3.0, -2.0], columns=["v"], index=[-1, -2, -3])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_engine_kwargs_not_cached():
|
||||
# If the user passes a different set of engine_kwargs don't return the same
|
||||
# jitted function
|
||||
pytest.importorskip("numba")
|
||||
nogil = True
|
||||
parallel = False
|
||||
nopython = True
|
||||
|
||||
def func_kwargs(values, index):
|
||||
return nogil + parallel + nopython
|
||||
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
df = DataFrame({"value": [0, 0, 0]})
|
||||
result = df.groupby(level=0).transform(
|
||||
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame({"value": [2.0, 2.0, 2.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
nogil = False
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby(level=0).transform(
|
||||
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame({"value": [1.0, 1.0, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
def test_multiindex_one_key(nogil, parallel, nopython):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby("A").transform(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame([{"A": 1, "B": 2, "C": 1.0}]).set_index(["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
|
||||
df.groupby(["A", "B"]).transform(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
|
||||
|
||||
def test_multilabel_numba_vs_cython(numba_supported_reductions):
|
||||
pytest.importorskip("numba")
|
||||
reduction, kwargs = numba_supported_reductions
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
gb = df.groupby(["A", "B"])
|
||||
res_agg = gb.transform(reduction, engine="numba", **kwargs)
|
||||
expected_agg = gb.transform(reduction, engine="cython", **kwargs)
|
||||
tm.assert_frame_equal(res_agg, expected_agg)
|
||||
|
||||
|
||||
def test_multilabel_udf_numba_vs_cython():
|
||||
pytest.importorskip("numba")
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
gb = df.groupby(["A", "B"])
|
||||
result = gb.transform(
|
||||
lambda values, index: (values - values.min()) / (values.max() - values.min()),
|
||||
engine="numba",
|
||||
)
|
||||
expected = gb.transform(
|
||||
lambda x: (x - x.min()) / (x.max() - x.min()), engine="cython"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user