309 lines
		
	
	
		
			9.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			309 lines
		
	
	
		
			9.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import numpy as np
 | |
| import pytest
 | |
| 
 | |
| import pandas as pd
 | |
| from pandas import (
 | |
|     DataFrame,
 | |
|     Series,
 | |
|     Timestamp,
 | |
|     date_range,
 | |
| )
 | |
| import pandas._testing as tm
 | |
| 
 | |
| 
 | |
| class TestDataFrameDiff:
 | |
|     def test_diff_requires_integer(self):
 | |
|         df = DataFrame(np.random.default_rng(2).standard_normal((2, 2)))
 | |
|         with pytest.raises(ValueError, match="periods must be an integer"):
 | |
|             df.diff(1.5)
 | |
| 
 | |
|     # GH#44572 np.int64 is accepted
 | |
|     @pytest.mark.parametrize("num", [1, np.int64(1)])
 | |
|     def test_diff(self, datetime_frame, num):
 | |
|         df = datetime_frame
 | |
|         the_diff = df.diff(num)
 | |
| 
 | |
|         expected = df["A"] - df["A"].shift(num)
 | |
|         tm.assert_series_equal(the_diff["A"], expected)
 | |
| 
 | |
|     def test_diff_int_dtype(self):
 | |
|         # int dtype
 | |
|         a = 10_000_000_000_000_000
 | |
|         b = a + 1
 | |
|         ser = Series([a, b])
 | |
| 
 | |
|         rs = DataFrame({"s": ser}).diff()
 | |
|         assert rs.s[1] == 1
 | |
| 
 | |
|     def test_diff_mixed_numeric(self, datetime_frame):
 | |
|         # mixed numeric
 | |
|         tf = datetime_frame.astype("float32")
 | |
|         the_diff = tf.diff(1)
 | |
|         tm.assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1))
 | |
| 
 | |
|     def test_diff_axis1_nonconsolidated(self):
 | |
|         # GH#10907
 | |
|         df = DataFrame({"y": Series([2]), "z": Series([3])})
 | |
|         df.insert(0, "x", 1)
 | |
|         result = df.diff(axis=1)
 | |
|         expected = DataFrame({"x": np.nan, "y": Series(1), "z": Series(1)})
 | |
|         tm.assert_frame_equal(result, expected)
 | |
| 
 | |
|     def test_diff_timedelta64_with_nat(self):
 | |
|         # GH#32441
 | |
|         arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]")
 | |
|         arr[:, 0] = np.timedelta64("NaT", "ns")
 | |
| 
 | |
|         df = DataFrame(arr)
 | |
|         result = df.diff(1, axis=0)
 | |
| 
 | |
|         expected = DataFrame({0: df[0], 1: [pd.NaT, pd.Timedelta(2), pd.Timedelta(2)]})
 | |
|         tm.assert_equal(result, expected)
 | |
| 
 | |
|         result = df.diff(0)
 | |
|         expected = df - df
 | |
|         assert expected[0].isna().all()
 | |
|         tm.assert_equal(result, expected)
 | |
| 
 | |
|         result = df.diff(-1, axis=1)
 | |
|         expected = df * np.nan
 | |
|         tm.assert_equal(result, expected)
 | |
| 
 | |
|     @pytest.mark.parametrize("tz", [None, "UTC"])
 | |
|     def test_diff_datetime_axis0_with_nat(self, tz, unit):
 | |
|         # GH#32441
 | |
|         dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz).as_unit(unit)
 | |
|         ser = Series(dti)
 | |
| 
 | |
|         df = ser.to_frame()
 | |
| 
 | |
|         result = df.diff()
 | |
|         ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)]).as_unit(
 | |
|             unit
 | |
|         )
 | |
|         expected = Series(ex_index).to_frame()
 | |
|         tm.assert_frame_equal(result, expected)
 | |
| 
 | |
|     @pytest.mark.parametrize("tz", [None, "UTC"])
 | |
|     def test_diff_datetime_with_nat_zero_periods(self, tz):
 | |
|         # diff on NaT values should give NaT, not timedelta64(0)
 | |
|         dti = date_range("2016-01-01", periods=4, tz=tz)
 | |
|         ser = Series(dti)
 | |
|         df = ser.to_frame().copy()
 | |
| 
 | |
|         df[1] = ser.copy()
 | |
| 
 | |
|         df.iloc[:, 0] = pd.NaT
 | |
| 
 | |
|         expected = df - df
 | |
|         assert expected[0].isna().all()
 | |
| 
 | |
|         result = df.diff(0, axis=0)
 | |
|         tm.assert_frame_equal(result, expected)
 | |
| 
 | |
|         result = df.diff(0, axis=1)
 | |
|         tm.assert_frame_equal(result, expected)
 | |
| 
 | |
|     @pytest.mark.parametrize("tz", [None, "UTC"])
 | |
|     def test_diff_datetime_axis0(self, tz):
 | |
|         # GH#18578
 | |
|         df = DataFrame(
 | |
|             {
 | |
|                 0: date_range("2010", freq="D", periods=2, tz=tz),
 | |
|                 1: date_range("2010", freq="D", periods=2, tz=tz),
 | |
|             }
 | |
|         )
 | |
| 
 | |
|         result = df.diff(axis=0)
 | |
|         expected = DataFrame(
 | |
|             {
 | |
|                 0: pd.TimedeltaIndex(["NaT", "1 days"]),
 | |
|                 1: pd.TimedeltaIndex(["NaT", "1 days"]),
 | |
|             }
 | |
|         )
 | |
|         tm.assert_frame_equal(result, expected)
 | |
| 
 | |
|     @pytest.mark.parametrize("tz", [None, "UTC"])
 | |
|     def test_diff_datetime_axis1(self, tz):
 | |
|         # GH#18578
 | |
|         df = DataFrame(
 | |
|             {
 | |
|                 0: date_range("2010", freq="D", periods=2, tz=tz),
 | |
|                 1: date_range("2010", freq="D", periods=2, tz=tz),
 | |
|             }
 | |
|         )
 | |
| 
 | |
|         result = df.diff(axis=1)
 | |
|         expected = DataFrame(
 | |
|             {
 | |
|                 0: pd.TimedeltaIndex(["NaT", "NaT"]),
 | |
|                 1: pd.TimedeltaIndex(["0 days", "0 days"]),
 | |
|             }
 | |
|         )
 | |
|         tm.assert_frame_equal(result, expected)
 | |
| 
 | |
|     def test_diff_timedelta(self, unit):
 | |
|         # GH#4533
 | |
|         df = DataFrame(
 | |
|             {
 | |
|                 "time": [Timestamp("20130101 9:01"), Timestamp("20130101 9:02")],
 | |
|                 "value": [1.0, 2.0],
 | |
|             }
 | |
|         )
 | |
|         df["time"] = df["time"].dt.as_unit(unit)
 | |
| 
 | |
|         res = df.diff()
 | |
|         exp = DataFrame(
 | |
|             [[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"]
 | |
|         )
 | |
|         exp["time"] = exp["time"].dt.as_unit(unit)
 | |
|         tm.assert_frame_equal(res, exp)
 | |
| 
 | |
|     def test_diff_mixed_dtype(self):
 | |
|         df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
 | |
|         df["A"] = np.array([1, 2, 3, 4, 5], dtype=object)
 | |
| 
 | |
|         result = df.diff()
 | |
|         assert result[0].dtype == np.float64
 | |
| 
 | |
|     def test_diff_neg_n(self, datetime_frame):
 | |
|         rs = datetime_frame.diff(-1)
 | |
|         xp = datetime_frame - datetime_frame.shift(-1)
 | |
|         tm.assert_frame_equal(rs, xp)
 | |
| 
 | |
|     def test_diff_float_n(self, datetime_frame):
 | |
|         rs = datetime_frame.diff(1.0)
 | |
|         xp = datetime_frame.diff(1)
 | |
|         tm.assert_frame_equal(rs, xp)
 | |
| 
 | |
|     def test_diff_axis(self):
 | |
|         # GH#9727
 | |
|         df = DataFrame([[1.0, 2.0], [3.0, 4.0]])
 | |
|         tm.assert_frame_equal(
 | |
|             df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]])
 | |
|         )
 | |
|         tm.assert_frame_equal(
 | |
|             df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]])
 | |
|         )
 | |
| 
 | |
|     def test_diff_period(self):
 | |
|         # GH#32995 Don't pass an incorrect axis
 | |
|         pi = date_range("2016-01-01", periods=3).to_period("D")
 | |
|         df = DataFrame({"A": pi})
 | |
| 
 | |
|         result = df.diff(1, axis=1)
 | |
| 
 | |
|         expected = (df - pd.NaT).astype(object)
 | |
|         tm.assert_frame_equal(result, expected)
 | |
| 
 | |
|     def test_diff_axis1_mixed_dtypes(self):
 | |
|         # GH#32995 operate column-wise when we have mixed dtypes and axis=1
 | |
|         df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
 | |
| 
 | |
|         expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": df["B"] / 2})
 | |
| 
 | |
|         result = df.diff(axis=1)
 | |
|         tm.assert_frame_equal(result, expected)
 | |
| 
 | |
|         # GH#21437 mixed-float-dtypes
 | |
|         df = DataFrame(
 | |
|             {"a": np.arange(3, dtype="float32"), "b": np.arange(3, dtype="float64")}
 | |
|         )
 | |
|         result = df.diff(axis=1)
 | |
|         expected = DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0})
 | |
|         tm.assert_frame_equal(result, expected)
 | |
| 
 | |
|     def test_diff_axis1_mixed_dtypes_large_periods(self):
 | |
|         # GH#32995 operate column-wise when we have mixed dtypes and axis=1
 | |
|         df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
 | |
| 
 | |
|         expected = df * np.nan
 | |
| 
 | |
|         result = df.diff(axis=1, periods=3)
 | |
|         tm.assert_frame_equal(result, expected)
 | |
| 
 | |
|     def test_diff_axis1_mixed_dtypes_negative_periods(self):
 | |
|         # GH#32995 operate column-wise when we have mixed dtypes and axis=1
 | |
|         df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
 | |
| 
 | |
|         expected = DataFrame({"A": -1.0 * df["A"], "B": df["B"] * np.nan})
 | |
| 
 | |
|         result = df.diff(axis=1, periods=-1)
 | |
|         tm.assert_frame_equal(result, expected)
 | |
| 
 | |
|     def test_diff_sparse(self):
 | |
|         # GH#28813 .diff() should work for sparse dataframes as well
 | |
|         sparse_df = DataFrame([[0, 1], [1, 0]], dtype="Sparse[int]")
 | |
| 
 | |
|         result = sparse_df.diff()
 | |
|         expected = DataFrame(
 | |
|             [[np.nan, np.nan], [1.0, -1.0]], dtype=pd.SparseDtype("float", 0.0)
 | |
|         )
 | |
| 
 | |
|         tm.assert_frame_equal(result, expected)
 | |
| 
 | |
|     @pytest.mark.parametrize(
 | |
|         "axis,expected",
 | |
|         [
 | |
|             (
 | |
|                 0,
 | |
|                 DataFrame(
 | |
|                     {
 | |
|                         "a": [np.nan, 0, 1, 0, np.nan, np.nan, np.nan, 0],
 | |
|                         "b": [np.nan, 1, np.nan, np.nan, -2, 1, np.nan, np.nan],
 | |
|                         "c": np.repeat(np.nan, 8),
 | |
|                         "d": [np.nan, 3, 5, 7, 9, 11, 13, 15],
 | |
|                     },
 | |
|                     dtype="Int64",
 | |
|                 ),
 | |
|             ),
 | |
|             (
 | |
|                 1,
 | |
|                 DataFrame(
 | |
|                     {
 | |
|                         "a": np.repeat(np.nan, 8),
 | |
|                         "b": [0, 1, np.nan, 1, np.nan, np.nan, np.nan, 0],
 | |
|                         "c": np.repeat(np.nan, 8),
 | |
|                         "d": np.repeat(np.nan, 8),
 | |
|                     },
 | |
|                     dtype="Int64",
 | |
|                 ),
 | |
|             ),
 | |
|         ],
 | |
|     )
 | |
|     def test_diff_integer_na(self, axis, expected):
 | |
|         # GH#24171 IntegerNA Support for DataFrame.diff()
 | |
|         df = DataFrame(
 | |
|             {
 | |
|                 "a": np.repeat([0, 1, np.nan, 2], 2),
 | |
|                 "b": np.tile([0, 1, np.nan, 2], 2),
 | |
|                 "c": np.repeat(np.nan, 8),
 | |
|                 "d": np.arange(1, 9) ** 2,
 | |
|             },
 | |
|             dtype="Int64",
 | |
|         )
 | |
| 
 | |
|         # Test case for default behaviour of diff
 | |
|         result = df.diff(axis=axis)
 | |
|         tm.assert_frame_equal(result, expected)
 | |
| 
 | |
|     def test_diff_readonly(self):
 | |
|         # https://github.com/pandas-dev/pandas/issues/35559
 | |
|         arr = np.random.default_rng(2).standard_normal((5, 2))
 | |
|         arr.flags.writeable = False
 | |
|         df = DataFrame(arr)
 | |
|         result = df.diff()
 | |
|         expected = DataFrame(np.array(df)).diff()
 | |
|         tm.assert_frame_equal(result, expected)
 | |
| 
 | |
|     def test_diff_all_int_dtype(self, any_int_numpy_dtype):
 | |
|         # GH 14773
 | |
|         df = DataFrame(range(5))
 | |
|         df = df.astype(any_int_numpy_dtype)
 | |
|         result = df.diff()
 | |
|         expected_dtype = (
 | |
|             "float32" if any_int_numpy_dtype in ("int8", "int16") else "float64"
 | |
|         )
 | |
|         expected = DataFrame([np.nan, 1.0, 1.0, 1.0, 1.0], dtype=expected_dtype)
 | |
|         tm.assert_frame_equal(result, expected)
 |