A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pandas/tests/groupby/test_function.py

1201 lines
38 KiB

import builtins
from io import StringIO
import numpy as np
import pytest
from pandas.errors import UnsupportedFunctionCall
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
import pandas.core.nanops as nanops
from pandas.util import _test_decorators as td
@pytest.fixture(
params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"],
ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"],
)
def dtypes_for_minmax(request):
"""
Fixture of dtypes with min and max values used for testing
cummin and cummax
"""
dtype = request.param
np_type = dtype
if dtype == "Int64":
np_type = np.int64
elif dtype == "Float64":
np_type = np.float64
min_val = (
np.iinfo(np_type).min
if np.dtype(np_type).kind == "i"
else np.finfo(np_type).min
)
max_val = (
np.iinfo(np_type).max
if np.dtype(np_type).kind == "i"
else np.finfo(np_type).max
)
return (dtype, min_val, max_val)
def test_intercept_builtin_sum():
s = Series([1.0, 2.0, np.nan, 3.0])
grouped = s.groupby([0, 1, 2, 2])
result = grouped.agg(builtins.sum)
result2 = grouped.apply(builtins.sum)
expected = grouped.sum()
tm.assert_series_equal(result, expected)
tm.assert_series_equal(result2, expected)
@pytest.mark.parametrize("f", [max, min, sum])
@pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key
def test_builtins_apply(keys, f):
# see gh-8155
df = DataFrame(np.random.randint(1, 50, (1000, 2)), columns=["jim", "joe"])
df["jolie"] = np.random.randn(1000)
gb = df.groupby(keys)
fname = f.__name__
result = gb.apply(f)
ngroups = len(df.drop_duplicates(subset=keys))
assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))"
assert result.shape == (ngroups, 3), assert_msg
npfunc = getattr(np, fname) # numpy's equivalent function
if f in [max, min]:
warn = FutureWarning
else:
warn = None
msg = "scalar (max|min) over the entire DataFrame"
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
# stacklevel can be thrown off because (i think) the stack
# goes through some of numpy's C code.
expected = gb.apply(npfunc)
tm.assert_frame_equal(result, expected)
with tm.assert_produces_warning(None):
expected2 = gb.apply(lambda x: npfunc(x, axis=0))
tm.assert_frame_equal(result, expected2)
if f != sum:
expected = gb.agg(fname).reset_index()
expected.set_index(keys, inplace=True, drop=False)
tm.assert_frame_equal(result, expected, check_dtype=False)
tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)())
class TestNumericOnly:
# make sure that we are passing thru kwargs to our agg functions
@pytest.fixture
def df(self):
# GH3668
# GH5724
df = DataFrame(
{
"group": [1, 1, 2],
"int": [1, 2, 3],
"float": [4.0, 5.0, 6.0],
"string": list("abc"),
"category_string": Series(list("abc")).astype("category"),
"category_int": [7, 8, 9],
"datetime": date_range("20130101", periods=3),
"datetimetz": date_range("20130101", periods=3, tz="US/Eastern"),
"timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
},
columns=[
"group",
"int",
"float",
"string",
"category_string",
"category_int",
"datetime",
"datetimetz",
"timedelta",
],
)
return df
@pytest.mark.parametrize("method", ["mean", "median"])
def test_averages(self, df, method):
# mean / median
expected_columns_numeric = Index(["int", "float", "category_int"])
gb = df.groupby("group")
expected = DataFrame(
{
"category_int": [7.5, 9],
"float": [4.5, 6.0],
"timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")],
"int": [1.5, 3],
"datetime": [
Timestamp("2013-01-01 12:00:00"),
Timestamp("2013-01-03 00:00:00"),
],
"datetimetz": [
Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
],
},
index=Index([1, 2], name="group"),
columns=[
"int",
"float",
"category_int",
"datetime",
"datetimetz",
"timedelta",
],
)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
result = getattr(gb, method)(numeric_only=False)
tm.assert_frame_equal(result.reindex_like(expected), expected)
expected_columns = expected.columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["min", "max"])
def test_extrema(self, df, method):
# TODO: min, max *should* handle
# categorical (ordered) dtype
expected_columns = Index(
[
"int",
"float",
"string",
"category_int",
"datetime",
"datetimetz",
"timedelta",
]
)
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["first", "last"])
def test_first_last(self, df, method):
expected_columns = Index(
[
"int",
"float",
"string",
"category_string",
"category_int",
"datetime",
"datetimetz",
"timedelta",
]
)
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["sum", "cumsum"])
def test_sum_cumsum(self, df, method):
expected_columns_numeric = Index(["int", "float", "category_int"])
expected_columns = Index(
["int", "float", "string", "category_int", "timedelta"]
)
if method == "cumsum":
# cumsum loses string
expected_columns = Index(["int", "float", "category_int", "timedelta"])
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["prod", "cumprod"])
def test_prod_cumprod(self, df, method):
expected_columns = Index(["int", "float", "category_int"])
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["cummin", "cummax"])
def test_cummin_cummax(self, df, method):
# like min, max, but don't include strings
expected_columns = Index(
["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
)
# GH#15561: numeric_only=False set by default like min/max
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
def _check(self, df, method, expected_columns, expected_columns_numeric):
gb = df.groupby("group")
# cummin, cummax dont have numeric_only kwarg, always use False
warn = None
if method in ["cummin", "cummax"]:
# these dont have numeric_only kwarg, always use False
warn = FutureWarning
elif method in ["min", "max"]:
# these have numeric_only kwarg, but default to False
warn = FutureWarning
with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
result = getattr(gb, method)()
tm.assert_index_equal(result.columns, expected_columns_numeric)
# GH#41475 deprecated silently ignoring nuisance columns
warn = None
if len(expected_columns) < len(gb._obj_with_exclusions.columns):
warn = FutureWarning
with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
result = getattr(gb, method)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)
class TestGroupByNonCythonPaths:
# GH#5610 non-cython calls should not include the grouper
# Tests for code not expected to go through cython paths.
@pytest.fixture
def df(self):
df = DataFrame(
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
columns=["A", "B", "C"],
)
return df
@pytest.fixture
def gb(self, df):
gb = df.groupby("A")
return gb
@pytest.fixture
def gni(self, df):
gni = df.groupby("A", as_index=False)
return gni
# TODO: non-unique columns, as_index=False
@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
def test_idxmax(self, gb):
# object dtype so idxmax goes through _aggregate_item_by_item
# GH#5610
# non-cython calls should not include the grouper
expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3])
expected.index.name = "A"
result = gb.idxmax()
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
def test_idxmin(self, gb):
# object dtype so idxmax goes through _aggregate_item_by_item
# GH#5610
# non-cython calls should not include the grouper
expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3])
expected.index.name = "A"
result = gb.idxmin()
tm.assert_frame_equal(result, expected)
def test_mad(self, gb, gni):
# mad
expected = DataFrame([[0], [np.nan]], columns=["B"], index=[1, 3])
expected.index.name = "A"
result = gb.mad()
tm.assert_frame_equal(result, expected)
expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1])
result = gni.mad()
tm.assert_frame_equal(result, expected)
def test_describe(self, df, gb, gni):
# describe
expected_index = Index([1, 3], name="A")
expected_col = MultiIndex(
levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
codes=[[0] * 8, list(range(8))],
)
expected = DataFrame(
[
[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
[0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
],
index=expected_index,
columns=expected_col,
)
result = gb.describe()
tm.assert_frame_equal(result, expected)
expected = pd.concat(
[
df[df.A == 1].describe().unstack().to_frame().T,
df[df.A == 3].describe().unstack().to_frame().T,
]
)
expected.index = Index([0, 1])
result = gni.describe()
tm.assert_frame_equal(result, expected)
def test_cython_api2():
# this takes the fast apply path
# cumsum (GH5614)
df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
result = df.groupby("A").cumsum()
tm.assert_frame_equal(result, expected)
# GH 5755 - cumsum is a transformer and should ignore as_index
result = df.groupby("A", as_index=False).cumsum()
tm.assert_frame_equal(result, expected)
# GH 13994
result = df.groupby("A").cumsum(axis=1)
expected = df.cumsum(axis=1)
tm.assert_frame_equal(result, expected)
result = df.groupby("A").cumprod(axis=1)
expected = df.cumprod(axis=1)
tm.assert_frame_equal(result, expected)
def test_cython_median():
df = DataFrame(np.random.randn(1000))
df.values[::2] = np.nan
labels = np.random.randint(0, 50, size=1000).astype(float)
labels[::17] = np.nan
result = df.groupby(labels).median()
exp = df.groupby(labels).agg(nanops.nanmedian)
tm.assert_frame_equal(result, exp)
df = DataFrame(np.random.randn(1000, 5))
rs = df.groupby(labels).agg(np.median)
xp = df.groupby(labels).median()
tm.assert_frame_equal(rs, xp)
def test_median_empty_bins(observed):
df = DataFrame(np.random.randint(0, 44, 500))
grps = range(0, 55, 5)
bins = pd.cut(df[0], grps)
result = df.groupby(bins, observed=observed).median()
expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"]
)
@pytest.mark.parametrize(
"method,data",
[
("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
("nth", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}], "args": [1]}),
("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}),
],
)
def test_groupby_non_arithmetic_agg_types(dtype, method, data):
# GH9311, GH6620
df = DataFrame(
[{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}]
)
df["b"] = df.b.astype(dtype)
if "args" not in data:
data["args"] = []
if "out_type" in data:
out_type = data["out_type"]
else:
out_type = dtype
exp = data["df"]
df_out = DataFrame(exp)
df_out["b"] = df_out.b.astype(out_type)
df_out.set_index("a", inplace=True)
grpd = df.groupby("a")
t = getattr(grpd, method)(*data["args"])
tm.assert_frame_equal(t, df_out)
@pytest.mark.parametrize(
"i",
[
(
Timestamp("2011-01-15 12:50:28.502376"),
Timestamp("2011-01-20 12:50:28.593448"),
),
(24650000000000001, 24650000000000002),
],
)
def test_groupby_non_arithmetic_agg_int_like_precision(i):
# see gh-6620, gh-9311
df = DataFrame([{"a": 1, "b": i[0]}, {"a": 1, "b": i[1]}])
grp_exp = {
"first": {"expected": i[0]},
"last": {"expected": i[1]},
"min": {"expected": i[0]},
"max": {"expected": i[1]},
"nth": {"expected": i[1], "args": [1]},
"count": {"expected": 2},
}
for method, data in grp_exp.items():
if "args" not in data:
data["args"] = []
grouped = df.groupby("a")
res = getattr(grouped, method)(*data["args"])
assert res.iloc[0].b == data["expected"]
@pytest.mark.parametrize(
"func, values",
[
("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}),
("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}),
],
)
@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
def test_idxmin_idxmax_returns_int_types(func, values):
# GH 25444
df = DataFrame(
{
"name": ["A", "A", "B", "B"],
"c_int": [1, 2, 3, 4],
"c_float": [4.02, 3.03, 2.04, 1.05],
"c_date": ["2019", "2018", "2016", "2017"],
}
)
df["c_date"] = pd.to_datetime(df["c_date"])
df["c_date_tz"] = df["c_date"].dt.tz_localize("US/Pacific")
df["c_timedelta"] = df["c_date"] - df["c_date"].iloc[0]
df["c_period"] = df["c_date"].dt.to_period("W")
df["c_Integer"] = df["c_int"].astype("Int64")
df["c_Floating"] = df["c_float"].astype("Float64")
result = getattr(df.groupby("name"), func)()
expected = DataFrame(values, index=Index(["A", "B"], name="name"))
expected["c_date_tz"] = expected["c_date"]
expected["c_timedelta"] = expected["c_date"]
expected["c_period"] = expected["c_date"]
expected["c_Integer"] = expected["c_int"]
expected["c_Floating"] = expected["c_float"]
tm.assert_frame_equal(result, expected)
def test_idxmin_idxmax_axis1():
df = DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4]
gb = df.groupby("A")
res = gb.idxmax(axis=1)
alt = df.iloc[:, 1:].idxmax(axis=1)
indexer = res.index.get_level_values(1)
tm.assert_series_equal(alt[indexer], res.droplevel("A"))
df["E"] = date_range("2016-01-01", periods=10)
gb2 = df.groupby("A")
msg = "reduction operation 'argmax' not allowed for this dtype"
with pytest.raises(TypeError, match=msg):
gb2.idxmax(axis=1)
def test_groupby_cumprod():
# GH 4095
df = DataFrame({"key": ["b"] * 10, "value": 2})
actual = df.groupby("key")["value"].cumprod()
expected = df.groupby("key")["value"].apply(lambda x: x.cumprod())
expected.name = "value"
tm.assert_series_equal(actual, expected)
df = DataFrame({"key": ["b"] * 100, "value": 2})
actual = df.groupby("key")["value"].cumprod()
# if overflows, groupby product casts to float
# while numpy passes back invalid values
df["value"] = df["value"].astype(float)
expected = df.groupby("key")["value"].apply(lambda x: x.cumprod())
expected.name = "value"
tm.assert_series_equal(actual, expected)
def scipy_sem(*args, **kwargs):
from scipy.stats import sem
return sem(*args, ddof=1, **kwargs)
@pytest.mark.parametrize(
"op,targop",
[
("mean", np.mean),
("median", np.median),
("std", np.std),
("var", np.var),
("sum", np.sum),
("prod", np.prod),
("min", np.min),
("max", np.max),
("first", lambda x: x.iloc[0]),
("last", lambda x: x.iloc[-1]),
("count", np.size),
pytest.param("sem", scipy_sem, marks=td.skip_if_no_scipy),
],
)
def test_ops_general(op, targop):
df = DataFrame(np.random.randn(1000))
labels = np.random.randint(0, 50, size=1000).astype(float)
result = getattr(df.groupby(labels), op)()
expected = df.groupby(labels).agg(targop)
tm.assert_frame_equal(result, expected)
def test_max_nan_bug():
raw = """,Date,app,File
-04-23,2013-04-23 00:00:00,,log080001.log
-05-06,2013-05-06 00:00:00,,log.log
-05-07,2013-05-07 00:00:00,OE,xlsx"""
df = pd.read_csv(StringIO(raw), parse_dates=[0])
gb = df.groupby("Date")
r = gb[["File"]].max()
e = gb["File"].max().to_frame()
tm.assert_frame_equal(r, e)
assert not r["File"].isna().any()
def test_nlargest():
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
b = Series(list("a" * 5 + "b" * 5))
gb = a.groupby(b)
r = gb.nlargest(3)
e = Series(
[7, 5, 3, 10, 9, 6],
index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]),
)
tm.assert_series_equal(r, e)
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
gb = a.groupby(b)
e = Series(
[3, 2, 1, 3, 3, 2],
index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]),
)
tm.assert_series_equal(gb.nlargest(3, keep="last"), e)
def test_nlargest_mi_grouper():
# see gh-21411
npr = np.random.RandomState(123456789)
dts = date_range("20180101", periods=10)
iterables = [dts, ["one", "two"]]
idx = MultiIndex.from_product(iterables, names=["first", "second"])
s = Series(npr.randn(20), index=idx)
result = s.groupby("first").nlargest(1)
exp_idx = MultiIndex.from_tuples(
[
(dts[0], dts[0], "one"),
(dts[1], dts[1], "one"),
(dts[2], dts[2], "one"),
(dts[3], dts[3], "two"),
(dts[4], dts[4], "one"),
(dts[5], dts[5], "one"),
(dts[6], dts[6], "one"),
(dts[7], dts[7], "one"),
(dts[8], dts[8], "two"),
(dts[9], dts[9], "one"),
],
names=["first", "first", "second"],
)
exp_values = [
2.2129019979039612,
1.8417114045748335,
0.858963679564603,
1.3759151378258088,
0.9430284594687134,
0.5296914208183142,
0.8318045593815487,
-0.8476703342910327,
0.3804446884133735,
-0.8028845810770998,
]
expected = Series(exp_values, index=exp_idx)
tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3)
def test_nsmallest():
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
b = Series(list("a" * 5 + "b" * 5))
gb = a.groupby(b)
r = gb.nsmallest(3)
e = Series(
[1, 2, 3, 0, 4, 6],
index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]),
)
tm.assert_series_equal(r, e)
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
gb = a.groupby(b)
e = Series(
[0, 1, 1, 0, 1, 2],
index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]),
)
tm.assert_series_equal(gb.nsmallest(3, keep="last"), e)
@pytest.mark.parametrize(
"data, groups",
[([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])],
)
@pytest.mark.parametrize("method", ["nlargest", "nsmallest"])
def test_nlargest_and_smallest_noop(data, groups, method):
# GH 15272, GH 16345, GH 29129
# Test nlargest/smallest when it results in a noop,
# i.e. input is sorted and group size <= n
if method == "nlargest":
data = list(reversed(data))
ser = Series(data, name="a")
result = getattr(ser.groupby(groups), method)(n=2)
expected = Series(data, index=MultiIndex.from_arrays([groups, ser.index]), name="a")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("func", ["cumprod", "cumsum"])
def test_numpy_compat(func):
# see gh-12811
df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
g = df.groupby("A")
msg = "numpy operations are not valid with groupby"
with pytest.raises(UnsupportedFunctionCall, match=msg):
getattr(g, func)(1, 2, 3)
with pytest.raises(UnsupportedFunctionCall, match=msg):
getattr(g, func)(foo=1)
def test_cummin(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
min_val = dtypes_for_minmax[1]
# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
df = base_df.astype(dtype)
expected = DataFrame({"B": expected_mins}).astype(dtype)
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)
# Test w/ min value for dtype
df.loc[[2, 6], "B"] = min_val
df.loc[[1, 5], "B"] = min_val + 1
expected.loc[[2, 3, 6, 7], "B"] = min_val
expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected, check_exact=True)
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected, check_exact=True)
# Test nan in some values
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
result = base_df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
expected = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)
# GH 15561
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
result = df.groupby("a")["b"].cummin()
tm.assert_series_equal(expected, result)
# GH 15635
df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
result = df.groupby("a").b.cummin()
expected = Series([1, 2, 1], name="b")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"])
def test_cummin_max_all_nan_column(method, dtype):
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})
base_df["B"] = base_df["B"].astype(dtype)
grouped = base_df.groupby("A")
expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype)
result = getattr(grouped, method)()
tm.assert_frame_equal(expected, result)
result = getattr(grouped["B"], method)().to_frame()
tm.assert_frame_equal(expected, result)
def test_cummax(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
max_val = dtypes_for_minmax[2]
# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
df = base_df.astype(dtype)
expected = DataFrame({"B": expected_maxs}).astype(dtype)
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
# Test w/ max value for dtype
df.loc[[2, 6], "B"] = max_val
expected.loc[[2, 3, 6, 7], "B"] = max_val
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
# Test nan in some values
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
# GH 15561
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
result = df.groupby("a")["b"].cummax()
tm.assert_series_equal(expected, result)
# GH 15635
df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
result = df.groupby("a").b.cummax()
expected = Series([2, 1, 2], name="b")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"])
@pytest.mark.parametrize(
"groups,expected_data",
[
([1, 1, 1], [1, None, None]),
([1, 2, 3], [1, None, 2]),
([1, 3, 3], [1, None, None]),
],
)
def test_cummin_max_skipna(method, dtype, groups, expected_data):
# GH-34047
df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
gb = df.groupby(groups)["a"]
result = getattr(gb, method)(skipna=False)
expected = Series(expected_data, dtype=dtype, name="a")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("method", ["cummin", "cummax"])
def test_cummin_max_skipna_multiple_cols(method):
# Ensure missing value in "a" doesn't cause "b" to be nan-filled
df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]})
gb = df.groupby([1, 1, 1])[["a", "b"]]
result = getattr(gb, method)(skipna=False)
expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]})
tm.assert_frame_equal(result, expected)
@td.skip_if_32bit
@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize(
"dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)]
)
def test_nullable_int_not_cast_as_float(method, dtype, val):
data = [val, pd.NA]
df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype)
grouped = df.groupby("grp")
result = grouped.transform(method)
expected = DataFrame({"b": data}, dtype=dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"in_vals, out_vals",
[
# Basics: strictly increasing (T), strictly decreasing (F),
# abs val increasing (F), non-strictly increasing (T)
([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]),
# Test with inf vals
(
[1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
[True, False, True, False],
),
# Test with nan vals; should always be False
(
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
[False, False, False, False],
),
],
)
def test_is_monotonic_increasing(in_vals, out_vals):
# GH 17015
source_dict = {
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
"C": in_vals,
}
df = DataFrame(source_dict)
result = df.groupby("B").C.is_monotonic_increasing
index = Index(list("abcd"), name="B")
expected = Series(index=index, data=out_vals, name="C")
tm.assert_series_equal(result, expected)
# Also check result equal to manually taking x.is_monotonic_increasing.
expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"in_vals, out_vals",
[
# Basics: strictly decreasing (T), strictly increasing (F),
# abs val decreasing (F), non-strictly increasing (T)
([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]),
# Test with inf vals
(
[np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
[True, True, False, True],
),
# Test with nan vals; should always be False
(
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
[False, False, False, False],
),
],
)
def test_is_monotonic_decreasing(in_vals, out_vals):
# GH 17015
source_dict = {
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
"C": in_vals,
}
df = DataFrame(source_dict)
result = df.groupby("B").C.is_monotonic_decreasing
index = Index(list("abcd"), name="B")
expected = Series(index=index, data=out_vals, name="C")
tm.assert_series_equal(result, expected)
# describe
# --------------------------------
def test_apply_describe_bug(mframe):
grouped = mframe.groupby(level="first")
grouped.describe() # it works!
def test_series_describe_multikey():
ts = tm.makeTimeSeries()
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.describe()
tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
tm.assert_series_equal(result["min"], grouped.min(), check_names=False)
def test_series_describe_single():
ts = tm.makeTimeSeries()
grouped = ts.groupby(lambda x: x.month)
result = grouped.apply(lambda x: x.describe())
expected = grouped.describe().stack()
tm.assert_series_equal(result, expected)
def test_series_index_name(df):
grouped = df.loc[:, ["C"]].groupby(df["A"])
result = grouped.agg(lambda x: x.mean())
assert result.index.name == "A"
def test_frame_describe_multikey(tsframe):
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.describe()
desc_groups = []
for col in tsframe:
group = grouped[col].describe()
# GH 17464 - Remove duplicate MultiIndex levels
group_col = MultiIndex(
levels=[[col], group.columns],
codes=[[0] * len(group.columns), range(len(group.columns))],
)
group = DataFrame(group.values, columns=group_col, index=group.index)
desc_groups.append(group)
expected = pd.concat(desc_groups, axis=1)
tm.assert_frame_equal(result, expected)
groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
result = groupedT.describe()
expected = tsframe.describe().T
tm.assert_frame_equal(result, expected)
def test_frame_describe_tupleindex():
# GH 14848 - regression from 0.19.0 to 0.19.1
df1 = DataFrame(
{
"x": [1, 2, 3, 4, 5] * 3,
"y": [10, 20, 30, 40, 50] * 3,
"z": [100, 200, 300, 400, 500] * 3,
}
)
df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
df2 = df1.rename(columns={"k": "key"})
msg = "Names should be list-like for a MultiIndex"
with pytest.raises(ValueError, match=msg):
df1.groupby("k").describe()
with pytest.raises(ValueError, match=msg):
df2.groupby("key").describe()
def test_frame_describe_unstacked_format():
# GH 4792
prices = {
Timestamp("2011-01-06 10:59:05", tz=None): 24990,
Timestamp("2011-01-06 12:43:33", tz=None): 25499,
Timestamp("2011-01-06 12:54:09", tz=None): 25499,
}
volumes = {
Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
}
df = DataFrame({"PRICE": prices, "VOLUME": volumes})
result = df.groupby("PRICE").VOLUME.describe()
data = [
df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
]
expected = DataFrame(
data,
index=Index([24990, 25499], name="PRICE"),
columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings(
"ignore:"
"indexing past lexsort depth may impact performance:"
"pandas.errors.PerformanceWarning"
)
@pytest.mark.parametrize("as_index", [True, False])
def test_describe_with_duplicate_output_column_names(as_index):
# GH 35314
df = DataFrame(
{
"a": [99, 99, 99, 88, 88, 88],
"b": [1, 2, 3, 4, 5, 6],
"c": [10, 20, 30, 40, 50, 60],
},
columns=["a", "b", "b"],
copy=False,
)
expected = (
DataFrame.from_records(
[
("a", "count", 3.0, 3.0),
("a", "mean", 88.0, 99.0),
("a", "std", 0.0, 0.0),
("a", "min", 88.0, 99.0),
("a", "25%", 88.0, 99.0),
("a", "50%", 88.0, 99.0),
("a", "75%", 88.0, 99.0),
("a", "max", 88.0, 99.0),
("b", "count", 3.0, 3.0),
("b", "mean", 5.0, 2.0),
("b", "std", 1.0, 1.0),
("b", "min", 4.0, 1.0),
("b", "25%", 4.5, 1.5),
("b", "50%", 5.0, 2.0),
("b", "75%", 5.5, 2.5),
("b", "max", 6.0, 3.0),
("b", "count", 3.0, 3.0),
("b", "mean", 5.0, 2.0),
("b", "std", 1.0, 1.0),
("b", "min", 4.0, 1.0),
("b", "25%", 4.5, 1.5),
("b", "50%", 5.0, 2.0),
("b", "75%", 5.5, 2.5),
("b", "max", 6.0, 3.0),
],
)
.set_index([0, 1])
.T
)
expected.columns.names = [None, None]
expected.index = Index([88, 99], name="a")
if as_index:
expected = expected.drop(columns=["a"], level=0)
else:
expected = expected.reset_index(drop=True)
result = df.groupby("a", as_index=as_index).describe()
tm.assert_frame_equal(result, expected)
def test_groupby_mean_no_overflow():
# Regression test for (#22487)
df = DataFrame(
{
"user": ["A", "A", "A", "A", "A"],
"connections": [4970, 4749, 4719, 4704, 18446744073699999744],
}
)
assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840
@pytest.mark.parametrize(
"values",
[
{
"a": [1, 1, 1, 2, 2, 2, 3, 3, 3],
"b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2],
},
{"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]},
],
)
@pytest.mark.parametrize("function", ["mean", "median", "var"])
def test_apply_to_nullable_integer_returns_float(values, function):
# https://github.com/pandas-dev/pandas/issues/32219
output = 0.5 if function == "var" else 1.5
arr = np.array([output] * 3, dtype=float)
idx = Index([1, 2, 3], name="a", dtype="Int64")
expected = DataFrame({"b": arr}, index=idx).astype("Float64")
groups = DataFrame(values, dtype="Int64").groupby("a")
result = getattr(groups, function)()
tm.assert_frame_equal(result, expected)
result = groups.agg(function)
tm.assert_frame_equal(result, expected)
result = groups.agg([function])
expected.columns = MultiIndex.from_tuples([("b", function)])
tm.assert_frame_equal(result, expected)
def test_groupby_sum_below_mincount_nullable_integer():
# https://github.com/pandas-dev/pandas/issues/32861
df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64")
grouped = df.groupby("a")
idx = Index([0, 1, 2], name="a", dtype="Int64")
result = grouped["b"].sum(min_count=2)
expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b")
tm.assert_series_equal(result, expected)
result = grouped.sum(min_count=2)
expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx)
tm.assert_frame_equal(result, expected)
def test_mean_on_timedelta():
# GH 17382
df = DataFrame({"time": pd.to_timedelta(range(10)), "cat": ["A", "B"] * 5})
result = df.groupby("cat")["time"].mean()
expected = Series(
pd.to_timedelta([4, 5]), name="time", index=Index(["A", "B"], name="cat")
)
tm.assert_series_equal(result, expected)
def test_groupby_sum_timedelta_with_nat():
# GH#42659
df = DataFrame(
{
"a": [1, 1, 2, 2],
"b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT],
}
)
td3 = pd.Timedelta(days=3)
gb = df.groupby("a")
res = gb.sum()
expected = DataFrame({"b": [td3, td3]}, index=Index([1, 2], name="a"))
tm.assert_frame_equal(res, expected)
res = gb["b"].sum()
tm.assert_series_equal(res, expected["b"])
res = gb["b"].sum(min_count=2)
expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index)
tm.assert_series_equal(res, expected)