A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pandas/tests/reductions/test_reductions.py

1533 lines
50 KiB

from datetime import (
datetime,
timedelta,
)
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
DatetimeIndex,
Index,
NaT,
Period,
PeriodIndex,
RangeIndex,
Series,
Timedelta,
TimedeltaIndex,
Timestamp,
date_range,
isna,
timedelta_range,
to_timedelta,
)
import pandas._testing as tm
from pandas.core import nanops
def get_objs():
indexes = [
tm.makeBoolIndex(10, name="a"),
tm.makeIntIndex(10, name="a"),
tm.makeFloatIndex(10, name="a"),
tm.makeDateIndex(10, name="a"),
tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern"),
tm.makePeriodIndex(10, name="a"),
tm.makeStringIndex(10, name="a"),
tm.makeUnicodeIndex(10, name="a"),
]
arr = np.random.randn(10)
series = [Series(arr, index=idx, name="a") for idx in indexes]
objs = indexes + series
return objs
objs = get_objs()
class TestReductions:
@pytest.mark.parametrize("opname", ["max", "min"])
@pytest.mark.parametrize("obj", objs)
def test_ops(self, opname, obj):
result = getattr(obj, opname)()
if not isinstance(obj, PeriodIndex):
expected = getattr(obj.values, opname)()
else:
expected = Period(ordinal=getattr(obj.asi8, opname)(), freq=obj.freq)
if getattr(obj, "tz", None) is not None:
# We need to de-localize before comparing to the numpy-produced result
expected = expected.astype("M8[ns]").astype("int64")
assert result.value == expected
else:
assert result == expected
@pytest.mark.parametrize("opname", ["max", "min"])
@pytest.mark.parametrize(
"dtype, val",
[
("object", 2.0),
("float64", 2.0),
("datetime64[ns]", datetime(2011, 11, 1)),
("Int64", 2),
("boolean", True),
],
)
def test_nanminmax(self, opname, dtype, val, index_or_series):
# GH#7261
klass = index_or_series
def check_missing(res):
if dtype == "datetime64[ns]":
return res is NaT
elif dtype in ["Int64", "boolean"]:
return res is pd.NA
else:
return isna(res)
obj = klass([None], dtype=dtype)
assert check_missing(getattr(obj, opname)())
assert check_missing(getattr(obj, opname)(skipna=False))
obj = klass([], dtype=dtype)
assert check_missing(getattr(obj, opname)())
assert check_missing(getattr(obj, opname)(skipna=False))
if dtype == "object":
# generic test with object only works for empty / all NaN
return
obj = klass([None, val], dtype=dtype)
assert getattr(obj, opname)() == val
assert check_missing(getattr(obj, opname)(skipna=False))
obj = klass([None, val, None], dtype=dtype)
assert getattr(obj, opname)() == val
assert check_missing(getattr(obj, opname)(skipna=False))
@pytest.mark.parametrize("opname", ["max", "min"])
def test_nanargminmax(self, opname, index_or_series):
# GH#7261
klass = index_or_series
arg_op = "arg" + opname if klass is Index else "idx" + opname
obj = klass([NaT, datetime(2011, 11, 1)])
assert getattr(obj, arg_op)() == 1
result = getattr(obj, arg_op)(skipna=False)
if klass is Series:
assert np.isnan(result)
else:
assert result == -1
obj = klass([NaT, datetime(2011, 11, 1), NaT])
# check DatetimeIndex non-monotonic path
assert getattr(obj, arg_op)() == 1
result = getattr(obj, arg_op)(skipna=False)
if klass is Series:
assert np.isnan(result)
else:
assert result == -1
@pytest.mark.parametrize("opname", ["max", "min"])
@pytest.mark.parametrize("dtype", ["M8[ns]", "datetime64[ns, UTC]"])
def test_nanops_empty_object(self, opname, index_or_series, dtype):
klass = index_or_series
arg_op = "arg" + opname if klass is Index else "idx" + opname
obj = klass([], dtype=dtype)
assert getattr(obj, opname)() is NaT
assert getattr(obj, opname)(skipna=False) is NaT
with pytest.raises(ValueError, match="empty sequence"):
getattr(obj, arg_op)()
with pytest.raises(ValueError, match="empty sequence"):
getattr(obj, arg_op)(skipna=False)
def test_argminmax(self):
obj = Index(np.arange(5, dtype="int64"))
assert obj.argmin() == 0
assert obj.argmax() == 4
obj = Index([np.nan, 1, np.nan, 2])
assert obj.argmin() == 1
assert obj.argmax() == 3
assert obj.argmin(skipna=False) == -1
assert obj.argmax(skipna=False) == -1
obj = Index([np.nan])
assert obj.argmin() == -1
assert obj.argmax() == -1
assert obj.argmin(skipna=False) == -1
assert obj.argmax(skipna=False) == -1
obj = Index([NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), NaT])
assert obj.argmin() == 1
assert obj.argmax() == 2
assert obj.argmin(skipna=False) == -1
assert obj.argmax(skipna=False) == -1
obj = Index([NaT])
assert obj.argmin() == -1
assert obj.argmax() == -1
assert obj.argmin(skipna=False) == -1
assert obj.argmax(skipna=False) == -1
@pytest.mark.parametrize("op, expected_col", [["max", "a"], ["min", "b"]])
def test_same_tz_min_max_axis_1(self, op, expected_col):
# GH 10390
df = DataFrame(
date_range("2016-01-01 00:00:00", periods=3, tz="UTC"), columns=["a"]
)
df["b"] = df.a.subtract(Timedelta(seconds=3600))
result = getattr(df, op)(axis=1)
expected = df[expected_col].rename(None)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("func", ["maximum", "minimum"])
def test_numpy_reduction_with_tz_aware_dtype(self, tz_aware_fixture, func):
# GH 15552
tz = tz_aware_fixture
arg = pd.to_datetime(["2019"]).tz_localize(tz)
expected = Series(arg)
result = getattr(np, func)(expected, expected)
tm.assert_series_equal(result, expected)
class TestIndexReductions:
# Note: the name TestIndexReductions indicates these tests
# were moved from a Index-specific test file, _not_ that these tests are
# intended long-term to be Index-specific
@pytest.mark.parametrize(
"start,stop,step",
[
(0, 400, 3),
(500, 0, -6),
(-(10**6), 10**6, 4),
(10**6, -(10**6), -4),
(0, 10, 20),
],
)
def test_max_min_range(self, start, stop, step):
# GH#17607
idx = RangeIndex(start, stop, step)
expected = idx._values.max()
result = idx.max()
assert result == expected
# skipna should be irrelevant since RangeIndex should never have NAs
result2 = idx.max(skipna=False)
assert result2 == expected
expected = idx._values.min()
result = idx.min()
assert result == expected
# skipna should be irrelevant since RangeIndex should never have NAs
result2 = idx.min(skipna=False)
assert result2 == expected
# empty
idx = RangeIndex(start, stop, -step)
assert isna(idx.max())
assert isna(idx.min())
def test_minmax_timedelta64(self):
# monotonic
idx1 = TimedeltaIndex(["1 days", "2 days", "3 days"])
assert idx1.is_monotonic
# non-monotonic
idx2 = TimedeltaIndex(["1 days", np.nan, "3 days", "NaT"])
assert not idx2.is_monotonic
for idx in [idx1, idx2]:
assert idx.min() == Timedelta("1 days")
assert idx.max() == Timedelta("3 days")
assert idx.argmin() == 0
assert idx.argmax() == 2
@pytest.mark.parametrize("op", ["min", "max"])
def test_minmax_timedelta_empty_or_na(self, op):
# Return NaT
obj = TimedeltaIndex([])
assert getattr(obj, op)() is NaT
obj = TimedeltaIndex([NaT])
assert getattr(obj, op)() is NaT
obj = TimedeltaIndex([NaT, NaT, NaT])
assert getattr(obj, op)() is NaT
def test_numpy_minmax_timedelta64(self):
td = timedelta_range("16815 days", "16820 days", freq="D")
assert np.min(td) == Timedelta("16815 days")
assert np.max(td) == Timedelta("16820 days")
errmsg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=errmsg):
np.min(td, out=0)
with pytest.raises(ValueError, match=errmsg):
np.max(td, out=0)
assert np.argmin(td) == 0
assert np.argmax(td) == 5
errmsg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=errmsg):
np.argmin(td, out=0)
with pytest.raises(ValueError, match=errmsg):
np.argmax(td, out=0)
def test_timedelta_ops(self):
# GH#4984
# make sure ops return Timedelta
s = Series(
[Timestamp("20130101") + timedelta(seconds=i * i) for i in range(10)]
)
td = s.diff()
result = td.mean()
expected = to_timedelta(timedelta(seconds=9))
assert result == expected
result = td.to_frame().mean()
assert result[0] == expected
result = td.quantile(0.1)
expected = Timedelta(np.timedelta64(2600, "ms"))
assert result == expected
result = td.median()
expected = to_timedelta("00:00:09")
assert result == expected
result = td.to_frame().median()
assert result[0] == expected
# GH#6462
# consistency in returned values for sum
result = td.sum()
expected = to_timedelta("00:01:21")
assert result == expected
result = td.to_frame().sum()
assert result[0] == expected
# std
result = td.std()
expected = to_timedelta(Series(td.dropna().values).std())
assert result == expected
result = td.to_frame().std()
assert result[0] == expected
# GH#10040
# make sure NaT is properly handled by median()
s = Series([Timestamp("2015-02-03"), Timestamp("2015-02-07")])
assert s.diff().median() == timedelta(days=4)
s = Series(
[Timestamp("2015-02-03"), Timestamp("2015-02-07"), Timestamp("2015-02-15")]
)
assert s.diff().median() == timedelta(days=6)
@pytest.mark.parametrize("opname", ["skew", "kurt", "sem", "prod", "var"])
def test_invalid_td64_reductions(self, opname):
s = Series(
[Timestamp("20130101") + timedelta(seconds=i * i) for i in range(10)]
)
td = s.diff()
msg = "|".join(
[
f"reduction operation '{opname}' not allowed for this dtype",
rf"cannot perform {opname} with type timedelta64\[ns\]",
f"does not support reduction '{opname}'",
]
)
with pytest.raises(TypeError, match=msg):
getattr(td, opname)()
with pytest.raises(TypeError, match=msg):
getattr(td.to_frame(), opname)(numeric_only=False)
def test_minmax_tz(self, tz_naive_fixture):
tz = tz_naive_fixture
# monotonic
idx1 = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz=tz)
assert idx1.is_monotonic
# non-monotonic
idx2 = DatetimeIndex(
["2011-01-01", NaT, "2011-01-03", "2011-01-02", NaT], tz=tz
)
assert not idx2.is_monotonic
for idx in [idx1, idx2]:
assert idx.min() == Timestamp("2011-01-01", tz=tz)
assert idx.max() == Timestamp("2011-01-03", tz=tz)
assert idx.argmin() == 0
assert idx.argmax() == 2
@pytest.mark.parametrize("op", ["min", "max"])
def test_minmax_nat_datetime64(self, op):
# Return NaT
obj = DatetimeIndex([])
assert isna(getattr(obj, op)())
obj = DatetimeIndex([NaT])
assert isna(getattr(obj, op)())
obj = DatetimeIndex([NaT, NaT, NaT])
assert isna(getattr(obj, op)())
def test_numpy_minmax_integer(self):
# GH#26125
idx = Index([1, 2, 3])
expected = idx.values.max()
result = np.max(idx)
assert result == expected
expected = idx.values.min()
result = np.min(idx)
assert result == expected
errmsg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=errmsg):
np.min(idx, out=0)
with pytest.raises(ValueError, match=errmsg):
np.max(idx, out=0)
expected = idx.values.argmax()
result = np.argmax(idx)
assert result == expected
expected = idx.values.argmin()
result = np.argmin(idx)
assert result == expected
errmsg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=errmsg):
np.argmin(idx, out=0)
with pytest.raises(ValueError, match=errmsg):
np.argmax(idx, out=0)
def test_numpy_minmax_range(self):
# GH#26125
idx = RangeIndex(0, 10, 3)
result = np.max(idx)
assert result == 9
result = np.min(idx)
assert result == 0
errmsg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=errmsg):
np.min(idx, out=0)
with pytest.raises(ValueError, match=errmsg):
np.max(idx, out=0)
# No need to test again argmax/argmin compat since the implementation
# is the same as basic integer index
def test_numpy_minmax_datetime64(self):
dr = date_range(start="2016-01-15", end="2016-01-20")
assert np.min(dr) == Timestamp("2016-01-15 00:00:00")
assert np.max(dr) == Timestamp("2016-01-20 00:00:00")
errmsg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=errmsg):
np.min(dr, out=0)
with pytest.raises(ValueError, match=errmsg):
np.max(dr, out=0)
assert np.argmin(dr) == 0
assert np.argmax(dr) == 5
errmsg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=errmsg):
np.argmin(dr, out=0)
with pytest.raises(ValueError, match=errmsg):
np.argmax(dr, out=0)
def test_minmax_period(self):
# monotonic
idx1 = PeriodIndex([NaT, "2011-01-01", "2011-01-02", "2011-01-03"], freq="D")
assert not idx1.is_monotonic
assert idx1[1:].is_monotonic
# non-monotonic
idx2 = PeriodIndex(
["2011-01-01", NaT, "2011-01-03", "2011-01-02", NaT], freq="D"
)
assert not idx2.is_monotonic
for idx in [idx1, idx2]:
assert idx.min() == Period("2011-01-01", freq="D")
assert idx.max() == Period("2011-01-03", freq="D")
assert idx1.argmin() == 1
assert idx2.argmin() == 0
assert idx1.argmax() == 3
assert idx2.argmax() == 2
for op in ["min", "max"]:
# Return NaT
obj = PeriodIndex([], freq="M")
result = getattr(obj, op)()
assert result is NaT
obj = PeriodIndex([NaT], freq="M")
result = getattr(obj, op)()
assert result is NaT
obj = PeriodIndex([NaT, NaT, NaT], freq="M")
result = getattr(obj, op)()
assert result is NaT
def test_numpy_minmax_period(self):
pr = pd.period_range(start="2016-01-15", end="2016-01-20")
assert np.min(pr) == Period("2016-01-15", freq="D")
assert np.max(pr) == Period("2016-01-20", freq="D")
errmsg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=errmsg):
np.min(pr, out=0)
with pytest.raises(ValueError, match=errmsg):
np.max(pr, out=0)
assert np.argmin(pr) == 0
assert np.argmax(pr) == 5
errmsg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=errmsg):
np.argmin(pr, out=0)
with pytest.raises(ValueError, match=errmsg):
np.argmax(pr, out=0)
def test_min_max_categorical(self):
ci = pd.CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False)
msg = (
r"Categorical is not ordered for operation min\n"
r"you can use .as_ordered\(\) to change the Categorical to an ordered one\n"
)
with pytest.raises(TypeError, match=msg):
ci.min()
msg = (
r"Categorical is not ordered for operation max\n"
r"you can use .as_ordered\(\) to change the Categorical to an ordered one\n"
)
with pytest.raises(TypeError, match=msg):
ci.max()
ci = pd.CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=True)
assert ci.min() == "c"
assert ci.max() == "b"
class TestSeriesReductions:
# Note: the name TestSeriesReductions indicates these tests
# were moved from a series-specific test file, _not_ that these tests are
# intended long-term to be series-specific
def test_sum_inf(self):
s = Series(np.random.randn(10))
s2 = s.copy()
s[5:8] = np.inf
s2[5:8] = np.nan
assert np.isinf(s.sum())
arr = np.random.randn(100, 100).astype("f4")
arr[:, 2] = np.inf
with pd.option_context("mode.use_inf_as_na", True):
tm.assert_almost_equal(s.sum(), s2.sum())
res = nanops.nansum(arr, axis=1)
assert np.isinf(res).all()
@pytest.mark.parametrize(
"dtype", ["float64", "Float32", "Int64", "boolean", "object"]
)
@pytest.mark.parametrize("use_bottleneck", [True, False])
@pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)])
def test_empty(self, method, unit, use_bottleneck, dtype):
with pd.option_context("use_bottleneck", use_bottleneck):
# GH#9422 / GH#18921
# Entirely empty
s = Series([], dtype=dtype)
# NA by default
result = getattr(s, method)()
assert result == unit
# Explicit
result = getattr(s, method)(min_count=0)
assert result == unit
result = getattr(s, method)(min_count=1)
assert isna(result)
# Skipna, default
result = getattr(s, method)(skipna=True)
result == unit
# Skipna, explicit
result = getattr(s, method)(skipna=True, min_count=0)
assert result == unit
result = getattr(s, method)(skipna=True, min_count=1)
assert isna(result)
result = getattr(s, method)(skipna=False, min_count=0)
assert result == unit
result = getattr(s, method)(skipna=False, min_count=1)
assert isna(result)
# All-NA
s = Series([np.nan], dtype=dtype)
# NA by default
result = getattr(s, method)()
assert result == unit
# Explicit
result = getattr(s, method)(min_count=0)
assert result == unit
result = getattr(s, method)(min_count=1)
assert isna(result)
# Skipna, default
result = getattr(s, method)(skipna=True)
result == unit
# skipna, explicit
result = getattr(s, method)(skipna=True, min_count=0)
assert result == unit
result = getattr(s, method)(skipna=True, min_count=1)
assert isna(result)
# Mix of valid, empty
s = Series([np.nan, 1], dtype=dtype)
# Default
result = getattr(s, method)()
assert result == 1.0
# Explicit
result = getattr(s, method)(min_count=0)
assert result == 1.0
result = getattr(s, method)(min_count=1)
assert result == 1.0
# Skipna
result = getattr(s, method)(skipna=True)
assert result == 1.0
result = getattr(s, method)(skipna=True, min_count=0)
assert result == 1.0
# GH#844 (changed in GH#9422)
df = DataFrame(np.empty((10, 0)), dtype=dtype)
assert (getattr(df, method)(1) == unit).all()
s = Series([1], dtype=dtype)
result = getattr(s, method)(min_count=2)
assert isna(result)
result = getattr(s, method)(skipna=False, min_count=2)
assert isna(result)
s = Series([np.nan], dtype=dtype)
result = getattr(s, method)(min_count=2)
assert isna(result)
s = Series([np.nan, 1], dtype=dtype)
result = getattr(s, method)(min_count=2)
assert isna(result)
@pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)])
def test_empty_multi(self, method, unit):
s = Series(
[1, np.nan, np.nan, np.nan],
index=pd.MultiIndex.from_product([("a", "b"), (0, 1)]),
)
# 1 / 0 by default
with tm.assert_produces_warning(FutureWarning):
result = getattr(s, method)(level=0)
expected = Series([1, unit], index=["a", "b"])
tm.assert_series_equal(result, expected)
# min_count=0
with tm.assert_produces_warning(FutureWarning):
result = getattr(s, method)(level=0, min_count=0)
expected = Series([1, unit], index=["a", "b"])
tm.assert_series_equal(result, expected)
# min_count=1
with tm.assert_produces_warning(FutureWarning):
result = getattr(s, method)(level=0, min_count=1)
expected = Series([1, np.nan], index=["a", "b"])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("method", ["mean"])
@pytest.mark.parametrize("dtype", ["Float64", "Int64", "boolean"])
def test_ops_consistency_on_empty_nullable(self, method, dtype):
# GH#34814
# consistency for nullable dtypes on empty or ALL-NA mean
# empty series
eser = Series([], dtype=dtype)
result = getattr(eser, method)()
assert result is pd.NA
# ALL-NA series
nser = Series([np.nan], dtype=dtype)
result = getattr(nser, method)()
assert result is pd.NA
@pytest.mark.parametrize("method", ["mean", "median", "std", "var"])
def test_ops_consistency_on_empty(self, method):
# GH#7869
# consistency on empty
# float
result = getattr(Series(dtype=float), method)()
assert isna(result)
# timedelta64[ns]
tdser = Series([], dtype="m8[ns]")
if method == "var":
msg = "|".join(
[
"operation 'var' not allowed",
r"cannot perform var with type timedelta64\[ns\]",
"does not support reduction 'var'",
]
)
with pytest.raises(TypeError, match=msg):
getattr(tdser, method)()
else:
result = getattr(tdser, method)()
assert result is NaT
def test_nansum_buglet(self):
ser = Series([1.0, np.nan], index=[0, 1])
result = np.nansum(ser)
tm.assert_almost_equal(result, 1)
@pytest.mark.parametrize("use_bottleneck", [True, False])
def test_sum_overflow(self, use_bottleneck):
with pd.option_context("use_bottleneck", use_bottleneck):
# GH#6915
# overflowing on the smaller int dtypes
for dtype in ["int32", "int64"]:
v = np.arange(5000000, dtype=dtype)
s = Series(v)
result = s.sum(skipna=False)
assert int(result) == v.sum(dtype="int64")
result = s.min(skipna=False)
assert int(result) == 0
result = s.max(skipna=False)
assert int(result) == v[-1]
for dtype in ["float32", "float64"]:
v = np.arange(5000000, dtype=dtype)
s = Series(v)
result = s.sum(skipna=False)
assert result == v.sum(dtype=dtype)
result = s.min(skipna=False)
assert np.allclose(float(result), 0.0)
result = s.max(skipna=False)
assert np.allclose(float(result), v[-1])
def test_empty_timeseries_reductions_return_nat(self):
# covers GH#11245
for dtype in ("m8[ns]", "m8[ns]", "M8[ns]", "M8[ns, UTC]"):
assert Series([], dtype=dtype).min() is NaT
assert Series([], dtype=dtype).max() is NaT
assert Series([], dtype=dtype).min(skipna=False) is NaT
assert Series([], dtype=dtype).max(skipna=False) is NaT
def test_numpy_argmin(self):
# See GH#16830
data = np.arange(1, 11)
s = Series(data, index=data)
result = np.argmin(s)
expected = np.argmin(data)
assert result == expected
result = s.argmin()
assert result == expected
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.argmin(s, out=data)
def test_numpy_argmax(self):
# See GH#16830
data = np.arange(1, 11)
s = Series(data, index=data)
result = np.argmax(s)
expected = np.argmax(data)
assert result == expected
result = s.argmax()
assert result == expected
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.argmax(s, out=data)
def test_idxmin(self):
# test idxmin
# _check_stat_op approach can not be used here because of isna check.
string_series = tm.makeStringSeries().rename("series")
# add some NaNs
string_series[5:15] = np.NaN
# skipna or no
assert string_series[string_series.idxmin()] == string_series.min()
assert isna(string_series.idxmin(skipna=False))
# no NaNs
nona = string_series.dropna()
assert nona[nona.idxmin()] == nona.min()
assert nona.index.values.tolist().index(nona.idxmin()) == nona.values.argmin()
# all NaNs
allna = string_series * np.nan
assert isna(allna.idxmin())
# datetime64[ns]
s = Series(date_range("20130102", periods=6))
result = s.idxmin()
assert result == 0
s[0] = np.nan
result = s.idxmin()
assert result == 1
def test_idxmax(self):
# test idxmax
# _check_stat_op approach can not be used here because of isna check.
string_series = tm.makeStringSeries().rename("series")
# add some NaNs
string_series[5:15] = np.NaN
# skipna or no
assert string_series[string_series.idxmax()] == string_series.max()
assert isna(string_series.idxmax(skipna=False))
# no NaNs
nona = string_series.dropna()
assert nona[nona.idxmax()] == nona.max()
assert nona.index.values.tolist().index(nona.idxmax()) == nona.values.argmax()
# all NaNs
allna = string_series * np.nan
assert isna(allna.idxmax())
from pandas import date_range
s = Series(date_range("20130102", periods=6))
result = s.idxmax()
assert result == 5
s[5] = np.nan
result = s.idxmax()
assert result == 4
# Float64Index
# GH#5914
s = Series([1, 2, 3], [1.1, 2.1, 3.1])
result = s.idxmax()
assert result == 3.1
result = s.idxmin()
assert result == 1.1
s = Series(s.index, s.index)
result = s.idxmax()
assert result == 3.1
result = s.idxmin()
assert result == 1.1
def test_all_any(self):
ts = tm.makeTimeSeries()
bool_series = ts > 0
assert not bool_series.all()
assert bool_series.any()
# Alternative types, with implicit 'object' dtype.
s = Series(["abc", True])
assert s.any()
@pytest.mark.parametrize("klass", [Index, Series])
def test_numpy_all_any(self, klass):
# GH#40180
idx = klass([0, 1, 2])
assert not np.all(idx)
assert np.any(idx)
idx = Index([1, 2, 3])
assert np.all(idx)
def test_all_any_params(self):
# Check skipna, with implicit 'object' dtype.
s1 = Series([np.nan, True])
s2 = Series([np.nan, False])
assert s1.all(skipna=False) # nan && True => True
assert s1.all(skipna=True)
assert s2.any(skipna=False)
assert not s2.any(skipna=True)
# Check level.
s = Series([False, False, True, True, False, True], index=[0, 0, 1, 1, 2, 2])
with tm.assert_produces_warning(FutureWarning):
tm.assert_series_equal(s.all(level=0), Series([False, True, False]))
with tm.assert_produces_warning(FutureWarning):
tm.assert_series_equal(s.any(level=0), Series([False, True, True]))
msg = "Option bool_only is not implemented with option level"
with pytest.raises(NotImplementedError, match=msg):
with tm.assert_produces_warning(FutureWarning):
s.any(bool_only=True, level=0)
with pytest.raises(NotImplementedError, match=msg):
with tm.assert_produces_warning(FutureWarning):
s.all(bool_only=True, level=0)
# GH#38810 bool_only is not implemented alone.
msg = "Series.any does not implement bool_only"
with pytest.raises(NotImplementedError, match=msg):
s.any(bool_only=True)
msg = "Series.all does not implement bool_only."
with pytest.raises(NotImplementedError, match=msg):
s.all(bool_only=True)
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
@pytest.mark.parametrize("skipna", [True, False])
def test_any_all_object_dtype(self, bool_agg_func, skipna):
# GH#12863
ser = Series(["a", "b", "c", "d", "e"], dtype=object)
result = getattr(ser, bool_agg_func)(skipna=skipna)
expected = True
assert result == expected
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
@pytest.mark.parametrize(
"data", [[False, None], [None, False], [False, np.nan], [np.nan, False]]
)
def test_any_all_object_dtype_missing(self, data, bool_agg_func):
# GH#27709
ser = Series(data)
result = getattr(ser, bool_agg_func)(skipna=False)
# None is treated is False, but np.nan is treated as True
expected = bool_agg_func == "any" and None not in data
assert result == expected
@pytest.mark.parametrize("dtype", ["boolean", "Int64", "UInt64", "Float64"])
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize(
# expected_data indexed as [[skipna=False/any, skipna=False/all],
# [skipna=True/any, skipna=True/all]]
"data,expected_data",
[
([0, 0, 0], [[False, False], [False, False]]),
([1, 1, 1], [[True, True], [True, True]]),
([pd.NA, pd.NA, pd.NA], [[pd.NA, pd.NA], [False, True]]),
([0, pd.NA, 0], [[pd.NA, False], [False, False]]),
([1, pd.NA, 1], [[True, pd.NA], [True, True]]),
([1, pd.NA, 0], [[True, False], [True, False]]),
],
)
def test_any_all_nullable_kleene_logic(
self, bool_agg_func, skipna, data, dtype, expected_data
):
# GH-37506, GH-41967
ser = Series(data, dtype=dtype)
expected = expected_data[skipna][bool_agg_func == "all"]
result = getattr(ser, bool_agg_func)(skipna=skipna)
assert (result is pd.NA and expected is pd.NA) or result == expected
@pytest.mark.parametrize(
"bool_agg_func,expected",
[("all", [False, True, False]), ("any", [False, True, True])],
)
def test_any_all_boolean_level(self, bool_agg_func, expected):
# GH#33449
ser = Series(
[False, False, True, True, False, True],
index=[0, 0, 1, 1, 2, 2],
dtype="boolean",
)
with tm.assert_produces_warning(FutureWarning):
result = getattr(ser, bool_agg_func)(level=0)
expected = Series(expected, dtype="boolean")
tm.assert_series_equal(result, expected)
def test_any_axis1_bool_only(self):
# GH#32432
df = DataFrame({"A": [True, False], "B": [1, 2]})
result = df.any(axis=1, bool_only=True)
expected = Series([True, False])
tm.assert_series_equal(result, expected)
def test_any_all_datetimelike(self):
# GH#38723 these may not be the desired long-term behavior (GH#34479)
# but in the interim should be internally consistent
dta = date_range("1995-01-02", periods=3)._data
ser = Series(dta)
df = DataFrame(ser)
assert dta.all()
assert dta.any()
assert ser.all()
assert ser.any()
assert df.any().all()
assert df.all().all()
dta = dta.tz_localize("UTC")
ser = Series(dta)
df = DataFrame(ser)
assert dta.all()
assert dta.any()
assert ser.all()
assert ser.any()
assert df.any().all()
assert df.all().all()
tda = dta - dta[0]
ser = Series(tda)
df = DataFrame(ser)
assert tda.any()
assert not tda.all()
assert ser.any()
assert not ser.all()
assert df.any().all()
assert not df.all().any()
def test_timedelta64_analytics(self):
# index min/max
dti = date_range("2012-1-1", periods=3, freq="D")
td = Series(dti) - Timestamp("20120101")
result = td.idxmin()
assert result == 0
result = td.idxmax()
assert result == 2
# GH#2982
# with NaT
td[0] = np.nan
result = td.idxmin()
assert result == 1
result = td.idxmax()
assert result == 2
# abs
s1 = Series(date_range("20120101", periods=3))
s2 = Series(date_range("20120102", periods=3))
expected = Series(s2 - s1)
result = np.abs(s1 - s2)
tm.assert_series_equal(result, expected)
result = (s1 - s2).abs()
tm.assert_series_equal(result, expected)
# max/min
result = td.max()
expected = Timedelta("2 days")
assert result == expected
result = td.min()
expected = Timedelta("1 days")
assert result == expected
@pytest.mark.parametrize(
"test_input,error_type",
[
(Series([], dtype="float64"), ValueError),
# For strings, or any Series with dtype 'O'
(Series(["foo", "bar", "baz"]), TypeError),
(Series([(1,), (2,)]), TypeError),
# For mixed data types
(Series(["foo", "foo", "bar", "bar", None, np.nan, "baz"]), TypeError),
],
)
def test_assert_idxminmax_raises(self, test_input, error_type):
"""
Cases where ``Series.argmax`` and related should raise an exception
"""
msg = (
"reduction operation 'argmin' not allowed for this dtype|"
"attempt to get argmin of an empty sequence"
)
with pytest.raises(error_type, match=msg):
test_input.idxmin()
with pytest.raises(error_type, match=msg):
test_input.idxmin(skipna=False)
msg = (
"reduction operation 'argmax' not allowed for this dtype|"
"attempt to get argmax of an empty sequence"
)
with pytest.raises(error_type, match=msg):
test_input.idxmax()
with pytest.raises(error_type, match=msg):
test_input.idxmax(skipna=False)
def test_idxminmax_with_inf(self):
# For numeric data with NA and Inf (GH #13595)
s = Series([0, -np.inf, np.inf, np.nan])
assert s.idxmin() == 1
assert np.isnan(s.idxmin(skipna=False))
assert s.idxmax() == 2
assert np.isnan(s.idxmax(skipna=False))
# Using old-style behavior that treats floating point nan, -inf, and
# +inf as missing
with pd.option_context("mode.use_inf_as_na", True):
assert s.idxmin() == 0
assert np.isnan(s.idxmin(skipna=False))
assert s.idxmax() == 0
np.isnan(s.idxmax(skipna=False))
class TestDatetime64SeriesReductions:
# Note: the name TestDatetime64SeriesReductions indicates these tests
# were moved from a series-specific test file, _not_ that these tests are
# intended long-term to be series-specific
@pytest.mark.parametrize(
"nat_ser",
[
Series([NaT, NaT]),
Series([NaT, Timedelta("nat")]),
Series([Timedelta("nat"), Timedelta("nat")]),
],
)
def test_minmax_nat_series(self, nat_ser):
# GH#23282
assert nat_ser.min() is NaT
assert nat_ser.max() is NaT
assert nat_ser.min(skipna=False) is NaT
assert nat_ser.max(skipna=False) is NaT
@pytest.mark.parametrize(
"nat_df",
[
DataFrame([NaT, NaT]),
DataFrame([NaT, Timedelta("nat")]),
DataFrame([Timedelta("nat"), Timedelta("nat")]),
],
)
def test_minmax_nat_dataframe(self, nat_df):
# GH#23282
assert nat_df.min()[0] is NaT
assert nat_df.max()[0] is NaT
assert nat_df.min(skipna=False)[0] is NaT
assert nat_df.max(skipna=False)[0] is NaT
def test_min_max(self):
rng = date_range("1/1/2000", "12/31/2000")
rng2 = rng.take(np.random.permutation(len(rng)))
the_min = rng2.min()
the_max = rng2.max()
assert isinstance(the_min, Timestamp)
assert isinstance(the_max, Timestamp)
assert the_min == rng[0]
assert the_max == rng[-1]
assert rng.min() == rng[0]
assert rng.max() == rng[-1]
def test_min_max_series(self):
rng = date_range("1/1/2000", periods=10, freq="4h")
lvls = ["A", "A", "A", "B", "B", "B", "C", "C", "C", "C"]
df = DataFrame({"TS": rng, "V": np.random.randn(len(rng)), "L": lvls})
result = df.TS.max()
exp = Timestamp(df.TS.iat[-1])
assert isinstance(result, Timestamp)
assert result == exp
result = df.TS.min()
exp = Timestamp(df.TS.iat[0])
assert isinstance(result, Timestamp)
assert result == exp
class TestCategoricalSeriesReductions:
# Note: the name TestCategoricalSeriesReductions indicates these tests
# were moved from a series-specific test file, _not_ that these tests are
# intended long-term to be series-specific
@pytest.mark.parametrize("function", ["min", "max"])
def test_min_max_unordered_raises(self, function):
# unordered cats have no min/max
cat = Series(Categorical(["a", "b", "c", "d"], ordered=False))
msg = f"Categorical is not ordered for operation {function}"
with pytest.raises(TypeError, match=msg):
getattr(cat, function)()
@pytest.mark.parametrize(
"values, categories",
[
(list("abc"), list("abc")),
(list("abc"), list("cba")),
(list("abc") + [np.nan], list("cba")),
([1, 2, 3], [3, 2, 1]),
([1, 2, 3, np.nan], [3, 2, 1]),
],
)
@pytest.mark.parametrize("function", ["min", "max"])
def test_min_max_ordered(self, values, categories, function):
# GH 25303
cat = Series(Categorical(values, categories=categories, ordered=True))
result = getattr(cat, function)(skipna=True)
expected = categories[0] if function == "min" else categories[2]
assert result == expected
@pytest.mark.parametrize("function", ["min", "max"])
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_ordered_with_nan_only(self, function, skipna):
# https://github.com/pandas-dev/pandas/issues/33450
cat = Series(Categorical([np.nan], categories=[1, 2], ordered=True))
result = getattr(cat, function)(skipna=skipna)
assert result is np.nan
@pytest.mark.parametrize("function", ["min", "max"])
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_skipna(self, function, skipna):
cat = Series(
Categorical(["a", "b", np.nan, "a"], categories=["b", "a"], ordered=True)
)
result = getattr(cat, function)(skipna=skipna)
if skipna is True:
expected = "b" if function == "min" else "a"
assert result == expected
else:
assert result is np.nan
class TestSeriesMode:
# Note: the name TestSeriesMode indicates these tests
# were moved from a series-specific test file, _not_ that these tests are
# intended long-term to be series-specific
@pytest.mark.parametrize(
"dropna, expected",
[(True, Series([], dtype=np.float64)), (False, Series([], dtype=np.float64))],
)
def test_mode_empty(self, dropna, expected):
s = Series([], dtype=np.float64)
result = s.mode(dropna)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dropna, data, expected",
[
(True, [1, 1, 1, 2], [1]),
(True, [1, 1, 1, 2, 3, 3, 3], [1, 3]),
(False, [1, 1, 1, 2], [1]),
(False, [1, 1, 1, 2, 3, 3, 3], [1, 3]),
],
)
@pytest.mark.parametrize(
"dt", list(np.typecodes["AllInteger"] + np.typecodes["Float"])
)
def test_mode_numerical(self, dropna, data, expected, dt):
s = Series(data, dtype=dt)
result = s.mode(dropna)
expected = Series(expected, dtype=dt)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("dropna, expected", [(True, [1.0]), (False, [1, np.nan])])
def test_mode_numerical_nan(self, dropna, expected):
s = Series([1, 1, 2, np.nan, np.nan])
result = s.mode(dropna)
expected = Series(expected)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dropna, expected1, expected2, expected3",
[(True, ["b"], ["bar"], ["nan"]), (False, ["b"], [np.nan], ["nan"])],
)
def test_mode_str_obj(self, dropna, expected1, expected2, expected3):
# Test string and object types.
data = ["a"] * 2 + ["b"] * 3
s = Series(data, dtype="c")
result = s.mode(dropna)
expected1 = Series(expected1, dtype="c")
tm.assert_series_equal(result, expected1)
data = ["foo", "bar", "bar", np.nan, np.nan, np.nan]
s = Series(data, dtype=object)
result = s.mode(dropna)
expected2 = Series(expected2, dtype=object)
tm.assert_series_equal(result, expected2)
data = ["foo", "bar", "bar", np.nan, np.nan, np.nan]
s = Series(data, dtype=object).astype(str)
result = s.mode(dropna)
expected3 = Series(expected3, dtype=str)
tm.assert_series_equal(result, expected3)
@pytest.mark.parametrize(
"dropna, expected1, expected2",
[(True, ["foo"], ["foo"]), (False, ["foo"], [np.nan])],
)
def test_mode_mixeddtype(self, dropna, expected1, expected2):
s = Series([1, "foo", "foo"])
result = s.mode(dropna)
expected = Series(expected1)
tm.assert_series_equal(result, expected)
s = Series([1, "foo", "foo", np.nan, np.nan, np.nan])
result = s.mode(dropna)
expected = Series(expected2, dtype=object)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dropna, expected1, expected2",
[
(
True,
["1900-05-03", "2011-01-03", "2013-01-02"],
["2011-01-03", "2013-01-02"],
),
(False, [np.nan], [np.nan, "2011-01-03", "2013-01-02"]),
],
)
def test_mode_datetime(self, dropna, expected1, expected2):
s = Series(
["2011-01-03", "2013-01-02", "1900-05-03", "nan", "nan"], dtype="M8[ns]"
)
result = s.mode(dropna)
expected1 = Series(expected1, dtype="M8[ns]")
tm.assert_series_equal(result, expected1)
s = Series(
[
"2011-01-03",
"2013-01-02",
"1900-05-03",
"2011-01-03",
"2013-01-02",
"nan",
"nan",
],
dtype="M8[ns]",
)
result = s.mode(dropna)
expected2 = Series(expected2, dtype="M8[ns]")
tm.assert_series_equal(result, expected2)
@pytest.mark.parametrize(
"dropna, expected1, expected2",
[
(True, ["-1 days", "0 days", "1 days"], ["2 min", "1 day"]),
(False, [np.nan], [np.nan, "2 min", "1 day"]),
],
)
def test_mode_timedelta(self, dropna, expected1, expected2):
# gh-5986: Test timedelta types.
s = Series(
["1 days", "-1 days", "0 days", "nan", "nan"], dtype="timedelta64[ns]"
)
result = s.mode(dropna)
expected1 = Series(expected1, dtype="timedelta64[ns]")
tm.assert_series_equal(result, expected1)
s = Series(
[
"1 day",
"1 day",
"-1 day",
"-1 day 2 min",
"2 min",
"2 min",
"nan",
"nan",
],
dtype="timedelta64[ns]",
)
result = s.mode(dropna)
expected2 = Series(expected2, dtype="timedelta64[ns]")
tm.assert_series_equal(result, expected2)
@pytest.mark.parametrize(
"dropna, expected1, expected2, expected3",
[
(
True,
Categorical([1, 2], categories=[1, 2]),
Categorical(["a"], categories=[1, "a"]),
Categorical([3, 1], categories=[3, 2, 1], ordered=True),
),
(
False,
Categorical([np.nan], categories=[1, 2]),
Categorical([np.nan, "a"], categories=[1, "a"]),
Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True),
),
],
)
def test_mode_category(self, dropna, expected1, expected2, expected3):
s = Series(Categorical([1, 2, np.nan, np.nan]))
result = s.mode(dropna)
expected1 = Series(expected1, dtype="category")
tm.assert_series_equal(result, expected1)
s = Series(Categorical([1, "a", "a", np.nan, np.nan]))
result = s.mode(dropna)
expected2 = Series(expected2, dtype="category")
tm.assert_series_equal(result, expected2)
s = Series(
Categorical(
[1, 1, 2, 3, 3, np.nan, np.nan], categories=[3, 2, 1], ordered=True
)
)
result = s.mode(dropna)
expected3 = Series(expected3, dtype="category")
tm.assert_series_equal(result, expected3)
@pytest.mark.parametrize(
"dropna, expected1, expected2",
[(True, [2**63], [1, 2**63]), (False, [2**63], [1, 2**63])],
)
def test_mode_intoverflow(self, dropna, expected1, expected2):
# Test for uint64 overflow.
s = Series([1, 2**63, 2**63], dtype=np.uint64)
result = s.mode(dropna)
expected1 = Series(expected1, dtype=np.uint64)
tm.assert_series_equal(result, expected1)
s = Series([1, 2**63], dtype=np.uint64)
result = s.mode(dropna)
expected2 = Series(expected2, dtype=np.uint64)
tm.assert_series_equal(result, expected2)
def test_mode_sortwarning(self):
# Check for the warning that is raised when the mode
# results cannot be sorted
expected = Series(["foo", np.nan])
s = Series([1, "foo", "foo", np.nan, np.nan])
with tm.assert_produces_warning(UserWarning):
result = s.mode(dropna=False)
result = result.sort_values().reset_index(drop=True)
tm.assert_series_equal(result, expected)
def test_mode_boolean_with_na(self):
# GH#42107
ser = Series([True, False, True, pd.NA], dtype="boolean")
result = ser.mode()
expected = Series({0: True}, dtype="boolean")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"array,expected,dtype",
[
(
[0, 1j, 1, 1, 1 + 1j, 1 + 2j],
Series([1], dtype=np.complex128),
np.complex128,
),
(
[0, 1j, 1, 1, 1 + 1j, 1 + 2j],
Series([1], dtype=np.complex64),
np.complex64,
),
(
[1 + 1j, 2j, 1 + 1j],
Series([1 + 1j], dtype=np.complex128),
np.complex128,
),
],
)
def test_single_mode_value_complex(self, array, expected, dtype):
result = Series(array, dtype=dtype).mode()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"array,expected,dtype",
[
(
# no modes
[0, 1j, 1, 1 + 1j, 1 + 2j],
Series([0j, 1j, 1 + 0j, 1 + 1j, 1 + 2j], dtype=np.complex128),
np.complex128,
),
(
[1 + 1j, 2j, 1 + 1j, 2j, 3],
Series([2j, 1 + 1j], dtype=np.complex64),
np.complex64,
),
],
)
def test_multimode_complex(self, array, expected, dtype):
# GH 17927
# mode tries to sort multimodal series.
# Complex numbers are sorted by their magnitude
result = Series(array, dtype=dtype).mode()
tm.assert_series_equal(result, expected)