A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pandas/tests/frame/methods/test_rank.py

498 lines
17 KiB

from datetime import (
datetime,
timedelta,
)
import numpy as np
import pytest
from pandas._libs.algos import (
Infinity,
NegInfinity,
)
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
class TestRank:
s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])
df = DataFrame({"A": s, "B": s})
results = {
"average": np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]),
"min": np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]),
"max": np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]),
"first": np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]),
"dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]),
}
@pytest.fixture(params=["average", "min", "max", "first", "dense"])
def method(self, request):
"""
Fixture for trying all rank methods
"""
return request.param
@td.skip_if_no_scipy
def test_rank(self, float_frame):
import scipy.stats # noqa:F401
from scipy.stats import rankdata
float_frame["A"][::2] = np.nan
float_frame["B"][::3] = np.nan
float_frame["C"][::4] = np.nan
float_frame["D"][::5] = np.nan
ranks0 = float_frame.rank()
ranks1 = float_frame.rank(1)
mask = np.isnan(float_frame.values)
fvals = float_frame.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, fvals)
exp0[mask] = np.nan
exp1 = np.apply_along_axis(rankdata, 1, fvals)
exp1[mask] = np.nan
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# integers
df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))
result = df.rank()
exp = df.astype(float).rank()
tm.assert_frame_equal(result, exp)
result = df.rank(1)
exp = df.astype(float).rank(1)
tm.assert_frame_equal(result, exp)
def test_rank2(self):
df = DataFrame([[1, 3, 2], [1, 2, 3]])
expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
result = df.rank(1, pct=True)
tm.assert_frame_equal(result, expected)
df = DataFrame([[1, 3, 2], [1, 2, 3]])
expected = df.rank(0) / 2.0
result = df.rank(0, pct=True)
tm.assert_frame_equal(result, expected)
df = DataFrame([["b", "c", "a"], ["a", "c", "b"]])
expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
result = df.rank(1, numeric_only=False)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
result = df.rank(0, numeric_only=False)
tm.assert_frame_equal(result, expected)
df = DataFrame([["b", np.nan, "a"], ["a", "c", "b"]])
expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]])
result = df.rank(1, numeric_only=False)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]])
result = df.rank(0, numeric_only=False)
tm.assert_frame_equal(result, expected)
# f7u12, this does not work without extensive workaround
data = [
[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
[datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)],
]
df = DataFrame(data)
# check the rank
expected = DataFrame([[2.0, np.nan, 1.0], [2.0, 3.0, 1.0]])
result = df.rank(1, numeric_only=False, ascending=True)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[1.0, np.nan, 2.0], [2.0, 1.0, 3.0]])
result = df.rank(1, numeric_only=False, ascending=False)
tm.assert_frame_equal(result, expected)
df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]})
exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]})
tm.assert_frame_equal(df.rank(), exp)
def test_rank_does_not_mutate(self):
# GH#18521
# Check rank does not mutate DataFrame
df = DataFrame(np.random.randn(10, 3), dtype="float64")
expected = df.copy()
df.rank()
result = df
tm.assert_frame_equal(result, expected)
def test_rank_mixed_frame(self, float_string_frame):
float_string_frame["datetime"] = datetime.now()
float_string_frame["timedelta"] = timedelta(days=1, seconds=1)
with tm.assert_produces_warning(FutureWarning, match="numeric_only=None"):
float_string_frame.rank(numeric_only=None)
with tm.assert_produces_warning(FutureWarning, match="Dropping of nuisance"):
result = float_string_frame.rank(1)
expected = float_string_frame.rank(1, numeric_only=True)
tm.assert_frame_equal(result, expected)
@td.skip_if_no_scipy
def test_rank_na_option(self, float_frame):
import scipy.stats # noqa:F401
from scipy.stats import rankdata
float_frame["A"][::2] = np.nan
float_frame["B"][::3] = np.nan
float_frame["C"][::4] = np.nan
float_frame["D"][::5] = np.nan
# bottom
ranks0 = float_frame.rank(na_option="bottom")
ranks1 = float_frame.rank(1, na_option="bottom")
fvals = float_frame.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, fvals)
exp1 = np.apply_along_axis(rankdata, 1, fvals)
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# top
ranks0 = float_frame.rank(na_option="top")
ranks1 = float_frame.rank(1, na_option="top")
fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values
fval1 = float_frame.T
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
fval1 = fval1.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, fval0)
exp1 = np.apply_along_axis(rankdata, 1, fval1)
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# descending
# bottom
ranks0 = float_frame.rank(na_option="top", ascending=False)
ranks1 = float_frame.rank(1, na_option="top", ascending=False)
fvals = float_frame.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, -fvals)
exp1 = np.apply_along_axis(rankdata, 1, -fvals)
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# descending
# top
ranks0 = float_frame.rank(na_option="bottom", ascending=False)
ranks1 = float_frame.rank(1, na_option="bottom", ascending=False)
fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values
fval1 = float_frame.T
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
fval1 = fval1.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, -fval0)
exp1 = np.apply_along_axis(rankdata, 1, -fval1)
tm.assert_numpy_array_equal(ranks0.values, exp0)
tm.assert_numpy_array_equal(ranks1.values, exp1)
# bad values throw error
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
with pytest.raises(ValueError, match=msg):
float_frame.rank(na_option="bad", ascending=False)
# invalid type
with pytest.raises(ValueError, match=msg):
float_frame.rank(na_option=True, ascending=False)
def test_rank_axis(self):
# check if using axes' names gives the same result
df = DataFrame([[2, 1], [4, 3]])
tm.assert_frame_equal(df.rank(axis=0), df.rank(axis="index"))
tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns"))
@td.skip_if_no_scipy
def test_rank_methods_frame(self):
import scipy.stats # noqa:F401
from scipy.stats import rankdata
xs = np.random.randint(0, 21, (100, 26))
xs = (xs - 10.0) / 10.0
cols = [chr(ord("z") - i) for i in range(xs.shape[1])]
for vals in [xs, xs + 1e6, xs * 1e-6]:
df = DataFrame(vals, columns=cols)
for ax in [0, 1]:
for m in ["average", "min", "max", "first", "dense"]:
result = df.rank(axis=ax, method=m)
sprank = np.apply_along_axis(
rankdata, ax, vals, m if m != "first" else "ordinal"
)
sprank = sprank.astype(np.float64)
expected = DataFrame(sprank, columns=cols).astype("float64")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
def test_rank_descending(self, method, dtype):
if "i" in dtype:
df = self.df.dropna().astype(dtype)
else:
df = self.df.astype(dtype)
res = df.rank(ascending=False)
expected = (df.max() - df).rank()
tm.assert_frame_equal(res, expected)
expected = (df.max() - df).rank(method=method)
if dtype != "O":
res2 = df.rank(method=method, ascending=False, numeric_only=True)
tm.assert_frame_equal(res2, expected)
res3 = df.rank(method=method, ascending=False, numeric_only=False)
tm.assert_frame_equal(res3, expected)
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("dtype", [None, object])
def test_rank_2d_tie_methods(self, method, axis, dtype):
df = self.df
def _check2d(df, expected, method="average", axis=0):
exp_df = DataFrame({"A": expected, "B": expected})
if axis == 1:
df = df.T
exp_df = exp_df.T
result = df.rank(method=method, axis=axis)
tm.assert_frame_equal(result, exp_df)
frame = df if dtype is None else df.astype(dtype)
_check2d(frame, self.results[method], method=method, axis=axis)
@pytest.mark.parametrize(
"method,exp",
[
("dense", [[1.0, 1.0, 1.0], [1.0, 0.5, 2.0 / 3], [1.0, 0.5, 1.0 / 3]]),
(
"min",
[
[1.0 / 3, 1.0, 1.0],
[1.0 / 3, 1.0 / 3, 2.0 / 3],
[1.0 / 3, 1.0 / 3, 1.0 / 3],
],
),
(
"max",
[[1.0, 1.0, 1.0], [1.0, 2.0 / 3, 2.0 / 3], [1.0, 2.0 / 3, 1.0 / 3]],
),
(
"average",
[[2.0 / 3, 1.0, 1.0], [2.0 / 3, 0.5, 2.0 / 3], [2.0 / 3, 0.5, 1.0 / 3]],
),
(
"first",
[
[1.0 / 3, 1.0, 1.0],
[2.0 / 3, 1.0 / 3, 2.0 / 3],
[3.0 / 3, 2.0 / 3, 1.0 / 3],
],
),
],
)
def test_rank_pct_true(self, method, exp):
# see gh-15630.
df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
result = df.rank(method=method, pct=True)
expected = DataFrame(exp)
tm.assert_frame_equal(result, expected)
@pytest.mark.single_cpu
@pytest.mark.high_memory
def test_pct_max_many_rows(self):
# GH 18271
df = DataFrame(
{"A": np.arange(2**24 + 1), "B": np.arange(2**24 + 1, 0, -1)}
)
result = df.rank(pct=True).max()
assert (result == 1).all()
@pytest.mark.parametrize(
"contents,dtype",
[
(
[
-np.inf,
-50,
-1,
-1e-20,
-1e-25,
-1e-50,
0,
1e-40,
1e-20,
1e-10,
2,
40,
np.inf,
],
"float64",
),
(
[
-np.inf,
-50,
-1,
-1e-20,
-1e-25,
-1e-45,
0,
1e-40,
1e-20,
1e-10,
2,
40,
np.inf,
],
"float32",
),
([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"),
(
[
np.iinfo(np.int64).min,
-100,
0,
1,
9999,
100000,
1e10,
np.iinfo(np.int64).max,
],
"int64",
),
([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"),
(
[datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 5)],
"datetime64",
),
],
)
def test_rank_inf_and_nan(self, contents, dtype, frame_or_series):
dtype_na_map = {
"float64": np.nan,
"float32": np.nan,
"object": None,
"datetime64": np.datetime64("nat"),
}
# Insert nans at random positions if underlying dtype has missing
# value. Then adjust the expected order by adding nans accordingly
# This is for testing whether rank calculation is affected
# when values are interwined with nan values.
values = np.array(contents, dtype=dtype)
exp_order = np.array(range(len(values)), dtype="float64") + 1.0
if dtype in dtype_na_map:
na_value = dtype_na_map[dtype]
nan_indices = np.random.choice(range(len(values)), 5)
values = np.insert(values, nan_indices, na_value)
exp_order = np.insert(exp_order, nan_indices, np.nan)
# Shuffle the testing array and expected results in the same way
random_order = np.random.permutation(len(values))
obj = frame_or_series(values[random_order])
expected = frame_or_series(exp_order[random_order], dtype="float64")
result = obj.rank()
tm.assert_equal(result, expected)
def test_df_series_inf_nan_consistency(self):
# GH#32593
index = [5, 4, 3, 2, 1, 6, 7, 8, 9, 10]
col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6]
col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf]
df = DataFrame(
data={
"col1": col1,
"col2": col2,
},
index=index,
dtype="f8",
)
df_result = df.rank()
series_result = df.copy()
series_result["col1"] = df["col1"].rank()
series_result["col2"] = df["col2"].rank()
tm.assert_frame_equal(df_result, series_result)
def test_rank_both_inf(self):
# GH#32593
df = DataFrame({"a": [-np.inf, 0, np.inf]})
expected = DataFrame({"a": [1.0, 2.0, 3.0]})
result = df.rank()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"na_option,ascending,expected",
[
("top", True, [3.0, 1.0, 2.0]),
("top", False, [2.0, 1.0, 3.0]),
("bottom", True, [2.0, 3.0, 1.0]),
("bottom", False, [1.0, 3.0, 2.0]),
],
)
def test_rank_inf_nans_na_option(
self, frame_or_series, method, na_option, ascending, expected
):
obj = frame_or_series([np.inf, np.nan, -np.inf])
result = obj.rank(method=method, na_option=na_option, ascending=ascending)
expected = frame_or_series(expected)
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"na_option,ascending,expected",
[
("bottom", True, [1.0, 2.0, 4.0, 3.0]),
("bottom", False, [1.0, 2.0, 4.0, 3.0]),
("top", True, [2.0, 3.0, 1.0, 4.0]),
("top", False, [2.0, 3.0, 1.0, 4.0]),
],
)
def test_rank_object_first(self, frame_or_series, na_option, ascending, expected):
obj = frame_or_series(["foo", "foo", None, "foo"])
result = obj.rank(method="first", na_option=na_option, ascending=ascending)
expected = frame_or_series(expected)
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"data,expected",
[
({"a": [1, 2, "a"], "b": [4, 5, 6]}, DataFrame({"b": [1.0, 2.0, 3.0]})),
({"a": [1, 2, "a"]}, DataFrame(index=range(3))),
],
)
def test_rank_mixed_axis_zero(self, data, expected):
df = DataFrame(data)
msg = "Dropping of nuisance columns"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.rank()
tm.assert_frame_equal(result, expected)