A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pandas/tests/indexing/test_categorical.py

555 lines
19 KiB

import re
import numpy as np
import pytest
from pandas.core.dtypes.common import is_categorical_dtype
import pandas as pd
from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
Index,
Interval,
Series,
Timedelta,
Timestamp,
)
import pandas._testing as tm
from pandas.api.types import CategoricalDtype as CDT
class TestCategoricalIndex:
def setup_method(self, method):
self.df = DataFrame(
{
"A": np.arange(6, dtype="int64"),
},
index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cab")), name="B"),
)
self.df2 = DataFrame(
{
"A": np.arange(6, dtype="int64"),
},
index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"),
)
def test_loc_scalar(self):
dtype = CDT(list("cab"))
result = self.df.loc["a"]
bidx = Series(list("aaa"), name="B").astype(dtype)
assert bidx.dtype == dtype
expected = DataFrame({"A": [0, 1, 5]}, index=Index(bidx))
tm.assert_frame_equal(result, expected)
df = self.df.copy()
df.loc["a"] = 20
bidx2 = Series(list("aabbca"), name="B").astype(dtype)
assert bidx2.dtype == dtype
expected = DataFrame(
{
"A": [20, 20, 2, 3, 4, 20],
},
index=Index(bidx2),
)
tm.assert_frame_equal(df, expected)
# value not in the categories
with pytest.raises(KeyError, match=r"^'d'$"):
df.loc["d"]
df2 = df.copy()
expected = df2.copy()
expected.index = expected.index.astype(object)
expected.loc["d"] = 10
df2.loc["d"] = 10
tm.assert_frame_equal(df2, expected)
def test_loc_setitem_with_expansion_non_category(self):
# Setting-with-expansion with a new key "d" that is not among caegories
df = self.df
df.loc["a"] = 20
# Setting a new row on an existing column
df3 = df.copy()
df3.loc["d", "A"] = 10
bidx3 = Index(list("aabbcad"), name="B")
expected3 = DataFrame(
{
"A": [20, 20, 2, 3, 4, 20, 10.0],
},
index=Index(bidx3),
)
tm.assert_frame_equal(df3, expected3)
# Settig a new row _and_ new column
df4 = df.copy()
df4.loc["d", "C"] = 10
expected3 = DataFrame(
{
"A": [20, 20, 2, 3, 4, 20, np.nan],
"C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 10],
},
index=Index(bidx3),
)
tm.assert_frame_equal(df4, expected3)
def test_loc_getitem_scalar_non_category(self):
with pytest.raises(KeyError, match="^1$"):
self.df.loc[1]
def test_slicing(self):
cat = Series(Categorical([1, 2, 3, 4]))
reverse = cat[::-1]
exp = np.array([4, 3, 2, 1], dtype=np.int64)
tm.assert_numpy_array_equal(reverse.__array__(), exp)
df = DataFrame({"value": (np.arange(100) + 1).astype("int64")})
df["D"] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])
expected = Series([11, Interval(0, 25)], index=["value", "D"], name=10)
result = df.iloc[10]
tm.assert_series_equal(result, expected)
expected = DataFrame(
{"value": np.arange(11, 21).astype("int64")},
index=np.arange(10, 20).astype("int64"),
)
expected["D"] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
result = df.iloc[10:20]
tm.assert_frame_equal(result, expected)
expected = Series([9, Interval(0, 25)], index=["value", "D"], name=8)
result = df.loc[8]
tm.assert_series_equal(result, expected)
def test_slicing_and_getting_ops(self):
# systematically test the slicing operations:
# for all slicing ops:
# - returning a dataframe
# - returning a column
# - returning a row
# - returning a single value
cats = Categorical(
["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"]
)
idx = Index(["h", "i", "j", "k", "l", "m", "n"])
values = [1, 2, 3, 4, 5, 6, 7]
df = DataFrame({"cats": cats, "values": values}, index=idx)
# the expected values
cats2 = Categorical(["b", "c"], categories=["a", "b", "c"])
idx2 = Index(["j", "k"])
values2 = [3, 4]
# 2:4,: | "j":"k",:
exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2)
# :,"cats" | :,0
exp_col = Series(cats, index=idx, name="cats")
# "j",: | 2,:
exp_row = Series(["b", 3], index=["cats", "values"], dtype="object", name="j")
# "j","cats | 2,0
exp_val = "b"
# iloc
# frame
res_df = df.iloc[2:4, :]
tm.assert_frame_equal(res_df, exp_df)
assert is_categorical_dtype(res_df["cats"].dtype)
# row
res_row = df.iloc[2, :]
tm.assert_series_equal(res_row, exp_row)
assert isinstance(res_row["cats"], str)
# col
res_col = df.iloc[:, 0]
tm.assert_series_equal(res_col, exp_col)
assert is_categorical_dtype(res_col.dtype)
# single value
res_val = df.iloc[2, 0]
assert res_val == exp_val
# loc
# frame
res_df = df.loc["j":"k", :]
tm.assert_frame_equal(res_df, exp_df)
assert is_categorical_dtype(res_df["cats"].dtype)
# row
res_row = df.loc["j", :]
tm.assert_series_equal(res_row, exp_row)
assert isinstance(res_row["cats"], str)
# col
res_col = df.loc[:, "cats"]
tm.assert_series_equal(res_col, exp_col)
assert is_categorical_dtype(res_col.dtype)
# single value
res_val = df.loc["j", "cats"]
assert res_val == exp_val
# single value
res_val = df.loc["j", df.columns[0]]
assert res_val == exp_val
# iat
res_val = df.iat[2, 0]
assert res_val == exp_val
# at
res_val = df.at["j", "cats"]
assert res_val == exp_val
# fancy indexing
exp_fancy = df.iloc[[2]]
res_fancy = df[df["cats"] == "b"]
tm.assert_frame_equal(res_fancy, exp_fancy)
res_fancy = df[df["values"] == 3]
tm.assert_frame_equal(res_fancy, exp_fancy)
# get_value
res_val = df.at["j", "cats"]
assert res_val == exp_val
# i : int, slice, or sequence of integers
res_row = df.iloc[2]
tm.assert_series_equal(res_row, exp_row)
assert isinstance(res_row["cats"], str)
res_df = df.iloc[slice(2, 4)]
tm.assert_frame_equal(res_df, exp_df)
assert is_categorical_dtype(res_df["cats"].dtype)
res_df = df.iloc[[2, 3]]
tm.assert_frame_equal(res_df, exp_df)
assert is_categorical_dtype(res_df["cats"].dtype)
res_col = df.iloc[:, 0]
tm.assert_series_equal(res_col, exp_col)
assert is_categorical_dtype(res_col.dtype)
res_df = df.iloc[:, slice(0, 2)]
tm.assert_frame_equal(res_df, df)
assert is_categorical_dtype(res_df["cats"].dtype)
res_df = df.iloc[:, [0, 1]]
tm.assert_frame_equal(res_df, df)
assert is_categorical_dtype(res_df["cats"].dtype)
def test_slicing_doc_examples(self):
# GH 7918
cats = Categorical(
["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c"]
)
idx = Index(["h", "i", "j", "k", "l", "m", "n"])
values = [1, 2, 2, 2, 3, 4, 5]
df = DataFrame({"cats": cats, "values": values}, index=idx)
result = df.iloc[2:4, :]
expected = DataFrame(
{
"cats": Categorical(["b", "b"], categories=["a", "b", "c"]),
"values": [2, 2],
},
index=["j", "k"],
)
tm.assert_frame_equal(result, expected)
result = df.iloc[2:4, :].dtypes
expected = Series(["category", "int64"], ["cats", "values"])
tm.assert_series_equal(result, expected)
result = df.loc["h":"j", "cats"]
expected = Series(
Categorical(["a", "b", "b"], categories=["a", "b", "c"]),
index=["h", "i", "j"],
name="cats",
)
tm.assert_series_equal(result, expected)
result = df.loc["h":"j", df.columns[0:1]]
expected = DataFrame(
{"cats": Categorical(["a", "b", "b"], categories=["a", "b", "c"])},
index=["h", "i", "j"],
)
tm.assert_frame_equal(result, expected)
def test_loc_getitem_listlike_labels(self):
# list of labels
result = self.df.loc[["c", "a"]]
expected = self.df.iloc[[4, 0, 1, 5]]
tm.assert_frame_equal(result, expected, check_index_type=True)
def test_loc_getitem_listlike_unused_category(self):
# GH#37901 a label that is in index.categories but not in index
# listlike containing an element in the categories but not in the values
with pytest.raises(KeyError, match=re.escape("['e'] not in index")):
self.df2.loc[["a", "b", "e"]]
def test_loc_getitem_label_unused_category(self):
# element in the categories but not in the values
with pytest.raises(KeyError, match=r"^'e'$"):
self.df2.loc["e"]
def test_loc_getitem_non_category(self):
# not all labels in the categories
with pytest.raises(KeyError, match=re.escape("['d'] not in index")):
self.df2.loc[["a", "d"]]
def test_loc_setitem_expansion_label_unused_category(self):
# assigning with a label that is in the categories but not in the index
df = self.df2.copy()
df.loc["e"] = 20
result = df.loc[["a", "b", "e"]]
exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B")
expected = DataFrame({"A": [0, 1, 5, 2, 3, 20]}, index=exp_index)
tm.assert_frame_equal(result, expected)
def test_loc_listlike_dtypes(self):
# GH 11586
# unique categories and codes
index = CategoricalIndex(["a", "b", "c"])
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index)
# unique slice
res = df.loc[["a", "b"]]
exp_index = CategoricalIndex(["a", "b"], categories=index.categories)
exp = DataFrame({"A": [1, 2], "B": [4, 5]}, index=exp_index)
tm.assert_frame_equal(res, exp, check_index_type=True)
# duplicated slice
res = df.loc[["a", "a", "b"]]
exp_index = CategoricalIndex(["a", "a", "b"], categories=index.categories)
exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index)
tm.assert_frame_equal(res, exp, check_index_type=True)
with pytest.raises(KeyError, match=re.escape("['x'] not in index")):
df.loc[["a", "x"]]
def test_loc_listlike_dtypes_duplicated_categories_and_codes(self):
# duplicated categories and codes
index = CategoricalIndex(["a", "b", "a"])
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index)
# unique slice
res = df.loc[["a", "b"]]
exp = DataFrame(
{"A": [1, 3, 2], "B": [4, 6, 5]}, index=CategoricalIndex(["a", "a", "b"])
)
tm.assert_frame_equal(res, exp, check_index_type=True)
# duplicated slice
res = df.loc[["a", "a", "b"]]
exp = DataFrame(
{"A": [1, 3, 1, 3, 2], "B": [4, 6, 4, 6, 5]},
index=CategoricalIndex(["a", "a", "a", "a", "b"]),
)
tm.assert_frame_equal(res, exp, check_index_type=True)
with pytest.raises(KeyError, match=re.escape("['x'] not in index")):
df.loc[["a", "x"]]
def test_loc_listlike_dtypes_unused_category(self):
# contains unused category
index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde"))
df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index)
res = df.loc[["a", "b"]]
exp = DataFrame(
{"A": [1, 3, 2], "B": [5, 7, 6]},
index=CategoricalIndex(["a", "a", "b"], categories=list("abcde")),
)
tm.assert_frame_equal(res, exp, check_index_type=True)
# duplicated slice
res = df.loc[["a", "a", "b"]]
exp = DataFrame(
{"A": [1, 3, 1, 3, 2], "B": [5, 7, 5, 7, 6]},
index=CategoricalIndex(["a", "a", "a", "a", "b"], categories=list("abcde")),
)
tm.assert_frame_equal(res, exp, check_index_type=True)
with pytest.raises(KeyError, match=re.escape("['x'] not in index")):
df.loc[["a", "x"]]
def test_loc_getitem_listlike_unused_category_raises_keyerror(self):
# key that is an *unused* category raises
index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde"))
df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index)
with pytest.raises(KeyError, match="e"):
# For comparison, check the scalar behavior
df.loc["e"]
with pytest.raises(KeyError, match=re.escape("['e'] not in index")):
df.loc[["a", "e"]]
def test_ix_categorical_index(self):
# GH 12531
df = DataFrame(np.random.randn(3, 3), index=list("ABC"), columns=list("XYZ"))
cdf = df.copy()
cdf.index = CategoricalIndex(df.index)
cdf.columns = CategoricalIndex(df.columns)
expect = Series(df.loc["A", :], index=cdf.columns, name="A")
tm.assert_series_equal(cdf.loc["A", :], expect)
expect = Series(df.loc[:, "X"], index=cdf.index, name="X")
tm.assert_series_equal(cdf.loc[:, "X"], expect)
exp_index = CategoricalIndex(list("AB"), categories=["A", "B", "C"])
expect = DataFrame(df.loc[["A", "B"], :], columns=cdf.columns, index=exp_index)
tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect)
exp_columns = CategoricalIndex(list("XY"), categories=["X", "Y", "Z"])
expect = DataFrame(df.loc[:, ["X", "Y"]], index=cdf.index, columns=exp_columns)
tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)
def test_ix_categorical_index_non_unique(self):
# non-unique
df = DataFrame(np.random.randn(3, 3), index=list("ABA"), columns=list("XYX"))
cdf = df.copy()
cdf.index = CategoricalIndex(df.index)
cdf.columns = CategoricalIndex(df.columns)
exp_index = CategoricalIndex(list("AA"), categories=["A", "B"])
expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index)
tm.assert_frame_equal(cdf.loc["A", :], expect)
exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"])
expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns)
tm.assert_frame_equal(cdf.loc[:, "X"], expect)
expect = DataFrame(
df.loc[["A", "B"], :],
columns=cdf.columns,
index=CategoricalIndex(list("AAB")),
)
tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect)
expect = DataFrame(
df.loc[:, ["X", "Y"]],
index=cdf.index,
columns=CategoricalIndex(list("XXY")),
)
tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)
def test_loc_slice(self):
# GH9748
msg = (
"cannot do slice indexing on CategoricalIndex with these "
r"indexers \[1\] of type int"
)
with pytest.raises(TypeError, match=msg):
self.df.loc[1:5]
result = self.df.loc["b":"c"]
expected = self.df.iloc[[2, 3, 4]]
tm.assert_frame_equal(result, expected)
def test_loc_and_at_with_categorical_index(self):
# GH 20629
df = DataFrame(
[[1, 2], [3, 4], [5, 6]], index=CategoricalIndex(["A", "B", "C"])
)
s = df[0]
assert s.loc["A"] == 1
assert s.at["A"] == 1
assert df.loc["B", 1] == 4
assert df.at["B", 1] == 4
@pytest.mark.parametrize(
"idx_values",
[
# python types
[1, 2, 3],
[-1, -2, -3],
[1.5, 2.5, 3.5],
[-1.5, -2.5, -3.5],
# numpy int/uint
*(np.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_NUMPY_DTYPES),
# numpy floats
*(np.array([1.5, 2.5, 3.5], dtype=dtyp) for dtyp in tm.FLOAT_NUMPY_DTYPES),
# numpy object
np.array([1, "b", 3.5], dtype=object),
# pandas scalars
[Interval(1, 4), Interval(4, 6), Interval(6, 9)],
[Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1)],
[Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")],
# pandas Integer arrays
*(pd.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES),
# other pandas arrays
pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array,
pd.date_range("2019-01-01", periods=3).array,
pd.timedelta_range(start="1d", periods=3).array,
],
)
def test_loc_getitem_with_non_string_categories(self, idx_values, ordered):
# GH-17569
cat_idx = CategoricalIndex(idx_values, ordered=ordered)
df = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx)
sl = slice(idx_values[0], idx_values[1])
# scalar selection
result = df.loc[idx_values[0]]
expected = Series(["foo"], index=["A"], name=idx_values[0])
tm.assert_series_equal(result, expected)
# list selection
result = df.loc[idx_values[:2]]
expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"])
tm.assert_frame_equal(result, expected)
# slice selection
result = df.loc[sl]
expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"])
tm.assert_frame_equal(result, expected)
# scalar assignment
result = df.copy()
result.loc[idx_values[0]] = "qux"
expected = DataFrame({"A": ["qux", "bar", "baz"]}, index=cat_idx)
tm.assert_frame_equal(result, expected)
# list assignment
result = df.copy()
result.loc[idx_values[:2], "A"] = ["qux", "qux2"]
expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx)
tm.assert_frame_equal(result, expected)
# slice assignment
result = df.copy()
result.loc[sl, "A"] = ["qux", "qux2"]
expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx)
tm.assert_frame_equal(result, expected)
def test_getitem_categorical_with_nan(self):
# GH#41933
ci = CategoricalIndex(["A", "B", np.nan])
ser = Series(range(3), index=ci)
assert ser[np.nan] == 2
assert ser.loc[np.nan] == 2
df = DataFrame(ser)
assert df.loc[np.nan, 0] == 2
assert df.loc[np.nan][0] == 2