A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pandas/tests/reshape/test_pivot.py

2311 lines
77 KiB

from datetime import (
date,
datetime,
timedelta,
)
from itertools import product
import numpy as np
import pytest
from pandas.errors import PerformanceWarning
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Grouper,
Index,
MultiIndex,
Series,
concat,
date_range,
)
import pandas._testing as tm
from pandas.api.types import CategoricalDtype as CDT
from pandas.core.reshape import reshape as reshape_lib
from pandas.core.reshape.pivot import pivot_table
@pytest.fixture(params=[True, False])
def dropna(request):
return request.param
@pytest.fixture(params=[([0] * 4, [1] * 4), (range(0, 3), range(1, 4))])
def interval_values(request, closed):
left, right = request.param
return Categorical(pd.IntervalIndex.from_arrays(left, right, closed))
class TestPivotTable:
def setup_method(self, method):
self.data = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
"D": np.random.randn(11),
"E": np.random.randn(11),
"F": np.random.randn(11),
}
)
def test_pivot_table(self, observed):
index = ["A", "B"]
columns = "C"
table = pivot_table(
self.data, values="D", index=index, columns=columns, observed=observed
)
table2 = self.data.pivot_table(
values="D", index=index, columns=columns, observed=observed
)
tm.assert_frame_equal(table, table2)
# this works
pivot_table(self.data, values="D", index=index, observed=observed)
if len(index) > 1:
assert table.index.names == tuple(index)
else:
assert table.index.name == index[0]
if len(columns) > 1:
assert table.columns.names == columns
else:
assert table.columns.name == columns[0]
expected = self.data.groupby(index + [columns])["D"].agg(np.mean).unstack()
tm.assert_frame_equal(table, expected)
def test_pivot_table_categorical_observed_equal(self, observed):
# issue #24923
df = DataFrame(
{"col1": list("abcde"), "col2": list("fghij"), "col3": [1, 2, 3, 4, 5]}
)
expected = df.pivot_table(
index="col1", values="col3", columns="col2", aggfunc=np.sum, fill_value=0
)
expected.index = expected.index.astype("category")
expected.columns = expected.columns.astype("category")
df.col1 = df.col1.astype("category")
df.col2 = df.col2.astype("category")
result = df.pivot_table(
index="col1",
values="col3",
columns="col2",
aggfunc=np.sum,
fill_value=0,
observed=observed,
)
tm.assert_frame_equal(result, expected)
def test_pivot_table_nocols(self):
df = DataFrame(
{"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]}
)
rs = df.pivot_table(columns="cols", aggfunc=np.sum)
xp = df.pivot_table(index="cols", aggfunc=np.sum).T
tm.assert_frame_equal(rs, xp)
rs = df.pivot_table(columns="cols", aggfunc={"values": "mean"})
xp = df.pivot_table(index="cols", aggfunc={"values": "mean"}).T
tm.assert_frame_equal(rs, xp)
def test_pivot_table_dropna(self):
df = DataFrame(
{
"amount": {0: 60000, 1: 100000, 2: 50000, 3: 30000},
"customer": {0: "A", 1: "A", 2: "B", 3: "C"},
"month": {0: 201307, 1: 201309, 2: 201308, 3: 201310},
"product": {0: "a", 1: "b", 2: "c", 3: "d"},
"quantity": {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000},
}
)
pv_col = df.pivot_table(
"quantity", "month", ["customer", "product"], dropna=False
)
pv_ind = df.pivot_table(
"quantity", ["customer", "product"], "month", dropna=False
)
m = MultiIndex.from_tuples(
[
("A", "a"),
("A", "b"),
("A", "c"),
("A", "d"),
("B", "a"),
("B", "b"),
("B", "c"),
("B", "d"),
("C", "a"),
("C", "b"),
("C", "c"),
("C", "d"),
],
names=["customer", "product"],
)
tm.assert_index_equal(pv_col.columns, m)
tm.assert_index_equal(pv_ind.index, m)
def test_pivot_table_categorical(self):
cat1 = Categorical(
["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True
)
cat2 = Categorical(
["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True
)
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
result = pivot_table(df, values="values", index=["A", "B"], dropna=True)
exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"])
expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
tm.assert_frame_equal(result, expected)
def test_pivot_table_dropna_categoricals(self, dropna):
# GH 15193
categories = ["a", "b", "c", "d"]
df = DataFrame(
{
"A": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
"B": [1, 2, 3, 1, 2, 3, 1, 2, 3],
"C": range(0, 9),
}
)
df["A"] = df["A"].astype(CDT(categories, ordered=False))
result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna)
expected_columns = Series(["a", "b", "c"], name="A")
expected_columns = expected_columns.astype(CDT(categories, ordered=False))
expected_index = Series([1, 2, 3], name="B")
expected = DataFrame(
[[0, 3, 6], [1, 4, 7], [2, 5, 8]],
index=expected_index,
columns=expected_columns,
)
if not dropna:
# add back the non observed to compare
expected = expected.reindex(columns=Categorical(categories)).astype("float")
tm.assert_frame_equal(result, expected)
def test_pivot_with_non_observable_dropna(self, dropna):
# gh-21133
df = DataFrame(
{
"A": Categorical(
[np.nan, "low", "high", "low", "high"],
categories=["low", "high"],
ordered=True,
),
"B": [0.0, 1.0, 2.0, 3.0, 4.0],
}
)
result = df.pivot_table(index="A", values="B", dropna=dropna)
expected = DataFrame(
{"B": [2.0, 3.0]},
index=Index(
Categorical.from_codes(
[0, 1], categories=["low", "high"], ordered=True
),
name="A",
),
)
tm.assert_frame_equal(result, expected)
# gh-21378
df = DataFrame(
{
"A": Categorical(
["left", "low", "high", "low", "high"],
categories=["low", "high", "left"],
ordered=True,
),
"B": range(5),
}
)
result = df.pivot_table(index="A", values="B", dropna=dropna)
expected = DataFrame(
{"B": [2, 3, 0]},
index=Index(
Categorical.from_codes(
[0, 1, 2], categories=["low", "high", "left"], ordered=True
),
name="A",
),
)
if not dropna:
expected["B"] = expected["B"].astype(float)
tm.assert_frame_equal(result, expected)
def test_pivot_with_interval_index(self, interval_values, dropna):
# GH 25814
df = DataFrame({"A": interval_values, "B": 1})
result = df.pivot_table(index="A", values="B", dropna=dropna)
expected = DataFrame({"B": 1}, index=Index(interval_values.unique(), name="A"))
if not dropna:
expected = expected.astype(float)
tm.assert_frame_equal(result, expected)
def test_pivot_with_interval_index_margins(self):
# GH 25815
ordered_cat = pd.IntervalIndex.from_arrays([0, 0, 1, 1], [1, 1, 2, 2])
df = DataFrame(
{
"A": np.arange(4, 0, -1, dtype=np.intp),
"B": ["a", "b", "a", "b"],
"C": Categorical(ordered_cat, ordered=True).sort_values(
ascending=False
),
}
)
pivot_tab = pivot_table(
df, index="C", columns="B", values="A", aggfunc="sum", margins=True
)
result = pivot_tab["All"]
expected = Series(
[3, 7, 10],
index=Index([pd.Interval(0, 1), pd.Interval(1, 2), "All"], name="C"),
name="All",
dtype=np.intp,
)
tm.assert_series_equal(result, expected)
def test_pass_array(self):
result = self.data.pivot_table("D", index=self.data.A, columns=self.data.C)
expected = self.data.pivot_table("D", index="A", columns="C")
tm.assert_frame_equal(result, expected)
def test_pass_function(self):
result = self.data.pivot_table("D", index=lambda x: x // 5, columns=self.data.C)
expected = self.data.pivot_table("D", index=self.data.index // 5, columns="C")
tm.assert_frame_equal(result, expected)
def test_pivot_table_multiple(self):
index = ["A", "B"]
columns = "C"
table = pivot_table(self.data, index=index, columns=columns)
expected = self.data.groupby(index + [columns]).agg(np.mean).unstack()
tm.assert_frame_equal(table, expected)
def test_pivot_dtypes(self):
# can convert dtypes
f = DataFrame(
{
"a": ["cat", "bat", "cat", "bat"],
"v": [1, 2, 3, 4],
"i": ["a", "b", "a", "b"],
}
)
assert f.dtypes["v"] == "int64"
z = pivot_table(
f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.sum
)
result = z.dtypes
expected = Series([np.dtype("int64")] * 2, index=Index(list("ab"), name="i"))
tm.assert_series_equal(result, expected)
# cannot convert dtypes
f = DataFrame(
{
"a": ["cat", "bat", "cat", "bat"],
"v": [1.5, 2.5, 3.5, 4.5],
"i": ["a", "b", "a", "b"],
}
)
assert f.dtypes["v"] == "float64"
z = pivot_table(
f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.mean
)
result = z.dtypes
expected = Series([np.dtype("float64")] * 2, index=Index(list("ab"), name="i"))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"columns,values",
[
("bool1", ["float1", "float2"]),
("bool1", ["float1", "float2", "bool1"]),
("bool2", ["float1", "float2", "bool1"]),
],
)
def test_pivot_preserve_dtypes(self, columns, values):
# GH 7142 regression test
v = np.arange(5, dtype=np.float64)
df = DataFrame(
{"float1": v, "float2": v + 2.0, "bool1": v <= 2, "bool2": v <= 3}
)
df_res = df.reset_index().pivot_table(
index="index", columns=columns, values=values
)
result = dict(df_res.dtypes)
expected = {col: np.dtype("float64") for col in df_res}
assert result == expected
def test_pivot_no_values(self):
# GH 14380
idx = pd.DatetimeIndex(
["2011-01-01", "2011-02-01", "2011-01-02", "2011-01-01", "2011-01-02"]
)
df = DataFrame({"A": [1, 2, 3, 4, 5]}, index=idx)
res = df.pivot_table(index=df.index.month, columns=df.index.day)
exp_columns = MultiIndex.from_tuples([("A", 1), ("A", 2)])
exp = DataFrame([[2.5, 4.0], [2.0, np.nan]], index=[1, 2], columns=exp_columns)
tm.assert_frame_equal(res, exp)
df = DataFrame(
{
"A": [1, 2, 3, 4, 5],
"dt": date_range("2011-01-01", freq="D", periods=5),
},
index=idx,
)
res = df.pivot_table(index=df.index.month, columns=Grouper(key="dt", freq="M"))
exp_columns = MultiIndex.from_tuples([("A", pd.Timestamp("2011-01-31"))])
exp_columns.names = [None, "dt"]
exp = DataFrame([3.25, 2.0], index=[1, 2], columns=exp_columns)
tm.assert_frame_equal(res, exp)
res = df.pivot_table(
index=Grouper(freq="A"), columns=Grouper(key="dt", freq="M")
)
exp = DataFrame(
[3], index=pd.DatetimeIndex(["2011-12-31"], freq="A"), columns=exp_columns
)
tm.assert_frame_equal(res, exp)
def test_pivot_multi_values(self):
result = pivot_table(
self.data, values=["D", "E"], index="A", columns=["B", "C"], fill_value=0
)
expected = pivot_table(
self.data.drop(["F"], axis=1), index="A", columns=["B", "C"], fill_value=0
)
tm.assert_frame_equal(result, expected)
def test_pivot_multi_functions(self):
f = lambda func: pivot_table(
self.data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func
)
result = f([np.mean, np.std])
means = f(np.mean)
stds = f(np.std)
expected = concat([means, stds], keys=["mean", "std"], axis=1)
tm.assert_frame_equal(result, expected)
# margins not supported??
f = lambda func: pivot_table(
self.data,
values=["D", "E"],
index=["A", "B"],
columns="C",
aggfunc=func,
margins=True,
)
result = f([np.mean, np.std])
means = f(np.mean)
stds = f(np.std)
expected = concat([means, stds], keys=["mean", "std"], axis=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", [True, False])
def test_pivot_index_with_nan(self, method):
# GH 3588
nan = np.nan
df = DataFrame(
{
"a": ["R1", "R2", nan, "R4"],
"b": ["C1", "C2", "C3", "C4"],
"c": [10, 15, 17, 20],
}
)
if method:
result = df.pivot("a", "b", "c")
else:
result = pd.pivot(df, "a", "b", "c")
expected = DataFrame(
[
[nan, nan, 17, nan],
[10, nan, nan, nan],
[nan, 15, nan, nan],
[nan, nan, nan, 20],
],
index=Index([nan, "R1", "R2", "R4"], name="a"),
columns=Index(["C1", "C2", "C3", "C4"], name="b"),
)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(df.pivot("b", "a", "c"), expected.T)
# GH9491
df = DataFrame(
{
"a": date_range("2014-02-01", periods=6, freq="D"),
"c": 100 + np.arange(6),
}
)
df["b"] = df["a"] - pd.Timestamp("2014-02-02")
df.loc[1, "a"] = df.loc[3, "a"] = nan
df.loc[1, "b"] = df.loc[4, "b"] = nan
if method:
pv = df.pivot("a", "b", "c")
else:
pv = pd.pivot(df, "a", "b", "c")
assert pv.notna().values.sum() == len(df)
for _, row in df.iterrows():
assert pv.loc[row["a"], row["b"]] == row["c"]
if method:
result = df.pivot("b", "a", "c")
else:
result = pd.pivot(df, "b", "a", "c")
tm.assert_frame_equal(result, pv.T)
@pytest.mark.filterwarnings("ignore:Timestamp.freq is deprecated:FutureWarning")
@pytest.mark.parametrize("method", [True, False])
def test_pivot_with_tz(self, method):
# GH 5878
df = DataFrame(
{
"dt1": [
datetime(2013, 1, 1, 9, 0),
datetime(2013, 1, 2, 9, 0),
datetime(2013, 1, 1, 9, 0),
datetime(2013, 1, 2, 9, 0),
],
"dt2": [
datetime(2014, 1, 1, 9, 0),
datetime(2014, 1, 1, 9, 0),
datetime(2014, 1, 2, 9, 0),
datetime(2014, 1, 2, 9, 0),
],
"data1": np.arange(4, dtype="int64"),
"data2": np.arange(4, dtype="int64"),
}
)
df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific"))
df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo"))
exp_col1 = Index(["data1", "data1", "data2", "data2"])
exp_col2 = pd.DatetimeIndex(
["2014/01/01 09:00", "2014/01/02 09:00"] * 2, name="dt2", tz="Asia/Tokyo"
)
exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
expected = DataFrame(
[[0, 2, 0, 2], [1, 3, 1, 3]],
index=pd.DatetimeIndex(
["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific"
),
columns=exp_col,
)
if method:
pv = df.pivot(index="dt1", columns="dt2")
else:
pv = pd.pivot(df, index="dt1", columns="dt2")
tm.assert_frame_equal(pv, expected)
expected = DataFrame(
[[0, 2], [1, 3]],
index=pd.DatetimeIndex(
["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific"
),
columns=pd.DatetimeIndex(
["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo"
),
)
if method:
pv = df.pivot(index="dt1", columns="dt2", values="data1")
else:
pv = pd.pivot(df, index="dt1", columns="dt2", values="data1")
tm.assert_frame_equal(pv, expected)
def test_pivot_tz_in_values(self):
# GH 14948
df = DataFrame(
[
{
"uid": "aa",
"ts": pd.Timestamp("2016-08-12 13:00:00-0700", tz="US/Pacific"),
},
{
"uid": "aa",
"ts": pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"),
},
{
"uid": "aa",
"ts": pd.Timestamp("2016-08-12 14:00:00-0700", tz="US/Pacific"),
},
{
"uid": "aa",
"ts": pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"),
},
{
"uid": "aa",
"ts": pd.Timestamp("2016-08-25 13:00:00-0700", tz="US/Pacific"),
},
]
)
df = df.set_index("ts").reset_index()
mins = df.ts.map(lambda x: x.replace(hour=0, minute=0, second=0, microsecond=0))
result = pivot_table(
df.set_index("ts").reset_index(),
values="ts",
index=["uid"],
columns=[mins],
aggfunc=np.min,
)
expected = DataFrame(
[
[
pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"),
pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"),
]
],
index=Index(["aa"], name="uid"),
columns=pd.DatetimeIndex(
[
pd.Timestamp("2016-08-12 00:00:00", tz="US/Pacific"),
pd.Timestamp("2016-08-25 00:00:00", tz="US/Pacific"),
],
name="ts",
),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", [True, False])
def test_pivot_periods(self, method):
df = DataFrame(
{
"p1": [
pd.Period("2013-01-01", "D"),
pd.Period("2013-01-02", "D"),
pd.Period("2013-01-01", "D"),
pd.Period("2013-01-02", "D"),
],
"p2": [
pd.Period("2013-01", "M"),
pd.Period("2013-01", "M"),
pd.Period("2013-02", "M"),
pd.Period("2013-02", "M"),
],
"data1": np.arange(4, dtype="int64"),
"data2": np.arange(4, dtype="int64"),
}
)
exp_col1 = Index(["data1", "data1", "data2", "data2"])
exp_col2 = pd.PeriodIndex(["2013-01", "2013-02"] * 2, name="p2", freq="M")
exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
expected = DataFrame(
[[0, 2, 0, 2], [1, 3, 1, 3]],
index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"),
columns=exp_col,
)
if method:
pv = df.pivot(index="p1", columns="p2")
else:
pv = pd.pivot(df, index="p1", columns="p2")
tm.assert_frame_equal(pv, expected)
expected = DataFrame(
[[0, 2], [1, 3]],
index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"),
columns=pd.PeriodIndex(["2013-01", "2013-02"], name="p2", freq="M"),
)
if method:
pv = df.pivot(index="p1", columns="p2", values="data1")
else:
pv = pd.pivot(df, index="p1", columns="p2", values="data1")
tm.assert_frame_equal(pv, expected)
def test_pivot_periods_with_margins(self):
# GH 28323
df = DataFrame(
{
"a": [1, 1, 2, 2],
"b": [
pd.Period("2019Q1"),
pd.Period("2019Q2"),
pd.Period("2019Q1"),
pd.Period("2019Q2"),
],
"x": 1.0,
}
)
expected = DataFrame(
data=1.0,
index=Index([1, 2, "All"], name="a"),
columns=Index([pd.Period("2019Q1"), pd.Period("2019Q2"), "All"], name="b"),
)
result = df.pivot_table(index="a", columns="b", values="x", margins=True)
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize(
"values",
[
["baz", "zoo"],
np.array(["baz", "zoo"]),
Series(["baz", "zoo"]),
Index(["baz", "zoo"]),
],
)
@pytest.mark.parametrize("method", [True, False])
def test_pivot_with_list_like_values(self, values, method):
# issue #17160
df = DataFrame(
{
"foo": ["one", "one", "one", "two", "two", "two"],
"bar": ["A", "B", "C", "A", "B", "C"],
"baz": [1, 2, 3, 4, 5, 6],
"zoo": ["x", "y", "z", "q", "w", "t"],
}
)
if method:
result = df.pivot(index="foo", columns="bar", values=values)
else:
result = pd.pivot(df, index="foo", columns="bar", values=values)
data = [[1, 2, 3, "x", "y", "z"], [4, 5, 6, "q", "w", "t"]]
index = Index(data=["one", "two"], name="foo")
columns = MultiIndex(
levels=[["baz", "zoo"], ["A", "B", "C"]],
codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]],
names=[None, "bar"],
)
expected = DataFrame(data=data, index=index, columns=columns, dtype="object")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
["bar", "baz"],
np.array(["bar", "baz"]),
Series(["bar", "baz"]),
Index(["bar", "baz"]),
],
)
@pytest.mark.parametrize("method", [True, False])
def test_pivot_with_list_like_values_nans(self, values, method):
# issue #17160
df = DataFrame(
{
"foo": ["one", "one", "one", "two", "two", "two"],
"bar": ["A", "B", "C", "A", "B", "C"],
"baz": [1, 2, 3, 4, 5, 6],
"zoo": ["x", "y", "z", "q", "w", "t"],
}
)
if method:
result = df.pivot(index="zoo", columns="foo", values=values)
else:
result = pd.pivot(df, index="zoo", columns="foo", values=values)
data = [
[np.nan, "A", np.nan, 4],
[np.nan, "C", np.nan, 6],
[np.nan, "B", np.nan, 5],
["A", np.nan, 1, np.nan],
["B", np.nan, 2, np.nan],
["C", np.nan, 3, np.nan],
]
index = Index(data=["q", "t", "w", "x", "y", "z"], name="zoo")
columns = MultiIndex(
levels=[["bar", "baz"], ["one", "two"]],
codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=[None, "foo"],
)
expected = DataFrame(data=data, index=index, columns=columns, dtype="object")
tm.assert_frame_equal(result, expected)
def test_pivot_columns_none_raise_error(self):
# GH 30924
df = DataFrame({"col1": ["a", "b", "c"], "col2": [1, 2, 3], "col3": [1, 2, 3]})
msg = r"pivot\(\) missing 1 required argument: 'columns'"
with pytest.raises(TypeError, match=msg):
df.pivot(index="col1", values="col3")
@pytest.mark.xfail(
reason="MultiIndexed unstack with tuple names fails with KeyError GH#19966"
)
@pytest.mark.parametrize("method", [True, False])
def test_pivot_with_multiindex(self, method):
# issue #17160
index = Index(data=[0, 1, 2, 3, 4, 5])
data = [
["one", "A", 1, "x"],
["one", "B", 2, "y"],
["one", "C", 3, "z"],
["two", "A", 4, "q"],
["two", "B", 5, "w"],
["two", "C", 6, "t"],
]
columns = MultiIndex(
levels=[["bar", "baz"], ["first", "second"]],
codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
)
df = DataFrame(data=data, index=index, columns=columns, dtype="object")
if method:
result = df.pivot(
index=("bar", "first"),
columns=("bar", "second"),
values=("baz", "first"),
)
else:
result = pd.pivot(
df,
index=("bar", "first"),
columns=("bar", "second"),
values=("baz", "first"),
)
data = {
"A": Series([1, 4], index=["one", "two"]),
"B": Series([2, 5], index=["one", "two"]),
"C": Series([3, 6], index=["one", "two"]),
}
expected = DataFrame(data)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", [True, False])
def test_pivot_with_tuple_of_values(self, method):
# issue #17160
df = DataFrame(
{
"foo": ["one", "one", "one", "two", "two", "two"],
"bar": ["A", "B", "C", "A", "B", "C"],
"baz": [1, 2, 3, 4, 5, 6],
"zoo": ["x", "y", "z", "q", "w", "t"],
}
)
with pytest.raises(KeyError, match=r"^\('bar', 'baz'\)$"):
# tuple is seen as a single column name
if method:
df.pivot(index="zoo", columns="foo", values=("bar", "baz"))
else:
pd.pivot(df, index="zoo", columns="foo", values=("bar", "baz"))
def test_margins(self):
def _check_output(
result, values_col, index=["A", "B"], columns=["C"], margins_col="All"
):
col_margins = result.loc[result.index[:-1], margins_col]
expected_col_margins = self.data.groupby(index)[values_col].mean()
tm.assert_series_equal(col_margins, expected_col_margins, check_names=False)
assert col_margins.name == margins_col
result = result.sort_index()
index_margins = result.loc[(margins_col, "")].iloc[:-1]
expected_ix_margins = self.data.groupby(columns)[values_col].mean()
tm.assert_series_equal(
index_margins, expected_ix_margins, check_names=False
)
assert index_margins.name == (margins_col, "")
grand_total_margins = result.loc[(margins_col, ""), margins_col]
expected_total_margins = self.data[values_col].mean()
assert grand_total_margins == expected_total_margins
# column specified
result = self.data.pivot_table(
values="D", index=["A", "B"], columns="C", margins=True, aggfunc=np.mean
)
_check_output(result, "D")
# Set a different margins_name (not 'All')
result = self.data.pivot_table(
values="D",
index=["A", "B"],
columns="C",
margins=True,
aggfunc=np.mean,
margins_name="Totals",
)
_check_output(result, "D", margins_col="Totals")
# no column specified
table = self.data.pivot_table(
index=["A", "B"], columns="C", margins=True, aggfunc=np.mean
)
for value_col in table.columns.levels[0]:
_check_output(table[value_col], value_col)
# no col
# to help with a buglet
self.data.columns = [k * 2 for k in self.data.columns]
table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc=np.mean)
for value_col in table.columns:
totals = table.loc[("All", ""), value_col]
assert totals == self.data[value_col].mean()
table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean")
for item in ["DD", "EE", "FF"]:
totals = table.loc[("All", ""), item]
assert totals == self.data[item].mean()
@pytest.mark.parametrize(
"columns, aggfunc, values, expected_columns",
[
(
"A",
np.mean,
[[5.5, 5.5, 2.2, 2.2], [8.0, 8.0, 4.4, 4.4]],
Index(["bar", "All", "foo", "All"], name="A"),
),
(
["A", "B"],
"sum",
[[9, 13, 22, 5, 6, 11], [14, 18, 32, 11, 11, 22]],
MultiIndex.from_tuples(
[
("bar", "one"),
("bar", "two"),
("bar", "All"),
("foo", "one"),
("foo", "two"),
("foo", "All"),
],
names=["A", "B"],
),
),
],
)
def test_margin_with_only_columns_defined(
self, columns, aggfunc, values, expected_columns
):
# GH 31016
df = DataFrame(
{
"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
"C": [
"small",
"large",
"large",
"small",
"small",
"large",
"small",
"small",
"large",
],
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
}
)
result = df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc)
expected = DataFrame(values, index=Index(["D", "E"]), columns=expected_columns)
tm.assert_frame_equal(result, expected)
def test_margins_dtype(self):
# GH 17013
df = self.data.copy()
df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3).astype("i8")
mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")]
mi = MultiIndex.from_tuples(mi_val, names=("A", "B"))
expected = DataFrame(
{"dull": [12, 21, 3, 9, 45], "shiny": [33, 0, 36, 51, 120]}, index=mi
).rename_axis("C", axis=1)
expected["All"] = expected["dull"] + expected["shiny"]
result = df.pivot_table(
values="D",
index=["A", "B"],
columns="C",
margins=True,
aggfunc=np.sum,
fill_value=0,
)
tm.assert_frame_equal(expected, result)
def test_margins_dtype_len(self):
mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")]
mi = MultiIndex.from_tuples(mi_val, names=("A", "B"))
expected = DataFrame(
{"dull": [1, 1, 2, 1, 5], "shiny": [2, 0, 2, 2, 6]}, index=mi
).rename_axis("C", axis=1)
expected["All"] = expected["dull"] + expected["shiny"]
result = self.data.pivot_table(
values="D",
index=["A", "B"],
columns="C",
margins=True,
aggfunc=len,
fill_value=0,
)
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)])
def test_pivot_table_multiindex_only(self, cols):
# GH 17038
df2 = DataFrame({cols[0]: [1, 2, 3], cols[1]: [1, 2, 3], "v": [4, 5, 6]})
result = df2.pivot_table(values="v", columns=cols)
expected = DataFrame(
[[4, 5, 6]],
columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols),
index=Index(["v"]),
)
tm.assert_frame_equal(result, expected)
def test_pivot_table_retains_tz(self):
dti = date_range("2016-01-01", periods=3, tz="Europe/Amsterdam")
df = DataFrame({"A": np.random.randn(3), "B": np.random.randn(3), "C": dti})
result = df.pivot_table(index=["B", "C"], dropna=False)
# check tz retention
assert result.index.levels[1].equals(dti)
def test_pivot_integer_columns(self):
# caused by upstream bug in unstack
d = date.min
data = list(
product(
["foo", "bar"],
["A", "B", "C"],
["x1", "x2"],
[d + timedelta(i) for i in range(20)],
[1.0],
)
)
df = DataFrame(data)
table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2])
df2 = df.rename(columns=str)
table2 = df2.pivot_table(values="4", index=["0", "1", "3"], columns=["2"])
tm.assert_frame_equal(table, table2, check_names=False)
def test_pivot_no_level_overlap(self):
# GH #1181
data = DataFrame(
{
"a": ["a", "a", "a", "a", "b", "b", "b", "b"] * 2,
"b": [0, 0, 0, 0, 1, 1, 1, 1] * 2,
"c": (["foo"] * 4 + ["bar"] * 4) * 2,
"value": np.random.randn(16),
}
)
table = data.pivot_table("value", index="a", columns=["b", "c"])
grouped = data.groupby(["a", "b", "c"])["value"].mean()
expected = grouped.unstack("b").unstack("c").dropna(axis=1, how="all")
tm.assert_frame_equal(table, expected)
def test_pivot_columns_lexsorted(self):
n = 10000
dtype = np.dtype(
[
("Index", object),
("Symbol", object),
("Year", int),
("Month", int),
("Day", int),
("Quantity", int),
("Price", float),
]
)
products = np.array(
[
("SP500", "ADBE"),
("SP500", "NVDA"),
("SP500", "ORCL"),
("NDQ100", "AAPL"),
("NDQ100", "MSFT"),
("NDQ100", "GOOG"),
("FTSE", "DGE.L"),
("FTSE", "TSCO.L"),
("FTSE", "GSK.L"),
],
dtype=[("Index", object), ("Symbol", object)],
)
items = np.empty(n, dtype=dtype)
iproduct = np.random.randint(0, len(products), n)
items["Index"] = products["Index"][iproduct]
items["Symbol"] = products["Symbol"][iproduct]
dr = date_range(date(2000, 1, 1), date(2010, 12, 31))
dates = dr[np.random.randint(0, len(dr), n)]
items["Year"] = dates.year
items["Month"] = dates.month
items["Day"] = dates.day
items["Price"] = np.random.lognormal(4.0, 2.0, n)
df = DataFrame(items)
pivoted = df.pivot_table(
"Price",
index=["Month", "Day"],
columns=["Index", "Symbol", "Year"],
aggfunc="mean",
)
assert pivoted.columns.is_monotonic
def test_pivot_complex_aggfunc(self):
f = {"D": ["std"], "E": ["sum"]}
expected = self.data.groupby(["A", "B"]).agg(f).unstack("B")
result = self.data.pivot_table(index="A", columns="B", aggfunc=f)
tm.assert_frame_equal(result, expected)
def test_margins_no_values_no_cols(self):
# Regression test on pivot table: no values or cols passed.
result = self.data[["A", "B"]].pivot_table(
index=["A", "B"], aggfunc=len, margins=True
)
result_list = result.tolist()
assert sum(result_list[:-1]) == result_list[-1]
def test_margins_no_values_two_rows(self):
# Regression test on pivot table: no values passed but rows are a
# multi-index
result = self.data[["A", "B", "C"]].pivot_table(
index=["A", "B"], columns="C", aggfunc=len, margins=True
)
assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0]
def test_margins_no_values_one_row_one_col(self):
# Regression test on pivot table: no values passed but row and col
# defined
result = self.data[["A", "B"]].pivot_table(
index="A", columns="B", aggfunc=len, margins=True
)
assert result.All.tolist() == [4.0, 7.0, 11.0]
def test_margins_no_values_two_row_two_cols(self):
# Regression test on pivot table: no values passed but rows and cols
# are multi-indexed
self.data["D"] = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]
result = self.data[["A", "B", "C", "D"]].pivot_table(
index=["A", "B"], columns=["C", "D"], aggfunc=len, margins=True
)
assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0]
@pytest.mark.parametrize("margin_name", ["foo", "one", 666, None, ["a", "b"]])
def test_pivot_table_with_margins_set_margin_name(self, margin_name):
# see gh-3335
msg = (
f'Conflicting name "{margin_name}" in margins|'
"margins_name argument must be a string"
)
with pytest.raises(ValueError, match=msg):
# multi-index index
pivot_table(
self.data,
values="D",
index=["A", "B"],
columns=["C"],
margins=True,
margins_name=margin_name,
)
with pytest.raises(ValueError, match=msg):
# multi-index column
pivot_table(
self.data,
values="D",
index=["C"],
columns=["A", "B"],
margins=True,
margins_name=margin_name,
)
with pytest.raises(ValueError, match=msg):
# non-multi-index index/column
pivot_table(
self.data,
values="D",
index=["A"],
columns=["B"],
margins=True,
margins_name=margin_name,
)
def test_pivot_timegrouper(self, using_array_manager):
df = DataFrame(
{
"Branch": "A A A A A A A B".split(),
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
"Date": [
datetime(2013, 1, 1),
datetime(2013, 1, 1),
datetime(2013, 10, 1),
datetime(2013, 10, 2),
datetime(2013, 10, 1),
datetime(2013, 10, 2),
datetime(2013, 12, 2),
datetime(2013, 12, 2),
],
}
).set_index("Date")
expected = DataFrame(
np.array([10, 18, 3], dtype="int64").reshape(1, 3),
index=pd.DatetimeIndex([datetime(2013, 12, 31)], freq="A"),
columns="Carl Joe Mark".split(),
)
expected.index.name = "Date"
expected.columns.name = "Buyer"
result = pivot_table(
df,
index=Grouper(freq="A"),
columns="Buyer",
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index="Buyer",
columns=Grouper(freq="A"),
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected.T)
expected = DataFrame(
np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3),
index=pd.DatetimeIndex(
[datetime(2013, 1, 1), datetime(2013, 7, 1)], freq="6MS"
),
columns="Carl Joe Mark".split(),
)
expected.index.name = "Date"
expected.columns.name = "Buyer"
if using_array_manager:
# INFO(ArrayManager) column without NaNs can preserve int dtype
expected["Carl"] = expected["Carl"].astype("int64")
result = pivot_table(
df,
index=Grouper(freq="6MS"),
columns="Buyer",
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index="Buyer",
columns=Grouper(freq="6MS"),
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected.T)
# passing the name
df = df.reset_index()
result = pivot_table(
df,
index=Grouper(freq="6MS", key="Date"),
columns="Buyer",
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index="Buyer",
columns=Grouper(freq="6MS", key="Date"),
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected.T)
msg = "'The grouper name foo is not found'"
with pytest.raises(KeyError, match=msg):
pivot_table(
df,
index=Grouper(freq="6MS", key="foo"),
columns="Buyer",
values="Quantity",
aggfunc=np.sum,
)
with pytest.raises(KeyError, match=msg):
pivot_table(
df,
index="Buyer",
columns=Grouper(freq="6MS", key="foo"),
values="Quantity",
aggfunc=np.sum,
)
# passing the level
df = df.set_index("Date")
result = pivot_table(
df,
index=Grouper(freq="6MS", level="Date"),
columns="Buyer",
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index="Buyer",
columns=Grouper(freq="6MS", level="Date"),
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected.T)
msg = "The level foo is not valid"
with pytest.raises(ValueError, match=msg):
pivot_table(
df,
index=Grouper(freq="6MS", level="foo"),
columns="Buyer",
values="Quantity",
aggfunc=np.sum,
)
with pytest.raises(ValueError, match=msg):
pivot_table(
df,
index="Buyer",
columns=Grouper(freq="6MS", level="foo"),
values="Quantity",
aggfunc=np.sum,
)
# double grouper
df = DataFrame(
{
"Branch": "A A A A A A A B".split(),
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
"Date": [
datetime(2013, 11, 1, 13, 0),
datetime(2013, 9, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 11, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 2, 12, 0),
datetime(2013, 12, 5, 14, 0),
],
"PayDay": [
datetime(2013, 10, 4, 0, 0),
datetime(2013, 10, 15, 13, 5),
datetime(2013, 9, 5, 20, 0),
datetime(2013, 11, 2, 10, 0),
datetime(2013, 10, 7, 20, 0),
datetime(2013, 9, 5, 10, 0),
datetime(2013, 12, 30, 12, 0),
datetime(2013, 11, 20, 14, 0),
],
}
)
result = pivot_table(
df,
index=Grouper(freq="M", key="Date"),
columns=Grouper(freq="M", key="PayDay"),
values="Quantity",
aggfunc=np.sum,
)
expected = DataFrame(
np.array(
[
np.nan,
3,
np.nan,
np.nan,
6,
np.nan,
1,
9,
np.nan,
9,
np.nan,
np.nan,
np.nan,
np.nan,
3,
np.nan,
]
).reshape(4, 4),
index=pd.DatetimeIndex(
[
datetime(2013, 9, 30),
datetime(2013, 10, 31),
datetime(2013, 11, 30),
datetime(2013, 12, 31),
],
freq="M",
),
columns=pd.DatetimeIndex(
[
datetime(2013, 9, 30),
datetime(2013, 10, 31),
datetime(2013, 11, 30),
datetime(2013, 12, 31),
],
freq="M",
),
)
expected.index.name = "Date"
expected.columns.name = "PayDay"
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index=Grouper(freq="M", key="PayDay"),
columns=Grouper(freq="M", key="Date"),
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected.T)
tuples = [
(datetime(2013, 9, 30), datetime(2013, 10, 31)),
(datetime(2013, 10, 31), datetime(2013, 9, 30)),
(datetime(2013, 10, 31), datetime(2013, 11, 30)),
(datetime(2013, 10, 31), datetime(2013, 12, 31)),
(datetime(2013, 11, 30), datetime(2013, 10, 31)),
(datetime(2013, 12, 31), datetime(2013, 11, 30)),
]
idx = MultiIndex.from_tuples(tuples, names=["Date", "PayDay"])
expected = DataFrame(
np.array(
[3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan, 3]
).reshape(6, 2),
index=idx,
columns=["A", "B"],
)
expected.columns.name = "Branch"
result = pivot_table(
df,
index=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")],
columns=["Branch"],
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index=["Branch"],
columns=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")],
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected.T)
def test_pivot_datetime_tz(self):
dates1 = [
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
]
dates2 = [
"2013-01-01 15:00:00",
"2013-01-01 15:00:00",
"2013-01-01 15:00:00",
"2013-02-01 15:00:00",
"2013-02-01 15:00:00",
"2013-02-01 15:00:00",
]
df = DataFrame(
{
"label": ["a", "a", "a", "b", "b", "b"],
"dt1": dates1,
"dt2": dates2,
"value1": np.arange(6, dtype="int64"),
"value2": [1, 2] * 3,
}
)
df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific"))
df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo"))
exp_idx = pd.DatetimeIndex(
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
tz="US/Pacific",
name="dt1",
)
exp_col1 = Index(["value1", "value1"])
exp_col2 = Index(["a", "b"], name="label")
exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col)
result = pivot_table(df, index=["dt1"], columns=["label"], values=["value1"])
tm.assert_frame_equal(result, expected)
exp_col1 = Index(["sum", "sum", "sum", "sum", "mean", "mean", "mean", "mean"])
exp_col2 = Index(["value1", "value1", "value2", "value2"] * 2)
exp_col3 = pd.DatetimeIndex(
["2013-01-01 15:00:00", "2013-02-01 15:00:00"] * 4,
tz="Asia/Tokyo",
name="dt2",
)
exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3])
expected = DataFrame(
np.array(
[
[0, 3, 1, 2, 0, 3, 1, 2],
[1, 4, 2, 1, 1, 4, 2, 1],
[2, 5, 1, 2, 2, 5, 1, 2],
],
dtype="int64",
),
index=exp_idx,
columns=exp_col,
)
result = pivot_table(
df,
index=["dt1"],
columns=["dt2"],
values=["value1", "value2"],
aggfunc=[np.sum, np.mean],
)
tm.assert_frame_equal(result, expected)
def test_pivot_dtaccessor(self):
# GH 8103
dates1 = [
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
]
dates2 = [
"2013-01-01 15:00:00",
"2013-01-01 15:00:00",
"2013-01-01 15:00:00",
"2013-02-01 15:00:00",
"2013-02-01 15:00:00",
"2013-02-01 15:00:00",
]
df = DataFrame(
{
"label": ["a", "a", "a", "b", "b", "b"],
"dt1": dates1,
"dt2": dates2,
"value1": np.arange(6, dtype="int64"),
"value2": [1, 2] * 3,
}
)
df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d))
df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d))
result = pivot_table(
df, index="label", columns=df["dt1"].dt.hour, values="value1"
)
exp_idx = Index(["a", "b"], name="label")
expected = DataFrame(
{7: [0, 3], 8: [1, 4], 9: [2, 5]},
index=exp_idx,
columns=Index([7, 8, 9], name="dt1"),
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df, index=df["dt2"].dt.month, columns=df["dt1"].dt.hour, values="value1"
)
expected = DataFrame(
{7: [0, 3], 8: [1, 4], 9: [2, 5]},
index=Index([1, 2], name="dt2"),
columns=Index([7, 8, 9], name="dt1"),
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index=df["dt2"].dt.year.values,
columns=[df["dt1"].dt.hour, df["dt2"].dt.month],
values="value1",
)
exp_col = MultiIndex.from_arrays(
[[7, 7, 8, 8, 9, 9], [1, 2] * 3], names=["dt1", "dt2"]
)
expected = DataFrame(
np.array([[0, 3, 1, 4, 2, 5]], dtype="int64"), index=[2013], columns=exp_col
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index=np.array(["X", "X", "X", "X", "Y", "Y"]),
columns=[df["dt1"].dt.hour, df["dt2"].dt.month],
values="value1",
)
expected = DataFrame(
np.array(
[[0, 3, 1, np.nan, 2, np.nan], [np.nan, np.nan, np.nan, 4, np.nan, 5]]
),
index=["X", "Y"],
columns=exp_col,
)
tm.assert_frame_equal(result, expected)
def test_daily(self):
rng = date_range("1/1/2000", "12/31/2004", freq="D")
ts = Series(np.random.randn(len(rng)), index=rng)
annual = pivot_table(
DataFrame(ts), index=ts.index.year, columns=ts.index.dayofyear
)
annual.columns = annual.columns.droplevel(0)
doy = np.asarray(ts.index.dayofyear)
for i in range(1, 367):
subset = ts[doy == i]
subset.index = subset.index.year
result = annual[i].dropna()
tm.assert_series_equal(result, subset, check_names=False)
assert result.name == i
def test_monthly(self):
rng = date_range("1/1/2000", "12/31/2004", freq="M")
ts = Series(np.random.randn(len(rng)), index=rng)
annual = pivot_table(DataFrame(ts), index=ts.index.year, columns=ts.index.month)
annual.columns = annual.columns.droplevel(0)
month = ts.index.month
for i in range(1, 13):
subset = ts[month == i]
subset.index = subset.index.year
result = annual[i].dropna()
tm.assert_series_equal(result, subset, check_names=False)
assert result.name == i
def test_pivot_table_with_iterator_values(self):
# GH 12017
aggs = {"D": "sum", "E": "mean"}
pivot_values_list = pivot_table(
self.data, index=["A"], values=list(aggs.keys()), aggfunc=aggs
)
pivot_values_keys = pivot_table(
self.data, index=["A"], values=aggs.keys(), aggfunc=aggs
)
tm.assert_frame_equal(pivot_values_keys, pivot_values_list)
agg_values_gen = (value for value in aggs.keys())
pivot_values_gen = pivot_table(
self.data, index=["A"], values=agg_values_gen, aggfunc=aggs
)
tm.assert_frame_equal(pivot_values_gen, pivot_values_list)
def test_pivot_table_margins_name_with_aggfunc_list(self):
# GH 13354
margins_name = "Weekly"
costs = DataFrame(
{
"item": ["bacon", "cheese", "bacon", "cheese"],
"cost": [2.5, 4.5, 3.2, 3.3],
"day": ["M", "M", "T", "T"],
}
)
table = costs.pivot_table(
index="item",
columns="day",
margins=True,
margins_name=margins_name,
aggfunc=[np.mean, max],
)
ix = Index(["bacon", "cheese", margins_name], dtype="object", name="item")
tups = [
("mean", "cost", "M"),
("mean", "cost", "T"),
("mean", "cost", margins_name),
("max", "cost", "M"),
("max", "cost", "T"),
("max", "cost", margins_name),
]
cols = MultiIndex.from_tuples(tups, names=[None, None, "day"])
expected = DataFrame(table.values, index=ix, columns=cols)
tm.assert_frame_equal(table, expected)
def test_categorical_margins(self, observed, request):
if observed:
request.node.add_marker(
pytest.mark.xfail(
reason="GH#17035 (np.mean of ints is casted back to ints)"
)
)
# GH 10989
df = DataFrame(
{"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2}
)
expected = DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]])
expected.index = Index([0, 1, "All"], name="y")
expected.columns = Index([0, 1, "All"], name="z")
table = df.pivot_table("x", "y", "z", dropna=observed, margins=True)
tm.assert_frame_equal(table, expected)
def test_categorical_margins_category(self, observed, request):
if observed:
request.node.add_marker(
pytest.mark.xfail(
reason="GH#17035 (np.mean of ints is casted back to ints)"
)
)
df = DataFrame(
{"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2}
)
expected = DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]])
expected.index = Index([0, 1, "All"], name="y")
expected.columns = Index([0, 1, "All"], name="z")
df.y = df.y.astype("category")
df.z = df.z.astype("category")
table = df.pivot_table("x", "y", "z", dropna=observed, margins=True)
tm.assert_frame_equal(table, expected)
def test_margins_casted_to_float(self, observed):
# GH 24893
df = DataFrame(
{
"A": [2, 4, 6, 8],
"B": [1, 4, 5, 8],
"C": [1, 3, 4, 6],
"D": ["X", "X", "Y", "Y"],
}
)
result = pivot_table(df, index="D", margins=True)
expected = DataFrame(
{"A": [3, 7, 5], "B": [2.5, 6.5, 4.5], "C": [2, 5, 3.5]},
index=Index(["X", "Y", "All"], name="D"),
)
tm.assert_frame_equal(result, expected)
def test_pivot_with_categorical(self, observed, ordered):
# gh-21370
idx = [np.nan, "low", "high", "low", np.nan]
col = [np.nan, "A", "B", np.nan, "A"]
df = DataFrame(
{
"In": Categorical(idx, categories=["low", "high"], ordered=ordered),
"Col": Categorical(col, categories=["A", "B"], ordered=ordered),
"Val": range(1, 6),
}
)
# case with index/columns/value
result = df.pivot_table(
index="In", columns="Col", values="Val", observed=observed
)
expected_cols = pd.CategoricalIndex(["A", "B"], ordered=ordered, name="Col")
expected = DataFrame(data=[[2.0, np.nan], [np.nan, 3.0]], columns=expected_cols)
expected.index = Index(
Categorical(["low", "high"], categories=["low", "high"], ordered=ordered),
name="In",
)
tm.assert_frame_equal(result, expected)
# case with columns/value
result = df.pivot_table(columns="Col", values="Val", observed=observed)
expected = DataFrame(
data=[[3.5, 3.0]], columns=expected_cols, index=Index(["Val"])
)
tm.assert_frame_equal(result, expected)
def test_categorical_aggfunc(self, observed):
# GH 9534
df = DataFrame(
{"C1": ["A", "B", "C", "C"], "C2": ["a", "a", "b", "b"], "V": [1, 2, 3, 4]}
)
df["C1"] = df["C1"].astype("category")
result = df.pivot_table(
"V", index="C1", columns="C2", dropna=observed, aggfunc="count"
)
expected_index = pd.CategoricalIndex(
["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1"
)
expected_columns = Index(["a", "b"], name="C2")
expected_data = np.array([[1, 0], [1, 0], [0, 2]], dtype=np.int64)
expected = DataFrame(
expected_data, index=expected_index, columns=expected_columns
)
tm.assert_frame_equal(result, expected)
def test_categorical_pivot_index_ordering(self, observed):
# GH 8731
df = DataFrame(
{
"Sales": [100, 120, 220],
"Month": ["January", "January", "January"],
"Year": [2013, 2014, 2013],
}
)
months = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
]
df["Month"] = df["Month"].astype("category").cat.set_categories(months)
result = df.pivot_table(
values="Sales",
index="Month",
columns="Year",
observed=observed,
aggfunc="sum",
)
expected_columns = Index([2013, 2014], name="Year", dtype="int64")
expected_index = pd.CategoricalIndex(
months, categories=months, ordered=False, name="Month"
)
expected_data = [[320, 120]] + [[0, 0]] * 11
expected = DataFrame(
expected_data, index=expected_index, columns=expected_columns
)
if observed:
expected = expected.loc[["January"]]
tm.assert_frame_equal(result, expected)
def test_pivot_table_not_series(self):
# GH 4386
# pivot_table always returns a DataFrame
# when values is not list like and columns is None
# and aggfunc is not instance of list
df = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"], "col3": [1, 3, 9]})
result = df.pivot_table("col1", index=["col3", "col2"], aggfunc=np.sum)
m = MultiIndex.from_arrays([[1, 3, 9], ["C", "D", "E"]], names=["col3", "col2"])
expected = DataFrame([3, 4, 5], index=m, columns=["col1"])
tm.assert_frame_equal(result, expected)
result = df.pivot_table("col1", index="col3", columns="col2", aggfunc=np.sum)
expected = DataFrame(
[[3, np.NaN, np.NaN], [np.NaN, 4, np.NaN], [np.NaN, np.NaN, 5]],
index=Index([1, 3, 9], name="col3"),
columns=Index(["C", "D", "E"], name="col2"),
)
tm.assert_frame_equal(result, expected)
result = df.pivot_table("col1", index="col3", aggfunc=[np.sum])
m = MultiIndex.from_arrays([["sum"], ["col1"]])
expected = DataFrame([3, 4, 5], index=Index([1, 3, 9], name="col3"), columns=m)
tm.assert_frame_equal(result, expected)
def test_pivot_margins_name_unicode(self):
# issue #13292
greek = "\u0394\u03bf\u03ba\u03b9\u03bc\u03ae"
frame = DataFrame({"foo": [1, 2, 3]})
table = pivot_table(
frame, index=["foo"], aggfunc=len, margins=True, margins_name=greek
)
index = Index([1, 2, 3, greek], dtype="object", name="foo")
expected = DataFrame(index=index)
tm.assert_frame_equal(table, expected)
def test_pivot_string_as_func(self):
# GH #18713
# for correctness purposes
data = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": range(11),
}
)
result = pivot_table(data, index="A", columns="B", aggfunc="sum")
mi = MultiIndex(
levels=[["C"], ["one", "two"]], codes=[[0, 0], [0, 1]], names=[None, "B"]
)
expected = DataFrame(
{("C", "one"): {"bar": 15, "foo": 13}, ("C", "two"): {"bar": 7, "foo": 20}},
columns=mi,
).rename_axis("A")
tm.assert_frame_equal(result, expected)
result = pivot_table(data, index="A", columns="B", aggfunc=["sum", "mean"])
mi = MultiIndex(
levels=[["sum", "mean"], ["C"], ["one", "two"]],
codes=[[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 1]],
names=[None, None, "B"],
)
expected = DataFrame(
{
("mean", "C", "one"): {"bar": 5.0, "foo": 3.25},
("mean", "C", "two"): {"bar": 7.0, "foo": 6.666666666666667},
("sum", "C", "one"): {"bar": 15, "foo": 13},
("sum", "C", "two"): {"bar": 7, "foo": 20},
},
columns=mi,
).rename_axis("A")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"f, f_numpy",
[
("sum", np.sum),
("mean", np.mean),
("std", np.std),
(["sum", "mean"], [np.sum, np.mean]),
(["sum", "std"], [np.sum, np.std]),
(["std", "mean"], [np.std, np.mean]),
],
)
def test_pivot_string_func_vs_func(self, f, f_numpy):
# GH #18713
# for consistency purposes
result = pivot_table(self.data, index="A", columns="B", aggfunc=f)
expected = pivot_table(self.data, index="A", columns="B", aggfunc=f_numpy)
tm.assert_frame_equal(result, expected)
@pytest.mark.slow
def test_pivot_number_of_levels_larger_than_int32(self, monkeypatch):
# GH 20601
# GH 26314: Change ValueError to PerformanceWarning
class MockUnstacker(reshape_lib._Unstacker):
def __init__(self, *args, **kwargs):
# __init__ will raise the warning
super().__init__(*args, **kwargs)
raise Exception("Don't compute final result.")
with monkeypatch.context() as m:
m.setattr(reshape_lib, "_Unstacker", MockUnstacker)
df = DataFrame(
{"ind1": np.arange(2**16), "ind2": np.arange(2**16), "count": 0}
)
msg = "The following operation may generate"
with tm.assert_produces_warning(PerformanceWarning, match=msg):
with pytest.raises(Exception, match="Don't compute final result."):
df.pivot_table(
index="ind1", columns="ind2", values="count", aggfunc="count"
)
def test_pivot_table_aggfunc_dropna(self, dropna):
# GH 22159
df = DataFrame(
{
"fruit": ["apple", "peach", "apple"],
"size": [1, 1, 2],
"taste": [7, 6, 6],
}
)
def ret_one(x):
return 1
def ret_sum(x):
return sum(x)
def ret_none(x):
return np.nan
result = pivot_table(
df, columns="fruit", aggfunc=[ret_sum, ret_none, ret_one], dropna=dropna
)
data = [[3, 1, np.nan, np.nan, 1, 1], [13, 6, np.nan, np.nan, 1, 1]]
col = MultiIndex.from_product(
[["ret_sum", "ret_none", "ret_one"], ["apple", "peach"]],
names=[None, "fruit"],
)
expected = DataFrame(data, index=["size", "taste"], columns=col)
if dropna:
expected = expected.dropna(axis="columns")
tm.assert_frame_equal(result, expected)
def test_pivot_table_aggfunc_scalar_dropna(self, dropna):
# GH 22159
df = DataFrame(
{"A": ["one", "two", "one"], "x": [3, np.nan, 2], "y": [1, np.nan, np.nan]}
)
result = pivot_table(df, columns="A", aggfunc=np.mean, dropna=dropna)
data = [[2.5, np.nan], [1, np.nan]]
col = Index(["one", "two"], name="A")
expected = DataFrame(data, index=["x", "y"], columns=col)
if dropna:
expected = expected.dropna(axis="columns")
tm.assert_frame_equal(result, expected)
def test_pivot_table_empty_aggfunc(self):
# GH 9186 & GH 13483
df = DataFrame(
{
"A": [2, 2, 3, 3, 2],
"id": [5, 6, 7, 8, 9],
"C": ["p", "q", "q", "p", "q"],
"D": [None, None, None, None, None],
}
)
result = df.pivot_table(index="A", columns="D", values="id", aggfunc=np.size)
expected = DataFrame(index=Index([], dtype="int64", name="A"))
expected.columns.name = "D"
tm.assert_frame_equal(result, expected)
def test_pivot_table_no_column_raises(self):
# GH 10326
def agg(arr):
return np.mean(arr)
foo = DataFrame({"X": [0, 0, 1, 1], "Y": [0, 1, 0, 1], "Z": [10, 20, 30, 40]})
with pytest.raises(KeyError, match="notpresent"):
foo.pivot_table("notpresent", "X", "Y", aggfunc=agg)
def test_pivot_table_multiindex_columns_doctest_case(self):
# The relevant characteristic is that the call
# to maybe_downcast_to_dtype(agged[v], data[v].dtype) in
# __internal_pivot_table has `agged[v]` a DataFrame instead of Series,
# In this case this is because agged.columns is a MultiIndex and 'v'
# is only indexing on its first level.
df = DataFrame(
{
"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
"C": [
"small",
"large",
"large",
"small",
"small",
"large",
"small",
"small",
"large",
],
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
}
)
table = pivot_table(
df,
values=["D", "E"],
index=["A", "C"],
aggfunc={"D": np.mean, "E": [min, max, np.mean]},
)
cols = MultiIndex.from_tuples(
[("D", "mean"), ("E", "max"), ("E", "mean"), ("E", "min")]
)
index = MultiIndex.from_tuples(
[("bar", "large"), ("bar", "small"), ("foo", "large"), ("foo", "small")],
names=["A", "C"],
)
vals = np.array(
[
[5.5, 9.0, 7.5, 6.0],
[5.5, 9.0, 8.5, 8.0],
[2.0, 5.0, 4.5, 4.0],
[2.33333333, 6.0, 4.33333333, 2.0],
]
)
expected = DataFrame(vals, columns=cols, index=index)
expected[("E", "min")] = expected[("E", "min")].astype(np.int64)
expected[("E", "max")] = expected[("E", "max")].astype(np.int64)
tm.assert_frame_equal(table, expected)
def test_pivot_table_sort_false(self):
# GH#39143
df = DataFrame(
{
"a": ["d1", "d4", "d3"],
"col": ["a", "b", "c"],
"num": [23, 21, 34],
"year": ["2018", "2018", "2019"],
}
)
result = df.pivot_table(
index=["a", "col"], columns="year", values="num", aggfunc="sum", sort=False
)
expected = DataFrame(
[[23, np.nan], [21, np.nan], [np.nan, 34]],
columns=Index(["2018", "2019"], name="year"),
index=MultiIndex.from_arrays(
[["d1", "d4", "d3"], ["a", "b", "c"]], names=["a", "col"]
),
)
tm.assert_frame_equal(result, expected)
def test_pivot_table_with_margins_and_numeric_columns(self):
# GH 26568
df = DataFrame([["a", "x", 1], ["a", "y", 2], ["b", "y", 3], ["b", "z", 4]])
df.columns = [10, 20, 30]
result = df.pivot_table(
index=10, columns=20, values=30, aggfunc="sum", fill_value=0, margins=True
)
expected = DataFrame([[1, 2, 0, 3], [0, 3, 4, 7], [1, 5, 4, 10]])
expected.columns = ["x", "y", "z", "All"]
expected.index = ["a", "b", "All"]
expected.columns.name = 20
expected.index.name = 10
tm.assert_frame_equal(result, expected)
class TestPivot:
def test_pivot(self):
data = {
"index": ["A", "B", "C", "C", "B", "A"],
"columns": ["One", "One", "One", "Two", "Two", "Two"],
"values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0],
}
frame = DataFrame(data)
pivoted = frame.pivot(index="index", columns="columns", values="values")
expected = DataFrame(
{
"One": {"A": 1.0, "B": 2.0, "C": 3.0},
"Two": {"A": 1.0, "B": 2.0, "C": 3.0},
}
)
expected.index.name, expected.columns.name = "index", "columns"
tm.assert_frame_equal(pivoted, expected)
# name tracking
assert pivoted.index.name == "index"
assert pivoted.columns.name == "columns"
# don't specify values
pivoted = frame.pivot(index="index", columns="columns")
assert pivoted.index.name == "index"
assert pivoted.columns.names == (None, "columns")
def test_pivot_duplicates(self):
data = DataFrame(
{
"a": ["bar", "bar", "foo", "foo", "foo"],
"b": ["one", "two", "one", "one", "two"],
"c": [1.0, 2.0, 3.0, 3.0, 4.0],
}
)
with pytest.raises(ValueError, match="duplicate entries"):
data.pivot("a", "b", "c")
def test_pivot_empty(self):
df = DataFrame(columns=["a", "b", "c"])
result = df.pivot("a", "b", "c")
expected = DataFrame()
tm.assert_frame_equal(result, expected, check_names=False)
def test_pivot_integer_bug(self):
df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")])
result = df.pivot(index=1, columns=0, values=2)
repr(result)
tm.assert_index_equal(result.columns, Index(["A", "B"], name=0))
def test_pivot_index_none(self):
# GH#3962
data = {
"index": ["A", "B", "C", "C", "B", "A"],
"columns": ["One", "One", "One", "Two", "Two", "Two"],
"values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0],
}
frame = DataFrame(data).set_index("index")
result = frame.pivot(columns="columns", values="values")
expected = DataFrame(
{
"One": {"A": 1.0, "B": 2.0, "C": 3.0},
"Two": {"A": 1.0, "B": 2.0, "C": 3.0},
}
)
expected.index.name, expected.columns.name = "index", "columns"
tm.assert_frame_equal(result, expected)
# omit values
result = frame.pivot(columns="columns")
expected.columns = MultiIndex.from_tuples(
[("values", "One"), ("values", "Two")], names=[None, "columns"]
)
expected.index.name = "index"
tm.assert_frame_equal(result, expected, check_names=False)
assert result.index.name == "index"
assert result.columns.names == (None, "columns")
expected.columns = expected.columns.droplevel(0)
result = frame.pivot(columns="columns", values="values")
expected.columns.name = "columns"
tm.assert_frame_equal(result, expected)
def test_pivot_index_list_values_none_immutable_args(self):
# GH37635
df = DataFrame(
{
"lev1": [1, 1, 1, 2, 2, 2],
"lev2": [1, 1, 2, 1, 1, 2],
"lev3": [1, 2, 1, 2, 1, 2],
"lev4": [1, 2, 3, 4, 5, 6],
"values": [0, 1, 2, 3, 4, 5],
}
)
index = ["lev1", "lev2"]
columns = ["lev3"]
result = df.pivot(index=index, columns=columns, values=None)
expected = DataFrame(
np.array(
[
[1.0, 2.0, 0.0, 1.0],
[3.0, np.nan, 2.0, np.nan],
[5.0, 4.0, 4.0, 3.0],
[np.nan, 6.0, np.nan, 5.0],
]
),
index=MultiIndex.from_arrays(
[(1, 1, 2, 2), (1, 2, 1, 2)], names=["lev1", "lev2"]
),
columns=MultiIndex.from_arrays(
[("lev4", "lev4", "values", "values"), (1, 2, 1, 2)],
names=[None, "lev3"],
),
)
tm.assert_frame_equal(result, expected)
assert index == ["lev1", "lev2"]
assert columns == ["lev3"]