A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pandas/tests/reshape/merge/test_merge.py

2633 lines
89 KiB

from datetime import (
date,
datetime,
timedelta,
)
import random
import re
import numpy as np
import pytest
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_object_dtype,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
DatetimeIndex,
IntervalIndex,
MultiIndex,
PeriodIndex,
RangeIndex,
Series,
TimedeltaIndex,
)
import pandas._testing as tm
from pandas.api.types import CategoricalDtype as CDT
from pandas.core.api import (
Float64Index,
Int64Index,
UInt64Index,
)
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import (
MergeError,
merge,
)
N = 50
NGROUPS = 8
def get_test_data(ngroups=NGROUPS, n=N):
unique_groups = list(range(ngroups))
arr = np.asarray(np.tile(unique_groups, n // ngroups))
if len(arr) < n:
arr = np.asarray(list(arr) + unique_groups[: n - len(arr)])
random.shuffle(arr)
return arr
def get_series():
return [
Series([1], dtype="int64"),
Series([1], dtype="Int64"),
Series([1.23]),
Series(["foo"]),
Series([True]),
Series([pd.Timestamp("2018-01-01")]),
Series([pd.Timestamp("2018-01-01", tz="US/Eastern")]),
]
def get_series_na():
return [
Series([np.nan], dtype="Int64"),
Series([np.nan], dtype="float"),
Series([np.nan], dtype="object"),
Series([pd.NaT]),
]
@pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name)
def series_of_dtype(request):
"""
A parametrized fixture returning a variety of Series of different
dtypes
"""
return request.param
@pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name)
def series_of_dtype2(request):
"""
A duplicate of the series_of_dtype fixture, so that it can be used
twice by a single function
"""
return request.param
@pytest.fixture(params=get_series_na(), ids=lambda x: x.dtype.name)
def series_of_dtype_all_na(request):
"""
A parametrized fixture returning a variety of Series with all NA
values
"""
return request.param
@pytest.fixture
def dfs_for_indicator():
df1 = DataFrame({"col1": [0, 1], "col_conflict": [1, 2], "col_left": ["a", "b"]})
df2 = DataFrame(
{
"col1": [1, 2, 3, 4, 5],
"col_conflict": [1, 2, 3, 4, 5],
"col_right": [2, 2, 2, 2, 2],
}
)
return df1, df2
class TestMerge:
def setup_method(self, method):
# aggregate multiple columns
self.df = DataFrame(
{
"key1": get_test_data(),
"key2": get_test_data(),
"data1": np.random.randn(N),
"data2": np.random.randn(N),
}
)
# exclude a couple keys for fun
self.df = self.df[self.df["key2"] > 1]
self.df2 = DataFrame(
{
"key1": get_test_data(n=N // 5),
"key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5),
"value": np.random.randn(N // 5),
}
)
self.left = DataFrame(
{"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)}
)
self.right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"])
def test_merge_inner_join_empty(self):
# GH 15328
df_empty = DataFrame()
df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64")
result = merge(df_empty, df_a, left_index=True, right_index=True)
expected = DataFrame({"a": []}, index=[], dtype="int64")
tm.assert_frame_equal(result, expected)
def test_merge_common(self):
joined = merge(self.df, self.df2)
exp = merge(self.df, self.df2, on=["key1", "key2"])
tm.assert_frame_equal(joined, exp)
def test_merge_non_string_columns(self):
# https://github.com/pandas-dev/pandas/issues/17962
# Checks that method runs for non string column names
left = DataFrame(
{0: [1, 0, 1, 0], 1: [0, 1, 0, 0], 2: [0, 0, 2, 0], 3: [1, 0, 0, 3]}
)
right = left.astype(float)
expected = left
result = merge(left, right)
tm.assert_frame_equal(expected, result)
def test_merge_index_as_on_arg(self):
# GH14355
left = self.df.set_index("key1")
right = self.df2.set_index("key1")
result = merge(left, right, on="key1")
expected = merge(self.df, self.df2, on="key1").set_index("key1")
tm.assert_frame_equal(result, expected)
def test_merge_index_singlekey_right_vs_left(self):
left = DataFrame(
{"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)}
)
right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"])
merged1 = merge(
left, right, left_on="key", right_index=True, how="left", sort=False
)
merged2 = merge(
right, left, right_on="key", left_index=True, how="right", sort=False
)
tm.assert_frame_equal(merged1, merged2.loc[:, merged1.columns])
merged1 = merge(
left, right, left_on="key", right_index=True, how="left", sort=True
)
merged2 = merge(
right, left, right_on="key", left_index=True, how="right", sort=True
)
tm.assert_frame_equal(merged1, merged2.loc[:, merged1.columns])
def test_merge_index_singlekey_inner(self):
left = DataFrame(
{"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)}
)
right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"])
# inner join
result = merge(left, right, left_on="key", right_index=True, how="inner")
expected = left.join(right, on="key").loc[result.index]
tm.assert_frame_equal(result, expected)
result = merge(right, left, right_on="key", left_index=True, how="inner")
expected = left.join(right, on="key").loc[result.index]
tm.assert_frame_equal(result, expected.loc[:, result.columns])
def test_merge_misspecified(self):
msg = "Must pass right_on or right_index=True"
with pytest.raises(pd.errors.MergeError, match=msg):
merge(self.left, self.right, left_index=True)
msg = "Must pass left_on or left_index=True"
with pytest.raises(pd.errors.MergeError, match=msg):
merge(self.left, self.right, right_index=True)
msg = (
'Can only pass argument "on" OR "left_on" and "right_on", not '
"a combination of both"
)
with pytest.raises(pd.errors.MergeError, match=msg):
merge(self.left, self.left, left_on="key", on="key")
msg = r"len\(right_on\) must equal len\(left_on\)"
with pytest.raises(ValueError, match=msg):
merge(self.df, self.df2, left_on=["key1"], right_on=["key1", "key2"])
def test_index_and_on_parameters_confusion(self):
msg = "right_index parameter must be of type bool, not <class 'list'>"
with pytest.raises(ValueError, match=msg):
merge(
self.df,
self.df2,
how="left",
left_index=False,
right_index=["key1", "key2"],
)
msg = "left_index parameter must be of type bool, not <class 'list'>"
with pytest.raises(ValueError, match=msg):
merge(
self.df,
self.df2,
how="left",
left_index=["key1", "key2"],
right_index=False,
)
with pytest.raises(ValueError, match=msg):
merge(
self.df,
self.df2,
how="left",
left_index=["key1", "key2"],
right_index=["key1", "key2"],
)
def test_merge_overlap(self):
merged = merge(self.left, self.left, on="key")
exp_len = (self.left["key"].value_counts() ** 2).sum()
assert len(merged) == exp_len
assert "v1_x" in merged
assert "v1_y" in merged
def test_merge_different_column_key_names(self):
left = DataFrame({"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]})
right = DataFrame({"rkey": ["foo", "bar", "qux", "foo"], "value": [5, 6, 7, 8]})
merged = left.merge(
right, left_on="lkey", right_on="rkey", how="outer", sort=True
)
exp = Series(["bar", "baz", "foo", "foo", "foo", "foo", np.nan], name="lkey")
tm.assert_series_equal(merged["lkey"], exp)
exp = Series(["bar", np.nan, "foo", "foo", "foo", "foo", "qux"], name="rkey")
tm.assert_series_equal(merged["rkey"], exp)
exp = Series([2, 3, 1, 1, 4, 4, np.nan], name="value_x")
tm.assert_series_equal(merged["value_x"], exp)
exp = Series([6, np.nan, 5, 8, 5, 8, 7], name="value_y")
tm.assert_series_equal(merged["value_y"], exp)
def test_merge_copy(self):
left = DataFrame({"a": 0, "b": 1}, index=range(10))
right = DataFrame({"c": "foo", "d": "bar"}, index=range(10))
merged = merge(left, right, left_index=True, right_index=True, copy=True)
merged["a"] = 6
assert (left["a"] == 0).all()
merged["d"] = "peekaboo"
assert (right["d"] == "bar").all()
def test_merge_nocopy(self, using_array_manager):
left = DataFrame({"a": 0, "b": 1}, index=range(10))
right = DataFrame({"c": "foo", "d": "bar"}, index=range(10))
merged = merge(left, right, left_index=True, right_index=True, copy=False)
assert np.shares_memory(merged["a"]._values, left["a"]._values)
assert np.shares_memory(merged["d"]._values, right["d"]._values)
def test_intelligently_handle_join_key(self):
# #733, be a bit more 1337 about not returning unconsolidated DataFrame
left = DataFrame(
{"key": [1, 1, 2, 2, 3], "value": list(range(5))}, columns=["value", "key"]
)
right = DataFrame({"key": [1, 1, 2, 3, 4, 5], "rvalue": list(range(6))})
joined = merge(left, right, on="key", how="outer")
expected = DataFrame(
{
"key": [1, 1, 1, 1, 2, 2, 3, 4, 5],
"value": np.array([0, 0, 1, 1, 2, 3, 4, np.nan, np.nan]),
"rvalue": [0, 1, 0, 1, 2, 2, 3, 4, 5],
},
columns=["value", "key", "rvalue"],
)
tm.assert_frame_equal(joined, expected)
def test_merge_join_key_dtype_cast(self):
# #8596
df1 = DataFrame({"key": [1], "v1": [10]})
df2 = DataFrame({"key": [2], "v1": [20]})
df = merge(df1, df2, how="outer")
assert df["key"].dtype == "int64"
df1 = DataFrame({"key": [True], "v1": [1]})
df2 = DataFrame({"key": [False], "v1": [0]})
df = merge(df1, df2, how="outer")
# GH13169
# GH#40073
assert df["key"].dtype == "bool"
df1 = DataFrame({"val": [1]})
df2 = DataFrame({"val": [2]})
lkey = np.array([1])
rkey = np.array([2])
df = merge(df1, df2, left_on=lkey, right_on=rkey, how="outer")
assert df["key_0"].dtype == "int64"
def test_handle_join_key_pass_array(self):
left = DataFrame(
{"key": [1, 1, 2, 2, 3], "value": np.arange(5)},
columns=["value", "key"],
dtype="int64",
)
right = DataFrame({"rvalue": np.arange(6)}, dtype="int64")
key = np.array([1, 1, 2, 3, 4, 5], dtype="int64")
merged = merge(left, right, left_on="key", right_on=key, how="outer")
merged2 = merge(right, left, left_on=key, right_on="key", how="outer")
tm.assert_series_equal(merged["key"], merged2["key"])
assert merged["key"].notna().all()
assert merged2["key"].notna().all()
left = DataFrame({"value": np.arange(5)}, columns=["value"])
right = DataFrame({"rvalue": np.arange(6)})
lkey = np.array([1, 1, 2, 2, 3])
rkey = np.array([1, 1, 2, 3, 4, 5])
merged = merge(left, right, left_on=lkey, right_on=rkey, how="outer")
tm.assert_series_equal(
merged["key_0"], Series([1, 1, 1, 1, 2, 2, 3, 4, 5], name="key_0")
)
left = DataFrame({"value": np.arange(3)})
right = DataFrame({"rvalue": np.arange(6)})
key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64)
merged = merge(left, right, left_index=True, right_on=key, how="outer")
tm.assert_series_equal(merged["key_0"], Series(key, name="key_0"))
def test_no_overlap_more_informative_error(self):
dt = datetime.now()
df1 = DataFrame({"x": ["a"]}, index=[dt])
df2 = DataFrame({"y": ["b", "c"]}, index=[dt, dt])
msg = (
"No common columns to perform merge on. "
f"Merge options: left_on={None}, right_on={None}, "
f"left_index={False}, right_index={False}"
)
with pytest.raises(MergeError, match=msg):
merge(df1, df2)
def test_merge_non_unique_indexes(self):
dt = datetime(2012, 5, 1)
dt2 = datetime(2012, 5, 2)
dt3 = datetime(2012, 5, 3)
dt4 = datetime(2012, 5, 4)
df1 = DataFrame({"x": ["a"]}, index=[dt])
df2 = DataFrame({"y": ["b", "c"]}, index=[dt, dt])
_check_merge(df1, df2)
# Not monotonic
df1 = DataFrame({"x": ["a", "b", "q"]}, index=[dt2, dt, dt4])
df2 = DataFrame(
{"y": ["c", "d", "e", "f", "g", "h"]}, index=[dt3, dt3, dt2, dt2, dt, dt]
)
_check_merge(df1, df2)
df1 = DataFrame({"x": ["a", "b"]}, index=[dt, dt])
df2 = DataFrame({"y": ["c", "d"]}, index=[dt, dt])
_check_merge(df1, df2)
def test_merge_non_unique_index_many_to_many(self):
dt = datetime(2012, 5, 1)
dt2 = datetime(2012, 5, 2)
dt3 = datetime(2012, 5, 3)
df1 = DataFrame({"x": ["a", "b", "c", "d"]}, index=[dt2, dt2, dt, dt])
df2 = DataFrame(
{"y": ["e", "f", "g", " h", "i"]}, index=[dt2, dt2, dt3, dt, dt]
)
_check_merge(df1, df2)
def test_left_merge_empty_dataframe(self):
left = DataFrame({"key": [1], "value": [2]})
right = DataFrame({"key": []})
result = merge(left, right, on="key", how="left")
tm.assert_frame_equal(result, left)
result = merge(right, left, on="key", how="right")
tm.assert_frame_equal(result, left)
@pytest.mark.parametrize(
"kwarg",
[
{"left_index": True, "right_index": True},
{"left_index": True, "right_on": "x"},
{"left_on": "a", "right_index": True},
{"left_on": "a", "right_on": "x"},
],
)
def test_merge_left_empty_right_empty(self, join_type, kwarg):
# GH 10824
left = DataFrame(columns=["a", "b", "c"])
right = DataFrame(columns=["x", "y", "z"])
exp_in = DataFrame(
columns=["a", "b", "c", "x", "y", "z"],
index=pd.Index([], dtype=object),
dtype=object,
)
result = merge(left, right, how=join_type, **kwarg)
tm.assert_frame_equal(result, exp_in)
def test_merge_left_empty_right_notempty(self):
# GH 10824
left = DataFrame(columns=["a", "b", "c"])
right = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["x", "y", "z"])
exp_out = DataFrame(
{
"a": np.array([np.nan] * 3, dtype=object),
"b": np.array([np.nan] * 3, dtype=object),
"c": np.array([np.nan] * 3, dtype=object),
"x": [1, 4, 7],
"y": [2, 5, 8],
"z": [3, 6, 9],
},
columns=["a", "b", "c", "x", "y", "z"],
)
exp_in = exp_out[0:0] # make empty DataFrame keeping dtype
# result will have object dtype
exp_in.index = exp_in.index.astype(object)
def check1(exp, kwarg):
result = merge(left, right, how="inner", **kwarg)
tm.assert_frame_equal(result, exp)
result = merge(left, right, how="left", **kwarg)
tm.assert_frame_equal(result, exp)
def check2(exp, kwarg):
result = merge(left, right, how="right", **kwarg)
tm.assert_frame_equal(result, exp)
result = merge(left, right, how="outer", **kwarg)
tm.assert_frame_equal(result, exp)
for kwarg in [
{"left_index": True, "right_index": True},
{"left_index": True, "right_on": "x"},
]:
check1(exp_in, kwarg)
check2(exp_out, kwarg)
kwarg = {"left_on": "a", "right_index": True}
check1(exp_in, kwarg)
exp_out["a"] = [0, 1, 2]
check2(exp_out, kwarg)
kwarg = {"left_on": "a", "right_on": "x"}
check1(exp_in, kwarg)
exp_out["a"] = np.array([np.nan] * 3, dtype=object)
check2(exp_out, kwarg)
def test_merge_left_notempty_right_empty(self):
# GH 10824
left = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
right = DataFrame(columns=["x", "y", "z"])
exp_out = DataFrame(
{
"a": [1, 4, 7],
"b": [2, 5, 8],
"c": [3, 6, 9],
"x": np.array([np.nan] * 3, dtype=object),
"y": np.array([np.nan] * 3, dtype=object),
"z": np.array([np.nan] * 3, dtype=object),
},
columns=["a", "b", "c", "x", "y", "z"],
)
exp_in = exp_out[0:0] # make empty DataFrame keeping dtype
# result will have object dtype
exp_in.index = exp_in.index.astype(object)
def check1(exp, kwarg):
result = merge(left, right, how="inner", **kwarg)
tm.assert_frame_equal(result, exp)
result = merge(left, right, how="right", **kwarg)
tm.assert_frame_equal(result, exp)
def check2(exp, kwarg):
result = merge(left, right, how="left", **kwarg)
tm.assert_frame_equal(result, exp)
result = merge(left, right, how="outer", **kwarg)
tm.assert_frame_equal(result, exp)
# TODO: should the next loop be un-indented? doing so breaks this test
for kwarg in [
{"left_index": True, "right_index": True},
{"left_index": True, "right_on": "x"},
{"left_on": "a", "right_index": True},
{"left_on": "a", "right_on": "x"},
]:
check1(exp_in, kwarg)
check2(exp_out, kwarg)
def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2):
# GH 25183
df = DataFrame(
{"key": series_of_dtype, "value": series_of_dtype2},
columns=["key", "value"],
)
df_empty = df[:0]
expected = DataFrame(
{
"value_x": Series(dtype=df.dtypes["value"]),
"key": Series(dtype=df.dtypes["key"]),
"value_y": Series(dtype=df.dtypes["value"]),
},
columns=["value_x", "key", "value_y"],
)
actual = df_empty.merge(df, on="key")
tm.assert_frame_equal(actual, expected)
def test_merge_all_na_column(self, series_of_dtype, series_of_dtype_all_na):
# GH 25183
df_left = DataFrame(
{"key": series_of_dtype, "value": series_of_dtype_all_na},
columns=["key", "value"],
)
df_right = DataFrame(
{"key": series_of_dtype, "value": series_of_dtype_all_na},
columns=["key", "value"],
)
expected = DataFrame(
{
"key": series_of_dtype,
"value_x": series_of_dtype_all_na,
"value_y": series_of_dtype_all_na,
},
columns=["key", "value_x", "value_y"],
)
actual = df_left.merge(df_right, on="key")
tm.assert_frame_equal(actual, expected)
def test_merge_nosort(self):
# GH#2098
d = {
"var1": np.random.randint(0, 10, size=10),
"var2": np.random.randint(0, 10, size=10),
"var3": [
datetime(2012, 1, 12),
datetime(2011, 2, 4),
datetime(2010, 2, 3),
datetime(2012, 1, 12),
datetime(2011, 2, 4),
datetime(2012, 4, 3),
datetime(2012, 3, 4),
datetime(2008, 5, 1),
datetime(2010, 2, 3),
datetime(2012, 2, 3),
],
}
df = DataFrame.from_dict(d)
var3 = df.var3.unique()
var3.sort()
new = DataFrame.from_dict({"var3": var3, "var8": np.random.random(7)})
result = df.merge(new, on="var3", sort=False)
exp = merge(df, new, on="var3", sort=False)
tm.assert_frame_equal(result, exp)
assert (df.var3.unique() == result.var3.unique()).all()
@pytest.mark.parametrize(
("sort", "values"), [(False, [1, 1, 0, 1, 1]), (True, [0, 1, 1, 1, 1])]
)
@pytest.mark.parametrize("how", ["left", "right"])
def test_merge_same_order_left_right(self, sort, values, how):
# GH#35382
df = DataFrame({"a": [1, 0, 1]})
result = df.merge(df, on="a", how=how, sort=sort)
expected = DataFrame(values, columns=["a"])
tm.assert_frame_equal(result, expected)
def test_merge_nan_right(self):
df1 = DataFrame({"i1": [0, 1], "i2": [0, 1]})
df2 = DataFrame({"i1": [0], "i3": [0]})
result = df1.join(df2, on="i1", rsuffix="_")
expected = (
DataFrame(
{
"i1": {0: 0.0, 1: 1},
"i2": {0: 0, 1: 1},
"i1_": {0: 0, 1: np.nan},
"i3": {0: 0.0, 1: np.nan},
None: {0: 0, 1: 0},
}
)
.set_index(None)
.reset_index()[["i1", "i2", "i1_", "i3"]]
)
tm.assert_frame_equal(result, expected, check_dtype=False)
def test_merge_nan_right2(self):
df1 = DataFrame({"i1": [0, 1], "i2": [0.5, 1.5]})
df2 = DataFrame({"i1": [0], "i3": [0.7]})
result = df1.join(df2, rsuffix="_", on="i1")
expected = DataFrame(
{
"i1": {0: 0, 1: 1},
"i1_": {0: 0.0, 1: np.nan},
"i2": {0: 0.5, 1: 1.5},
"i3": {0: 0.69999999999999996, 1: np.nan},
}
)[["i1", "i2", "i1_", "i3"]]
tm.assert_frame_equal(result, expected)
def test_merge_type(self):
class NotADataFrame(DataFrame):
@property
def _constructor(self):
return NotADataFrame
nad = NotADataFrame(self.df)
result = nad.merge(self.df2, on="key1")
assert isinstance(result, NotADataFrame)
def test_join_append_timedeltas(self, using_array_manager):
# timedelta64 issues with join/merge
# GH 5695
d = DataFrame.from_dict(
{"d": [datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500)]}
)
df = DataFrame(columns=list("dt"))
df = concat([df, d], ignore_index=True)
result = concat([df, d], ignore_index=True)
expected = DataFrame(
{
"d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)],
"t": [timedelta(0, 22500), timedelta(0, 22500)],
}
)
if using_array_manager:
# TODO(ArrayManager) decide on exact casting rules in concat
expected = expected.astype(object)
tm.assert_frame_equal(result, expected)
def test_join_append_timedeltas2(self):
# timedelta64 issues with join/merge
# GH 5695
td = np.timedelta64(300000000)
lhs = DataFrame(Series([td, td], index=["A", "B"]))
rhs = DataFrame(Series([td], index=["A"]))
result = lhs.join(rhs, rsuffix="r", how="left")
expected = DataFrame(
{
"0": Series([td, td], index=list("AB")),
"0r": Series([td, pd.NaT], index=list("AB")),
}
)
tm.assert_frame_equal(result, expected)
def test_other_datetime_unit(self):
# GH 13389
df1 = DataFrame({"entity_id": [101, 102]})
s = Series([None, None], index=[101, 102], name="days")
for dtype in [
"datetime64[D]",
"datetime64[h]",
"datetime64[m]",
"datetime64[s]",
"datetime64[ms]",
"datetime64[us]",
"datetime64[ns]",
]:
df2 = s.astype(dtype).to_frame("days")
# coerces to datetime64[ns], thus should not be affected
assert df2["days"].dtype == "datetime64[ns]"
result = df1.merge(df2, left_on="entity_id", right_index=True)
exp = DataFrame(
{
"entity_id": [101, 102],
"days": np.array(["nat", "nat"], dtype="datetime64[ns]"),
},
columns=["entity_id", "days"],
)
tm.assert_frame_equal(result, exp)
@pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"])
def test_other_timedelta_unit(self, unit):
# GH 13389
df1 = DataFrame({"entity_id": [101, 102]})
s = Series([None, None], index=[101, 102], name="days")
dtype = f"m8[{unit}]"
df2 = s.astype(dtype).to_frame("days")
assert df2["days"].dtype == "m8[ns]"
result = df1.merge(df2, left_on="entity_id", right_index=True)
exp = DataFrame(
{"entity_id": [101, 102], "days": np.array(["nat", "nat"], dtype=dtype)},
columns=["entity_id", "days"],
)
tm.assert_frame_equal(result, exp)
def test_overlapping_columns_error_message(self):
df = DataFrame({"key": [1, 2, 3], "v1": [4, 5, 6], "v2": [7, 8, 9]})
df2 = DataFrame({"key": [1, 2, 3], "v1": [4, 5, 6], "v2": [7, 8, 9]})
df.columns = ["key", "foo", "foo"]
df2.columns = ["key", "bar", "bar"]
expected = DataFrame(
{
"key": [1, 2, 3],
"v1": [4, 5, 6],
"v2": [7, 8, 9],
"v3": [4, 5, 6],
"v4": [7, 8, 9],
}
)
expected.columns = ["key", "foo", "foo", "bar", "bar"]
tm.assert_frame_equal(merge(df, df2), expected)
# #2649, #10639
df2.columns = ["key1", "foo", "foo"]
msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)"
with pytest.raises(MergeError, match=msg):
merge(df, df2)
def test_merge_on_datetime64tz(self):
# GH11405
left = DataFrame(
{
"key": pd.date_range("20151010", periods=2, tz="US/Eastern"),
"value": [1, 2],
}
)
right = DataFrame(
{
"key": pd.date_range("20151011", periods=3, tz="US/Eastern"),
"value": [1, 2, 3],
}
)
expected = DataFrame(
{
"key": pd.date_range("20151010", periods=4, tz="US/Eastern"),
"value_x": [1, 2, np.nan, np.nan],
"value_y": [np.nan, 1, 2, 3],
}
)
result = merge(left, right, on="key", how="outer")
tm.assert_frame_equal(result, expected)
def test_merge_datetime64tz_values(self):
left = DataFrame(
{
"key": [1, 2],
"value": pd.date_range("20151010", periods=2, tz="US/Eastern"),
}
)
right = DataFrame(
{
"key": [2, 3],
"value": pd.date_range("20151011", periods=2, tz="US/Eastern"),
}
)
expected = DataFrame(
{
"key": [1, 2, 3],
"value_x": list(pd.date_range("20151010", periods=2, tz="US/Eastern"))
+ [pd.NaT],
"value_y": [pd.NaT]
+ list(pd.date_range("20151011", periods=2, tz="US/Eastern")),
}
)
result = merge(left, right, on="key", how="outer")
tm.assert_frame_equal(result, expected)
assert result["value_x"].dtype == "datetime64[ns, US/Eastern]"
assert result["value_y"].dtype == "datetime64[ns, US/Eastern]"
def test_merge_on_datetime64tz_empty(self):
# https://github.com/pandas-dev/pandas/issues/25014
dtz = pd.DatetimeTZDtype(tz="UTC")
right = DataFrame(
{
"date": [pd.Timestamp("2018", tz=dtz.tz)],
"value": [4.0],
"date2": [pd.Timestamp("2019", tz=dtz.tz)],
},
columns=["date", "value", "date2"],
)
left = right[:0]
result = left.merge(right, on="date")
expected = DataFrame(
{
"value_x": Series(dtype=float),
"date2_x": Series(dtype=dtz),
"date": Series(dtype=dtz),
"value_y": Series(dtype=float),
"date2_y": Series(dtype=dtz),
},
columns=["value_x", "date2_x", "date", "value_y", "date2_y"],
)
tm.assert_frame_equal(result, expected)
def test_merge_datetime64tz_with_dst_transition(self):
# GH 18885
df1 = DataFrame(
pd.date_range("2017-10-29 01:00", periods=4, freq="H", tz="Europe/Madrid"),
columns=["date"],
)
df1["value"] = 1
df2 = DataFrame(
{
"date": pd.to_datetime(
[
"2017-10-29 03:00:00",
"2017-10-29 04:00:00",
"2017-10-29 05:00:00",
]
),
"value": 2,
}
)
df2["date"] = df2["date"].dt.tz_localize("UTC").dt.tz_convert("Europe/Madrid")
result = merge(df1, df2, how="outer", on="date")
expected = DataFrame(
{
"date": pd.date_range(
"2017-10-29 01:00", periods=7, freq="H", tz="Europe/Madrid"
),
"value_x": [1] * 4 + [np.nan] * 3,
"value_y": [np.nan] * 4 + [2] * 3,
}
)
tm.assert_frame_equal(result, expected)
def test_merge_non_unique_period_index(self):
# GH #16871
index = pd.period_range("2016-01-01", periods=16, freq="M")
df = DataFrame(list(range(len(index))), index=index, columns=["pnum"])
df2 = concat([df, df])
result = df.merge(df2, left_index=True, right_index=True, how="inner")
expected = DataFrame(
np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
columns=["pnum_x", "pnum_y"],
index=df2.sort_index().index,
)
tm.assert_frame_equal(result, expected)
def test_merge_on_periods(self):
left = DataFrame(
{"key": pd.period_range("20151010", periods=2, freq="D"), "value": [1, 2]}
)
right = DataFrame(
{
"key": pd.period_range("20151011", periods=3, freq="D"),
"value": [1, 2, 3],
}
)
expected = DataFrame(
{
"key": pd.period_range("20151010", periods=4, freq="D"),
"value_x": [1, 2, np.nan, np.nan],
"value_y": [np.nan, 1, 2, 3],
}
)
result = merge(left, right, on="key", how="outer")
tm.assert_frame_equal(result, expected)
def test_merge_period_values(self):
left = DataFrame(
{"key": [1, 2], "value": pd.period_range("20151010", periods=2, freq="D")}
)
right = DataFrame(
{"key": [2, 3], "value": pd.period_range("20151011", periods=2, freq="D")}
)
exp_x = pd.period_range("20151010", periods=2, freq="D")
exp_y = pd.period_range("20151011", periods=2, freq="D")
expected = DataFrame(
{
"key": [1, 2, 3],
"value_x": list(exp_x) + [pd.NaT],
"value_y": [pd.NaT] + list(exp_y),
}
)
result = merge(left, right, on="key", how="outer")
tm.assert_frame_equal(result, expected)
assert result["value_x"].dtype == "Period[D]"
assert result["value_y"].dtype == "Period[D]"
def test_indicator(self, dfs_for_indicator):
# PR #10054. xref #7412 and closes #8790.
df1, df2 = dfs_for_indicator
df1_copy = df1.copy()
df2_copy = df2.copy()
df_result = DataFrame(
{
"col1": [0, 1, 2, 3, 4, 5],
"col_conflict_x": [1, 2, np.nan, np.nan, np.nan, np.nan],
"col_left": ["a", "b", np.nan, np.nan, np.nan, np.nan],
"col_conflict_y": [np.nan, 1, 2, 3, 4, 5],
"col_right": [np.nan, 2, 2, 2, 2, 2],
}
)
df_result["_merge"] = Categorical(
[
"left_only",
"both",
"right_only",
"right_only",
"right_only",
"right_only",
],
categories=["left_only", "right_only", "both"],
)
df_result = df_result[
[
"col1",
"col_conflict_x",
"col_left",
"col_conflict_y",
"col_right",
"_merge",
]
]
test = merge(df1, df2, on="col1", how="outer", indicator=True)
tm.assert_frame_equal(test, df_result)
test = df1.merge(df2, on="col1", how="outer", indicator=True)
tm.assert_frame_equal(test, df_result)
# No side effects
tm.assert_frame_equal(df1, df1_copy)
tm.assert_frame_equal(df2, df2_copy)
# Check with custom name
df_result_custom_name = df_result
df_result_custom_name = df_result_custom_name.rename(
columns={"_merge": "custom_name"}
)
test_custom_name = merge(
df1, df2, on="col1", how="outer", indicator="custom_name"
)
tm.assert_frame_equal(test_custom_name, df_result_custom_name)
test_custom_name = df1.merge(
df2, on="col1", how="outer", indicator="custom_name"
)
tm.assert_frame_equal(test_custom_name, df_result_custom_name)
def test_merge_indicator_arg_validation(self, dfs_for_indicator):
# Check only accepts strings and booleans
df1, df2 = dfs_for_indicator
msg = "indicator option can only accept boolean or string arguments"
with pytest.raises(ValueError, match=msg):
merge(df1, df2, on="col1", how="outer", indicator=5)
with pytest.raises(ValueError, match=msg):
df1.merge(df2, on="col1", how="outer", indicator=5)
def test_merge_indicator_result_integrity(self, dfs_for_indicator):
# Check result integrity
df1, df2 = dfs_for_indicator
test2 = merge(df1, df2, on="col1", how="left", indicator=True)
assert (test2._merge != "right_only").all()
test2 = df1.merge(df2, on="col1", how="left", indicator=True)
assert (test2._merge != "right_only").all()
test3 = merge(df1, df2, on="col1", how="right", indicator=True)
assert (test3._merge != "left_only").all()
test3 = df1.merge(df2, on="col1", how="right", indicator=True)
assert (test3._merge != "left_only").all()
test4 = merge(df1, df2, on="col1", how="inner", indicator=True)
assert (test4._merge == "both").all()
test4 = df1.merge(df2, on="col1", how="inner", indicator=True)
assert (test4._merge == "both").all()
def test_merge_indicator_invalid(self, dfs_for_indicator):
# Check if working name in df
df1, _ = dfs_for_indicator
for i in ["_right_indicator", "_left_indicator", "_merge"]:
df_badcolumn = DataFrame({"col1": [1, 2], i: [2, 2]})
msg = (
"Cannot use `indicator=True` option when data contains a "
f"column named {i}|"
"Cannot use name of an existing column for indicator column"
)
with pytest.raises(ValueError, match=msg):
merge(df1, df_badcolumn, on="col1", how="outer", indicator=True)
with pytest.raises(ValueError, match=msg):
df1.merge(df_badcolumn, on="col1", how="outer", indicator=True)
# Check for name conflict with custom name
df_badcolumn = DataFrame({"col1": [1, 2], "custom_column_name": [2, 2]})
msg = "Cannot use name of an existing column for indicator column"
with pytest.raises(ValueError, match=msg):
merge(
df1,
df_badcolumn,
on="col1",
how="outer",
indicator="custom_column_name",
)
with pytest.raises(ValueError, match=msg):
df1.merge(
df_badcolumn, on="col1", how="outer", indicator="custom_column_name"
)
def test_merge_indicator_multiple_columns(self):
# Merge on multiple columns
df3 = DataFrame({"col1": [0, 1], "col2": ["a", "b"]})
df4 = DataFrame({"col1": [1, 1, 3], "col2": ["b", "x", "y"]})
hand_coded_result = DataFrame(
{"col1": [0, 1, 1, 3], "col2": ["a", "b", "x", "y"]}
)
hand_coded_result["_merge"] = Categorical(
["left_only", "both", "right_only", "right_only"],
categories=["left_only", "right_only", "both"],
)
test5 = merge(df3, df4, on=["col1", "col2"], how="outer", indicator=True)
tm.assert_frame_equal(test5, hand_coded_result)
test5 = df3.merge(df4, on=["col1", "col2"], how="outer", indicator=True)
tm.assert_frame_equal(test5, hand_coded_result)
def test_validation(self):
left = DataFrame(
{"a": ["a", "b", "c", "d"], "b": ["cat", "dog", "weasel", "horse"]},
index=range(4),
)
right = DataFrame(
{
"a": ["a", "b", "c", "d", "e"],
"c": ["meow", "bark", "um... weasel noise?", "nay", "chirp"],
},
index=range(5),
)
# Make sure no side effects.
left_copy = left.copy()
right_copy = right.copy()
result = merge(left, right, left_index=True, right_index=True, validate="1:1")
tm.assert_frame_equal(left, left_copy)
tm.assert_frame_equal(right, right_copy)
# make sure merge still correct
expected = DataFrame(
{
"a_x": ["a", "b", "c", "d"],
"b": ["cat", "dog", "weasel", "horse"],
"a_y": ["a", "b", "c", "d"],
"c": ["meow", "bark", "um... weasel noise?", "nay"],
},
index=range(4),
columns=["a_x", "b", "a_y", "c"],
)
result = merge(
left, right, left_index=True, right_index=True, validate="one_to_one"
)
tm.assert_frame_equal(result, expected)
expected_2 = DataFrame(
{
"a": ["a", "b", "c", "d"],
"b": ["cat", "dog", "weasel", "horse"],
"c": ["meow", "bark", "um... weasel noise?", "nay"],
},
index=range(4),
)
result = merge(left, right, on="a", validate="1:1")
tm.assert_frame_equal(left, left_copy)
tm.assert_frame_equal(right, right_copy)
tm.assert_frame_equal(result, expected_2)
result = merge(left, right, on="a", validate="one_to_one")
tm.assert_frame_equal(result, expected_2)
# One index, one column
expected_3 = DataFrame(
{
"b": ["cat", "dog", "weasel", "horse"],
"a": ["a", "b", "c", "d"],
"c": ["meow", "bark", "um... weasel noise?", "nay"],
},
columns=["b", "a", "c"],
index=range(4),
)
left_index_reset = left.set_index("a")
result = merge(
left_index_reset,
right,
left_index=True,
right_on="a",
validate="one_to_one",
)
tm.assert_frame_equal(result, expected_3)
# Dups on right
right_w_dups = concat([right, DataFrame({"a": ["e"], "c": ["moo"]}, index=[4])])
merge(
left,
right_w_dups,
left_index=True,
right_index=True,
validate="one_to_many",
)
msg = "Merge keys are not unique in right dataset; not a one-to-one merge"
with pytest.raises(MergeError, match=msg):
merge(
left,
right_w_dups,
left_index=True,
right_index=True,
validate="one_to_one",
)
with pytest.raises(MergeError, match=msg):
merge(left, right_w_dups, on="a", validate="one_to_one")
# Dups on left
left_w_dups = concat(
[left, DataFrame({"a": ["a"], "c": ["cow"]}, index=[3])], sort=True
)
merge(
left_w_dups,
right,
left_index=True,
right_index=True,
validate="many_to_one",
)
msg = "Merge keys are not unique in left dataset; not a one-to-one merge"
with pytest.raises(MergeError, match=msg):
merge(
left_w_dups,
right,
left_index=True,
right_index=True,
validate="one_to_one",
)
with pytest.raises(MergeError, match=msg):
merge(left_w_dups, right, on="a", validate="one_to_one")
# Dups on both
merge(left_w_dups, right_w_dups, on="a", validate="many_to_many")
msg = "Merge keys are not unique in right dataset; not a many-to-one merge"
with pytest.raises(MergeError, match=msg):
merge(
left_w_dups,
right_w_dups,
left_index=True,
right_index=True,
validate="many_to_one",
)
msg = "Merge keys are not unique in left dataset; not a one-to-many merge"
with pytest.raises(MergeError, match=msg):
merge(left_w_dups, right_w_dups, on="a", validate="one_to_many")
# Check invalid arguments
msg = "Not a valid argument for validate"
with pytest.raises(ValueError, match=msg):
merge(left, right, on="a", validate="jibberish")
# Two column merge, dups in both, but jointly no dups.
left = DataFrame(
{
"a": ["a", "a", "b", "b"],
"b": [0, 1, 0, 1],
"c": ["cat", "dog", "weasel", "horse"],
},
index=range(4),
)
right = DataFrame(
{
"a": ["a", "a", "b"],
"b": [0, 1, 0],
"d": ["meow", "bark", "um... weasel noise?"],
},
index=range(3),
)
expected_multi = DataFrame(
{
"a": ["a", "a", "b"],
"b": [0, 1, 0],
"c": ["cat", "dog", "weasel"],
"d": ["meow", "bark", "um... weasel noise?"],
},
index=range(3),
)
msg = (
"Merge keys are not unique in either left or right dataset; "
"not a one-to-one merge"
)
with pytest.raises(MergeError, match=msg):
merge(left, right, on="a", validate="1:1")
result = merge(left, right, on=["a", "b"], validate="1:1")
tm.assert_frame_equal(result, expected_multi)
def test_merge_two_empty_df_no_division_error(self):
# GH17776, PR #17846
a = DataFrame({"a": [], "b": [], "c": []})
with np.errstate(divide="raise"):
merge(a, a, on=("a", "b"))
@pytest.mark.parametrize("how", ["right", "outer"])
@pytest.mark.parametrize(
"index,expected_index",
[
(
CategoricalIndex([1, 2, 4]),
CategoricalIndex([1, 2, 4, None, None, None]),
),
(
DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"]),
DatetimeIndex(
["2001-01-01", "2002-02-02", "2003-03-03", pd.NaT, pd.NaT, pd.NaT]
),
),
(Float64Index([1, 2, 3]), Float64Index([1, 2, 3, None, None, None])),
(Int64Index([1, 2, 3]), Float64Index([1, 2, 3, None, None, None])),
(
IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]),
IntervalIndex.from_tuples(
[(1, 2), (2, 3), (3, 4), np.nan, np.nan, np.nan]
),
),
(
PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D"),
PeriodIndex(
["2001-01-01", "2001-01-02", "2001-01-03", pd.NaT, pd.NaT, pd.NaT],
freq="D",
),
),
(
TimedeltaIndex(["1d", "2d", "3d"]),
TimedeltaIndex(["1d", "2d", "3d", pd.NaT, pd.NaT, pd.NaT]),
),
],
)
def test_merge_on_index_with_more_values(self, how, index, expected_index):
# GH 24212
# pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
# -1 is interpreted as a missing value instead of the last element
df1 = DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index)
df2 = DataFrame({"b": [0, 1, 2, 3, 4, 5]})
result = df1.merge(df2, left_on="key", right_index=True, how=how)
expected = DataFrame(
[
[0, 0, 0],
[1, 1, 1],
[2, 2, 2],
[np.nan, 3, 3],
[np.nan, 4, 4],
[np.nan, 5, 5],
],
columns=["a", "key", "b"],
)
expected.set_index(expected_index, inplace=True)
tm.assert_frame_equal(result, expected)
def test_merge_right_index_right(self):
# Note: the expected output here is probably incorrect.
# See https://github.com/pandas-dev/pandas/issues/17257 for more.
# We include this as a regression test for GH-24897.
left = DataFrame({"a": [1, 2, 3], "key": [0, 1, 1]})
right = DataFrame({"b": [1, 2, 3]})
expected = DataFrame(
{"a": [1, 2, 3, None], "key": [0, 1, 1, 2], "b": [1, 2, 2, 3]},
columns=["a", "key", "b"],
index=[0, 1, 2, np.nan],
)
result = left.merge(right, left_on="key", right_index=True, how="right")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("how", ["left", "right"])
def test_merge_preserves_row_order(self, how):
# GH 27453
left_df = DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
right_df = DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]})
result = left_df.merge(right_df, on=["animal", "max_speed"], how=how)
if how == "right":
expected = DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]})
else:
expected = DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
tm.assert_frame_equal(result, expected)
def test_merge_take_missing_values_from_index_of_other_dtype(self):
# GH 24212
left = DataFrame(
{
"a": [1, 2, 3],
"key": Categorical(["a", "a", "b"], categories=list("abc")),
}
)
right = DataFrame({"b": [1, 2, 3]}, index=CategoricalIndex(["a", "b", "c"]))
result = left.merge(right, left_on="key", right_index=True, how="right")
expected = DataFrame(
{
"a": [1, 2, 3, None],
"key": Categorical(["a", "a", "b", "c"]),
"b": [1, 1, 2, 3],
},
index=[0, 1, 2, np.nan],
)
expected = expected.reindex(columns=["a", "key", "b"])
tm.assert_frame_equal(result, expected)
def test_merge_readonly(self):
# https://github.com/pandas-dev/pandas/issues/27943
data1 = DataFrame(
np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"]
)
data2 = DataFrame(
np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"]
)
# make each underlying block array / column array read-only
for arr in data1._mgr.arrays:
arr.flags.writeable = False
data1.merge(data2) # no error
def _check_merge(x, y):
for how in ["inner", "left", "outer"]:
result = x.join(y, how=how)
expected = merge(x.reset_index(), y.reset_index(), how=how, sort=True)
expected = expected.set_index("index")
# TODO check_names on merge?
tm.assert_frame_equal(result, expected, check_names=False)
class TestMergeDtypes:
@pytest.mark.parametrize(
"right_vals", [["foo", "bar"], Series(["foo", "bar"]).astype("category")]
)
def test_different(self, right_vals):
left = DataFrame(
{
"A": ["foo", "bar"],
"B": Series(["foo", "bar"]).astype("category"),
"C": [1, 2],
"D": [1.0, 2.0],
"E": Series([1, 2], dtype="uint64"),
"F": Series([1, 2], dtype="int32"),
}
)
right = DataFrame({"A": right_vals})
# GH 9780
# We allow merging on object and categorical cols and cast
# categorical cols to object
result = merge(left, right, on="A")
assert is_object_dtype(result.A.dtype)
@pytest.mark.parametrize("d1", [np.int64, np.int32, np.int16, np.int8, np.uint8])
@pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16])
def test_join_multi_dtypes(self, d1, d2):
dtype1 = np.dtype(d1)
dtype2 = np.dtype(d2)
left = DataFrame(
{
"k1": np.array([0, 1, 2] * 8, dtype=dtype1),
"k2": ["foo", "bar"] * 12,
"v": np.array(np.arange(24), dtype=np.int64),
}
)
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
right = DataFrame({"v2": np.array([5, 7], dtype=dtype2)}, index=index)
result = left.join(right, on=["k1", "k2"])
expected = left.copy()
if dtype2.kind == "i":
dtype2 = np.dtype("float64")
expected["v2"] = np.array(np.nan, dtype=dtype2)
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
tm.assert_frame_equal(result, expected)
result = left.join(right, on=["k1", "k2"], sort=True)
expected.sort_values(["k1", "k2"], kind="mergesort", inplace=True)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"int_vals, float_vals, exp_vals",
[
([1, 2, 3], [1.0, 2.0, 3.0], {"X": [1, 2, 3], "Y": [1.0, 2.0, 3.0]}),
([1, 2, 3], [1.0, 3.0], {"X": [1, 3], "Y": [1.0, 3.0]}),
([1, 2], [1.0, 2.0, 3.0], {"X": [1, 2], "Y": [1.0, 2.0]}),
],
)
def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals):
# GH 16572
# Check that float column is not cast to object if
# merging on float and int columns
A = DataFrame({"X": int_vals})
B = DataFrame({"Y": float_vals})
expected = DataFrame(exp_vals)
result = A.merge(B, left_on="X", right_on="Y")
tm.assert_frame_equal(result, expected)
result = B.merge(A, left_on="Y", right_on="X")
tm.assert_frame_equal(result, expected[["Y", "X"]])
def test_merge_key_dtype_cast(self):
# GH 17044
df1 = DataFrame({"key": [1.0, 2.0], "v1": [10, 20]}, columns=["key", "v1"])
df2 = DataFrame({"key": [2], "v2": [200]}, columns=["key", "v2"])
result = df1.merge(df2, on="key", how="left")
expected = DataFrame(
{"key": [1.0, 2.0], "v1": [10, 20], "v2": [np.nan, 200.0]},
columns=["key", "v1", "v2"],
)
tm.assert_frame_equal(result, expected)
def test_merge_on_ints_floats_warning(self):
# GH 16572
# merge will produce a warning when merging on int and
# float columns where the float values are not exactly
# equal to their int representation
A = DataFrame({"X": [1, 2, 3]})
B = DataFrame({"Y": [1.1, 2.5, 3.0]})
expected = DataFrame({"X": [3], "Y": [3.0]})
with tm.assert_produces_warning(UserWarning):
result = A.merge(B, left_on="X", right_on="Y")
tm.assert_frame_equal(result, expected)
with tm.assert_produces_warning(UserWarning):
result = B.merge(A, left_on="Y", right_on="X")
tm.assert_frame_equal(result, expected[["Y", "X"]])
# test no warning if float has NaNs
B = DataFrame({"Y": [np.nan, np.nan, 3.0]})
with tm.assert_produces_warning(None):
result = B.merge(A, left_on="Y", right_on="X")
tm.assert_frame_equal(result, expected[["Y", "X"]])
def test_merge_incompat_infer_boolean_object(self):
# GH21119: bool + object bool merge OK
df1 = DataFrame({"key": Series([True, False], dtype=object)})
df2 = DataFrame({"key": [True, False]})
expected = DataFrame({"key": [True, False]}, dtype=object)
result = merge(df1, df2, on="key")
tm.assert_frame_equal(result, expected)
result = merge(df2, df1, on="key")
tm.assert_frame_equal(result, expected)
def test_merge_incompat_infer_boolean_object_with_missing(self):
# GH21119: bool + object bool merge OK
# with missing value
df1 = DataFrame({"key": Series([True, False, np.nan], dtype=object)})
df2 = DataFrame({"key": [True, False]})
expected = DataFrame({"key": [True, False]}, dtype=object)
result = merge(df1, df2, on="key")
tm.assert_frame_equal(result, expected)
result = merge(df2, df1, on="key")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"df1_vals, df2_vals",
[
# merge on category coerces to object
([0, 1, 2], Series(["a", "b", "a"]).astype("category")),
([0.0, 1.0, 2.0], Series(["a", "b", "a"]).astype("category")),
# no not infer
([0, 1], Series([False, True], dtype=object)),
([0, 1], Series([False, True], dtype=bool)),
],
)
def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals):
# these are explicitly allowed incompat merges, that pass thru
# the result type is dependent on if the values on the rhs are
# inferred, otherwise these will be coerced to object
df1 = DataFrame({"A": df1_vals})
df2 = DataFrame({"A": df2_vals})
result = merge(df1, df2, on=["A"])
assert is_object_dtype(result.A.dtype)
result = merge(df2, df1, on=["A"])
assert is_object_dtype(result.A.dtype)
@pytest.mark.parametrize(
"df1_vals, df2_vals",
[
# do not infer to numeric
(Series([1, 2], dtype="uint64"), ["a", "b", "c"]),
(Series([1, 2], dtype="int32"), ["a", "b", "c"]),
([0, 1, 2], ["0", "1", "2"]),
([0.0, 1.0, 2.0], ["0", "1", "2"]),
([0, 1, 2], ["0", "1", "2"]),
(
pd.date_range("1/1/2011", periods=2, freq="D"),
["2011-01-01", "2011-01-02"],
),
(pd.date_range("1/1/2011", periods=2, freq="D"), [0, 1]),
(pd.date_range("1/1/2011", periods=2, freq="D"), [0.0, 1.0]),
(
pd.date_range("20130101", periods=3),
pd.date_range("20130101", periods=3, tz="US/Eastern"),
),
],
)
def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals):
# GH 9780, GH 15800
# Raise a ValueError when a user tries to merge on
# dtypes that are incompatible (e.g., obj and int/float)
df1 = DataFrame({"A": df1_vals})
df2 = DataFrame({"A": df2_vals})
msg = (
f"You are trying to merge on {df1['A'].dtype} and "
f"{df2['A'].dtype} columns. If you wish to proceed "
"you should use pd.concat"
)
msg = re.escape(msg)
with pytest.raises(ValueError, match=msg):
merge(df1, df2, on=["A"])
# Check that error still raised when swapping order of dataframes
msg = (
f"You are trying to merge on {df2['A'].dtype} and "
f"{df1['A'].dtype} columns. If you wish to proceed "
"you should use pd.concat"
)
msg = re.escape(msg)
with pytest.raises(ValueError, match=msg):
merge(df2, df1, on=["A"])
@pytest.mark.parametrize(
"expected_data, how",
[
([1, 2], "outer"),
([], "inner"),
([2], "right"),
([1], "left"),
],
)
def test_merge_EA_dtype(self, any_numeric_ea_dtype, how, expected_data):
# GH#40073
d1 = DataFrame([(1,)], columns=["id"], dtype=any_numeric_ea_dtype)
d2 = DataFrame([(2,)], columns=["id"], dtype=any_numeric_ea_dtype)
result = merge(d1, d2, how=how)
expected = DataFrame(expected_data, columns=["id"], dtype=any_numeric_ea_dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"expected_data, how",
[
(["a", "b"], "outer"),
([], "inner"),
(["b"], "right"),
(["a"], "left"),
],
)
def test_merge_string_dtype(self, how, expected_data, any_string_dtype):
# GH#40073
d1 = DataFrame([("a",)], columns=["id"], dtype=any_string_dtype)
d2 = DataFrame([("b",)], columns=["id"], dtype=any_string_dtype)
result = merge(d1, d2, how=how)
expected = DataFrame(expected_data, columns=["id"], dtype=any_string_dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"how, expected_data",
[
("inner", [[True, 1, 4], [False, 5, 3]]),
("outer", [[True, 1, 4], [False, 5, 3]]),
("left", [[True, 1, 4], [False, 5, 3]]),
("right", [[False, 5, 3], [True, 1, 4]]),
],
)
def test_merge_bool_dtype(self, how, expected_data):
# GH#40073
df1 = DataFrame({"A": [True, False], "B": [1, 5]})
df2 = DataFrame({"A": [False, True], "C": [3, 4]})
result = merge(df1, df2, how=how)
expected = DataFrame(expected_data, columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
def test_merge_ea_with_string(self, join_type, string_dtype):
# GH 43734 Avoid the use of `assign` with multi-index
df1 = DataFrame(
data={
("lvl0", "lvl1-a"): ["1", "2", "3", "4", None],
("lvl0", "lvl1-b"): ["4", "5", "6", "7", "8"],
},
dtype=pd.StringDtype(),
)
df1_copy = df1.copy()
df2 = DataFrame(
data={
("lvl0", "lvl1-a"): ["1", "2", "3", pd.NA, "5"],
("lvl0", "lvl1-c"): ["7", "8", "9", pd.NA, "11"],
},
dtype=string_dtype,
)
df2_copy = df2.copy()
merged = merge(left=df1, right=df2, on=[("lvl0", "lvl1-a")], how=join_type)
# No change in df1 and df2
tm.assert_frame_equal(df1, df1_copy)
tm.assert_frame_equal(df2, df2_copy)
# Check the expected types for the merged data frame
expected = Series(
[np.dtype("O"), pd.StringDtype(), np.dtype("O")],
index=MultiIndex.from_tuples(
[("lvl0", "lvl1-a"), ("lvl0", "lvl1-b"), ("lvl0", "lvl1-c")]
),
)
tm.assert_series_equal(merged.dtypes, expected)
@pytest.fixture
def left():
np.random.seed(1234)
return DataFrame(
{
"X": Series(np.random.choice(["foo", "bar"], size=(10,))).astype(
CDT(["foo", "bar"])
),
"Y": np.random.choice(["one", "two", "three"], size=(10,)),
}
)
@pytest.fixture
def right():
np.random.seed(1234)
return DataFrame(
{"X": Series(["foo", "bar"]).astype(CDT(["foo", "bar"])), "Z": [1, 2]}
)
class TestMergeCategorical:
def test_identical(self, left):
# merging on the same, should preserve dtypes
merged = merge(left, left, on="X")
result = merged.dtypes.sort_index()
expected = Series(
[CategoricalDtype(categories=["foo", "bar"]), np.dtype("O"), np.dtype("O")],
index=["X", "Y_x", "Y_y"],
)
tm.assert_series_equal(result, expected)
def test_basic(self, left, right):
# we have matching Categorical dtypes in X
# so should preserve the merged column
merged = merge(left, right, on="X")
result = merged.dtypes.sort_index()
expected = Series(
[
CategoricalDtype(categories=["foo", "bar"]),
np.dtype("O"),
np.dtype("int64"),
],
index=["X", "Y", "Z"],
)
tm.assert_series_equal(result, expected)
def test_merge_categorical(self):
# GH 9426
right = DataFrame(
{
"c": {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"},
"d": {0: "null", 1: "null", 2: "null", 3: "null", 4: "null"},
}
)
left = DataFrame(
{
"a": {0: "f", 1: "f", 2: "f", 3: "f", 4: "f"},
"b": {0: "g", 1: "g", 2: "g", 3: "g", 4: "g"},
}
)
df = merge(left, right, how="left", left_on="b", right_on="c")
# object-object
expected = df.copy()
# object-cat
# note that we propagate the category
# because we don't have any matching rows
cright = right.copy()
cright["d"] = cright["d"].astype("category")
result = merge(left, cright, how="left", left_on="b", right_on="c")
expected["d"] = expected["d"].astype(CategoricalDtype(["null"]))
tm.assert_frame_equal(result, expected)
# cat-object
cleft = left.copy()
cleft["b"] = cleft["b"].astype("category")
result = merge(cleft, cright, how="left", left_on="b", right_on="c")
tm.assert_frame_equal(result, expected)
# cat-cat
cright = right.copy()
cright["d"] = cright["d"].astype("category")
cleft = left.copy()
cleft["b"] = cleft["b"].astype("category")
result = merge(cleft, cright, how="left", left_on="b", right_on="c")
tm.assert_frame_equal(result, expected)
def tests_merge_categorical_unordered_equal(self):
# GH-19551
df1 = DataFrame(
{
"Foo": Categorical(["A", "B", "C"], categories=["A", "B", "C"]),
"Left": ["A0", "B0", "C0"],
}
)
df2 = DataFrame(
{
"Foo": Categorical(["C", "B", "A"], categories=["C", "B", "A"]),
"Right": ["C1", "B1", "A1"],
}
)
result = merge(df1, df2, on=["Foo"])
expected = DataFrame(
{
"Foo": Categorical(["A", "B", "C"]),
"Left": ["A0", "B0", "C0"],
"Right": ["A1", "B1", "C1"],
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("ordered", [True, False])
def test_multiindex_merge_with_unordered_categoricalindex(self, ordered):
# GH 36973
pcat = CategoricalDtype(categories=["P2", "P1"], ordered=ordered)
df1 = DataFrame(
{
"id": ["C", "C", "D"],
"p": Categorical(["P2", "P1", "P2"], dtype=pcat),
"a": [0, 1, 2],
}
).set_index(["id", "p"])
df2 = DataFrame(
{
"id": ["A", "C", "C"],
"p": Categorical(["P2", "P2", "P1"], dtype=pcat),
"d1": [10, 11, 12],
}
).set_index(["id", "p"])
result = merge(df1, df2, how="left", left_index=True, right_index=True)
expected = DataFrame(
{
"id": ["C", "C", "D"],
"p": Categorical(["P2", "P1", "P2"], dtype=pcat),
"a": [0, 1, 2],
"d1": [11.0, 12.0, np.nan],
}
).set_index(["id", "p"])
tm.assert_frame_equal(result, expected)
def test_other_columns(self, left, right):
# non-merge columns should preserve if possible
right = right.assign(Z=right.Z.astype("category"))
merged = merge(left, right, on="X")
result = merged.dtypes.sort_index()
expected = Series(
[
CategoricalDtype(categories=["foo", "bar"]),
np.dtype("O"),
CategoricalDtype(categories=[1, 2]),
],
index=["X", "Y", "Z"],
)
tm.assert_series_equal(result, expected)
# categories are preserved
assert left.X.values._categories_match_up_to_permutation(merged.X.values)
assert right.Z.values._categories_match_up_to_permutation(merged.Z.values)
@pytest.mark.parametrize(
"change",
[
lambda x: x,
lambda x: x.astype(CDT(["foo", "bar", "bah"])),
lambda x: x.astype(CDT(ordered=True)),
],
)
def test_dtype_on_merged_different(self, change, join_type, left, right):
# our merging columns, X now has 2 different dtypes
# so we must be object as a result
X = change(right.X.astype("object"))
right = right.assign(X=X)
assert is_categorical_dtype(left.X.values.dtype)
# assert not left.X.values._categories_match_up_to_permutation(right.X.values)
merged = merge(left, right, on="X", how=join_type)
result = merged.dtypes.sort_index()
expected = Series(
[np.dtype("O"), np.dtype("O"), np.dtype("int64")], index=["X", "Y", "Z"]
)
tm.assert_series_equal(result, expected)
def test_self_join_multiple_categories(self):
# GH 16767
# non-duplicates should work with multiple categories
m = 5
df = DataFrame(
{
"a": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] * m,
"b": ["t", "w", "x", "y", "z"] * 2 * m,
"c": [
letter
for each in ["m", "n", "u", "p", "o"]
for letter in [each] * 2 * m
],
"d": [
letter
for each in [
"aa",
"bb",
"cc",
"dd",
"ee",
"ff",
"gg",
"hh",
"ii",
"jj",
]
for letter in [each] * m
],
}
)
# change them all to categorical variables
df = df.apply(lambda x: x.astype("category"))
# self-join should equal ourselves
result = merge(df, df, on=list(df.columns))
tm.assert_frame_equal(result, df)
def test_dtype_on_categorical_dates(self):
# GH 16900
# dates should not be coerced to ints
df = DataFrame(
[[date(2001, 1, 1), 1.1], [date(2001, 1, 2), 1.3]], columns=["date", "num2"]
)
df["date"] = df["date"].astype("category")
df2 = DataFrame(
[[date(2001, 1, 1), 1.3], [date(2001, 1, 3), 1.4]], columns=["date", "num4"]
)
df2["date"] = df2["date"].astype("category")
expected_outer = DataFrame(
[
[pd.Timestamp("2001-01-01").date(), 1.1, 1.3],
[pd.Timestamp("2001-01-02").date(), 1.3, np.nan],
[pd.Timestamp("2001-01-03").date(), np.nan, 1.4],
],
columns=["date", "num2", "num4"],
)
result_outer = merge(df, df2, how="outer", on=["date"])
tm.assert_frame_equal(result_outer, expected_outer)
expected_inner = DataFrame(
[[pd.Timestamp("2001-01-01").date(), 1.1, 1.3]],
columns=["date", "num2", "num4"],
)
result_inner = merge(df, df2, how="inner", on=["date"])
tm.assert_frame_equal(result_inner, expected_inner)
@pytest.mark.parametrize("ordered", [True, False])
@pytest.mark.parametrize(
"category_column,categories,expected_categories",
[
([False, True, True, False], [True, False], [True, False]),
([2, 1, 1, 2], [1, 2], [1, 2]),
(["False", "True", "True", "False"], ["True", "False"], ["True", "False"]),
],
)
def test_merging_with_bool_or_int_cateorical_column(
self, category_column, categories, expected_categories, ordered
):
# GH 17187
# merging with a boolean/int categorical column
df1 = DataFrame({"id": [1, 2, 3, 4], "cat": category_column})
df1["cat"] = df1["cat"].astype(CDT(categories, ordered=ordered))
df2 = DataFrame({"id": [2, 4], "num": [1, 9]})
result = df1.merge(df2)
expected = DataFrame({"id": [2, 4], "cat": expected_categories, "num": [1, 9]})
expected["cat"] = expected["cat"].astype(CDT(categories, ordered=ordered))
tm.assert_frame_equal(expected, result)
def test_merge_on_int_array(self):
# GH 23020
df = DataFrame({"A": Series([1, 2, np.nan], dtype="Int64"), "B": 1})
result = merge(df, df, on="A")
expected = DataFrame(
{"A": Series([1, 2, np.nan], dtype="Int64"), "B_x": 1, "B_y": 1}
)
tm.assert_frame_equal(result, expected)
@pytest.fixture
def left_df():
return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0])
@pytest.fixture
def right_df():
return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2])
class TestMergeOnIndexes:
@pytest.mark.parametrize(
"how, sort, expected",
[
("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])),
("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])),
(
"left",
False,
DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]),
),
(
"left",
True,
DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]),
),
(
"right",
False,
DataFrame(
{"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2]
),
),
(
"right",
True,
DataFrame(
{"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3]
),
),
(
"outer",
False,
DataFrame(
{"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
index=[0, 1, 2, 3],
),
),
(
"outer",
True,
DataFrame(
{"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
index=[0, 1, 2, 3],
),
),
],
)
def test_merge_on_indexes(self, left_df, right_df, how, sort, expected):
result = merge(
left_df, right_df, left_index=True, right_index=True, how=how, sort=sort
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"index",
[
CategoricalIndex(["A", "B"], categories=["A", "B"], name="index_col"),
Float64Index([1.0, 2.0], name="index_col"),
Int64Index([1, 2], name="index_col"),
UInt64Index([1, 2], name="index_col"),
RangeIndex(start=0, stop=2, name="index_col"),
DatetimeIndex(["2018-01-01", "2018-01-02"], name="index_col"),
],
ids=lambda x: type(x).__name__,
)
def test_merge_index_types(index):
# gh-20777
# assert key access is consistent across index types
left = DataFrame({"left_data": [1, 2]}, index=index)
right = DataFrame({"right_data": [1.0, 2.0]}, index=index)
result = left.merge(right, on=["index_col"])
expected = DataFrame({"left_data": [1, 2], "right_data": [1.0, 2.0]}, index=index)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"on,left_on,right_on,left_index,right_index,nm",
[
(["outer", "inner"], None, None, False, False, "B"),
(None, None, None, True, True, "B"),
(None, ["outer", "inner"], None, False, True, "B"),
(None, None, ["outer", "inner"], True, False, "B"),
(["outer", "inner"], None, None, False, False, None),
(None, None, None, True, True, None),
(None, ["outer", "inner"], None, False, True, None),
(None, None, ["outer", "inner"], True, False, None),
],
)
def test_merge_series(on, left_on, right_on, left_index, right_index, nm):
# GH 21220
a = DataFrame(
{"A": [1, 2, 3, 4]},
index=MultiIndex.from_product([["a", "b"], [0, 1]], names=["outer", "inner"]),
)
b = Series(
[1, 2, 3, 4],
index=MultiIndex.from_product([["a", "b"], [1, 2]], names=["outer", "inner"]),
name=nm,
)
expected = DataFrame(
{"A": [2, 4], "B": [1, 3]},
index=MultiIndex.from_product([["a", "b"], [1]], names=["outer", "inner"]),
)
if nm is not None:
result = merge(
a,
b,
on=on,
left_on=left_on,
right_on=right_on,
left_index=left_index,
right_index=right_index,
)
tm.assert_frame_equal(result, expected)
else:
msg = "Cannot merge a Series without a name"
with pytest.raises(ValueError, match=msg):
result = merge(
a,
b,
on=on,
left_on=left_on,
right_on=right_on,
left_index=left_index,
right_index=right_index,
)
@pytest.mark.parametrize(
"col1, col2, kwargs, expected_cols",
[
(0, 0, {"suffixes": ("", "_dup")}, ["0", "0_dup"]),
(0, 0, {"suffixes": (None, "_dup")}, [0, "0_dup"]),
(0, 0, {"suffixes": ("_x", "_y")}, ["0_x", "0_y"]),
(0, 0, {"suffixes": ["_x", "_y"]}, ["0_x", "0_y"]),
("a", 0, {"suffixes": (None, "_y")}, ["a", 0]),
(0.0, 0.0, {"suffixes": ("_x", None)}, ["0.0_x", 0.0]),
("b", "b", {"suffixes": (None, "_y")}, ["b", "b_y"]),
("a", "a", {"suffixes": ("_x", None)}, ["a_x", "a"]),
("a", "b", {"suffixes": ("_x", None)}, ["a", "b"]),
("a", "a", {"suffixes": (None, "_x")}, ["a", "a_x"]),
(0, 0, {"suffixes": ("_a", None)}, ["0_a", 0]),
("a", "a", {}, ["a_x", "a_y"]),
(0, 0, {}, ["0_x", "0_y"]),
],
)
def test_merge_suffix(col1, col2, kwargs, expected_cols):
# issue: 24782
a = DataFrame({col1: [1, 2, 3]})
b = DataFrame({col2: [4, 5, 6]})
expected = DataFrame([[1, 4], [2, 5], [3, 6]], columns=expected_cols)
result = a.merge(b, left_index=True, right_index=True, **kwargs)
tm.assert_frame_equal(result, expected)
result = merge(a, b, left_index=True, right_index=True, **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"how,expected",
[
(
"right",
DataFrame(
{"A": [100, 200, 300], "B1": [60, 70, np.nan], "B2": [600, 700, 800]}
),
),
(
"outer",
DataFrame(
{
"A": [100, 200, 1, 300],
"B1": [60, 70, 80, np.nan],
"B2": [600, 700, np.nan, 800],
}
),
),
],
)
def test_merge_duplicate_suffix(how, expected):
left_df = DataFrame({"A": [100, 200, 1], "B": [60, 70, 80]})
right_df = DataFrame({"A": [100, 200, 300], "B": [600, 700, 800]})
result = merge(left_df, right_df, on="A", how=how, suffixes=("_x", "_x"))
expected.columns = ["A", "B_x", "B_x"]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"col1, col2, suffixes",
[("a", "a", (None, None)), ("a", "a", ("", None)), (0, 0, (None, ""))],
)
def test_merge_suffix_error(col1, col2, suffixes):
# issue: 24782
a = DataFrame({col1: [1, 2, 3]})
b = DataFrame({col2: [3, 4, 5]})
# TODO: might reconsider current raise behaviour, see issue 24782
msg = "columns overlap but no suffix specified"
with pytest.raises(ValueError, match=msg):
merge(a, b, left_index=True, right_index=True, suffixes=suffixes)
@pytest.mark.parametrize("suffixes", [{"left", "right"}, {"left": 0, "right": 0}])
def test_merge_suffix_warns(suffixes):
a = DataFrame({"a": [1, 2, 3]})
b = DataFrame({"b": [3, 4, 5]})
with tm.assert_produces_warning(FutureWarning):
merge(a, b, left_index=True, right_index=True, suffixes={"left", "right"})
@pytest.mark.parametrize(
"col1, col2, suffixes, msg",
[
("a", "a", ("a", "b", "c"), r"too many values to unpack \(expected 2\)"),
("a", "a", tuple("a"), r"not enough values to unpack \(expected 2, got 1\)"),
],
)
def test_merge_suffix_length_error(col1, col2, suffixes, msg):
a = DataFrame({col1: [1, 2, 3]})
b = DataFrame({col2: [3, 4, 5]})
with pytest.raises(ValueError, match=msg):
merge(a, b, left_index=True, right_index=True, suffixes=suffixes)
@pytest.mark.parametrize("cat_dtype", ["one", "two"])
@pytest.mark.parametrize("reverse", [True, False])
def test_merge_equal_cat_dtypes(cat_dtype, reverse):
# see gh-22501
cat_dtypes = {
"one": CategoricalDtype(categories=["a", "b", "c"], ordered=False),
"two": CategoricalDtype(categories=["a", "b", "c"], ordered=False),
}
df1 = DataFrame(
{"foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]), "left": [1, 2, 3]}
).set_index("foo")
data_foo = ["a", "b", "c"]
data_right = [1, 2, 3]
if reverse:
data_foo.reverse()
data_right.reverse()
df2 = DataFrame(
{"foo": Series(data_foo).astype(cat_dtypes[cat_dtype]), "right": data_right}
).set_index("foo")
result = df1.merge(df2, left_index=True, right_index=True)
expected = DataFrame(
{
"left": [1, 2, 3],
"right": [1, 2, 3],
"foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]),
}
).set_index("foo")
tm.assert_frame_equal(result, expected)
def test_merge_equal_cat_dtypes2():
# see gh-22501
cat_dtype = CategoricalDtype(categories=["a", "b", "c"], ordered=False)
# Test Data
df1 = DataFrame(
{"foo": Series(["a", "b"]).astype(cat_dtype), "left": [1, 2]}
).set_index("foo")
df2 = DataFrame(
{"foo": Series(["a", "b", "c"]).astype(cat_dtype), "right": [3, 2, 1]}
).set_index("foo")
result = df1.merge(df2, left_index=True, right_index=True)
expected = DataFrame(
{"left": [1, 2], "right": [3, 2], "foo": Series(["a", "b"]).astype(cat_dtype)}
).set_index("foo")
tm.assert_frame_equal(result, expected)
def test_merge_on_cat_and_ext_array():
# GH 28668
right = DataFrame(
{"a": Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval")}
)
left = right.copy()
left["a"] = left["a"].astype("category")
result = merge(left, right, how="inner", on="a")
expected = right.copy()
tm.assert_frame_equal(result, expected)
def test_merge_multiindex_columns():
# Issue #28518
# Verify that merging two dataframes give the expected labels
# The original cause of this issue come from a bug lexsort_depth and is tested in
# test_lexsort_depth
letters = ["a", "b", "c", "d"]
numbers = ["1", "2", "3"]
index = MultiIndex.from_product((letters, numbers), names=["outer", "inner"])
frame_x = DataFrame(columns=index)
frame_x["id"] = ""
frame_y = DataFrame(columns=index)
frame_y["id"] = ""
l_suf = "_x"
r_suf = "_y"
result = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf)))
# Constructing the expected results
expected_labels = [letter + l_suf for letter in letters] + [
letter + r_suf for letter in letters
]
expected_index = MultiIndex.from_product(
[expected_labels, numbers], names=["outer", "inner"]
)
expected = DataFrame(columns=expected_index)
expected["id"] = ""
tm.assert_frame_equal(result, expected)
def test_merge_datetime_upcast_dtype():
# https://github.com/pandas-dev/pandas/issues/31208
df1 = DataFrame({"x": ["a", "b", "c"], "y": ["1", "2", "4"]})
df2 = DataFrame(
{"y": ["1", "2", "3"], "z": pd.to_datetime(["2000", "2001", "2002"])}
)
result = merge(df1, df2, how="left", on="y")
expected = DataFrame(
{
"x": ["a", "b", "c"],
"y": ["1", "2", "4"],
"z": pd.to_datetime(["2000", "2001", "NaT"]),
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("n_categories", [5, 128])
def test_categorical_non_unique_monotonic(n_categories):
# GH 28189
# With n_categories as 5, we test the int8 case is hit in libjoin,
# with n_categories as 128 we test the int16 case.
left_index = CategoricalIndex([0] + list(range(n_categories)))
df1 = DataFrame(range(n_categories + 1), columns=["value"], index=left_index)
df2 = DataFrame(
[[6]],
columns=["value"],
index=CategoricalIndex([0], categories=np.arange(n_categories)),
)
result = merge(df1, df2, how="left", left_index=True, right_index=True)
expected = DataFrame(
[[i, 6.0] if i < 2 else [i, np.nan] for i in range(n_categories + 1)],
columns=["value_x", "value_y"],
index=left_index,
)
tm.assert_frame_equal(expected, result)
def test_merge_join_categorical_multiindex():
# From issue 16627
a = {
"Cat1": Categorical(["a", "b", "a", "c", "a", "b"], ["a", "b", "c"]),
"Int1": [0, 1, 0, 1, 0, 0],
}
a = DataFrame(a)
b = {
"Cat": Categorical(["a", "b", "c", "a", "b", "c"], ["a", "b", "c"]),
"Int": [0, 0, 0, 1, 1, 1],
"Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6],
}
b = DataFrame(b).set_index(["Cat", "Int"])["Factor"]
expected = merge(
a,
b.reset_index(),
left_on=["Cat1", "Int1"],
right_on=["Cat", "Int"],
how="left",
)
expected = expected.drop(["Cat", "Int"], axis=1)
result = a.join(b, on=["Cat1", "Int1"])
tm.assert_frame_equal(expected, result)
# Same test, but with ordered categorical
a = {
"Cat1": Categorical(
["a", "b", "a", "c", "a", "b"], ["b", "a", "c"], ordered=True
),
"Int1": [0, 1, 0, 1, 0, 0],
}
a = DataFrame(a)
b = {
"Cat": Categorical(
["a", "b", "c", "a", "b", "c"], ["b", "a", "c"], ordered=True
),
"Int": [0, 0, 0, 1, 1, 1],
"Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6],
}
b = DataFrame(b).set_index(["Cat", "Int"])["Factor"]
expected = merge(
a,
b.reset_index(),
left_on=["Cat1", "Int1"],
right_on=["Cat", "Int"],
how="left",
)
expected = expected.drop(["Cat", "Int"], axis=1)
result = a.join(b, on=["Cat1", "Int1"])
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize("func", ["merge", "merge_asof"])
@pytest.mark.parametrize(
("kwargs", "err_msg"),
[
({"left_on": "a", "left_index": True}, ["left_on", "left_index"]),
({"right_on": "a", "right_index": True}, ["right_on", "right_index"]),
],
)
def test_merge_join_cols_error_reporting_duplicates(func, kwargs, err_msg):
# GH: 16228
left = DataFrame({"a": [1, 2], "b": [3, 4]})
right = DataFrame({"a": [1, 1], "c": [5, 6]})
msg = rf'Can only pass argument "{err_msg[0]}" OR "{err_msg[1]}" not both\.'
with pytest.raises(MergeError, match=msg):
getattr(pd, func)(left, right, **kwargs)
@pytest.mark.parametrize("func", ["merge", "merge_asof"])
@pytest.mark.parametrize(
("kwargs", "err_msg"),
[
({"left_on": "a"}, ["right_on", "right_index"]),
({"right_on": "a"}, ["left_on", "left_index"]),
],
)
def test_merge_join_cols_error_reporting_missing(func, kwargs, err_msg):
# GH: 16228
left = DataFrame({"a": [1, 2], "b": [3, 4]})
right = DataFrame({"a": [1, 1], "c": [5, 6]})
msg = rf'Must pass "{err_msg[0]}" OR "{err_msg[1]}"\.'
with pytest.raises(MergeError, match=msg):
getattr(pd, func)(left, right, **kwargs)
@pytest.mark.parametrize("func", ["merge", "merge_asof"])
@pytest.mark.parametrize(
"kwargs",
[
{"right_index": True},
{"left_index": True},
],
)
def test_merge_join_cols_error_reporting_on_and_index(func, kwargs):
# GH: 16228
left = DataFrame({"a": [1, 2], "b": [3, 4]})
right = DataFrame({"a": [1, 1], "c": [5, 6]})
msg = (
r'Can only pass argument "on" OR "left_index" '
r'and "right_index", not a combination of both\.'
)
with pytest.raises(MergeError, match=msg):
getattr(pd, func)(left, right, on="a", **kwargs)
def test_merge_right_left_index():
# GH#38616
left = DataFrame({"x": [1, 1], "z": ["foo", "foo"]})
right = DataFrame({"x": [1, 1], "z": ["foo", "foo"]})
result = merge(left, right, how="right", left_index=True, right_on="x")
expected = DataFrame(
{
"x": [1, 1],
"x_x": [1, 1],
"z_x": ["foo", "foo"],
"x_y": [1, 1],
"z_y": ["foo", "foo"],
}
)
tm.assert_frame_equal(result, expected)
def test_merge_result_empty_index_and_on():
# GH#33814
df1 = DataFrame({"a": [1], "b": [2]}).set_index(["a", "b"])
df2 = DataFrame({"b": [1]}).set_index(["b"])
expected = DataFrame({"a": [], "b": []}, dtype=np.int64).set_index(["a", "b"])
result = merge(df1, df2, left_on=["b"], right_index=True)
tm.assert_frame_equal(result, expected)
result = merge(df2, df1, left_index=True, right_on=["b"])
tm.assert_frame_equal(result, expected)
def test_merge_suffixes_produce_dup_columns_warns():
# GH#22818
left = DataFrame({"a": [1, 2, 3], "b": 1, "b_x": 2})
right = DataFrame({"a": [1, 2, 3], "b": 2})
expected = DataFrame(
[[1, 1, 2, 2], [2, 1, 2, 2], [3, 1, 2, 2]], columns=["a", "b_x", "b_x", "b_y"]
)
with tm.assert_produces_warning(FutureWarning):
result = merge(left, right, on="a")
tm.assert_frame_equal(result, expected)
with tm.assert_produces_warning(FutureWarning):
merge(right, left, on="a", suffixes=("_y", "_x"))
tm.assert_frame_equal(result, expected)
def test_merge_duplicate_columns_with_suffix_no_warning():
# GH#22818
# Do not raise warning when duplicates are caused by duplicates in origin
left = DataFrame([[1, 1, 1], [2, 2, 2]], columns=["a", "b", "b"])
right = DataFrame({"a": [1, 3], "b": 2})
result = merge(left, right, on="a")
expected = DataFrame([[1, 1, 1, 2]], columns=["a", "b_x", "b_x", "b_y"])
tm.assert_frame_equal(result, expected)
def test_merge_duplicate_columns_with_suffix_causing_another_duplicate():
# GH#22818
# This should raise warning because suffixes cause another collision
left = DataFrame([[1, 1, 1, 1], [2, 2, 2, 2]], columns=["a", "b", "b", "b_x"])
right = DataFrame({"a": [1, 3], "b": 2})
with tm.assert_produces_warning(FutureWarning):
result = merge(left, right, on="a")
expected = DataFrame([[1, 1, 1, 1, 2]], columns=["a", "b_x", "b_x", "b_x", "b_y"])
tm.assert_frame_equal(result, expected)
def test_merge_string_float_column_result():
# GH 13353
df1 = DataFrame([[1, 2], [3, 4]], columns=pd.Index(["a", 114.0]))
df2 = DataFrame([[9, 10], [11, 12]], columns=["x", "y"])
result = merge(df2, df1, how="inner", left_index=True, right_index=True)
expected = DataFrame(
[[9, 10, 1, 2], [11, 12, 3, 4]], columns=pd.Index(["x", "y", "a", 114.0])
)
tm.assert_frame_equal(result, expected)
def test_mergeerror_on_left_index_mismatched_dtypes():
# GH 22449
df_1 = DataFrame(data=["X"], columns=["C"], index=[22])
df_2 = DataFrame(data=["X"], columns=["C"], index=[999])
with pytest.raises(MergeError, match="Can only pass argument"):
merge(df_1, df_2, on=["C"], left_index=True)
@pytest.mark.parametrize("dtype", [None, "Int64"])
def test_merge_outer_with_NaN(dtype):
# GH#43550
left = DataFrame({"key": [1, 2], "col1": [1, 2]}, dtype=dtype)
right = DataFrame({"key": [np.nan, np.nan], "col2": [3, 4]}, dtype=dtype)
result = merge(left, right, on="key", how="outer")
expected = DataFrame(
{
"key": [1, 2, np.nan, np.nan],
"col1": [1, 2, np.nan, np.nan],
"col2": [np.nan, np.nan, 3, 4],
},
dtype=dtype,
)
tm.assert_frame_equal(result, expected)
# switch left and right
result = merge(right, left, on="key", how="outer")
expected = DataFrame(
{
"key": [np.nan, np.nan, 1, 2],
"col2": [3, 4, np.nan, np.nan],
"col1": [np.nan, np.nan, 1, 2],
},
dtype=dtype,
)
tm.assert_frame_equal(result, expected)
def test_merge_different_index_names():
# GH#45094
left = DataFrame({"a": [1]}, index=pd.Index([1], name="c"))
right = DataFrame({"a": [1]}, index=pd.Index([1], name="d"))
result = merge(left, right, left_on="c", right_on="d")
expected = DataFrame({"a_x": [1], "a_y": 1})
tm.assert_frame_equal(result, expected)