A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pandas/tests/frame/methods/test_sample.py

365 lines
13 KiB

import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
import pandas.core.common as com
class TestSample:
@pytest.fixture(params=[Series, DataFrame])
def obj(self, request):
klass = request.param
if klass is Series:
arr = np.random.randn(10)
else:
arr = np.random.randn(10, 10)
return klass(arr, dtype=None)
@pytest.mark.parametrize("test", list(range(10)))
def test_sample(self, test, obj):
# Fixes issue: 2419
# Check behavior of random_state argument
# Check for stability when receives seed or random state -- run 10
# times.
seed = np.random.randint(0, 100)
tm.assert_equal(
obj.sample(n=4, random_state=seed), obj.sample(n=4, random_state=seed)
)
tm.assert_equal(
obj.sample(frac=0.7, random_state=seed),
obj.sample(frac=0.7, random_state=seed),
)
tm.assert_equal(
obj.sample(n=4, random_state=np.random.RandomState(test)),
obj.sample(n=4, random_state=np.random.RandomState(test)),
)
tm.assert_equal(
obj.sample(frac=0.7, random_state=np.random.RandomState(test)),
obj.sample(frac=0.7, random_state=np.random.RandomState(test)),
)
tm.assert_equal(
obj.sample(frac=2, replace=True, random_state=np.random.RandomState(test)),
obj.sample(frac=2, replace=True, random_state=np.random.RandomState(test)),
)
os1, os2 = [], []
for _ in range(2):
np.random.seed(test)
os1.append(obj.sample(n=4))
os2.append(obj.sample(frac=0.7))
tm.assert_equal(*os1)
tm.assert_equal(*os2)
def test_sample_lengths(self, obj):
# Check lengths are right
assert len(obj.sample(n=4) == 4)
assert len(obj.sample(frac=0.34) == 3)
assert len(obj.sample(frac=0.36) == 4)
def test_sample_invalid_random_state(self, obj):
# Check for error when random_state argument invalid.
msg = (
"random_state must be an integer, array-like, a BitGenerator, Generator, "
"a numpy RandomState, or None"
)
with pytest.raises(ValueError, match=msg):
obj.sample(random_state="a_string")
def test_sample_wont_accept_n_and_frac(self, obj):
# Giving both frac and N throws error
msg = "Please enter a value for `frac` OR `n`, not both"
with pytest.raises(ValueError, match=msg):
obj.sample(n=3, frac=0.3)
def test_sample_requires_positive_n_frac(self, obj):
with pytest.raises(
ValueError,
match="A negative number of rows requested. Please provide `n` >= 0",
):
obj.sample(n=-3)
with pytest.raises(
ValueError,
match="A negative number of rows requested. Please provide `frac` >= 0",
):
obj.sample(frac=-0.3)
def test_sample_requires_integer_n(self, obj):
# Make sure float values of `n` give error
with pytest.raises(ValueError, match="Only integers accepted as `n` values"):
obj.sample(n=3.2)
def test_sample_invalid_weight_lengths(self, obj):
# Weight length must be right
msg = "Weights and axis to be sampled must be of same length"
with pytest.raises(ValueError, match=msg):
obj.sample(n=3, weights=[0, 1])
with pytest.raises(ValueError, match=msg):
bad_weights = [0.5] * 11
obj.sample(n=3, weights=bad_weights)
with pytest.raises(ValueError, match="Fewer non-zero entries in p than size"):
bad_weight_series = Series([0, 0, 0.2])
obj.sample(n=4, weights=bad_weight_series)
def test_sample_negative_weights(self, obj):
# Check won't accept negative weights
bad_weights = [-0.1] * 10
msg = "weight vector many not include negative values"
with pytest.raises(ValueError, match=msg):
obj.sample(n=3, weights=bad_weights)
def test_sample_inf_weights(self, obj):
# Check inf and -inf throw errors:
weights_with_inf = [0.1] * 10
weights_with_inf[0] = np.inf
msg = "weight vector may not include `inf` values"
with pytest.raises(ValueError, match=msg):
obj.sample(n=3, weights=weights_with_inf)
weights_with_ninf = [0.1] * 10
weights_with_ninf[0] = -np.inf
with pytest.raises(ValueError, match=msg):
obj.sample(n=3, weights=weights_with_ninf)
def test_sample_zero_weights(self, obj):
# All zeros raises errors
zero_weights = [0] * 10
with pytest.raises(ValueError, match="Invalid weights: weights sum to zero"):
obj.sample(n=3, weights=zero_weights)
def test_sample_missing_weights(self, obj):
# All missing weights
nan_weights = [np.nan] * 10
with pytest.raises(ValueError, match="Invalid weights: weights sum to zero"):
obj.sample(n=3, weights=nan_weights)
def test_sample_none_weights(self, obj):
# Check None are also replaced by zeros.
weights_with_None = [None] * 10
weights_with_None[5] = 0.5
tm.assert_equal(
obj.sample(n=1, axis=0, weights=weights_with_None), obj.iloc[5:6]
)
@pytest.mark.parametrize(
"func_str,arg",
[
("np.array", [2, 3, 1, 0]),
("np.random.MT19937", 3),
("np.random.PCG64", 11),
],
)
def test_sample_random_state(self, func_str, arg, frame_or_series):
# GH#32503
obj = DataFrame({"col1": range(10, 20), "col2": range(20, 30)})
obj = tm.get_obj(obj, frame_or_series)
result = obj.sample(n=3, random_state=eval(func_str)(arg))
expected = obj.sample(n=3, random_state=com.random_state(eval(func_str)(arg)))
tm.assert_equal(result, expected)
def test_sample_generator(self, frame_or_series):
# GH#38100
obj = frame_or_series(np.arange(100))
rng = np.random.default_rng()
# Consecutive calls should advance the seed
result1 = obj.sample(n=50, random_state=rng)
result2 = obj.sample(n=50, random_state=rng)
assert not (result1.index.values == result2.index.values).all()
# Matching generator initialization must give same result
# Consecutive calls should advance the seed
result1 = obj.sample(n=50, random_state=np.random.default_rng(11))
result2 = obj.sample(n=50, random_state=np.random.default_rng(11))
tm.assert_equal(result1, result2)
def test_sample_upsampling_without_replacement(self, frame_or_series):
# GH#27451
obj = DataFrame({"A": list("abc")})
obj = tm.get_obj(obj, frame_or_series)
msg = (
"Replace has to be set to `True` when "
"upsampling the population `frac` > 1."
)
with pytest.raises(ValueError, match=msg):
obj.sample(frac=2, replace=False)
class TestSampleDataFrame:
# Tests which are relevant only for DataFrame, so these are
# as fully parametrized as they can get.
def test_sample(self):
# GH#2419
# additional specific object based tests
# A few dataframe test with degenerate weights.
easy_weight_list = [0] * 10
easy_weight_list[5] = 1
df = DataFrame(
{
"col1": range(10, 20),
"col2": range(20, 30),
"colString": ["a"] * 10,
"easyweights": easy_weight_list,
}
)
sample1 = df.sample(n=1, weights="easyweights")
tm.assert_frame_equal(sample1, df.iloc[5:6])
# Ensure proper error if string given as weight for Series or
# DataFrame with axis = 1.
ser = Series(range(10))
msg = "Strings cannot be passed as weights when sampling from a Series."
with pytest.raises(ValueError, match=msg):
ser.sample(n=3, weights="weight_column")
msg = (
"Strings can only be passed to weights when sampling from rows on a "
"DataFrame"
)
with pytest.raises(ValueError, match=msg):
df.sample(n=1, weights="weight_column", axis=1)
# Check weighting key error
with pytest.raises(
KeyError, match="'String passed to weights not a valid column'"
):
df.sample(n=3, weights="not_a_real_column_name")
# Check that re-normalizes weights that don't sum to one.
weights_less_than_1 = [0] * 10
weights_less_than_1[0] = 0.5
tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1])
###
# Test axis argument
###
# Test axis argument
df = DataFrame({"col1": range(10), "col2": ["a"] * 10})
second_column_weight = [0, 1]
tm.assert_frame_equal(
df.sample(n=1, axis=1, weights=second_column_weight), df[["col2"]]
)
# Different axis arg types
tm.assert_frame_equal(
df.sample(n=1, axis="columns", weights=second_column_weight), df[["col2"]]
)
weight = [0] * 10
weight[5] = 0.5
tm.assert_frame_equal(df.sample(n=1, axis="rows", weights=weight), df.iloc[5:6])
tm.assert_frame_equal(
df.sample(n=1, axis="index", weights=weight), df.iloc[5:6]
)
# Check out of range axis values
msg = "No axis named 2 for object type DataFrame"
with pytest.raises(ValueError, match=msg):
df.sample(n=1, axis=2)
msg = "No axis named not_a_name for object type DataFrame"
with pytest.raises(ValueError, match=msg):
df.sample(n=1, axis="not_a_name")
ser = Series(range(10))
with pytest.raises(ValueError, match="No axis named 1 for object type Series"):
ser.sample(n=1, axis=1)
# Test weight length compared to correct axis
msg = "Weights and axis to be sampled must be of same length"
with pytest.raises(ValueError, match=msg):
df.sample(n=1, axis=1, weights=[0.5] * 10)
def test_sample_axis1(self):
# Check weights with axis = 1
easy_weight_list = [0] * 3
easy_weight_list[2] = 1
df = DataFrame(
{"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10}
)
sample1 = df.sample(n=1, axis=1, weights=easy_weight_list)
tm.assert_frame_equal(sample1, df[["colString"]])
# Test default axes
tm.assert_frame_equal(
df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42)
)
def test_sample_aligns_weights_with_frame(self):
# Test that function aligns weights with frame
df = DataFrame({"col1": [5, 6, 7], "col2": ["a", "b", "c"]}, index=[9, 5, 3])
ser = Series([1, 0, 0], index=[3, 5, 9])
tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser))
# Weights have index values to be dropped because not in
# sampled DataFrame
ser2 = Series([0.001, 0, 10000], index=[3, 5, 10])
tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser2))
# Weights have empty values to be filed with zeros
ser3 = Series([0.01, 0], index=[3, 5])
tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser3))
# No overlap in weight and sampled DataFrame indices
ser4 = Series([1, 0], index=[1, 2])
with pytest.raises(ValueError, match="Invalid weights: weights sum to zero"):
df.sample(1, weights=ser4)
def test_sample_is_copy(self):
# GH#27357, GH#30784: ensure the result of sample is an actual copy and
# doesn't track the parent dataframe / doesn't give SettingWithCopy warnings
df = DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"])
df2 = df.sample(3)
with tm.assert_produces_warning(None):
df2["d"] = 1
def test_sample_does_not_modify_weights(self):
# GH-42843
result = np.array([np.nan, 1, np.nan])
expected = result.copy()
ser = Series([1, 2, 3])
# Test numpy array weights won't be modified in place
ser.sample(weights=result)
tm.assert_numpy_array_equal(result, expected)
# Test DataFrame column won't be modified in place
df = DataFrame({"values": [1, 1, 1], "weights": [1, np.nan, np.nan]})
expected = df["weights"].copy()
df.sample(frac=1.0, replace=True, weights="weights")
result = df["weights"]
tm.assert_series_equal(result, expected)
def test_sample_ignore_index(self):
# GH 38581
df = DataFrame(
{"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10}
)
result = df.sample(3, ignore_index=True)
expected_index = Index(range(3))
tm.assert_index_equal(result.index, expected_index, exact=True)