A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_array.py

1518 lines
52 KiB

import operator
import re
import warnings
import numpy as np
import pytest
from pandas._libs.sparse import IntIndex
import pandas.util._test_decorators as td
import pandas as pd
from pandas import isna
import pandas._testing as tm
from pandas.core.api import Int64Index
from pandas.core.arrays.sparse import (
SparseArray,
SparseDtype,
)
class TestSparseArray:
def setup_method(self, method):
self.arr_data = np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6])
self.arr = SparseArray(self.arr_data)
self.zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0)
def test_constructor_dtype(self):
arr = SparseArray([np.nan, 1, 2, np.nan])
assert arr.dtype == SparseDtype(np.float64, np.nan)
assert arr.dtype.subtype == np.float64
assert np.isnan(arr.fill_value)
arr = SparseArray([np.nan, 1, 2, np.nan], fill_value=0)
assert arr.dtype == SparseDtype(np.float64, 0)
assert arr.fill_value == 0
arr = SparseArray([0, 1, 2, 4], dtype=np.float64)
assert arr.dtype == SparseDtype(np.float64, np.nan)
assert np.isnan(arr.fill_value)
arr = SparseArray([0, 1, 2, 4], dtype=np.int64)
assert arr.dtype == SparseDtype(np.int64, 0)
assert arr.fill_value == 0
arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=np.int64)
assert arr.dtype == SparseDtype(np.int64, 0)
assert arr.fill_value == 0
arr = SparseArray([0, 1, 2, 4], dtype=None)
assert arr.dtype == SparseDtype(np.int64, 0)
assert arr.fill_value == 0
arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=None)
assert arr.dtype == SparseDtype(np.int64, 0)
assert arr.fill_value == 0
def test_constructor_dtype_str(self):
result = SparseArray([1, 2, 3], dtype="int")
expected = SparseArray([1, 2, 3], dtype=int)
tm.assert_sp_array_equal(result, expected)
def test_constructor_sparse_dtype(self):
result = SparseArray([1, 0, 0, 1], dtype=SparseDtype("int64", -1))
expected = SparseArray([1, 0, 0, 1], fill_value=-1, dtype=np.int64)
tm.assert_sp_array_equal(result, expected)
assert result.sp_values.dtype == np.dtype("int64")
def test_constructor_sparse_dtype_str(self):
result = SparseArray([1, 0, 0, 1], dtype="Sparse[int32]")
expected = SparseArray([1, 0, 0, 1], dtype=np.int32)
tm.assert_sp_array_equal(result, expected)
assert result.sp_values.dtype == np.dtype("int32")
def test_constructor_object_dtype(self):
# GH 11856
arr = SparseArray(["A", "A", np.nan, "B"], dtype=object)
assert arr.dtype == SparseDtype(object)
assert np.isnan(arr.fill_value)
arr = SparseArray(["A", "A", np.nan, "B"], dtype=object, fill_value="A")
assert arr.dtype == SparseDtype(object, "A")
assert arr.fill_value == "A"
# GH 17574
data = [False, 0, 100.0, 0.0]
arr = SparseArray(data, dtype=object, fill_value=False)
assert arr.dtype == SparseDtype(object, False)
assert arr.fill_value is False
arr_expected = np.array(data, dtype=object)
it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected))
assert np.fromiter(it, dtype=np.bool_).all()
@pytest.mark.parametrize("dtype", [SparseDtype(int, 0), int])
def test_constructor_na_dtype(self, dtype):
with pytest.raises(ValueError, match="Cannot convert"):
SparseArray([0, 1, np.nan], dtype=dtype)
def test_constructor_warns_when_losing_timezone(self):
# GH#32501 warn when losing timezone information
dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")
expected = SparseArray(np.asarray(dti, dtype="datetime64[ns]"))
with tm.assert_produces_warning(UserWarning):
result = SparseArray(dti)
tm.assert_sp_array_equal(result, expected)
with tm.assert_produces_warning(UserWarning):
result = SparseArray(pd.Series(dti))
tm.assert_sp_array_equal(result, expected)
def test_constructor_spindex_dtype(self):
arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]))
# XXX: Behavior change: specifying SparseIndex no longer changes the
# fill_value
expected = SparseArray([0, 1, 2, 0], kind="integer")
tm.assert_sp_array_equal(arr, expected)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
arr = SparseArray(
data=[1, 2, 3],
sparse_index=IntIndex(4, [1, 2, 3]),
dtype=np.int64,
fill_value=0,
)
exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
arr = SparseArray(
data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64
)
exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
arr = SparseArray(
data=[1, 2, 3],
sparse_index=IntIndex(4, [1, 2, 3]),
dtype=None,
fill_value=0,
)
exp = SparseArray([0, 1, 2, 3], dtype=None)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
@pytest.mark.parametrize("sparse_index", [None, IntIndex(1, [0])])
def test_constructor_spindex_dtype_scalar(self, sparse_index):
# scalar input
arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None)
exp = SparseArray([1], dtype=None)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None)
exp = SparseArray([1], dtype=None)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
def test_constructor_spindex_dtype_scalar_broadcasts(self):
arr = SparseArray(
data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None
)
exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
@pytest.mark.parametrize(
"data, fill_value",
[
(np.array([1, 2]), 0),
(np.array([1.0, 2.0]), np.nan),
([True, False], False),
([pd.Timestamp("2017-01-01")], pd.NaT),
],
)
def test_constructor_inferred_fill_value(self, data, fill_value):
result = SparseArray(data).fill_value
if isna(fill_value):
assert isna(result)
else:
assert result == fill_value
@pytest.mark.parametrize("format", ["coo", "csc", "csr"])
@pytest.mark.parametrize("size", [0, 10])
@td.skip_if_no_scipy
def test_from_spmatrix(self, size, format):
import scipy.sparse
mat = scipy.sparse.random(size, 1, density=0.5, format=format)
result = SparseArray.from_spmatrix(mat)
result = np.asarray(result)
expected = mat.toarray().ravel()
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("format", ["coo", "csc", "csr"])
@td.skip_if_no_scipy
def test_from_spmatrix_including_explicit_zero(self, format):
import scipy.sparse
mat = scipy.sparse.random(10, 1, density=0.5, format=format)
mat.data[0] = 0
result = SparseArray.from_spmatrix(mat)
result = np.asarray(result)
expected = mat.toarray().ravel()
tm.assert_numpy_array_equal(result, expected)
@td.skip_if_no_scipy
def test_from_spmatrix_raises(self):
import scipy.sparse
mat = scipy.sparse.eye(5, 4, format="csc")
with pytest.raises(ValueError, match="not '4'"):
SparseArray.from_spmatrix(mat)
@pytest.mark.parametrize(
"scalar,dtype",
[
(False, SparseDtype(bool, False)),
(0.0, SparseDtype("float64", 0)),
(1, SparseDtype("int64", 1)),
("z", SparseDtype("object", "z")),
],
)
def test_scalar_with_index_infer_dtype(self, scalar, dtype):
# GH 19163
with tm.assert_produces_warning(
FutureWarning, match="The index argument has been deprecated"
):
arr = SparseArray(scalar, index=[1, 2, 3], fill_value=scalar)
exp = SparseArray([scalar, scalar, scalar], fill_value=scalar)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == dtype
assert exp.dtype == dtype
def test_getitem_bool_sparse_array(self):
# GH 23122
spar_bool = SparseArray([False, True] * 5, dtype=np.bool8, fill_value=True)
exp = SparseArray([np.nan, 2, np.nan, 5, 6])
tm.assert_sp_array_equal(self.arr[spar_bool], exp)
spar_bool = ~spar_bool
res = self.arr[spar_bool]
exp = SparseArray([np.nan, 1, 3, 4, np.nan])
tm.assert_sp_array_equal(res, exp)
spar_bool = SparseArray(
[False, True, np.nan] * 3, dtype=np.bool8, fill_value=np.nan
)
res = self.arr[spar_bool]
exp = SparseArray([np.nan, 3, 5])
tm.assert_sp_array_equal(res, exp)
def test_getitem_bool_sparse_array_as_comparison(self):
# GH 45110
arr = SparseArray([1, 2, 3, 4, np.nan, np.nan], fill_value=np.nan)
res = arr[arr > 2]
exp = SparseArray([3.0, 4.0], fill_value=np.nan)
tm.assert_sp_array_equal(res, exp)
def test_get_item(self):
assert np.isnan(self.arr[1])
assert self.arr[2] == 1
assert self.arr[7] == 5
assert self.zarr[0] == 0
assert self.zarr[2] == 1
assert self.zarr[7] == 5
errmsg = "must be an integer between -10 and 10"
with pytest.raises(IndexError, match=errmsg):
self.arr[11]
with pytest.raises(IndexError, match=errmsg):
self.arr[-11]
assert self.arr[-1] == self.arr[len(self.arr) - 1]
def test_take_scalar_raises(self):
msg = "'indices' must be an array, not a scalar '2'."
with pytest.raises(ValueError, match=msg):
self.arr.take(2)
def test_take(self):
exp = SparseArray(np.take(self.arr_data, [2, 3]))
tm.assert_sp_array_equal(self.arr.take([2, 3]), exp)
exp = SparseArray(np.take(self.arr_data, [0, 1, 2]))
tm.assert_sp_array_equal(self.arr.take([0, 1, 2]), exp)
def test_take_all_empty(self):
a = pd.array([0, 0], dtype=SparseDtype("int64"))
result = a.take([0, 1], allow_fill=True, fill_value=np.nan)
tm.assert_sp_array_equal(a, result)
def test_take_fill_value(self):
data = np.array([1, np.nan, 0, 3, 0])
sparse = SparseArray(data, fill_value=0)
exp = SparseArray(np.take(data, [0]), fill_value=0)
tm.assert_sp_array_equal(sparse.take([0]), exp)
exp = SparseArray(np.take(data, [1, 3, 4]), fill_value=0)
tm.assert_sp_array_equal(sparse.take([1, 3, 4]), exp)
def test_take_negative(self):
exp = SparseArray(np.take(self.arr_data, [-1]))
tm.assert_sp_array_equal(self.arr.take([-1]), exp)
exp = SparseArray(np.take(self.arr_data, [-4, -3, -2]))
tm.assert_sp_array_equal(self.arr.take([-4, -3, -2]), exp)
@pytest.mark.parametrize("fill_value", [0, None, np.nan])
def test_shift_fill_value(self, fill_value):
# GH #24128
sparse = SparseArray(np.array([1, 0, 0, 3, 0]), fill_value=8.0)
res = sparse.shift(1, fill_value=fill_value)
if isna(fill_value):
fill_value = res.dtype.na_value
exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]), fill_value=8.0)
tm.assert_sp_array_equal(res, exp)
def test_bad_take(self):
with pytest.raises(IndexError, match="bounds"):
self.arr.take([11])
def test_take_filling(self):
# similar tests as GH 12631
sparse = SparseArray([np.nan, np.nan, 1, np.nan, 4])
result = sparse.take(np.array([1, 0, -1]))
expected = SparseArray([np.nan, np.nan, 4])
tm.assert_sp_array_equal(result, expected)
# XXX: test change: fill_value=True -> allow_fill=True
result = sparse.take(np.array([1, 0, -1]), allow_fill=True)
expected = SparseArray([np.nan, np.nan, np.nan])
tm.assert_sp_array_equal(result, expected)
# allow_fill=False
result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
expected = SparseArray([np.nan, np.nan, 4])
tm.assert_sp_array_equal(result, expected)
msg = "Invalid value in 'indices'"
with pytest.raises(ValueError, match=msg):
sparse.take(np.array([1, 0, -2]), allow_fill=True)
with pytest.raises(ValueError, match=msg):
sparse.take(np.array([1, 0, -5]), allow_fill=True)
msg = "out of bounds value in 'indices'"
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, -6]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]), allow_fill=True)
def test_take_filling_fill_value(self):
# same tests as GH 12631
sparse = SparseArray([np.nan, 0, 1, 0, 4], fill_value=0)
result = sparse.take(np.array([1, 0, -1]))
expected = SparseArray([0, np.nan, 4], fill_value=0)
tm.assert_sp_array_equal(result, expected)
# fill_value
result = sparse.take(np.array([1, 0, -1]), allow_fill=True)
# XXX: behavior change.
# the old way of filling self.fill_value doesn't follow EA rules.
# It's supposed to be self.dtype.na_value (nan in this case)
expected = SparseArray([0, np.nan, np.nan], fill_value=0)
tm.assert_sp_array_equal(result, expected)
# allow_fill=False
result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
expected = SparseArray([0, np.nan, 4], fill_value=0)
tm.assert_sp_array_equal(result, expected)
msg = "Invalid value in 'indices'."
with pytest.raises(ValueError, match=msg):
sparse.take(np.array([1, 0, -2]), allow_fill=True)
with pytest.raises(ValueError, match=msg):
sparse.take(np.array([1, 0, -5]), allow_fill=True)
msg = "out of bounds value in 'indices'"
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, -6]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]), fill_value=True)
@pytest.mark.parametrize("kind", ["block", "integer"])
def test_take_filling_all_nan(self, kind):
sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan], kind=kind)
result = sparse.take(np.array([1, 0, -1]))
expected = SparseArray([np.nan, np.nan, np.nan], kind=kind)
tm.assert_sp_array_equal(result, expected)
result = sparse.take(np.array([1, 0, -1]), fill_value=True)
expected = SparseArray([np.nan, np.nan, np.nan], kind=kind)
tm.assert_sp_array_equal(result, expected)
msg = "out of bounds value in 'indices'"
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, -6]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]), fill_value=True)
def test_set_item(self):
def setitem():
self.arr[5] = 3
def setslice():
self.arr[1:5] = 2
with pytest.raises(TypeError, match="assignment via setitem"):
setitem()
with pytest.raises(TypeError, match="assignment via setitem"):
setslice()
def test_constructor_from_too_large_array(self):
with pytest.raises(TypeError, match="expected dimension <= 1 data"):
SparseArray(np.arange(10).reshape((2, 5)))
def test_constructor_from_sparse(self):
res = SparseArray(self.zarr)
assert res.fill_value == 0
tm.assert_almost_equal(res.sp_values, self.zarr.sp_values)
def test_constructor_copy(self):
cp = SparseArray(self.arr, copy=True)
cp.sp_values[:3] = 0
assert not (self.arr.sp_values[:3] == 0).any()
not_copy = SparseArray(self.arr)
not_copy.sp_values[:3] = 0
assert (self.arr.sp_values[:3] == 0).all()
def test_constructor_bool(self):
# GH 10648
data = np.array([False, False, True, True, False, False])
arr = SparseArray(data, fill_value=False, dtype=bool)
assert arr.dtype == SparseDtype(bool)
tm.assert_numpy_array_equal(arr.sp_values, np.array([True, True]))
# Behavior change: np.asarray densifies.
# tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr))
tm.assert_numpy_array_equal(arr.sp_index.indices, np.array([2, 3], np.int32))
dense = arr.to_dense()
assert dense.dtype == bool
tm.assert_numpy_array_equal(dense, data)
def test_constructor_bool_fill_value(self):
arr = SparseArray([True, False, True], dtype=None)
assert arr.dtype == SparseDtype(np.bool_)
assert not arr.fill_value
arr = SparseArray([True, False, True], dtype=np.bool_)
assert arr.dtype == SparseDtype(np.bool_)
assert not arr.fill_value
arr = SparseArray([True, False, True], dtype=np.bool_, fill_value=True)
assert arr.dtype == SparseDtype(np.bool_, True)
assert arr.fill_value
def test_constructor_float32(self):
# GH 10648
data = np.array([1.0, np.nan, 3], dtype=np.float32)
arr = SparseArray(data, dtype=np.float32)
assert arr.dtype == SparseDtype(np.float32)
tm.assert_numpy_array_equal(arr.sp_values, np.array([1, 3], dtype=np.float32))
# Behavior change: np.asarray densifies.
# tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr))
tm.assert_numpy_array_equal(
arr.sp_index.indices, np.array([0, 2], dtype=np.int32)
)
dense = arr.to_dense()
assert dense.dtype == np.float32
tm.assert_numpy_array_equal(dense, data)
def test_astype(self):
# float -> float
arr = SparseArray([None, None, 0, 2])
result = arr.astype("Sparse[float32]")
expected = SparseArray([None, None, 0, 2], dtype=np.dtype("float32"))
tm.assert_sp_array_equal(result, expected)
dtype = SparseDtype("float64", fill_value=0)
result = arr.astype(dtype)
expected = SparseArray._simple_new(
np.array([0.0, 2.0], dtype=dtype.subtype), IntIndex(4, [2, 3]), dtype
)
tm.assert_sp_array_equal(result, expected)
dtype = SparseDtype("int64", 0)
result = arr.astype(dtype)
expected = SparseArray._simple_new(
np.array([0, 2], dtype=np.int64), IntIndex(4, [2, 3]), dtype
)
tm.assert_sp_array_equal(result, expected)
arr = SparseArray([0, np.nan, 0, 1], fill_value=0)
with pytest.raises(ValueError, match="NA"):
arr.astype("Sparse[i8]")
def test_astype_bool(self):
a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0))
result = a.astype(bool)
expected = SparseArray(
[True, False, False, True], dtype=SparseDtype(bool, False)
)
tm.assert_sp_array_equal(result, expected)
# update fill value
result = a.astype(SparseDtype(bool, False))
expected = SparseArray(
[True, False, False, True], dtype=SparseDtype(bool, False)
)
tm.assert_sp_array_equal(result, expected)
def test_astype_all(self, any_real_numpy_dtype):
vals = np.array([1, 2, 3])
arr = SparseArray(vals, fill_value=1)
typ = np.dtype(any_real_numpy_dtype)
res = arr.astype(typ)
assert res.dtype == SparseDtype(typ, 1)
assert res.sp_values.dtype == typ
tm.assert_numpy_array_equal(np.asarray(res.to_dense()), vals.astype(typ))
@pytest.mark.parametrize(
"arr, dtype, expected",
[
(
SparseArray([0, 1]),
"float",
SparseArray([0.0, 1.0], dtype=SparseDtype(float, 0.0)),
),
(SparseArray([0, 1]), bool, SparseArray([False, True])),
(
SparseArray([0, 1], fill_value=1),
bool,
SparseArray([False, True], dtype=SparseDtype(bool, True)),
),
pytest.param(
SparseArray([0, 1]),
"datetime64[ns]",
SparseArray(
np.array([0, 1], dtype="datetime64[ns]"),
dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")),
),
marks=[pytest.mark.xfail(reason="NumPy-7619")],
),
(
SparseArray([0, 1, 10]),
str,
SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")),
),
(SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])),
(
SparseArray([0, 1, 0]),
object,
SparseArray([0, 1, 0], dtype=SparseDtype(object, 0)),
),
],
)
def test_astype_more(self, arr, dtype, expected):
result = arr.astype(dtype)
tm.assert_sp_array_equal(result, expected)
def test_astype_nan_raises(self):
arr = SparseArray([1.0, np.nan])
with pytest.raises(ValueError, match="Cannot convert non-finite"):
arr.astype(int)
def test_astype_copy_false(self):
# GH#34456 bug caused by using .view instead of .astype in astype_nansafe
arr = SparseArray([1, 2, 3])
result = arr.astype(float, copy=False)
expected = SparseArray([1.0, 2.0, 3.0], fill_value=0.0)
tm.assert_sp_array_equal(result, expected)
def test_set_fill_value(self):
arr = SparseArray([1.0, np.nan, 2.0], fill_value=np.nan)
arr.fill_value = 2
assert arr.fill_value == 2
arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64)
arr.fill_value = 2
assert arr.fill_value == 2
# XXX: this seems fine? You can construct an integer
# sparsearray with NaN fill value, why not update one?
# coerces to int
# msg = "unable to set fill_value 3\\.1 to int64 dtype"
# with pytest.raises(ValueError, match=msg):
arr.fill_value = 3.1
assert arr.fill_value == 3.1
# msg = "unable to set fill_value nan to int64 dtype"
# with pytest.raises(ValueError, match=msg):
arr.fill_value = np.nan
assert np.isnan(arr.fill_value)
arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
arr.fill_value = True
assert arr.fill_value
# coerces to bool
# XXX: we can construct an sparse array of bool
# type and use as fill_value any value
# msg = "fill_value must be True, False or nan"
# with pytest.raises(ValueError, match=msg):
# arr.fill_value = 0
# msg = "unable to set fill_value nan to bool dtype"
# with pytest.raises(ValueError, match=msg):
arr.fill_value = np.nan
assert np.isnan(arr.fill_value)
@pytest.mark.parametrize("val", [[1, 2, 3], np.array([1, 2]), (1, 2, 3)])
def test_set_fill_invalid_non_scalar(self, val):
arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
msg = "fill_value must be a scalar"
with pytest.raises(ValueError, match=msg):
arr.fill_value = val
def test_copy(self):
arr2 = self.arr.copy()
assert arr2.sp_values is not self.arr.sp_values
assert arr2.sp_index is self.arr.sp_index
def test_values_asarray(self):
tm.assert_almost_equal(self.arr.to_dense(), self.arr_data)
@pytest.mark.parametrize(
"data,shape,dtype",
[
([0, 0, 0, 0, 0], (5,), None),
([], (0,), None),
([0], (1,), None),
(["A", "A", np.nan, "B"], (4,), object),
],
)
def test_shape(self, data, shape, dtype):
# GH 21126
out = SparseArray(data, dtype=dtype)
assert out.shape == shape
@pytest.mark.parametrize(
"vals",
[
[np.nan, np.nan, np.nan, np.nan, np.nan],
[1, np.nan, np.nan, 3, np.nan],
[1, np.nan, 0, 3, 0],
],
)
@pytest.mark.parametrize("fill_value", [None, 0])
def test_dense_repr(self, vals, fill_value):
vals = np.array(vals)
arr = SparseArray(vals, fill_value=fill_value)
res = arr.to_dense()
tm.assert_numpy_array_equal(res, vals)
res2 = arr._internal_get_values()
tm.assert_numpy_array_equal(res2, vals)
def test_getitem(self):
def _checkit(i):
tm.assert_almost_equal(self.arr[i], self.arr.to_dense()[i])
for i in range(len(self.arr)):
_checkit(i)
_checkit(-i)
def test_getitem_arraylike_mask(self):
arr = SparseArray([0, 1, 2])
result = arr[[True, False, True]]
expected = SparseArray([0, 2])
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize(
"slc",
[
np.s_[:],
np.s_[1:10],
np.s_[1:100],
np.s_[10:1],
np.s_[:-3],
np.s_[-5:-4],
np.s_[:-12],
np.s_[-12:],
np.s_[2:],
np.s_[2::3],
np.s_[::2],
np.s_[::-1],
np.s_[::-2],
np.s_[1:6:2],
np.s_[:-6:-2],
],
)
@pytest.mark.parametrize(
"as_dense", [[np.nan] * 10, [1] * 10, [np.nan] * 5 + [1] * 5, []]
)
def test_getslice(self, slc, as_dense):
as_dense = np.array(as_dense)
arr = SparseArray(as_dense)
result = arr[slc]
expected = SparseArray(as_dense[slc])
tm.assert_sp_array_equal(result, expected)
def test_getslice_tuple(self):
dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0])
sparse = SparseArray(dense)
res = sparse[(slice(4, None),)]
exp = SparseArray(dense[4:])
tm.assert_sp_array_equal(res, exp)
sparse = SparseArray(dense, fill_value=0)
res = sparse[(slice(4, None),)]
exp = SparseArray(dense[4:], fill_value=0)
tm.assert_sp_array_equal(res, exp)
msg = "too many indices for array"
with pytest.raises(IndexError, match=msg):
sparse[4:, :]
with pytest.raises(IndexError, match=msg):
# check numpy compat
dense[4:, :]
def test_boolean_slice_empty(self):
arr = SparseArray([0, 1, 2])
res = arr[[False, False, False]]
assert res.dtype == arr.dtype
def test_neg_operator(self):
arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
res = -arr
exp = SparseArray([1, 2, np.nan, -3], fill_value=np.nan, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)
arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
res = -arr
exp = SparseArray([1, 2, -1, -3], fill_value=1, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)
def test_abs_operator(self):
arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
res = abs(arr)
exp = SparseArray([1, 2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)
arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
res = abs(arr)
exp = SparseArray([1, 2, 1, 3], fill_value=1, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)
def test_invert_operator(self):
arr = SparseArray([False, True, False, True], fill_value=False, dtype=np.bool8)
res = ~arr
exp = SparseArray(
np.invert([False, True, False, True]), fill_value=True, dtype=np.bool8
)
res = ~arr
tm.assert_sp_array_equal(exp, res)
arr = SparseArray([0, 1, 0, 2, 3, 0], fill_value=0, dtype=np.int32)
res = ~arr
exp = SparseArray([-1, -2, -1, -3, -4, -1], fill_value=-1, dtype=np.int32)
@pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"])
def test_binary_operators(self, op):
op = getattr(operator, op)
data1 = np.random.randn(20)
data2 = np.random.randn(20)
data1[::2] = np.nan
data2[::3] = np.nan
arr1 = SparseArray(data1)
arr2 = SparseArray(data2)
data1[::2] = 3
data2[::3] = 3
farr1 = SparseArray(data1, fill_value=3)
farr2 = SparseArray(data2, fill_value=3)
def _check_op(op, first, second):
res = op(first, second)
exp = SparseArray(
op(first.to_dense(), second.to_dense()), fill_value=first.fill_value
)
assert isinstance(res, SparseArray)
tm.assert_almost_equal(res.to_dense(), exp.to_dense())
res2 = op(first, second.to_dense())
assert isinstance(res2, SparseArray)
tm.assert_sp_array_equal(res, res2)
res3 = op(first.to_dense(), second)
assert isinstance(res3, SparseArray)
tm.assert_sp_array_equal(res, res3)
res4 = op(first, 4)
assert isinstance(res4, SparseArray)
# Ignore this if the actual op raises (e.g. pow).
try:
exp = op(first.to_dense(), 4)
exp_fv = op(first.fill_value, 4)
except ValueError:
pass
else:
tm.assert_almost_equal(res4.fill_value, exp_fv)
tm.assert_almost_equal(res4.to_dense(), exp)
with np.errstate(all="ignore"):
for first_arr, second_arr in [(arr1, arr2), (farr1, farr2)]:
_check_op(op, first_arr, second_arr)
def test_pickle(self):
def _check_roundtrip(obj):
unpickled = tm.round_trip_pickle(obj)
tm.assert_sp_array_equal(unpickled, obj)
_check_roundtrip(self.arr)
_check_roundtrip(self.zarr)
def test_generator_warnings(self):
sp_arr = SparseArray([1, 2, 3])
with warnings.catch_warnings(record=True) as w:
warnings.filterwarnings(action="always", category=DeprecationWarning)
warnings.filterwarnings(action="always", category=PendingDeprecationWarning)
for _ in sp_arr:
pass
assert len(w) == 0
def test_fillna(self):
s = SparseArray([1, np.nan, np.nan, 3, np.nan])
res = s.fillna(-1)
exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0)
res = s.fillna(-1)
exp = SparseArray([1, -1, -1, 3, -1], fill_value=0, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
s = SparseArray([1, np.nan, 0, 3, 0])
res = s.fillna(-1)
exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
s = SparseArray([1, np.nan, 0, 3, 0], fill_value=0)
res = s.fillna(-1)
exp = SparseArray([1, -1, 0, 3, 0], fill_value=0, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
s = SparseArray([np.nan, np.nan, np.nan, np.nan])
res = s.fillna(-1)
exp = SparseArray([-1, -1, -1, -1], fill_value=-1, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
s = SparseArray([np.nan, np.nan, np.nan, np.nan], fill_value=0)
res = s.fillna(-1)
exp = SparseArray([-1, -1, -1, -1], fill_value=0, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
# float dtype's fill_value is np.nan, replaced by -1
s = SparseArray([0.0, 0.0, 0.0, 0.0])
res = s.fillna(-1)
exp = SparseArray([0.0, 0.0, 0.0, 0.0], fill_value=-1)
tm.assert_sp_array_equal(res, exp)
# int dtype shouldn't have missing. No changes.
s = SparseArray([0, 0, 0, 0])
assert s.dtype == SparseDtype(np.int64)
assert s.fill_value == 0
res = s.fillna(-1)
tm.assert_sp_array_equal(res, s)
s = SparseArray([0, 0, 0, 0], fill_value=0)
assert s.dtype == SparseDtype(np.int64)
assert s.fill_value == 0
res = s.fillna(-1)
exp = SparseArray([0, 0, 0, 0], fill_value=0)
tm.assert_sp_array_equal(res, exp)
# fill_value can be nan if there is no missing hole.
# only fill_value will be changed
s = SparseArray([0, 0, 0, 0], fill_value=np.nan)
assert s.dtype == SparseDtype(np.int64, fill_value=np.nan)
assert np.isnan(s.fill_value)
res = s.fillna(-1)
exp = SparseArray([0, 0, 0, 0], fill_value=-1)
tm.assert_sp_array_equal(res, exp)
def test_fillna_overlap(self):
s = SparseArray([1, np.nan, np.nan, 3, np.nan])
# filling with existing value doesn't replace existing value with
# fill_value, i.e. existing 3 remains in sp_values
res = s.fillna(3)
exp = np.array([1, 3, 3, 3, 3], dtype=np.float64)
tm.assert_numpy_array_equal(res.to_dense(), exp)
s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0)
res = s.fillna(3)
exp = SparseArray([1, 3, 3, 3, 3], fill_value=0, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
def test_nonzero(self):
# Tests regression #21172.
sa = SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0])
expected = np.array([2, 5, 9], dtype=np.int32)
(result,) = sa.nonzero()
tm.assert_numpy_array_equal(expected, result)
sa = SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0])
(result,) = sa.nonzero()
tm.assert_numpy_array_equal(expected, result)
class TestSparseArrayAnalytics:
@pytest.mark.parametrize(
"data,pos,neg",
[
([True, True, True], True, False),
([1, 2, 1], 1, 0),
([1.0, 2.0, 1.0], 1.0, 0.0),
],
)
def test_all(self, data, pos, neg):
# GH 17570
out = SparseArray(data).all()
assert out
out = SparseArray(data, fill_value=pos).all()
assert out
data[1] = neg
out = SparseArray(data).all()
assert not out
out = SparseArray(data, fill_value=pos).all()
assert not out
@pytest.mark.parametrize(
"data,pos,neg",
[
([True, True, True], True, False),
([1, 2, 1], 1, 0),
([1.0, 2.0, 1.0], 1.0, 0.0),
],
)
def test_numpy_all(self, data, pos, neg):
# GH 17570
out = np.all(SparseArray(data))
assert out
out = np.all(SparseArray(data, fill_value=pos))
assert out
data[1] = neg
out = np.all(SparseArray(data))
assert not out
out = np.all(SparseArray(data, fill_value=pos))
assert not out
# raises with a different message on py2.
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.all(SparseArray(data), out=np.array([]))
@pytest.mark.parametrize(
"data,pos,neg",
[
([False, True, False], True, False),
([0, 2, 0], 2, 0),
([0.0, 2.0, 0.0], 2.0, 0.0),
],
)
def test_any(self, data, pos, neg):
# GH 17570
out = SparseArray(data).any()
assert out
out = SparseArray(data, fill_value=pos).any()
assert out
data[1] = neg
out = SparseArray(data).any()
assert not out
out = SparseArray(data, fill_value=pos).any()
assert not out
@pytest.mark.parametrize(
"data,pos,neg",
[
([False, True, False], True, False),
([0, 2, 0], 2, 0),
([0.0, 2.0, 0.0], 2.0, 0.0),
],
)
def test_numpy_any(self, data, pos, neg):
# GH 17570
out = np.any(SparseArray(data))
assert out
out = np.any(SparseArray(data, fill_value=pos))
assert out
data[1] = neg
out = np.any(SparseArray(data))
assert not out
out = np.any(SparseArray(data, fill_value=pos))
assert not out
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.any(SparseArray(data), out=out)
def test_sum(self):
data = np.arange(10).astype(float)
out = SparseArray(data).sum()
assert out == 45.0
data[5] = np.nan
out = SparseArray(data, fill_value=2).sum()
assert out == 40.0
out = SparseArray(data, fill_value=np.nan).sum()
assert out == 40.0
@pytest.mark.parametrize(
"arr",
[np.array([0, 1, np.nan, 1]), np.array([0, 1, 1])],
)
@pytest.mark.parametrize("fill_value", [0, 1, np.nan])
@pytest.mark.parametrize("min_count, expected", [(3, 2), (4, np.nan)])
def test_sum_min_count(self, arr, fill_value, min_count, expected):
# https://github.com/pandas-dev/pandas/issues/25777
sparray = SparseArray(arr, fill_value=fill_value)
result = sparray.sum(min_count=min_count)
if np.isnan(expected):
assert np.isnan(result)
else:
assert result == expected
def test_bool_sum_min_count(self):
spar_bool = pd.arrays.SparseArray(
[False, True] * 5, dtype=np.bool8, fill_value=True
)
res = spar_bool.sum(min_count=1)
assert res == 5
res = spar_bool.sum(min_count=11)
assert isna(res)
def test_numpy_sum(self):
data = np.arange(10).astype(float)
out = np.sum(SparseArray(data))
assert out == 45.0
data[5] = np.nan
out = np.sum(SparseArray(data, fill_value=2))
assert out == 40.0
out = np.sum(SparseArray(data, fill_value=np.nan))
assert out == 40.0
msg = "the 'dtype' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.sum(SparseArray(data), dtype=np.int64)
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.sum(SparseArray(data), out=out)
@pytest.mark.parametrize(
"data,expected",
[
(
np.array([1, 2, 3, 4, 5], dtype=float), # non-null data
SparseArray(np.array([1.0, 3.0, 6.0, 10.0, 15.0])),
),
(
np.array([1, 2, np.nan, 4, 5], dtype=float), # null data
SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])),
),
],
)
@pytest.mark.parametrize("numpy", [True, False])
def test_cumsum(self, data, expected, numpy):
cumsum = np.cumsum if numpy else lambda s: s.cumsum()
out = cumsum(SparseArray(data))
tm.assert_sp_array_equal(out, expected)
out = cumsum(SparseArray(data, fill_value=np.nan))
tm.assert_sp_array_equal(out, expected)
out = cumsum(SparseArray(data, fill_value=2))
tm.assert_sp_array_equal(out, expected)
if numpy: # numpy compatibility checks.
msg = "the 'dtype' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.cumsum(SparseArray(data), dtype=np.int64)
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.cumsum(SparseArray(data), out=out)
else:
axis = 1 # SparseArray currently 1-D, so only axis = 0 is valid.
msg = re.escape(f"axis(={axis}) out of bounds")
with pytest.raises(ValueError, match=msg):
SparseArray(data).cumsum(axis=axis)
def test_mean(self):
data = np.arange(10).astype(float)
out = SparseArray(data).mean()
assert out == 4.5
data[5] = np.nan
out = SparseArray(data).mean()
assert out == 40.0 / 9
def test_numpy_mean(self):
data = np.arange(10).astype(float)
out = np.mean(SparseArray(data))
assert out == 4.5
data[5] = np.nan
out = np.mean(SparseArray(data))
assert out == 40.0 / 9
msg = "the 'dtype' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.mean(SparseArray(data), dtype=np.int64)
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.mean(SparseArray(data), out=out)
def test_ufunc(self):
# GH 13853 make sure ufunc is applied to fill_value
sparse = SparseArray([1, np.nan, 2, np.nan, -2])
result = SparseArray([1, np.nan, 2, np.nan, 2])
tm.assert_sp_array_equal(abs(sparse), result)
tm.assert_sp_array_equal(np.abs(sparse), result)
sparse = SparseArray([1, -1, 2, -2], fill_value=1)
result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1)
tm.assert_sp_array_equal(abs(sparse), result)
tm.assert_sp_array_equal(np.abs(sparse), result)
sparse = SparseArray([1, -1, 2, -2], fill_value=-1)
exp = SparseArray([1, 1, 2, 2], fill_value=1)
tm.assert_sp_array_equal(abs(sparse), exp)
tm.assert_sp_array_equal(np.abs(sparse), exp)
sparse = SparseArray([1, np.nan, 2, np.nan, -2])
result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2]))
tm.assert_sp_array_equal(np.sin(sparse), result)
sparse = SparseArray([1, -1, 2, -2], fill_value=1)
result = SparseArray(np.sin([1, -1, 2, -2]), fill_value=np.sin(1))
tm.assert_sp_array_equal(np.sin(sparse), result)
sparse = SparseArray([1, -1, 0, -2], fill_value=0)
result = SparseArray(np.sin([1, -1, 0, -2]), fill_value=np.sin(0))
tm.assert_sp_array_equal(np.sin(sparse), result)
def test_ufunc_args(self):
# GH 13853 make sure ufunc is applied to fill_value, including its arg
sparse = SparseArray([1, np.nan, 2, np.nan, -2])
result = SparseArray([2, np.nan, 3, np.nan, -1])
tm.assert_sp_array_equal(np.add(sparse, 1), result)
sparse = SparseArray([1, -1, 2, -2], fill_value=1)
result = SparseArray([2, 0, 3, -1], fill_value=2)
tm.assert_sp_array_equal(np.add(sparse, 1), result)
sparse = SparseArray([1, -1, 0, -2], fill_value=0)
result = SparseArray([2, 0, 1, -1], fill_value=1)
tm.assert_sp_array_equal(np.add(sparse, 1), result)
@pytest.mark.parametrize("fill_value", [0.0, np.nan])
def test_modf(self, fill_value):
# https://github.com/pandas-dev/pandas/issues/26946
sparse = SparseArray([fill_value] * 10 + [1.1, 2.2], fill_value=fill_value)
r1, r2 = np.modf(sparse)
e1, e2 = np.modf(np.asarray(sparse))
tm.assert_sp_array_equal(r1, SparseArray(e1, fill_value=fill_value))
tm.assert_sp_array_equal(r2, SparseArray(e2, fill_value=fill_value))
def test_nbytes_integer(self):
arr = SparseArray([1, 0, 0, 0, 2], kind="integer")
result = arr.nbytes
# (2 * 8) + 2 * 4
assert result == 24
def test_nbytes_block(self):
arr = SparseArray([1, 2, 0, 0, 0], kind="block")
result = arr.nbytes
# (2 * 8) + 4 + 4
# sp_values, blocs, blengths
assert result == 24
def test_asarray_datetime64(self):
s = SparseArray(pd.to_datetime(["2012", None, None, "2013"]))
np.asarray(s)
def test_density(self):
arr = SparseArray([0, 1])
assert arr.density == 0.5
def test_npoints(self):
arr = SparseArray([0, 1])
assert arr.npoints == 1
class TestAccessor:
@pytest.mark.parametrize("attr", ["npoints", "density", "fill_value", "sp_values"])
def test_get_attributes(self, attr):
arr = SparseArray([0, 1])
ser = pd.Series(arr)
result = getattr(ser.sparse, attr)
expected = getattr(arr, attr)
assert result == expected
@td.skip_if_no_scipy
def test_from_coo(self):
import scipy.sparse
row = [0, 3, 1, 0]
col = [0, 3, 1, 2]
data = [4, 5, 7, 9]
# TODO(scipy#13585): Remove dtype when scipy is fixed
# https://github.com/scipy/scipy/issues/13585
sp_array = scipy.sparse.coo_matrix((data, (row, col)), dtype="int")
result = pd.Series.sparse.from_coo(sp_array)
index = pd.MultiIndex.from_arrays([[0, 0, 1, 3], [0, 2, 1, 3]])
expected = pd.Series([4, 9, 7, 5], index=index, dtype="Sparse[int]")
tm.assert_series_equal(result, expected)
@td.skip_if_no_scipy
@pytest.mark.parametrize(
"sort_labels, expected_rows, expected_cols, expected_values_pos",
[
(
False,
[("b", 2), ("a", 2), ("b", 1), ("a", 1)],
[("z", 1), ("z", 2), ("x", 2), ("z", 0)],
{1: (1, 0), 3: (3, 3)},
),
(
True,
[("a", 1), ("a", 2), ("b", 1), ("b", 2)],
[("x", 2), ("z", 0), ("z", 1), ("z", 2)],
{1: (1, 2), 3: (0, 1)},
),
],
)
def test_to_coo(
self, sort_labels, expected_rows, expected_cols, expected_values_pos
):
import scipy.sparse
values = SparseArray([0, np.nan, 1, 0, None, 3], fill_value=0)
index = pd.MultiIndex.from_tuples(
[
("b", 2, "z", 1),
("a", 2, "z", 2),
("a", 2, "z", 1),
("a", 2, "x", 2),
("b", 1, "z", 1),
("a", 1, "z", 0),
]
)
ss = pd.Series(values, index=index)
expected_A = np.zeros((4, 4))
for value, (row, col) in expected_values_pos.items():
expected_A[row, col] = value
A, rows, cols = ss.sparse.to_coo(
row_levels=(0, 1), column_levels=(2, 3), sort_labels=sort_labels
)
assert isinstance(A, scipy.sparse.coo_matrix)
tm.assert_numpy_array_equal(A.toarray(), expected_A)
assert rows == expected_rows
assert cols == expected_cols
def test_non_sparse_raises(self):
ser = pd.Series([1, 2, 3])
with pytest.raises(AttributeError, match=".sparse"):
ser.sparse.density
def test_setting_fill_value_fillna_still_works():
# This is why letting users update fill_value / dtype is bad
# astype has the same problem.
arr = SparseArray([1.0, np.nan, 1.0], fill_value=0.0)
arr.fill_value = np.nan
result = arr.isna()
# Can't do direct comparison, since the sp_index will be different
# So let's convert to ndarray and check there.
result = np.asarray(result)
expected = np.array([False, True, False])
tm.assert_numpy_array_equal(result, expected)
def test_setting_fill_value_updates():
arr = SparseArray([0.0, np.nan], fill_value=0)
arr.fill_value = np.nan
# use private constructor to get the index right
# otherwise both nans would be un-stored.
expected = SparseArray._simple_new(
sparse_array=np.array([np.nan]),
sparse_index=IntIndex(2, [1]),
dtype=SparseDtype(float, np.nan),
)
tm.assert_sp_array_equal(arr, expected)
@pytest.mark.parametrize(
"arr, loc",
[
([None, 1, 2], 0),
([0, None, 2], 1),
([0, 1, None], 2),
([0, 1, 1, None, None], 3),
([1, 1, 1, 2], -1),
([], -1),
],
)
def test_first_fill_value_loc(arr, loc):
result = SparseArray(arr)._first_fill_value_loc()
assert result == loc
@pytest.mark.parametrize(
"arr", [[1, 2, np.nan, np.nan], [1, np.nan, 2, np.nan], [1, 2, np.nan]]
)
@pytest.mark.parametrize("fill_value", [np.nan, 0, 1])
def test_unique_na_fill(arr, fill_value):
a = SparseArray(arr, fill_value=fill_value).unique()
b = pd.Series(arr).unique()
assert isinstance(a, SparseArray)
a = np.asarray(a)
tm.assert_numpy_array_equal(a, b)
def test_unique_all_sparse():
# https://github.com/pandas-dev/pandas/issues/23168
arr = SparseArray([0, 0])
result = arr.unique()
expected = SparseArray([0])
tm.assert_sp_array_equal(result, expected)
def test_map():
arr = SparseArray([0, 1, 2])
expected = SparseArray([10, 11, 12], fill_value=10)
# dict
result = arr.map({0: 10, 1: 11, 2: 12})
tm.assert_sp_array_equal(result, expected)
# series
result = arr.map(pd.Series({0: 10, 1: 11, 2: 12}))
tm.assert_sp_array_equal(result, expected)
# function
result = arr.map(pd.Series({0: 10, 1: 11, 2: 12}))
expected = SparseArray([10, 11, 12], fill_value=10)
tm.assert_sp_array_equal(result, expected)
def test_map_missing():
arr = SparseArray([0, 1, 2])
expected = SparseArray([10, 11, None], fill_value=10)
result = arr.map({0: 10, 1: 11})
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("fill_value", [np.nan, 1])
def test_dropna(fill_value):
# GH-28287
arr = SparseArray([np.nan, 1], fill_value=fill_value)
exp = SparseArray([1.0], fill_value=fill_value)
tm.assert_sp_array_equal(arr.dropna(), exp)
df = pd.DataFrame({"a": [0, 1], "b": arr})
expected_df = pd.DataFrame({"a": [1], "b": exp}, index=Int64Index([1]))
tm.assert_equal(df.dropna(), expected_df)
def test_drop_duplicates_fill_value():
# GH 11726
df = pd.DataFrame(np.zeros((5, 5))).apply(lambda x: SparseArray(x, fill_value=0))
result = df.drop_duplicates()
expected = pd.DataFrame({i: SparseArray([0.0], fill_value=0) for i in range(5)})
tm.assert_frame_equal(result, expected)
class TestMinMax:
@pytest.mark.parametrize(
"raw_data,max_expected,min_expected",
[
(np.arange(5.0), [4], [0]),
(-np.arange(5.0), [0], [-4]),
(np.array([0, 1, 2, np.nan, 4]), [4], [0]),
(np.array([np.nan] * 5), [np.nan], [np.nan]),
(np.array([]), [np.nan], [np.nan]),
],
)
def test_nan_fill_value(self, raw_data, max_expected, min_expected):
arr = SparseArray(raw_data)
max_result = arr.max()
min_result = arr.min()
assert max_result in max_expected
assert min_result in min_expected
max_result = arr.max(skipna=False)
min_result = arr.min(skipna=False)
if np.isnan(raw_data).any():
assert np.isnan(max_result)
assert np.isnan(min_result)
else:
assert max_result in max_expected
assert min_result in min_expected
@pytest.mark.parametrize(
"fill_value,max_expected,min_expected",
[
(100, 100, 0),
(-100, 1, -100),
],
)
def test_fill_value(self, fill_value, max_expected, min_expected):
arr = SparseArray(
np.array([fill_value, 0, 1]), dtype=SparseDtype("int", fill_value)
)
max_result = arr.max()
assert max_result == max_expected
min_result = arr.min()
assert min_result == min_expected
def test_only_fill_value(self):
fv = 100
arr = SparseArray(np.array([fv, fv, fv]), dtype=SparseDtype("int", fv))
assert len(arr._valid_sp_values) == 0
assert arr.max() == fv
assert arr.min() == fv
assert arr.max(skipna=False) == fv
assert arr.min(skipna=False) == fv
@pytest.mark.parametrize("func", ["min", "max"])
@pytest.mark.parametrize("data", [np.array([]), np.array([np.nan, np.nan])])
@pytest.mark.parametrize(
"dtype,expected",
[
(SparseDtype(np.float64, np.nan), np.nan),
(SparseDtype(np.float64, 5.0), np.nan),
(SparseDtype("datetime64[ns]", pd.NaT), pd.NaT),
(SparseDtype("datetime64[ns]", pd.to_datetime("2018-05-05")), pd.NaT),
],
)
def test_na_value_if_no_valid_values(self, func, data, dtype, expected):
arr = SparseArray(data, dtype=dtype)
result = getattr(arr, func)()
if expected is pd.NaT:
# TODO: pin down whether we wrap datetime64("NaT")
assert result is pd.NaT or np.isnat(result)
else:
assert np.isnan(result)