A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pandas/tests/internals/test_internals.py

1410 lines
48 KiB

from datetime import (
date,
datetime,
)
import itertools
import re
import numpy as np
import pytest
from pandas._libs.internals import BlockPlacement
import pandas.util._test_decorators as td
from pandas.core.dtypes.common import is_scalar
import pandas as pd
from pandas import (
Categorical,
DataFrame,
DatetimeIndex,
Index,
IntervalIndex,
Series,
Timedelta,
Timestamp,
period_range,
)
import pandas._testing as tm
import pandas.core.algorithms as algos
from pandas.core.arrays import (
DatetimeArray,
SparseArray,
TimedeltaArray,
)
from pandas.core.internals import (
BlockManager,
SingleBlockManager,
make_block,
)
from pandas.core.internals.blocks import (
ensure_block_shape,
new_block,
)
# this file contains BlockManager specific tests
# TODO(ArrayManager) factor out interleave_dtype tests
pytestmark = td.skip_array_manager_invalid_test
@pytest.fixture(params=[new_block, make_block])
def block_maker(request):
"""
Fixture to test both the internal new_block and pseudo-public make_block.
"""
return request.param
@pytest.fixture
def mgr():
return create_mgr(
"a: f8; b: object; c: f8; d: object; e: f8;"
"f: bool; g: i8; h: complex; i: datetime-1; j: datetime-2;"
"k: M8[ns, US/Eastern]; l: M8[ns, CET];"
)
def assert_block_equal(left, right):
tm.assert_numpy_array_equal(left.values, right.values)
assert left.dtype == right.dtype
assert isinstance(left.mgr_locs, BlockPlacement)
assert isinstance(right.mgr_locs, BlockPlacement)
tm.assert_numpy_array_equal(left.mgr_locs.as_array, right.mgr_locs.as_array)
def get_numeric_mat(shape):
arr = np.arange(shape[0])
return np.lib.stride_tricks.as_strided(
x=arr, shape=shape, strides=(arr.itemsize,) + (0,) * (len(shape) - 1)
).copy()
N = 10
def create_block(typestr, placement, item_shape=None, num_offset=0, maker=new_block):
"""
Supported typestr:
* float, f8, f4, f2
* int, i8, i4, i2, i1
* uint, u8, u4, u2, u1
* complex, c16, c8
* bool
* object, string, O
* datetime, dt, M8[ns], M8[ns, tz]
* timedelta, td, m8[ns]
* sparse (SparseArray with fill_value=0.0)
* sparse_na (SparseArray with fill_value=np.nan)
* category, category2
"""
placement = BlockPlacement(placement)
num_items = len(placement)
if item_shape is None:
item_shape = (N,)
shape = (num_items,) + item_shape
mat = get_numeric_mat(shape)
if typestr in (
"float",
"f8",
"f4",
"f2",
"int",
"i8",
"i4",
"i2",
"i1",
"uint",
"u8",
"u4",
"u2",
"u1",
):
values = mat.astype(typestr) + num_offset
elif typestr in ("complex", "c16", "c8"):
values = 1.0j * (mat.astype(typestr) + num_offset)
elif typestr in ("object", "string", "O"):
values = np.reshape([f"A{i:d}" for i in mat.ravel() + num_offset], shape)
elif typestr in ("b", "bool"):
values = np.ones(shape, dtype=np.bool_)
elif typestr in ("datetime", "dt", "M8[ns]"):
values = (mat * 1e9).astype("M8[ns]")
elif typestr.startswith("M8[ns"):
# datetime with tz
m = re.search(r"M8\[ns,\s*(\w+\/?\w*)\]", typestr)
assert m is not None, f"incompatible typestr -> {typestr}"
tz = m.groups()[0]
assert num_items == 1, "must have only 1 num items for a tz-aware"
values = DatetimeIndex(np.arange(N) * 1e9, tz=tz)._data
values = ensure_block_shape(values, ndim=len(shape))
elif typestr in ("timedelta", "td", "m8[ns]"):
values = (mat * 1).astype("m8[ns]")
elif typestr in ("category",):
values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4])
elif typestr in ("category2",):
values = Categorical(["a", "a", "a", "a", "b", "b", "c", "c", "c", "d"])
elif typestr in ("sparse", "sparse_na"):
if shape[-1] != 10:
# We also are implicitly assuming this in the category cases above
raise NotImplementedError
assert all(s == 1 for s in shape[:-1])
if typestr.endswith("_na"):
fill_value = np.nan
else:
fill_value = 0.0
values = SparseArray(
[fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6],
fill_value=fill_value,
)
arr = values.sp_values.view()
arr += num_offset - 1
else:
raise ValueError(f'Unsupported typestr: "{typestr}"')
return maker(values, placement=placement, ndim=len(shape))
def create_single_mgr(typestr, num_rows=None):
if num_rows is None:
num_rows = N
return SingleBlockManager(
create_block(typestr, placement=slice(0, num_rows), item_shape=()),
Index(np.arange(num_rows)),
)
def create_mgr(descr, item_shape=None):
"""
Construct BlockManager from string description.
String description syntax looks similar to np.matrix initializer. It looks
like this::
a,b,c: f8; d,e,f: i8
Rules are rather simple:
* see list of supported datatypes in `create_block` method
* components are semicolon-separated
* each component is `NAME,NAME,NAME: DTYPE_ID`
* whitespace around colons & semicolons are removed
* components with same DTYPE_ID are combined into single block
* to force multiple blocks with same dtype, use '-SUFFIX'::
'a:f8-1; b:f8-2; c:f8-foobar'
"""
if item_shape is None:
item_shape = (N,)
offset = 0
mgr_items = []
block_placements = {}
for d in descr.split(";"):
d = d.strip()
if not len(d):
continue
names, blockstr = d.partition(":")[::2]
blockstr = blockstr.strip()
names = names.strip().split(",")
mgr_items.extend(names)
placement = list(np.arange(len(names)) + offset)
try:
block_placements[blockstr].extend(placement)
except KeyError:
block_placements[blockstr] = placement
offset += len(names)
mgr_items = Index(mgr_items)
blocks = []
num_offset = 0
for blockstr, placement in block_placements.items():
typestr = blockstr.split("-")[0]
blocks.append(
create_block(
typestr, placement, item_shape=item_shape, num_offset=num_offset
)
)
num_offset += len(placement)
sblocks = sorted(blocks, key=lambda b: b.mgr_locs[0])
return BlockManager(
tuple(sblocks),
[mgr_items] + [Index(np.arange(n)) for n in item_shape],
)
class TestBlock:
def setup_method(self, method):
self.fblock = create_block("float", [0, 2, 4])
self.cblock = create_block("complex", [7])
self.oblock = create_block("object", [1, 3])
self.bool_block = create_block("bool", [5])
def test_constructor(self):
int32block = create_block("i4", [0])
assert int32block.dtype == np.int32
@pytest.mark.parametrize(
"typ, data",
[
["float", [0, 2, 4]],
["complex", [7]],
["object", [1, 3]],
["bool", [5]],
],
)
def test_pickle(self, typ, data):
blk = create_block(typ, data)
assert_block_equal(tm.round_trip_pickle(blk), blk)
def test_mgr_locs(self):
assert isinstance(self.fblock.mgr_locs, BlockPlacement)
tm.assert_numpy_array_equal(
self.fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.intp)
)
def test_attrs(self):
assert self.fblock.shape == self.fblock.values.shape
assert self.fblock.dtype == self.fblock.values.dtype
assert len(self.fblock) == len(self.fblock.values)
def test_copy(self):
cop = self.fblock.copy()
assert cop is not self.fblock
assert_block_equal(self.fblock, cop)
def test_delete(self):
newb = self.fblock.copy()
newb.delete(0)
assert isinstance(newb.mgr_locs, BlockPlacement)
tm.assert_numpy_array_equal(
newb.mgr_locs.as_array, np.array([2, 4], dtype=np.intp)
)
assert (newb.values[0] == 1).all()
newb = self.fblock.copy()
newb.delete(1)
assert isinstance(newb.mgr_locs, BlockPlacement)
tm.assert_numpy_array_equal(
newb.mgr_locs.as_array, np.array([0, 4], dtype=np.intp)
)
assert (newb.values[1] == 2).all()
newb = self.fblock.copy()
newb.delete(2)
tm.assert_numpy_array_equal(
newb.mgr_locs.as_array, np.array([0, 2], dtype=np.intp)
)
assert (newb.values[1] == 1).all()
newb = self.fblock.copy()
with pytest.raises(IndexError, match=None):
newb.delete(3)
def test_delete_datetimelike(self):
# dont use np.delete on values, as that will coerce from DTA/TDA to ndarray
arr = np.arange(20, dtype="i8").reshape(5, 4).view("m8[ns]")
df = DataFrame(arr)
blk = df._mgr.blocks[0]
assert isinstance(blk.values, TimedeltaArray)
blk.delete(1)
assert isinstance(blk.values, TimedeltaArray)
df = DataFrame(arr.view("M8[ns]"))
blk = df._mgr.blocks[0]
assert isinstance(blk.values, DatetimeArray)
blk.delete([1, 3])
assert isinstance(blk.values, DatetimeArray)
def test_split(self):
# GH#37799
values = np.random.randn(3, 4)
blk = new_block(values, placement=[3, 1, 6], ndim=2)
result = blk._split()
# check that we get views, not copies
values[:] = -9999
assert (blk.values == -9999).all()
assert len(result) == 3
expected = [
new_block(values[[0]], placement=[3], ndim=2),
new_block(values[[1]], placement=[1], ndim=2),
new_block(values[[2]], placement=[6], ndim=2),
]
for res, exp in zip(result, expected):
assert_block_equal(res, exp)
def test_is_categorical_deprecated(self):
# GH#40571
blk = self.fblock
with tm.assert_produces_warning(DeprecationWarning):
blk.is_categorical
class TestBlockManager:
def test_attrs(self):
mgr = create_mgr("a,b,c: f8-1; d,e,f: f8-2")
assert mgr.nblocks == 2
assert len(mgr) == 6
def test_duplicate_ref_loc_failure(self):
tmp_mgr = create_mgr("a:bool; a: f8")
axes, blocks = tmp_mgr.axes, tmp_mgr.blocks
blocks[0].mgr_locs = BlockPlacement(np.array([0]))
blocks[1].mgr_locs = BlockPlacement(np.array([0]))
# test trying to create block manager with overlapping ref locs
msg = "Gaps in blk ref_locs"
with pytest.raises(AssertionError, match=msg):
mgr = BlockManager(blocks, axes)
mgr._rebuild_blknos_and_blklocs()
blocks[0].mgr_locs = BlockPlacement(np.array([0]))
blocks[1].mgr_locs = BlockPlacement(np.array([1]))
mgr = BlockManager(blocks, axes)
mgr.iget(1)
def test_pickle(self, mgr):
mgr2 = tm.round_trip_pickle(mgr)
tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
# GH2431
assert hasattr(mgr2, "_is_consolidated")
assert hasattr(mgr2, "_known_consolidated")
# reset to False on load
assert not mgr2._is_consolidated
assert not mgr2._known_consolidated
@pytest.mark.parametrize("mgr_string", ["a,a,a:f8", "a: f8; a: i8"])
def test_non_unique_pickle(self, mgr_string):
mgr = create_mgr(mgr_string)
mgr2 = tm.round_trip_pickle(mgr)
tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
def test_categorical_block_pickle(self):
mgr = create_mgr("a: category")
mgr2 = tm.round_trip_pickle(mgr)
tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
smgr = create_single_mgr("category")
smgr2 = tm.round_trip_pickle(smgr)
tm.assert_series_equal(Series(smgr), Series(smgr2))
def test_iget(self):
cols = Index(list("abc"))
values = np.random.rand(3, 3)
block = new_block(
values=values.copy(),
placement=np.arange(3, dtype=np.intp),
ndim=values.ndim,
)
mgr = BlockManager(blocks=(block,), axes=[cols, Index(np.arange(3))])
tm.assert_almost_equal(mgr.iget(0).internal_values(), values[0])
tm.assert_almost_equal(mgr.iget(1).internal_values(), values[1])
tm.assert_almost_equal(mgr.iget(2).internal_values(), values[2])
def test_set(self):
mgr = create_mgr("a,b,c: int", item_shape=(3,))
mgr.insert(len(mgr.items), "d", np.array(["foo"] * 3))
mgr.iset(1, np.array(["bar"] * 3))
tm.assert_numpy_array_equal(mgr.iget(0).internal_values(), np.array([0] * 3))
tm.assert_numpy_array_equal(
mgr.iget(1).internal_values(), np.array(["bar"] * 3, dtype=np.object_)
)
tm.assert_numpy_array_equal(mgr.iget(2).internal_values(), np.array([2] * 3))
tm.assert_numpy_array_equal(
mgr.iget(3).internal_values(), np.array(["foo"] * 3, dtype=np.object_)
)
def test_set_change_dtype(self, mgr):
mgr.insert(len(mgr.items), "baz", np.zeros(N, dtype=bool))
mgr.iset(mgr.items.get_loc("baz"), np.repeat("foo", N))
idx = mgr.items.get_loc("baz")
assert mgr.iget(idx).dtype == np.object_
mgr2 = mgr.consolidate()
mgr2.iset(mgr2.items.get_loc("baz"), np.repeat("foo", N))
idx = mgr2.items.get_loc("baz")
assert mgr2.iget(idx).dtype == np.object_
mgr2.insert(len(mgr2.items), "quux", np.random.randn(N).astype(int))
idx = mgr2.items.get_loc("quux")
assert mgr2.iget(idx).dtype == np.int_
mgr2.iset(mgr2.items.get_loc("quux"), np.random.randn(N))
assert mgr2.iget(idx).dtype == np.float_
def test_copy(self, mgr):
cp = mgr.copy(deep=False)
for blk, cp_blk in zip(mgr.blocks, cp.blocks):
# view assertion
tm.assert_equal(cp_blk.values, blk.values)
if isinstance(blk.values, np.ndarray):
assert cp_blk.values.base is blk.values.base
else:
# DatetimeTZBlock has DatetimeIndex values
assert cp_blk.values._data.base is blk.values._data.base
# copy(deep=True) consolidates, so the block-wise assertions will
# fail is mgr is not consolidated
mgr._consolidate_inplace()
cp = mgr.copy(deep=True)
for blk, cp_blk in zip(mgr.blocks, cp.blocks):
bvals = blk.values
cpvals = cp_blk.values
tm.assert_equal(cpvals, bvals)
if isinstance(cpvals, np.ndarray):
lbase = cpvals.base
rbase = bvals.base
else:
lbase = cpvals._ndarray.base
rbase = bvals._ndarray.base
# copy assertion we either have a None for a base or in case of
# some blocks it is an array (e.g. datetimetz), but was copied
if isinstance(cpvals, DatetimeArray):
assert (lbase is None and rbase is None) or (lbase is not rbase)
elif not isinstance(cpvals, np.ndarray):
assert lbase is not rbase
else:
assert lbase is None and rbase is None
def test_sparse(self):
mgr = create_mgr("a: sparse-1; b: sparse-2")
assert mgr.as_array().dtype == np.float64
def test_sparse_mixed(self):
mgr = create_mgr("a: sparse-1; b: sparse-2; c: f8")
assert len(mgr.blocks) == 3
assert isinstance(mgr, BlockManager)
@pytest.mark.parametrize(
"mgr_string, dtype",
[("c: f4; d: f2", np.float32), ("c: f4; d: f2; e: f8", np.float64)],
)
def test_as_array_float(self, mgr_string, dtype):
mgr = create_mgr(mgr_string)
assert mgr.as_array().dtype == dtype
@pytest.mark.parametrize(
"mgr_string, dtype",
[
("a: bool-1; b: bool-2", np.bool_),
("a: i8-1; b: i8-2; c: i4; d: i2; e: u1", np.int64),
("c: i4; d: i2; e: u1", np.int32),
],
)
def test_as_array_int_bool(self, mgr_string, dtype):
mgr = create_mgr(mgr_string)
assert mgr.as_array().dtype == dtype
def test_as_array_datetime(self):
mgr = create_mgr("h: datetime-1; g: datetime-2")
assert mgr.as_array().dtype == "M8[ns]"
def test_as_array_datetime_tz(self):
mgr = create_mgr("h: M8[ns, US/Eastern]; g: M8[ns, CET]")
assert mgr.iget(0).dtype == "datetime64[ns, US/Eastern]"
assert mgr.iget(1).dtype == "datetime64[ns, CET]"
assert mgr.as_array().dtype == "object"
@pytest.mark.parametrize("t", ["float16", "float32", "float64", "int32", "int64"])
def test_astype(self, t):
# coerce all
mgr = create_mgr("c: f4; d: f2; e: f8")
t = np.dtype(t)
tmgr = mgr.astype(t)
assert tmgr.iget(0).dtype.type == t
assert tmgr.iget(1).dtype.type == t
assert tmgr.iget(2).dtype.type == t
# mixed
mgr = create_mgr("a,b: object; c: bool; d: datetime; e: f4; f: f2; g: f8")
t = np.dtype(t)
tmgr = mgr.astype(t, errors="ignore")
assert tmgr.iget(2).dtype.type == t
assert tmgr.iget(4).dtype.type == t
assert tmgr.iget(5).dtype.type == t
assert tmgr.iget(6).dtype.type == t
assert tmgr.iget(0).dtype.type == np.object_
assert tmgr.iget(1).dtype.type == np.object_
if t != np.int64:
assert tmgr.iget(3).dtype.type == np.datetime64
else:
assert tmgr.iget(3).dtype.type == t
def test_convert(self):
def _compare(old_mgr, new_mgr):
"""compare the blocks, numeric compare ==, object don't"""
old_blocks = set(old_mgr.blocks)
new_blocks = set(new_mgr.blocks)
assert len(old_blocks) == len(new_blocks)
# compare non-numeric
for b in old_blocks:
found = False
for nb in new_blocks:
if (b.values == nb.values).all():
found = True
break
assert found
for b in new_blocks:
found = False
for ob in old_blocks:
if (b.values == ob.values).all():
found = True
break
assert found
# noops
mgr = create_mgr("f: i8; g: f8")
new_mgr = mgr.convert()
_compare(mgr, new_mgr)
# convert
mgr = create_mgr("a,b,foo: object; f: i8; g: f8")
mgr.iset(0, np.array(["1"] * N, dtype=np.object_))
mgr.iset(1, np.array(["2."] * N, dtype=np.object_))
mgr.iset(2, np.array(["foo."] * N, dtype=np.object_))
new_mgr = mgr.convert(numeric=True)
assert new_mgr.iget(0).dtype == np.int64
assert new_mgr.iget(1).dtype == np.float64
assert new_mgr.iget(2).dtype == np.object_
assert new_mgr.iget(3).dtype == np.int64
assert new_mgr.iget(4).dtype == np.float64
mgr = create_mgr(
"a,b,foo: object; f: i4; bool: bool; dt: datetime; i: i8; g: f8; h: f2"
)
mgr.iset(0, np.array(["1"] * N, dtype=np.object_))
mgr.iset(1, np.array(["2."] * N, dtype=np.object_))
mgr.iset(2, np.array(["foo."] * N, dtype=np.object_))
new_mgr = mgr.convert(numeric=True)
assert new_mgr.iget(0).dtype == np.int64
assert new_mgr.iget(1).dtype == np.float64
assert new_mgr.iget(2).dtype == np.object_
assert new_mgr.iget(3).dtype == np.int32
assert new_mgr.iget(4).dtype == np.bool_
assert new_mgr.iget(5).dtype.type, np.datetime64
assert new_mgr.iget(6).dtype == np.int64
assert new_mgr.iget(7).dtype == np.float64
assert new_mgr.iget(8).dtype == np.float16
def test_invalid_ea_block(self):
with pytest.raises(ValueError, match="need to split"):
create_mgr("a: category; b: category")
with pytest.raises(ValueError, match="need to split"):
create_mgr("a: category2; b: category2")
def test_interleave(self):
# self
for dtype in ["f8", "i8", "object", "bool", "complex", "M8[ns]", "m8[ns]"]:
mgr = create_mgr(f"a: {dtype}")
assert mgr.as_array().dtype == dtype
mgr = create_mgr(f"a: {dtype}; b: {dtype}")
assert mgr.as_array().dtype == dtype
@pytest.mark.parametrize(
"mgr_string, dtype",
[
("a: category", "i8"),
("a: category; b: category", "i8"),
("a: category; b: category2", "object"),
("a: category2", "object"),
("a: category2; b: category2", "object"),
("a: f8", "f8"),
("a: f8; b: i8", "f8"),
("a: f4; b: i8", "f8"),
("a: f4; b: i8; d: object", "object"),
("a: bool; b: i8", "object"),
("a: complex", "complex"),
("a: f8; b: category", "object"),
("a: M8[ns]; b: category", "object"),
("a: M8[ns]; b: bool", "object"),
("a: M8[ns]; b: i8", "object"),
("a: m8[ns]; b: bool", "object"),
("a: m8[ns]; b: i8", "object"),
("a: M8[ns]; b: m8[ns]", "object"),
],
)
def test_interleave_dtype(self, mgr_string, dtype):
# will be converted according the actual dtype of the underlying
mgr = create_mgr("a: category")
assert mgr.as_array().dtype == "i8"
mgr = create_mgr("a: category; b: category2")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: category2")
assert mgr.as_array().dtype == "object"
# combinations
mgr = create_mgr("a: f8")
assert mgr.as_array().dtype == "f8"
mgr = create_mgr("a: f8; b: i8")
assert mgr.as_array().dtype == "f8"
mgr = create_mgr("a: f4; b: i8")
assert mgr.as_array().dtype == "f8"
mgr = create_mgr("a: f4; b: i8; d: object")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: bool; b: i8")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: complex")
assert mgr.as_array().dtype == "complex"
mgr = create_mgr("a: f8; b: category")
assert mgr.as_array().dtype == "f8"
mgr = create_mgr("a: M8[ns]; b: category")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: M8[ns]; b: bool")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: M8[ns]; b: i8")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: m8[ns]; b: bool")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: m8[ns]; b: i8")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: M8[ns]; b: m8[ns]")
assert mgr.as_array().dtype == "object"
def test_consolidate_ordering_issues(self, mgr):
mgr.iset(mgr.items.get_loc("f"), np.random.randn(N))
mgr.iset(mgr.items.get_loc("d"), np.random.randn(N))
mgr.iset(mgr.items.get_loc("b"), np.random.randn(N))
mgr.iset(mgr.items.get_loc("g"), np.random.randn(N))
mgr.iset(mgr.items.get_loc("h"), np.random.randn(N))
# we have datetime/tz blocks in mgr
cons = mgr.consolidate()
assert cons.nblocks == 4
cons = mgr.consolidate().get_numeric_data()
assert cons.nblocks == 1
assert isinstance(cons.blocks[0].mgr_locs, BlockPlacement)
tm.assert_numpy_array_equal(
cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.intp)
)
def test_reindex_items(self):
# mgr is not consolidated, f8 & f8-2 blocks
mgr = create_mgr("a: f8; b: i8; c: f8; d: i8; e: f8; f: bool; g: f8-2")
reindexed = mgr.reindex_axis(["g", "c", "a", "d"], axis=0)
assert reindexed.nblocks == 2
tm.assert_index_equal(reindexed.items, Index(["g", "c", "a", "d"]))
tm.assert_almost_equal(
mgr.iget(6).internal_values(), reindexed.iget(0).internal_values()
)
tm.assert_almost_equal(
mgr.iget(2).internal_values(), reindexed.iget(1).internal_values()
)
tm.assert_almost_equal(
mgr.iget(0).internal_values(), reindexed.iget(2).internal_values()
)
tm.assert_almost_equal(
mgr.iget(3).internal_values(), reindexed.iget(3).internal_values()
)
def test_get_numeric_data(self):
mgr = create_mgr(
"int: int; float: float; complex: complex;"
"str: object; bool: bool; obj: object; dt: datetime",
item_shape=(3,),
)
mgr.iset(5, np.array([1, 2, 3], dtype=np.object_))
numeric = mgr.get_numeric_data()
tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"]))
tm.assert_almost_equal(
mgr.iget(mgr.items.get_loc("float")).internal_values(),
numeric.iget(numeric.items.get_loc("float")).internal_values(),
)
# Check sharing
numeric.iset(
numeric.items.get_loc("float"),
np.array([100.0, 200.0, 300.0]),
inplace=True,
)
tm.assert_almost_equal(
mgr.iget(mgr.items.get_loc("float")).internal_values(),
np.array([100.0, 200.0, 300.0]),
)
numeric2 = mgr.get_numeric_data(copy=True)
tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"]))
numeric2.iset(
numeric2.items.get_loc("float"),
np.array([1000.0, 2000.0, 3000.0]),
inplace=True,
)
tm.assert_almost_equal(
mgr.iget(mgr.items.get_loc("float")).internal_values(),
np.array([100.0, 200.0, 300.0]),
)
def test_get_bool_data(self):
mgr = create_mgr(
"int: int; float: float; complex: complex;"
"str: object; bool: bool; obj: object; dt: datetime",
item_shape=(3,),
)
mgr.iset(6, np.array([True, False, True], dtype=np.object_))
bools = mgr.get_bool_data()
tm.assert_index_equal(bools.items, Index(["bool", "dt"]))
tm.assert_almost_equal(
mgr.iget(mgr.items.get_loc("bool")).internal_values(),
bools.iget(bools.items.get_loc("bool")).internal_values(),
)
bools.iset(0, np.array([True, False, True]), inplace=True)
tm.assert_numpy_array_equal(
mgr.iget(mgr.items.get_loc("bool")).internal_values(),
np.array([True, False, True]),
)
# Check sharing
bools2 = mgr.get_bool_data(copy=True)
bools2.iset(0, np.array([False, True, False]))
tm.assert_numpy_array_equal(
mgr.iget(mgr.items.get_loc("bool")).internal_values(),
np.array([True, False, True]),
)
def test_unicode_repr_doesnt_raise(self):
repr(create_mgr("b,\u05d0: object"))
@pytest.mark.parametrize(
"mgr_string", ["a,b,c: i8-1; d,e,f: i8-2", "a,a,a: i8-1; b,b,b: i8-2"]
)
def test_equals(self, mgr_string):
# unique items
bm1 = create_mgr(mgr_string)
bm2 = BlockManager(bm1.blocks[::-1], bm1.axes)
assert bm1.equals(bm2)
@pytest.mark.parametrize(
"mgr_string",
[
"a:i8;b:f8", # basic case
"a:i8;b:f8;c:c8;d:b", # many types
"a:i8;e:dt;f:td;g:string", # more types
"a:i8;b:category;c:category2", # categories
"c:sparse;d:sparse_na;b:f8", # sparse
],
)
def test_equals_block_order_different_dtypes(self, mgr_string):
# GH 9330
bm = create_mgr(mgr_string)
block_perms = itertools.permutations(bm.blocks)
for bm_perm in block_perms:
bm_this = BlockManager(tuple(bm_perm), bm.axes)
assert bm.equals(bm_this)
assert bm_this.equals(bm)
def test_single_mgr_ctor(self):
mgr = create_single_mgr("f8", num_rows=5)
assert mgr.external_values().tolist() == [0.0, 1.0, 2.0, 3.0, 4.0]
@pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0])
def test_validate_bool_args(self, value):
bm1 = create_mgr("a,b,c: i8-1; d,e,f: i8-2")
msg = (
'For argument "inplace" expected type bool, '
f"received type {type(value).__name__}."
)
with pytest.raises(ValueError, match=msg):
bm1.replace_list([1], [2], inplace=value)
def _as_array(mgr):
if mgr.ndim == 1:
return mgr.external_values()
return mgr.as_array().T
class TestIndexing:
# Nosetests-style data-driven tests.
#
# This test applies different indexing routines to block managers and
# compares the outcome to the result of same operations on np.ndarray.
#
# NOTE: sparse (SparseBlock with fill_value != np.nan) fail a lot of tests
# and are disabled.
MANAGERS = [
create_single_mgr("f8", N),
create_single_mgr("i8", N),
# 2-dim
create_mgr("a,b,c,d,e,f: f8", item_shape=(N,)),
create_mgr("a,b,c,d,e,f: i8", item_shape=(N,)),
create_mgr("a,b: f8; c,d: i8; e,f: string", item_shape=(N,)),
create_mgr("a,b: f8; c,d: i8; e,f: f8", item_shape=(N,)),
]
@pytest.mark.parametrize("mgr", MANAGERS)
def test_get_slice(self, mgr):
def assert_slice_ok(mgr, axis, slobj):
mat = _as_array(mgr)
# we maybe using an ndarray to test slicing and
# might not be the full length of the axis
if isinstance(slobj, np.ndarray):
ax = mgr.axes[axis]
if len(ax) and len(slobj) and len(slobj) != len(ax):
slobj = np.concatenate(
[slobj, np.zeros(len(ax) - len(slobj), dtype=bool)]
)
if isinstance(slobj, slice):
sliced = mgr.get_slice(slobj, axis=axis)
elif mgr.ndim == 1 and axis == 0:
sliced = mgr.getitem_mgr(slobj)
else:
# BlockManager doesn't support non-slice, SingleBlockManager
# doesn't support axis > 0
return
mat_slobj = (slice(None),) * axis + (slobj,)
tm.assert_numpy_array_equal(
mat[mat_slobj], _as_array(sliced), check_dtype=False
)
tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis])
assert mgr.ndim <= 2, mgr.ndim
for ax in range(mgr.ndim):
# slice
assert_slice_ok(mgr, ax, slice(None))
assert_slice_ok(mgr, ax, slice(3))
assert_slice_ok(mgr, ax, slice(100))
assert_slice_ok(mgr, ax, slice(1, 4))
assert_slice_ok(mgr, ax, slice(3, 0, -2))
if mgr.ndim < 2:
# 2D only support slice objects
# boolean mask
assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_))
assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_))
assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_))
if mgr.shape[ax] >= 3:
assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0)
assert_slice_ok(
mgr, ax, np.array([True, True, False], dtype=np.bool_)
)
# fancy indexer
assert_slice_ok(mgr, ax, [])
assert_slice_ok(mgr, ax, list(range(mgr.shape[ax])))
if mgr.shape[ax] >= 3:
assert_slice_ok(mgr, ax, [0, 1, 2])
assert_slice_ok(mgr, ax, [-1, -2, -3])
@pytest.mark.parametrize("mgr", MANAGERS)
def test_take(self, mgr):
def assert_take_ok(mgr, axis, indexer):
mat = _as_array(mgr)
taken = mgr.take(indexer, axis)
tm.assert_numpy_array_equal(
np.take(mat, indexer, axis), _as_array(taken), check_dtype=False
)
tm.assert_index_equal(mgr.axes[axis].take(indexer), taken.axes[axis])
for ax in range(mgr.ndim):
# take/fancy indexer
assert_take_ok(mgr, ax, indexer=[])
assert_take_ok(mgr, ax, indexer=[0, 0, 0])
assert_take_ok(mgr, ax, indexer=list(range(mgr.shape[ax])))
if mgr.shape[ax] >= 3:
assert_take_ok(mgr, ax, indexer=[0, 1, 2])
assert_take_ok(mgr, ax, indexer=[-1, -2, -3])
@pytest.mark.parametrize("mgr", MANAGERS)
@pytest.mark.parametrize("fill_value", [None, np.nan, 100.0])
def test_reindex_axis(self, fill_value, mgr):
def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value):
mat = _as_array(mgr)
indexer = mgr.axes[axis].get_indexer_for(new_labels)
reindexed = mgr.reindex_axis(new_labels, axis, fill_value=fill_value)
tm.assert_numpy_array_equal(
algos.take_nd(mat, indexer, axis, fill_value=fill_value),
_as_array(reindexed),
check_dtype=False,
)
tm.assert_index_equal(reindexed.axes[axis], new_labels)
for ax in range(mgr.ndim):
assert_reindex_axis_is_ok(mgr, ax, Index([]), fill_value)
assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax], fill_value)
assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax][[0, 0, 0]], fill_value)
assert_reindex_axis_is_ok(mgr, ax, Index(["foo", "bar", "baz"]), fill_value)
assert_reindex_axis_is_ok(
mgr, ax, Index(["foo", mgr.axes[ax][0], "baz"]), fill_value
)
if mgr.shape[ax] >= 3:
assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax][:-3], fill_value)
assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax][-3::-1], fill_value)
assert_reindex_axis_is_ok(
mgr, ax, mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value
)
@pytest.mark.parametrize("mgr", MANAGERS)
@pytest.mark.parametrize("fill_value", [None, np.nan, 100.0])
def test_reindex_indexer(self, fill_value, mgr):
def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value):
mat = _as_array(mgr)
reindexed_mat = algos.take_nd(mat, indexer, axis, fill_value=fill_value)
reindexed = mgr.reindex_indexer(
new_labels, indexer, axis, fill_value=fill_value
)
tm.assert_numpy_array_equal(
reindexed_mat, _as_array(reindexed), check_dtype=False
)
tm.assert_index_equal(reindexed.axes[axis], new_labels)
for ax in range(mgr.ndim):
assert_reindex_indexer_is_ok(
mgr, ax, Index([]), np.array([], dtype=np.intp), fill_value
)
assert_reindex_indexer_is_ok(
mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value
)
assert_reindex_indexer_is_ok(
mgr,
ax,
Index(["foo"] * mgr.shape[ax]),
np.arange(mgr.shape[ax]),
fill_value,
)
assert_reindex_indexer_is_ok(
mgr, ax, mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), fill_value
)
assert_reindex_indexer_is_ok(
mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], fill_value
)
assert_reindex_indexer_is_ok(
mgr, ax, Index(["foo", "bar", "baz"]), np.array([0, 0, 0]), fill_value
)
assert_reindex_indexer_is_ok(
mgr, ax, Index(["foo", "bar", "baz"]), np.array([-1, 0, -1]), fill_value
)
assert_reindex_indexer_is_ok(
mgr,
ax,
Index(["foo", mgr.axes[ax][0], "baz"]),
np.array([-1, -1, -1]),
fill_value,
)
if mgr.shape[ax] >= 3:
assert_reindex_indexer_is_ok(
mgr,
ax,
Index(["foo", "bar", "baz"]),
np.array([0, 1, 2]),
fill_value,
)
class TestBlockPlacement:
@pytest.mark.parametrize(
"slc, expected",
[
(slice(0, 4), 4),
(slice(0, 4, 2), 2),
(slice(0, 3, 2), 2),
(slice(0, 1, 2), 1),
(slice(1, 0, -1), 1),
],
)
def test_slice_len(self, slc, expected):
assert len(BlockPlacement(slc)) == expected
@pytest.mark.parametrize("slc", [slice(1, 1, 0), slice(1, 2, 0)])
def test_zero_step_raises(self, slc):
msg = "slice step cannot be zero"
with pytest.raises(ValueError, match=msg):
BlockPlacement(slc)
def test_slice_canonize_negative_stop(self):
# GH#37524 negative stop is OK with negative step and positive start
slc = slice(3, -1, -2)
bp = BlockPlacement(slc)
assert bp.indexer == slice(3, None, -2)
@pytest.mark.parametrize(
"slc",
[
slice(None, None),
slice(10, None),
slice(None, None, -1),
slice(None, 10, -1),
# These are "unbounded" because negative index will
# change depending on container shape.
slice(-1, None),
slice(None, -1),
slice(-1, -1),
slice(-1, None, -1),
slice(None, -1, -1),
slice(-1, -1, -1),
],
)
def test_unbounded_slice_raises(self, slc):
msg = "unbounded slice"
with pytest.raises(ValueError, match=msg):
BlockPlacement(slc)
@pytest.mark.parametrize(
"slc",
[
slice(0, 0),
slice(100, 0),
slice(100, 100),
slice(100, 100, -1),
slice(0, 100, -1),
],
)
def test_not_slice_like_slices(self, slc):
assert not BlockPlacement(slc).is_slice_like
@pytest.mark.parametrize(
"arr, slc",
[
([0], slice(0, 1, 1)),
([100], slice(100, 101, 1)),
([0, 1, 2], slice(0, 3, 1)),
([0, 5, 10], slice(0, 15, 5)),
([0, 100], slice(0, 200, 100)),
([2, 1], slice(2, 0, -1)),
],
)
def test_array_to_slice_conversion(self, arr, slc):
assert BlockPlacement(arr).as_slice == slc
@pytest.mark.parametrize(
"arr",
[
[],
[-1],
[-1, -2, -3],
[-10],
[-1],
[-1, 0, 1, 2],
[-2, 0, 2, 4],
[1, 0, -1],
[1, 1, 1],
],
)
def test_not_slice_like_arrays(self, arr):
assert not BlockPlacement(arr).is_slice_like
@pytest.mark.parametrize(
"slc, expected",
[(slice(0, 3), [0, 1, 2]), (slice(0, 0), []), (slice(3, 0), [])],
)
def test_slice_iter(self, slc, expected):
assert list(BlockPlacement(slc)) == expected
@pytest.mark.parametrize(
"slc, arr",
[
(slice(0, 3), [0, 1, 2]),
(slice(0, 0), []),
(slice(3, 0), []),
(slice(3, 0, -1), [3, 2, 1]),
],
)
def test_slice_to_array_conversion(self, slc, arr):
tm.assert_numpy_array_equal(
BlockPlacement(slc).as_array, np.asarray(arr, dtype=np.intp)
)
def test_blockplacement_add(self):
bpl = BlockPlacement(slice(0, 5))
assert bpl.add(1).as_slice == slice(1, 6, 1)
assert bpl.add(np.arange(5)).as_slice == slice(0, 10, 2)
assert list(bpl.add(np.arange(5, 0, -1))) == [5, 5, 5, 5, 5]
@pytest.mark.parametrize(
"val, inc, expected",
[
(slice(0, 0), 0, []),
(slice(1, 4), 0, [1, 2, 3]),
(slice(3, 0, -1), 0, [3, 2, 1]),
([1, 2, 4], 0, [1, 2, 4]),
(slice(0, 0), 10, []),
(slice(1, 4), 10, [11, 12, 13]),
(slice(3, 0, -1), 10, [13, 12, 11]),
([1, 2, 4], 10, [11, 12, 14]),
(slice(0, 0), -1, []),
(slice(1, 4), -1, [0, 1, 2]),
([1, 2, 4], -1, [0, 1, 3]),
],
)
def test_blockplacement_add_int(self, val, inc, expected):
assert list(BlockPlacement(val).add(inc)) == expected
@pytest.mark.parametrize("val", [slice(1, 4), [1, 2, 4]])
def test_blockplacement_add_int_raises(self, val):
msg = "iadd causes length change"
with pytest.raises(ValueError, match=msg):
BlockPlacement(val).add(-10)
class TestCanHoldElement:
@pytest.fixture(
params=[
lambda x: x,
lambda x: x.to_series(),
lambda x: x._data,
lambda x: list(x),
lambda x: x.astype(object),
lambda x: np.asarray(x),
lambda x: x[0],
lambda x: x[:0],
]
)
def element(self, request):
"""
Functions that take an Index and return an element that should have
blk._can_hold_element(element) for a Block with this index's dtype.
"""
return request.param
def test_datetime_block_can_hold_element(self):
block = create_block("datetime", [0])
assert block._can_hold_element([])
# We will check that block._can_hold_element iff arr.__setitem__ works
arr = pd.array(block.values.ravel())
# coerce None
assert block._can_hold_element(None)
arr[0] = None
assert arr[0] is pd.NaT
# coerce different types of datetime objects
vals = [np.datetime64("2010-10-10"), datetime(2010, 10, 10)]
for val in vals:
assert block._can_hold_element(val)
arr[0] = val
val = date(2010, 10, 10)
assert not block._can_hold_element(val)
msg = (
"value should be a 'Timestamp', 'NaT', "
"or array of those. Got 'date' instead."
)
with pytest.raises(TypeError, match=msg):
arr[0] = val
@pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64])
def test_interval_can_hold_element_emptylist(self, dtype, element):
arr = np.array([1, 3, 4], dtype=dtype)
ii = IntervalIndex.from_breaks(arr)
blk = new_block(ii._data, [1], ndim=2)
assert blk._can_hold_element([])
# TODO: check this holds for all blocks
@pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64])
def test_interval_can_hold_element(self, dtype, element):
arr = np.array([1, 3, 4, 9], dtype=dtype)
ii = IntervalIndex.from_breaks(arr)
blk = new_block(ii._data, [1], ndim=2)
elem = element(ii)
self.check_series_setitem(elem, ii, True)
assert blk._can_hold_element(elem)
# Careful: to get the expected Series-inplace behavior we need
# `elem` to not have the same length as `arr`
ii2 = IntervalIndex.from_breaks(arr[:-1], closed="neither")
elem = element(ii2)
self.check_series_setitem(elem, ii, False)
assert not blk._can_hold_element(elem)
ii3 = IntervalIndex.from_breaks([Timestamp(1), Timestamp(3), Timestamp(4)])
elem = element(ii3)
self.check_series_setitem(elem, ii, False)
assert not blk._can_hold_element(elem)
ii4 = IntervalIndex.from_breaks([Timedelta(1), Timedelta(3), Timedelta(4)])
elem = element(ii4)
self.check_series_setitem(elem, ii, False)
assert not blk._can_hold_element(elem)
def test_period_can_hold_element_emptylist(self):
pi = period_range("2016", periods=3, freq="A")
blk = new_block(pi._data.reshape(1, 3), [1], ndim=2)
assert blk._can_hold_element([])
def test_period_can_hold_element(self, element):
pi = period_range("2016", periods=3, freq="A")
elem = element(pi)
self.check_series_setitem(elem, pi, True)
# Careful: to get the expected Series-inplace behavior we need
# `elem` to not have the same length as `arr`
pi2 = pi.asfreq("D")[:-1]
elem = element(pi2)
self.check_series_setitem(elem, pi, False)
dti = pi.to_timestamp("S")[:-1]
elem = element(dti)
self.check_series_setitem(elem, pi, False)
def check_setting(self, elem, index: Index, inplace: bool):
self.check_series_setitem(elem, index, inplace)
self.check_frame_setitem(elem, index, inplace)
def check_can_hold_element(self, obj, elem, inplace: bool):
blk = obj._mgr.blocks[0]
if inplace:
assert blk._can_hold_element(elem)
else:
assert not blk._can_hold_element(elem)
def check_series_setitem(self, elem, index: Index, inplace: bool):
arr = index._data.copy()
ser = Series(arr)
self.check_can_hold_element(ser, elem, inplace)
if is_scalar(elem):
ser[0] = elem
else:
ser[: len(elem)] = elem
if inplace:
assert ser.array is arr # i.e. setting was done inplace
else:
assert ser.dtype == object
def check_frame_setitem(self, elem, index: Index, inplace: bool):
arr = index._data.copy()
df = DataFrame(arr)
self.check_can_hold_element(df, elem, inplace)
if is_scalar(elem):
df.iloc[0, 0] = elem
else:
df.iloc[: len(elem), 0] = elem
if inplace:
# assertion here implies setting was done inplace
# error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" has no
# attribute "blocks"
assert df._mgr.blocks[0].values is arr # type:ignore[union-attr]
else:
assert df.dtypes[0] == object
class TestShouldStore:
def test_should_store_categorical(self):
cat = Categorical(["A", "B", "C"])
df = DataFrame(cat)
blk = df._mgr.blocks[0]
# matching dtype
assert blk.should_store(cat)
assert blk.should_store(cat[:-1])
# different dtype
assert not blk.should_store(cat.as_ordered())
# ndarray instead of Categorical
assert not blk.should_store(np.asarray(cat))
def test_validate_ndim(block_maker):
values = np.array([1.0, 2.0])
placement = slice(2)
msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]"
with pytest.raises(ValueError, match=msg):
block_maker(values, placement, ndim=2)
def test_block_shape():
idx = Index([0, 1, 2, 3, 4])
a = Series([1, 2, 3]).reindex(idx)
b = Series(Categorical([1, 2, 3])).reindex(idx)
assert a._mgr.blocks[0].mgr_locs.indexer == b._mgr.blocks[0].mgr_locs.indexer
def test_make_block_no_pandas_array(block_maker):
# https://github.com/pandas-dev/pandas/pull/24866
arr = pd.arrays.PandasArray(np.array([1, 2]))
# PandasArray, no dtype
result = block_maker(arr, slice(len(arr)), ndim=arr.ndim)
assert result.dtype.kind in ["i", "u"]
if block_maker is make_block:
# new_block requires caller to unwrap PandasArray
assert result.is_extension is False
# PandasArray, PandasDtype
result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim)
assert result.dtype.kind in ["i", "u"]
assert result.is_extension is False
# new_block no longer taked dtype keyword
# ndarray, PandasDtype
result = block_maker(
arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim
)
assert result.dtype.kind in ["i", "u"]
assert result.is_extension is False
def test_single_block_manager_fastpath_deprecated():
# GH#33092
ser = Series(range(3))
blk = ser._data.blocks[0]
with tm.assert_produces_warning(FutureWarning):
SingleBlockManager(blk, ser.index, fastpath=True)