A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pandas/_libs/index.pyx

799 lines
25 KiB

cimport cython
import numpy as np
cimport numpy as cnp
from numpy cimport (
float32_t,
float64_t,
int8_t,
int16_t,
int32_t,
int64_t,
intp_t,
ndarray,
uint8_t,
uint16_t,
uint32_t,
uint64_t,
)
cnp.import_array()
from pandas._libs cimport util
from pandas._libs.hashtable cimport HashTable
from pandas._libs.tslibs.nattype cimport c_NaT as NaT
from pandas._libs.tslibs.period cimport is_period_object
from pandas._libs.tslibs.timedeltas cimport _Timedelta
from pandas._libs.tslibs.timestamps cimport _Timestamp
from pandas._libs import (
algos,
hashtable as _hash,
)
from pandas._libs.lib cimport eq_NA_compat
from pandas._libs.missing cimport (
C_NA as NA,
checknull,
is_matching_na,
)
cdef inline bint is_definitely_invalid_key(object val):
try:
hash(val)
except TypeError:
return True
return False
cdef ndarray _get_bool_indexer(ndarray values, object val):
"""
Return a ndarray[bool] of locations where val matches self.values.
If val is not NA, this is equivalent to `self.values == val`
"""
# Caller is responsible for ensuring _check_type has already been called
cdef:
ndarray[uint8_t, ndim=1, cast=True] indexer
Py_ssize_t i
object item
if values.descr.type_num == cnp.NPY_OBJECT:
# i.e. values.dtype == object
if not checknull(val):
indexer = eq_NA_compat(values, val)
else:
# We need to check for _matching_ NA values
indexer = np.empty(len(values), dtype=np.uint8)
for i in range(len(values)):
item = values[i]
indexer[i] = is_matching_na(item, val)
else:
if util.is_nan(val):
indexer = np.isnan(values)
else:
indexer = values == val
return indexer.view(bool)
# Don't populate hash tables in monotonic indexes larger than this
_SIZE_CUTOFF = 1_000_000
cdef _unpack_bool_indexer(ndarray[uint8_t, ndim=1, cast=True] indexer, object val):
"""
Possibly unpack a boolean mask to a single indexer.
"""
# Returns ndarray[bool] or int
cdef:
ndarray[intp_t, ndim=1] found
int count
found = np.where(indexer)[0]
count = len(found)
if count > 1:
return indexer
if count == 1:
return int(found[0])
raise KeyError(val)
@cython.freelist(32)
cdef class IndexEngine:
cdef readonly:
ndarray values
HashTable mapping
bint over_size_threshold
cdef:
bint unique, monotonic_inc, monotonic_dec
bint need_monotonic_check, need_unique_check
object _np_type
def __init__(self, ndarray values):
self.values = values
self.over_size_threshold = len(values) >= _SIZE_CUTOFF
self.clear_mapping()
self._np_type = values.dtype.type
def __contains__(self, val: object) -> bool:
# We assume before we get here:
# - val is hashable
self._ensure_mapping_populated()
return val in self.mapping
cpdef get_loc(self, object val):
# -> Py_ssize_t | slice | ndarray[bool]
cdef:
Py_ssize_t loc
if is_definitely_invalid_key(val):
raise TypeError(f"'{val}' is an invalid key")
self._check_type(val)
if self.over_size_threshold and self.is_monotonic_increasing:
if not self.is_unique:
return self._get_loc_duplicates(val)
values = self.values
loc = self._searchsorted_left(val)
if loc >= len(values):
raise KeyError(val)
if values[loc] != val:
raise KeyError(val)
return loc
self._ensure_mapping_populated()
if not self.unique:
return self._get_loc_duplicates(val)
try:
return self.mapping.get_item(val)
except OverflowError as err:
# GH#41775 OverflowError e.g. if we are uint64 and val is -1
# or if we are int64 and value is np.iinfo(np.int64).max+1
# (the uint64 with -1 case should actually be excluded by _check_type)
raise KeyError(val) from err
cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
"""
See ObjectEngine._searchsorted_left.__doc__.
"""
# Caller is responsible for ensuring _check_type has already been called
loc = self.values.searchsorted(self._np_type(val), side="left")
return loc
cdef inline _get_loc_duplicates(self, object val):
# -> Py_ssize_t | slice | ndarray[bool]
cdef:
Py_ssize_t diff, left, right
if self.is_monotonic_increasing:
values = self.values
try:
left = values.searchsorted(val, side='left')
right = values.searchsorted(val, side='right')
except TypeError:
# e.g. GH#29189 get_loc(None) with a Float64Index
# 2021-09-29 Now only reached for object-dtype
raise KeyError(val)
diff = right - left
if diff == 0:
raise KeyError(val)
elif diff == 1:
return left
else:
return slice(left, right)
return self._maybe_get_bool_indexer(val)
cdef _maybe_get_bool_indexer(self, object val):
# Returns ndarray[bool] or int
cdef:
ndarray[uint8_t, ndim=1, cast=True] indexer
indexer = _get_bool_indexer(self.values, val)
return _unpack_bool_indexer(indexer, val)
def sizeof(self, deep: bool = False) -> int:
""" return the sizeof our mapping """
if not self.is_mapping_populated:
return 0
return self.mapping.sizeof(deep=deep)
def __sizeof__(self) -> int:
return self.sizeof()
@property
def is_unique(self) -> bool:
if self.need_unique_check:
self._do_unique_check()
return self.unique == 1
cdef inline _do_unique_check(self):
# this de-facto the same
self._ensure_mapping_populated()
@property
def is_monotonic_increasing(self) -> bool:
if self.need_monotonic_check:
self._do_monotonic_check()
return self.monotonic_inc == 1
@property
def is_monotonic_decreasing(self) -> bool:
if self.need_monotonic_check:
self._do_monotonic_check()
return self.monotonic_dec == 1
cdef inline _do_monotonic_check(self):
cdef:
bint is_unique
try:
values = self.values
self.monotonic_inc, self.monotonic_dec, is_unique = \
self._call_monotonic(values)
except TypeError:
self.monotonic_inc = 0
self.monotonic_dec = 0
is_unique = 0
self.need_monotonic_check = 0
# we can only be sure of uniqueness if is_unique=1
if is_unique:
self.unique = 1
self.need_unique_check = 0
cdef _call_monotonic(self, values):
return algos.is_monotonic(values, timelike=False)
cdef _make_hash_table(self, Py_ssize_t n):
raise NotImplementedError # pragma: no cover
cdef _check_type(self, object val):
hash(val)
@property
def is_mapping_populated(self) -> bool:
return self.mapping is not None
cdef inline _ensure_mapping_populated(self):
# this populates the mapping
# if its not already populated
# also satisfies the need_unique_check
if not self.is_mapping_populated:
values = self.values
self.mapping = self._make_hash_table(len(values))
self.mapping.map_locations(values)
if len(self.mapping) == len(values):
self.unique = 1
self.need_unique_check = 0
def clear_mapping(self):
self.mapping = None
self.need_monotonic_check = 1
self.need_unique_check = 1
self.unique = 0
self.monotonic_inc = 0
self.monotonic_dec = 0
def get_indexer(self, ndarray values) -> np.ndarray:
self._ensure_mapping_populated()
return self.mapping.lookup(values)
def get_indexer_non_unique(self, ndarray targets):
"""
Return an indexer suitable for taking from a non unique index
return the labels in the same order as the target
and a missing indexer into the targets (which correspond
to the -1 indices in the results
Returns
-------
indexer : np.ndarray[np.intp]
missing : np.ndarray[np.intp]
"""
cdef:
ndarray values
ndarray[intp_t] result, missing
set stargets, remaining_stargets, found_nas
dict d = {}
object val
Py_ssize_t count = 0, count_missing = 0
Py_ssize_t i, j, n, n_t, n_alloc, start, end
bint check_na_values = False
values = self.values
stargets = set(targets)
n = len(values)
n_t = len(targets)
if n > 10_000:
n_alloc = 10_000
else:
n_alloc = n
result = np.empty(n_alloc, dtype=np.intp)
missing = np.empty(n_t, dtype=np.intp)
# map each starget to its position in the index
if (
stargets and
len(stargets) < 5 and
not any([checknull(t) for t in stargets]) and
self.is_monotonic_increasing
):
# if there are few enough stargets and the index is monotonically
# increasing, then use binary search for each starget
remaining_stargets = set()
for starget in stargets:
try:
start = values.searchsorted(starget, side='left')
end = values.searchsorted(starget, side='right')
except TypeError: # e.g. if we tried to search for string in int array
remaining_stargets.add(starget)
else:
if start != end:
d[starget] = list(range(start, end))
stargets = remaining_stargets
if stargets:
# otherwise, map by iterating through all items in the index
# short-circuit na check
if values.dtype == object:
check_na_values = True
# keep track of nas in values
found_nas = set()
for i in range(n):
val = values[i]
# GH#43870
# handle lookup for nas
# (ie. np.nan, float("NaN"), Decimal("NaN"), dt64nat, td64nat)
if check_na_values and checknull(val):
match = [na for na in found_nas if is_matching_na(val, na)]
# matching na not found
if not len(match):
found_nas.add(val)
# add na to stargets to utilize `in` for stargets/d lookup
match_stargets = [
x for x in stargets if is_matching_na(val, x)
]
if len(match_stargets):
# add our 'standardized' na
stargets.add(val)
# matching na found
else:
assert len(match) == 1
val = match[0]
if val in stargets:
if val not in d:
d[val] = []
d[val].append(i)
for i in range(n_t):
val = targets[i]
# ensure there are nas in values before looking for a matching na
if check_na_values and checknull(val):
match = [na for na in found_nas if is_matching_na(val, na)]
if len(match):
assert len(match) == 1
val = match[0]
# found
if val in d:
key = val
for j in d[key]:
# realloc if needed
if count >= n_alloc:
n_alloc += 10_000
result = np.resize(result, n_alloc)
result[count] = j
count += 1
# value not found
else:
if count >= n_alloc:
n_alloc += 10_000
result = np.resize(result, n_alloc)
result[count] = -1
count += 1
missing[count_missing] = i
count_missing += 1
return result[0:count], missing[0:count_missing]
cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
# GH#1757 ndarray.searchsorted is not safe to use with array of tuples
# (treats a tuple `val` as a sequence of keys instead of a single key),
# so we implement something similar.
# This is equivalent to the stdlib's bisect.bisect_left
cdef:
Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1
object pval
if hi == 0 or (hi > 0 and val > values[hi]):
return len(values)
while lo < hi:
mid = (lo + hi) // 2
pval = values[mid]
if val < pval:
hi = mid
elif val > pval:
lo = mid + 1
else:
while mid > 0 and val == values[mid - 1]:
mid -= 1
return mid
if val <= values[mid]:
return mid
else:
return mid + 1
cdef class ObjectEngine(IndexEngine):
"""
Index Engine for use with object-dtype Index, namely the base class Index.
"""
cdef _make_hash_table(self, Py_ssize_t n):
return _hash.PyObjectHashTable(n)
cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
# using values.searchsorted here would treat a tuple `val` as a sequence
# instead of a single key, so we use a different implementation
try:
loc = _bin_search(self.values, val)
except TypeError as err:
raise KeyError(val) from err
return loc
cdef class DatetimeEngine(Int64Engine):
cdef int64_t _unbox_scalar(self, scalar) except? -1:
# NB: caller is responsible for ensuring tzawareness compat
# before we get here
if not (isinstance(scalar, _Timestamp) or scalar is NaT):
raise TypeError(scalar)
return scalar.value
def __contains__(self, val: object) -> bool:
# We assume before we get here:
# - val is hashable
self._unbox_scalar(val)
try:
self.get_loc(val)
return True
except KeyError:
return False
cdef _call_monotonic(self, values):
return algos.is_monotonic(values, timelike=True)
cpdef get_loc(self, object val):
# NB: the caller is responsible for ensuring that we are called
# with either a Timestamp or NaT (Timedelta or NaT for TimedeltaEngine)
cdef:
Py_ssize_t loc
if is_definitely_invalid_key(val):
raise TypeError(f"'{val}' is an invalid key")
try:
conv = self._unbox_scalar(val)
except TypeError:
raise KeyError(val)
# Welcome to the spaghetti factory
if self.over_size_threshold and self.is_monotonic_increasing:
if not self.is_unique:
return self._get_loc_duplicates(conv)
values = self.values
loc = values.searchsorted(conv, side='left')
if loc == len(values) or values[loc] != conv:
raise KeyError(val)
return loc
self._ensure_mapping_populated()
if not self.unique:
return self._get_loc_duplicates(conv)
try:
return self.mapping.get_item(conv)
except KeyError:
raise KeyError(val)
cdef class TimedeltaEngine(DatetimeEngine):
cdef int64_t _unbox_scalar(self, scalar) except? -1:
if not (isinstance(scalar, _Timedelta) or scalar is NaT):
raise TypeError(scalar)
return scalar.value
cdef class PeriodEngine(Int64Engine):
cdef int64_t _unbox_scalar(self, scalar) except? -1:
if scalar is NaT:
return scalar.value
if is_period_object(scalar):
# NB: we assume that we have the correct freq here.
return scalar.ordinal
raise TypeError(scalar)
cpdef get_loc(self, object val):
# NB: the caller is responsible for ensuring that we are called
# with either a Period or NaT
cdef:
int64_t conv
try:
conv = self._unbox_scalar(val)
except TypeError:
raise KeyError(val)
return Int64Engine.get_loc(self, conv)
cdef _call_monotonic(self, values):
return algos.is_monotonic(values, timelike=True)
cdef class BaseMultiIndexCodesEngine:
"""
Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
represent each label in a MultiIndex as an integer, by juxtaposing the bits
encoding each level, with appropriate offsets.
For instance: if 3 levels have respectively 3, 6 and 1 possible values,
then their labels can be represented using respectively 2, 3 and 1 bits,
as follows:
_ _ _ _____ _ __ __ __
|0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
— — — ————— — —— —— ——
|0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
— — — ————— — —— —— ——
|0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
and the resulting unsigned integer representation will be:
_ _ _ _____ _ __ __ __ __ __ __
|0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
Offsets are calculated at initialization, labels are transformed by method
_codes_to_ints.
Keys are located by first locating each component against the respective
level, then locating (the integer representation of) codes.
"""
def __init__(self, object levels, object labels,
ndarray[uint64_t, ndim=1] offsets):
"""
Parameters
----------
levels : list-like of numpy arrays
Levels of the MultiIndex.
labels : list-like of numpy arrays of integer dtype
Labels of the MultiIndex.
offsets : numpy array of uint64 dtype
Pre-calculated offsets, one for each level of the index.
"""
self.levels = levels
self.offsets = offsets
# Transform labels in a single array, and add 1 so that we are working
# with positive integers (-1 for NaN becomes 0):
codes = (np.array(labels, dtype='int64').T + 1).astype('uint64',
copy=False)
# Map each codes combination in the index to an integer unambiguously
# (no collisions possible), based on the "offsets", which describe the
# number of bits to switch labels for each level:
lab_ints = self._codes_to_ints(codes)
# Initialize underlying index (e.g. libindex.UInt64Engine) with
# integers representing labels: we will use its get_loc and get_indexer
self._base.__init__(self, lab_ints)
def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
raise NotImplementedError("Implemented by subclass") # pragma: no cover
def _extract_level_codes(self, target) -> np.ndarray:
"""
Map the requested list of (tuple) keys to their integer representations
for searching in the underlying integer index.
Parameters
----------
target : MultiIndex
Returns
------
int_keys : 1-dimensional array of dtype uint64 or object
Integers representing one combination each
"""
zt = [target._get_level_values(i) for i in range(target.nlevels)]
level_codes = [lev.get_indexer_for(codes) + 1 for lev, codes
in zip(self.levels, zt)]
return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
def get_indexer(self, target: np.ndarray) -> np.ndarray:
"""
Returns an array giving the positions of each value of `target` in
`self.values`, where -1 represents a value in `target` which does not
appear in `self.values`
Parameters
----------
target : np.ndarray
Returns
-------
np.ndarray[intp_t, ndim=1] of the indexer of `target` into
`self.values`
"""
return self._base.get_indexer(self, target)
def get_indexer_with_fill(self, ndarray target, ndarray values,
str method, object limit) -> np.ndarray:
"""
Returns an array giving the positions of each value of `target` in
`values`, where -1 represents a value in `target` which does not
appear in `values`
If `method` is "backfill" then the position for a value in `target`
which does not appear in `values` is that of the next greater value
in `values` (if one exists), and -1 if there is no such value.
Similarly, if the method is "pad" then the position for a value in
`target` which does not appear in `values` is that of the next smaller
value in `values` (if one exists), and -1 if there is no such value.
Parameters
----------
target: ndarray[object] of tuples
need not be sorted, but all must have the same length, which must be
the same as the length of all tuples in `values`
values : ndarray[object] of tuples
must be sorted and all have the same length. Should be the set of
the MultiIndex's values.
method: string
"backfill" or "pad"
limit: int or None
if provided, limit the number of fills to this value
Returns
-------
np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`,
filled with the `method` (and optionally `limit`) specified
"""
assert method in ("backfill", "pad")
cdef:
int64_t i, j, next_code
int64_t num_values, num_target_values
ndarray[int64_t, ndim=1] target_order
ndarray[object, ndim=1] target_values
ndarray[int64_t, ndim=1] new_codes, new_target_codes
ndarray[intp_t, ndim=1] sorted_indexer
target_order = np.argsort(target).astype('int64')
target_values = target[target_order]
num_values, num_target_values = len(values), len(target_values)
new_codes, new_target_codes = (
np.empty((num_values,)).astype('int64'),
np.empty((num_target_values,)).astype('int64'),
)
# `values` and `target_values` are both sorted, so we walk through them
# and memoize the (ordered) set of indices in the (implicit) merged-and
# sorted list of the two which belong to each of them
# the effect of this is to create a factorization for the (sorted)
# merger of the index values, where `new_codes` and `new_target_codes`
# are the subset of the factors which appear in `values` and `target`,
# respectively
i, j, next_code = 0, 0, 0
while i < num_values and j < num_target_values:
val, target_val = values[i], target_values[j]
if val <= target_val:
new_codes[i] = next_code
i += 1
if target_val <= val:
new_target_codes[j] = next_code
j += 1
next_code += 1
# at this point, at least one should have reached the end
# the remaining values of the other should be added to the end
assert i == num_values or j == num_target_values
while i < num_values:
new_codes[i] = next_code
i += 1
next_code += 1
while j < num_target_values:
new_target_codes[j] = next_code
j += 1
next_code += 1
# get the indexer, and undo the sorting of `target.values`
algo = algos.backfill if method == "backfill" else algos.pad
sorted_indexer = algo(new_codes, new_target_codes, limit=limit)
return sorted_indexer[np.argsort(target_order)]
def get_loc(self, object key):
if is_definitely_invalid_key(key):
raise TypeError(f"'{key}' is an invalid key")
if not isinstance(key, tuple):
raise KeyError(key)
try:
indices = [0 if checknull(v) else lev.get_loc(v) + 1
for lev, v in zip(self.levels, key)]
except KeyError:
raise KeyError(key)
# Transform indices into single integer:
lab_int = self._codes_to_ints(np.array(indices, dtype='uint64'))
return self._base.get_loc(self, lab_int)
def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray:
indexer = self._base.get_indexer_non_unique(self, target)
return indexer
def __contains__(self, val: object) -> bool:
# We assume before we get here:
# - val is hashable
# Default __contains__ looks in the underlying mapping, which in this
# case only contains integer representations.
try:
self.get_loc(val)
return True
except (KeyError, TypeError, ValueError):
return False
# Generated from template.
include "index_class_helper.pxi"