A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pandas/_libs/hashtable_class_helper.pxi.in

1323 lines
43 KiB

"""
Template for each `dtype` helper function for hashtable
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""
{{py:
# name
complex_types = ['complex64',
'complex128']
}}
{{for name in complex_types}}
cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil:
cdef kh{{name}}_t res
res.real = val.real
res.imag = val.imag
return res
{{endfor}}
{{py:
# name
c_types = ['khcomplex128_t',
'khcomplex64_t',
'float64_t',
'float32_t',
'int64_t',
'int32_t',
'int16_t',
'int8_t',
'uint64_t',
'uint32_t',
'uint16_t',
'uint8_t']
}}
{{for c_type in c_types}}
cdef bint is_nan_{{c_type}}({{c_type}} val) nogil:
{{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }}
return val.real != val.real or val.imag != val.imag
{{elif c_type in {'float64_t', 'float32_t'} }}
return val != val
{{else}}
return False
{{endif}}
{{if c_type in {'khcomplex128_t', 'khcomplex64_t', 'float64_t', 'float32_t'} }}
# are_equivalent_{{c_type}} is cimported via khash.pxd
{{else}}
cdef bint are_equivalent_{{c_type}}({{c_type}} val1, {{c_type}} val2) nogil:
return val1 == val2
{{endif}}
{{endfor}}
{{py:
# name
cimported_types = ['complex64',
'complex128',
'float32',
'float64',
'int8',
'int16',
'int32',
'int64',
'pymap',
'str',
'strbox',
'uint8',
'uint16',
'uint32',
'uint64']
}}
{{for name in cimported_types}}
from pandas._libs.khash cimport (
kh_destroy_{{name}},
kh_exist_{{name}},
kh_get_{{name}},
kh_init_{{name}},
kh_put_{{name}},
kh_resize_{{name}},
)
{{endfor}}
# ----------------------------------------------------------------------
# VectorData
# ----------------------------------------------------------------------
from pandas._libs.tslibs.util cimport get_c_string
from pandas._libs.missing cimport C_NA
{{py:
# name, dtype, c_type
# the generated StringVector is not actually used
# but is included for completeness (rather ObjectVector is used
# for uniques in hashtables)
dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
('Complex64', 'complex64', 'khcomplex64_t'),
('Float64', 'float64', 'float64_t'),
('Float32', 'float32', 'float32_t'),
('Int64', 'int64', 'int64_t'),
('Int32', 'int32', 'int32_t'),
('Int16', 'int16', 'int16_t'),
('Int8', 'int8', 'int8_t'),
('String', 'string', 'char *'),
('UInt64', 'uint64', 'uint64_t'),
('UInt32', 'uint32', 'uint32_t'),
('UInt16', 'uint16', 'uint16_t'),
('UInt8', 'uint8', 'uint8_t')]
}}
{{for name, dtype, c_type in dtypes}}
{{if dtype != 'int64'}}
# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
# by IntervalTree
ctypedef struct {{name}}VectorData:
{{c_type}} *data
Py_ssize_t n, m
{{endif}}
@cython.wraparound(False)
@cython.boundscheck(False)
cdef inline void append_data_{{dtype}}({{name}}VectorData *data,
{{c_type}} x) nogil:
data.data[data.n] = x
data.n += 1
{{endfor}}
ctypedef fused vector_data:
Int64VectorData
Int32VectorData
Int16VectorData
Int8VectorData
UInt64VectorData
UInt32VectorData
UInt16VectorData
UInt8VectorData
Float64VectorData
Float32VectorData
Complex128VectorData
Complex64VectorData
StringVectorData
cdef inline bint needs_resize(vector_data *data) nogil:
return data.n == data.m
# ----------------------------------------------------------------------
# Vector
# ----------------------------------------------------------------------
cdef class Vector:
# cdef readonly:
# bint external_view_exists
def __cinit__(self):
self.external_view_exists = False
{{py:
# name, dtype, c_type
dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
('Complex64', 'complex64', 'khcomplex64_t'),
('Float64', 'float64', 'float64_t'),
('UInt64', 'uint64', 'uint64_t'),
('Int64', 'int64', 'int64_t'),
('Float32', 'float32', 'float32_t'),
('UInt32', 'uint32', 'uint32_t'),
('Int32', 'int32', 'int32_t'),
('UInt16', 'uint16', 'uint16_t'),
('Int16', 'int16', 'int16_t'),
('UInt8', 'uint8', 'uint8_t'),
('Int8', 'int8', 'int8_t')]
}}
{{for name, dtype, c_type in dtypes}}
cdef class {{name}}Vector(Vector):
# For int64 we have to put this declaration in the .pxd file;
# Int64Vector is the only one we need exposed for other cython files.
{{if dtype != 'int64'}}
cdef:
{{name}}VectorData *data
ndarray ao
{{endif}}
def __cinit__(self):
self.data = <{{name}}VectorData *>PyMem_Malloc(
sizeof({{name}}VectorData))
if not self.data:
raise MemoryError()
self.data.n = 0
self.data.m = _INIT_VEC_CAP
self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
self.data.data = <{{c_type}}*>self.ao.data
cdef resize(self):
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
self.ao.resize(self.data.m, refcheck=False)
self.data.data = <{{c_type}}*>self.ao.data
def __dealloc__(self):
if self.data is not NULL:
PyMem_Free(self.data)
self.data = NULL
def __len__(self) -> int:
return self.data.n
cpdef ndarray to_array(self):
if self.data.m != self.data.n:
if self.external_view_exists:
# should never happen
raise ValueError("should have raised on append()")
self.ao.resize(self.data.n, refcheck=False)
self.data.m = self.data.n
self.external_view_exists = True
return self.ao
cdef inline void append(self, {{c_type}} x):
if needs_resize(self.data):
if self.external_view_exists:
raise ValueError("external reference but "
"Vector.resize() needed")
self.resize()
append_data_{{dtype}}(self.data, x)
cdef extend(self, const {{c_type}}[:] x):
for i in range(len(x)):
self.append(x[i])
{{endfor}}
cdef class StringVector(Vector):
cdef:
StringVectorData *data
def __cinit__(self):
self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
if not self.data:
raise MemoryError()
self.data.n = 0
self.data.m = _INIT_VEC_CAP
self.data.data = <char **>malloc(self.data.m * sizeof(char *))
if not self.data.data:
raise MemoryError()
cdef resize(self):
cdef:
char **orig_data
Py_ssize_t i, m
m = self.data.m
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
orig_data = self.data.data
self.data.data = <char **>malloc(self.data.m * sizeof(char *))
if not self.data.data:
raise MemoryError()
for i in range(m):
self.data.data[i] = orig_data[i]
def __dealloc__(self):
if self.data is not NULL:
if self.data.data is not NULL:
free(self.data.data)
PyMem_Free(self.data)
self.data = NULL
def __len__(self) -> int:
return self.data.n
cpdef ndarray[object, ndim=1] to_array(self):
cdef:
ndarray ao
Py_ssize_t n
object val
ao = np.empty(self.data.n, dtype=object)
for i in range(self.data.n):
val = self.data.data[i]
ao[i] = val
self.external_view_exists = True
self.data.m = self.data.n
return ao
cdef inline void append(self, char *x):
if needs_resize(self.data):
self.resize()
append_data_string(self.data, x)
cdef extend(self, ndarray[object] x):
for i in range(len(x)):
self.append(x[i])
cdef class ObjectVector(Vector):
cdef:
PyObject **data
Py_ssize_t n, m
ndarray ao
def __cinit__(self):
self.n = 0
self.m = _INIT_VEC_CAP
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
self.data = <PyObject**>self.ao.data
def __len__(self) -> int:
return self.n
cdef inline append(self, object obj):
if self.n == self.m:
if self.external_view_exists:
raise ValueError("external reference but "
"Vector.resize() needed")
self.m = max(self.m * 2, _INIT_VEC_CAP)
self.ao.resize(self.m, refcheck=False)
self.data = <PyObject**>self.ao.data
Py_INCREF(obj)
self.data[self.n] = <PyObject*>obj
self.n += 1
cpdef ndarray[object, ndim=1] to_array(self):
if self.m != self.n:
if self.external_view_exists:
raise ValueError("should have raised on append()")
self.ao.resize(self.n, refcheck=False)
self.m = self.n
self.external_view_exists = True
return self.ao
cdef extend(self, ndarray[object] x):
for i in range(len(x)):
self.append(x[i])
# ----------------------------------------------------------------------
# HashTable
# ----------------------------------------------------------------------
cdef class HashTable:
pass
{{py:
# name, dtype, c_type, to_c_type
dtypes = [('Complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'),
('Float64', 'float64', 'float64_t', ''),
('UInt64', 'uint64', 'uint64_t', ''),
('Int64', 'int64', 'int64_t', ''),
('Complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'),
('Float32', 'float32', 'float32_t', ''),
('UInt32', 'uint32', 'uint32_t', ''),
('Int32', 'int32', 'int32_t', ''),
('UInt16', 'uint16', 'uint16_t', ''),
('Int16', 'int16', 'int16_t', ''),
('UInt8', 'uint8', 'uint8_t', ''),
('Int8', 'int8', 'int8_t', '')]
}}
{{for name, dtype, c_type, to_c_type in dtypes}}
cdef class {{name}}HashTable(HashTable):
def __cinit__(self, int64_t size_hint=1):
self.table = kh_init_{{dtype}}()
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
kh_resize_{{dtype}}(self.table, size_hint)
def __len__(self) -> int:
return self.table.size
def __dealloc__(self):
if self.table is not NULL:
kh_destroy_{{dtype}}(self.table)
self.table = NULL
def __contains__(self, object key) -> bool:
cdef:
khiter_t k
{{c_type}} ckey
ckey = {{to_c_type}}(key)
k = kh_get_{{dtype}}(self.table, ckey)
return k != self.table.n_buckets
def sizeof(self, deep: bool = False) -> int:
""" return the size of my table in bytes """
overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
for_pairs = self.table.n_buckets * (sizeof({{dtype}}_t) + # keys
sizeof(Py_ssize_t)) # vals
return overhead + for_flags + for_pairs
def get_state(self) -> dict[str, int]:
""" returns infos about the state of the hashtable"""
return {
'n_buckets' : self.table.n_buckets,
'size' : self.table.size,
'n_occupied' : self.table.n_occupied,
'upper_bound' : self.table.upper_bound,
}
cpdef get_item(self, {{dtype}}_t val):
cdef:
khiter_t k
{{c_type}} cval
cval = {{to_c_type}}(val)
k = kh_get_{{dtype}}(self.table, cval)
if k != self.table.n_buckets:
return self.table.vals[k]
else:
raise KeyError(val)
cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val):
cdef:
khiter_t k
int ret = 0
{{c_type}} ckey
ckey = {{to_c_type}}(key)
k = kh_put_{{dtype}}(self.table, ckey, &ret)
if kh_exist_{{dtype}}(self.table, k):
self.table.vals[k] = val
else:
raise KeyError(key)
@cython.boundscheck(False)
def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values) -> None:
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
{{c_type}} key
khiter_t k
with nogil:
for i in range(n):
key = {{to_c_type}}(keys[i])
k = kh_put_{{dtype}}(self.table, key, &ret)
self.table.vals[k] = <Py_ssize_t>values[i]
@cython.boundscheck(False)
def map_locations(self, const {{dtype}}_t[:] values) -> None:
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
{{c_type}} val
khiter_t k
with nogil:
for i in range(n):
val= {{to_c_type}}(values[i])
k = kh_put_{{dtype}}(self.table, val, &ret)
self.table.vals[k] = i
@cython.boundscheck(False)
def lookup(self, const {{dtype}}_t[:] values) -> ndarray:
# -> np.ndarray[np.intp]
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
{{c_type}} val
khiter_t k
intp_t[:] locs = np.empty(n, dtype=np.intp)
with nogil:
for i in range(n):
val = {{to_c_type}}(values[i])
k = kh_get_{{dtype}}(self.table, val)
if k != self.table.n_buckets:
locs[i] = self.table.vals[k]
else:
locs[i] = -1
return np.asarray(locs)
@cython.boundscheck(False)
@cython.wraparound(False)
def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, bint ignore_na=False,
object mask=None, bint return_inverse=False):
"""
Calculate unique values and labels (no sorting!)
Parameters
----------
values : ndarray[{{dtype}}]
Array of values of which unique will be calculated
uniques : {{name}}Vector
Vector into which uniques will be written
count_prior : Py_ssize_t, default 0
Number of existing entries in uniques
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then
any value "val" satisfying val != val is considered missing.
If na_value is not None, then _additionally_, any value "val"
satisfying val == na_value is considered missing.
ignore_na : bool, default False
Whether NA-values should be ignored for calculating the uniques. If
True, the labels corresponding to missing values will be set to
na_sentinel.
mask : ndarray[bool], optional
If not None, the mask is used as indicator for missing values
(True = missing, False = valid) instead of `na_value` or
condition "val != val".
return_inverse : bool, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
Returns
-------
uniques : ndarray[{{dtype}}]
Unique values of input, not sorted
labels : ndarray[intp_t] (if return_inverse=True)
The labels from values to uniques
"""
cdef:
Py_ssize_t i, idx, count = count_prior, n = len(values)
intp_t[:] labels
int ret = 0
{{c_type}} val, na_value2
khiter_t k
{{name}}VectorData *ud
bint use_na_value, use_mask
uint8_t[:] mask_values
if return_inverse:
labels = np.empty(n, dtype=np.intp)
ud = uniques.data
use_na_value = na_value is not None
use_mask = mask is not None
if use_mask:
mask_values = mask.view("uint8")
if use_na_value:
# We need this na_value2 because we want to allow users
# to *optionally* specify an NA sentinel *of the correct* type.
# We use None, to make it optional, which requires `object` type
# for the parameter. To please the compiler, we use na_value2,
# which is only used if it's *specified*.
na_value2 = {{to_c_type}}(na_value)
else:
na_value2 = {{to_c_type}}(0)
with nogil:
for i in range(n):
val = {{to_c_type}}(values[i])
if ignore_na and use_mask:
if mask_values[i]:
labels[i] = na_sentinel
continue
elif ignore_na and (
is_nan_{{c_type}}(val) or
(use_na_value and are_equivalent_{{c_type}}(val, na_value2))
):
# if missing values do not count as unique values (i.e. if
# ignore_na is True), skip the hashtable entry for them,
# and replace the corresponding label with na_sentinel
labels[i] = na_sentinel
continue
k = kh_get_{{dtype}}(self.table, val)
if k == self.table.n_buckets:
# k hasn't been seen yet
k = kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
if uniques.external_view_exists:
raise ValueError("external reference to "
"uniques held, but "
"Vector.resize() needed")
uniques.resize()
append_data_{{dtype}}(ud, val)
if return_inverse:
self.table.vals[k] = count
labels[i] = count
count += 1
elif return_inverse:
# k falls into a previous bucket
# only relevant in case we need to construct the inverse
idx = self.table.vals[k]
labels[i] = idx
if return_inverse:
return uniques.to_array(), labels.base # .base -> underlying ndarray
return uniques.to_array()
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
"""
Calculate unique values and labels (no sorting!)
Parameters
----------
values : ndarray[{{dtype}}]
Array of values of which unique will be calculated
return_inverse : bool, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
Returns
-------
uniques : ndarray[{{dtype}}]
Unique values of input, not sorted
labels : ndarray[intp_t] (if return_inverse)
The labels from values to uniques
"""
uniques = {{name}}Vector()
return self._unique(values, uniques, ignore_na=False,
return_inverse=return_inverse)
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
object na_value=None, object mask=None):
"""
Calculate unique values and labels (no sorting!)
Missing values are not included in the "uniques" for this method.
The labels for any missing values will be set to "na_sentinel"
Parameters
----------
values : ndarray[{{dtype}}]
Array of values of which unique will be calculated
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then
any value "val" satisfying val != val is considered missing.
If na_value is not None, then _additionally_, any value "val"
satisfying val == na_value is considered missing.
mask : ndarray[bool], optional
If not None, the mask is used as indicator for missing values
(True = missing, False = valid) instead of `na_value` or
condition "val != val".
Returns
-------
uniques : ndarray[{{dtype}}]
Unique values of input, not sorted
labels : ndarray[intp_t]
The labels from values to uniques
"""
uniques_vector = {{name}}Vector()
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
na_value=na_value, ignore_na=True, mask=mask,
return_inverse=True)
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
# -> np.ndarray[np.intp]
_, labels = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
ignore_na=True, return_inverse=True)
return labels
{{if dtype == 'int64'}}
@cython.boundscheck(False)
def get_labels_groupby(
self, const {{dtype}}_t[:] values
) -> tuple[ndarray, ndarray]:
# tuple[np.ndarray[np.intp], np.ndarray[{{dtype}}]]
cdef:
Py_ssize_t i, n = len(values)
intp_t[:] labels
Py_ssize_t idx, count = 0
int ret = 0
{{c_type}} val
khiter_t k
{{name}}Vector uniques = {{name}}Vector()
{{name}}VectorData *ud
labels = np.empty(n, dtype=np.intp)
ud = uniques.data
with nogil:
for i in range(n):
val = {{to_c_type}}(values[i])
# specific for groupby
if val < 0:
labels[i] = -1
continue
k = kh_get_{{dtype}}(self.table, val)
if k != self.table.n_buckets:
idx = self.table.vals[k]
labels[i] = idx
else:
k = kh_put_{{dtype}}(self.table, val, &ret)
self.table.vals[k] = count
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, val)
labels[i] = count
count += 1
arr_uniques = uniques.to_array()
return np.asarray(labels), arr_uniques
{{endif}}
{{endfor}}
cdef class StringHashTable(HashTable):
# these by-definition *must* be strings
# or a sentinel np.nan / None missing value
na_string_sentinel = '__nan__'
def __init__(self, int64_t size_hint=1):
self.table = kh_init_str()
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
kh_resize_str(self.table, size_hint)
def __dealloc__(self):
if self.table is not NULL:
kh_destroy_str(self.table)
self.table = NULL
def sizeof(self, deep: bool = False) -> int:
overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
for_pairs = self.table.n_buckets * (sizeof(char *) + # keys
sizeof(Py_ssize_t)) # vals
return overhead + for_flags + for_pairs
def get_state(self) -> dict[str, int]:
""" returns infos about the state of the hashtable"""
return {
'n_buckets' : self.table.n_buckets,
'size' : self.table.size,
'n_occupied' : self.table.n_occupied,
'upper_bound' : self.table.upper_bound,
}
cpdef get_item(self, str val):
cdef:
khiter_t k
const char *v
v = get_c_string(val)
k = kh_get_str(self.table, v)
if k != self.table.n_buckets:
return self.table.vals[k]
else:
raise KeyError(val)
cpdef set_item(self, str key, Py_ssize_t val):
cdef:
khiter_t k
int ret = 0
const char *v
v = get_c_string(key)
k = kh_put_str(self.table, v, &ret)
if kh_exist_str(self.table, k):
self.table.vals[k] = val
else:
raise KeyError(key)
@cython.boundscheck(False)
def get_indexer(self, ndarray[object] values) -> ndarray:
# -> np.ndarray[np.intp]
cdef:
Py_ssize_t i, n = len(values)
ndarray[intp_t] labels = np.empty(n, dtype=np.intp)
intp_t *resbuf = <intp_t*>labels.data
khiter_t k
kh_str_t *table = self.table
const char *v
const char **vecs
vecs = <const char **>malloc(n * sizeof(char *))
for i in range(n):
val = values[i]
v = get_c_string(val)
vecs[i] = v
with nogil:
for i in range(n):
k = kh_get_str(table, vecs[i])
if k != table.n_buckets:
resbuf[i] = table.vals[k]
else:
resbuf[i] = -1
free(vecs)
return labels
@cython.boundscheck(False)
def lookup(self, ndarray[object] values) -> ndarray:
# -> np.ndarray[np.intp]
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
object val
const char *v
khiter_t k
intp_t[:] locs = np.empty(n, dtype=np.intp)
# these by-definition *must* be strings
vecs = <const char **>malloc(n * sizeof(char *))
for i in range(n):
val = values[i]
if isinstance(val, str):
# GH#31499 if we have a np.str_ get_c_string won't recognize
# it as a str, even though isinstance does.
v = get_c_string(<str>val)
else:
v = get_c_string(self.na_string_sentinel)
vecs[i] = v
with nogil:
for i in range(n):
v = vecs[i]
k = kh_get_str(self.table, v)
if k != self.table.n_buckets:
locs[i] = self.table.vals[k]
else:
locs[i] = -1
free(vecs)
return np.asarray(locs)
@cython.boundscheck(False)
def map_locations(self, ndarray[object] values) -> None:
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
object val
const char *v
const char **vecs
khiter_t k
# these by-definition *must* be strings
vecs = <const char **>malloc(n * sizeof(char *))
for i in range(n):
val = values[i]
if isinstance(val, str):
# GH#31499 if we have a np.str_ get_c_string won't recognize
# it as a str, even though isinstance does.
v = get_c_string(<str>val)
else:
v = get_c_string(self.na_string_sentinel)
vecs[i] = v
with nogil:
for i in range(n):
v = vecs[i]
k = kh_put_str(self.table, v, &ret)
self.table.vals[k] = i
free(vecs)
@cython.boundscheck(False)
@cython.wraparound(False)
def _unique(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, bint ignore_na=False,
bint return_inverse=False):
"""
Calculate unique values and labels (no sorting!)
Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
uniques : ObjectVector
Vector into which uniques will be written
count_prior : Py_ssize_t, default 0
Number of existing entries in uniques
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then any value
that is not a string is considered missing. If na_value is
not None, then _additionally_ any value "val" satisfying
val == na_value is considered missing.
ignore_na : bool, default False
Whether NA-values should be ignored for calculating the uniques. If
True, the labels corresponding to missing values will be set to
na_sentinel.
return_inverse : bool, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[intp_t] (if return_inverse=True)
The labels from values to uniques
"""
cdef:
Py_ssize_t i, idx, count = count_prior, n = len(values)
intp_t[:] labels
int64_t[:] uindexer
int ret = 0
object val
const char *v
const char **vecs
khiter_t k
bint use_na_value
if return_inverse:
labels = np.zeros(n, dtype=np.intp)
uindexer = np.empty(n, dtype=np.int64)
use_na_value = na_value is not None
# assign pointers and pre-filter out missing (if ignore_na)
vecs = <const char **>malloc(n * sizeof(char *))
for i in range(n):
val = values[i]
if (ignore_na
and (not isinstance(val, str)
or (use_na_value and val == na_value))):
# if missing values do not count as unique values (i.e. if
# ignore_na is True), we can skip the actual value, and
# replace the label with na_sentinel directly
labels[i] = na_sentinel
else:
# if ignore_na is False, we also stringify NaN/None/etc.
try:
v = get_c_string(<str>val)
except UnicodeEncodeError:
v = get_c_string(<str>repr(val))
vecs[i] = v
# compute
with nogil:
for i in range(n):
if ignore_na and labels[i] == na_sentinel:
# skip entries for ignored missing values (see above)
continue
v = vecs[i]
k = kh_get_str(self.table, v)
if k == self.table.n_buckets:
# k hasn't been seen yet
k = kh_put_str(self.table, v, &ret)
uindexer[count] = i
if return_inverse:
self.table.vals[k] = count
labels[i] = count
count += 1
elif return_inverse:
# k falls into a previous bucket
# only relevant in case we need to construct the inverse
idx = self.table.vals[k]
labels[i] = idx
free(vecs)
# uniques
for i in range(count):
uniques.append(values[uindexer[i]])
if return_inverse:
return uniques.to_array(), labels.base # .base -> underlying ndarray
return uniques.to_array()
def unique(self, ndarray[object] values, bint return_inverse=False):
"""
Calculate unique values and labels (no sorting!)
Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
return_inverse : bool, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[intp_t] (if return_inverse)
The labels from values to uniques
"""
uniques = ObjectVector()
return self._unique(values, uniques, ignore_na=False,
return_inverse=return_inverse)
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
object na_value=None, object mask=None):
"""
Calculate unique values and labels (no sorting!)
Missing values are not included in the "uniques" for this method.
The labels for any missing values will be set to "na_sentinel"
Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then any value
that is not a string is considered missing. If na_value is
not None, then _additionally_ any value "val" satisfying
val == na_value is considered missing.
mask : ndarray[bool], optional
Not yet implemented for StringHashTable.
Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[intp]
The labels from values to uniques
"""
uniques_vector = ObjectVector()
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
na_value=na_value, ignore_na=True,
return_inverse=True)
def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
# -> np.ndarray[np.intp]
_, labels = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
ignore_na=True, return_inverse=True)
return labels
cdef class PyObjectHashTable(HashTable):
def __init__(self, int64_t size_hint=1):
self.table = kh_init_pymap()
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
kh_resize_pymap(self.table, size_hint)
def __dealloc__(self):
if self.table is not NULL:
kh_destroy_pymap(self.table)
self.table = NULL
def __len__(self) -> int:
return self.table.size
def __contains__(self, object key) -> bool:
cdef:
khiter_t k
hash(key)
k = kh_get_pymap(self.table, <PyObject*>key)
return k != self.table.n_buckets
def sizeof(self, deep: bool = False) -> int:
""" return the size of my table in bytes """
overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
for_pairs = self.table.n_buckets * (sizeof(PyObject *) + # keys
sizeof(Py_ssize_t)) # vals
return overhead + for_flags + for_pairs
def get_state(self) -> dict[str, int]:
"""
returns infos about the current state of the hashtable like size,
number of buckets and so on.
"""
return {
'n_buckets' : self.table.n_buckets,
'size' : self.table.size,
'n_occupied' : self.table.n_occupied,
'upper_bound' : self.table.upper_bound,
}
cpdef get_item(self, object val):
cdef:
khiter_t k
k = kh_get_pymap(self.table, <PyObject*>val)
if k != self.table.n_buckets:
return self.table.vals[k]
else:
raise KeyError(val)
cpdef set_item(self, object key, Py_ssize_t val):
cdef:
khiter_t k
int ret = 0
char* buf
hash(key)
k = kh_put_pymap(self.table, <PyObject*>key, &ret)
if kh_exist_pymap(self.table, k):
self.table.vals[k] = val
else:
raise KeyError(key)
def map_locations(self, ndarray[object] values) -> None:
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
object val
khiter_t k
for i in range(n):
val = values[i]
hash(val)
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
self.table.vals[k] = i
def lookup(self, ndarray[object] values) -> ndarray:
# -> np.ndarray[np.intp]
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
object val
khiter_t k
intp_t[:] locs = np.empty(n, dtype=np.intp)
for i in range(n):
val = values[i]
hash(val)
k = kh_get_pymap(self.table, <PyObject*>val)
if k != self.table.n_buckets:
locs[i] = self.table.vals[k]
else:
locs[i] = -1
return np.asarray(locs)
@cython.boundscheck(False)
@cython.wraparound(False)
def _unique(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, bint ignore_na=False,
bint return_inverse=False):
"""
Calculate unique values and labels (no sorting!)
Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
uniques : ObjectVector
Vector into which uniques will be written
count_prior : Py_ssize_t, default 0
Number of existing entries in uniques
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then None _plus_
any value "val" satisfying val != val is considered missing.
If na_value is not None, then _additionally_, any value "val"
satisfying val == na_value is considered missing.
ignore_na : bool, default False
Whether NA-values should be ignored for calculating the uniques. If
True, the labels corresponding to missing values will be set to
na_sentinel.
return_inverse : bool, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[intp_t] (if return_inverse=True)
The labels from values to uniques
"""
cdef:
Py_ssize_t i, idx, count = count_prior, n = len(values)
intp_t[:] labels
int ret = 0
object val
khiter_t k
bint use_na_value
if return_inverse:
labels = np.empty(n, dtype=np.intp)
use_na_value = na_value is not None
for i in range(n):
val = values[i]
hash(val)
if ignore_na and (
checknull(val)
or (use_na_value and val == na_value)
):
# if missing values do not count as unique values (i.e. if
# ignore_na is True), skip the hashtable entry for them, and
# replace the corresponding label with na_sentinel
labels[i] = na_sentinel
continue
k = kh_get_pymap(self.table, <PyObject*>val)
if k == self.table.n_buckets:
# k hasn't been seen yet
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
uniques.append(val)
if return_inverse:
self.table.vals[k] = count
labels[i] = count
count += 1
elif return_inverse:
# k falls into a previous bucket
# only relevant in case we need to construct the inverse
idx = self.table.vals[k]
labels[i] = idx
if return_inverse:
return uniques.to_array(), labels.base # .base -> underlying ndarray
return uniques.to_array()
def unique(self, ndarray[object] values, bint return_inverse=False):
"""
Calculate unique values and labels (no sorting!)
Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
return_inverse : bool, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[intp_t] (if return_inverse)
The labels from values to uniques
"""
uniques = ObjectVector()
return self._unique(values, uniques, ignore_na=False,
return_inverse=return_inverse)
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
object na_value=None, object mask=None):
"""
Calculate unique values and labels (no sorting!)
Missing values are not included in the "uniques" for this method.
The labels for any missing values will be set to "na_sentinel"
Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then None _plus_
any value "val" satisfying val != val is considered missing.
If na_value is not None, then _additionally_, any value "val"
satisfying val == na_value is considered missing.
mask : ndarray[bool], optional
Not yet implemented for PyObjectHashTable.
Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[intp_t]
The labels from values to uniques
"""
uniques_vector = ObjectVector()
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
na_value=na_value, ignore_na=True,
return_inverse=True)
def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
# -> np.ndarray[np.intp]
_, labels = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
ignore_na=True, return_inverse=True)
return labels