A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pandas/io/json/_table_schema.py

368 lines
10 KiB

"""
Table Schema builders
https://specs.frictionlessdata.io/json-table-schema/
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
cast,
)
import warnings
import pandas._libs.json as json
from pandas._typing import (
DtypeObj,
JSONSerializable,
)
from pandas.core.dtypes.base import _registry as registry
from pandas.core.dtypes.common import (
is_bool_dtype,
is_categorical_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_extension_array_dtype,
is_integer_dtype,
is_numeric_dtype,
is_period_dtype,
is_string_dtype,
is_timedelta64_dtype,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas import DataFrame
import pandas.core.common as com
if TYPE_CHECKING:
from pandas import Series
from pandas.core.indexes.multi import MultiIndex
loads = json.loads
TABLE_SCHEMA_VERSION = "1.4.0"
def as_json_table_type(x: DtypeObj) -> str:
"""
Convert a NumPy / pandas type to its corresponding json_table.
Parameters
----------
x : np.dtype or ExtensionDtype
Returns
-------
str
the Table Schema data types
Notes
-----
This table shows the relationship between NumPy / pandas dtypes,
and Table Schema dtypes.
============== =================
Pandas type Table Schema type
============== =================
int64 integer
float64 number
bool boolean
datetime64[ns] datetime
timedelta64[ns] duration
object str
categorical any
=============== =================
"""
if is_integer_dtype(x):
return "integer"
elif is_bool_dtype(x):
return "boolean"
elif is_numeric_dtype(x):
return "number"
elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x):
return "datetime"
elif is_timedelta64_dtype(x):
return "duration"
elif is_categorical_dtype(x):
return "any"
elif is_extension_array_dtype(x):
return "any"
elif is_string_dtype(x):
return "string"
else:
return "any"
def set_default_names(data):
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
if com.all_not_none(*data.index.names):
nms = data.index.names
if len(nms) == 1 and data.index.name == "index":
warnings.warn("Index name of 'index' is not round-trippable.")
elif len(nms) > 1 and any(x.startswith("level_") for x in nms):
warnings.warn(
"Index names beginning with 'level_' are not round-trippable."
)
return data
data = data.copy()
if data.index.nlevels > 1:
data.index.names = com.fill_missing_names(data.index.names)
else:
data.index.name = data.index.name or "index"
return data
def convert_pandas_type_to_json_field(arr):
dtype = arr.dtype
if arr.name is None:
name = "values"
else:
name = arr.name
field: dict[str, JSONSerializable] = {
"name": name,
"type": as_json_table_type(dtype),
}
if is_categorical_dtype(dtype):
cats = dtype.categories
ordered = dtype.ordered
field["constraints"] = {"enum": list(cats)}
field["ordered"] = ordered
elif is_period_dtype(dtype):
field["freq"] = dtype.freq.freqstr
elif is_datetime64tz_dtype(dtype):
field["tz"] = dtype.tz.zone
elif is_extension_array_dtype(dtype):
field["extDtype"] = dtype.name
return field
def convert_json_field_to_pandas_type(field):
"""
Converts a JSON field descriptor into its corresponding NumPy / pandas type
Parameters
----------
field
A JSON field descriptor
Returns
-------
dtype
Raises
------
ValueError
If the type of the provided field is unknown or currently unsupported
Examples
--------
>>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"})
'int64'
>>> convert_json_field_to_pandas_type(
... {
... "name": "a_categorical",
... "type": "any",
... "constraints": {"enum": ["a", "b", "c"]},
... "ordered": True,
... }
... )
CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)
>>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"})
'datetime64[ns]'
>>> convert_json_field_to_pandas_type(
... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"}
... )
'datetime64[ns, US/Central]'
"""
typ = field["type"]
if typ == "string":
return "object"
elif typ == "integer":
return "int64"
elif typ == "number":
return "float64"
elif typ == "boolean":
return "bool"
elif typ == "duration":
return "timedelta64"
elif typ == "datetime":
if field.get("tz"):
return f"datetime64[ns, {field['tz']}]"
else:
return "datetime64[ns]"
elif typ == "any":
if "constraints" in field and "ordered" in field:
return CategoricalDtype(
categories=field["constraints"]["enum"], ordered=field["ordered"]
)
elif "extDtype" in field:
return registry.find(field["extDtype"])
else:
return "object"
raise ValueError(f"Unsupported or invalid field type: {typ}")
def build_table_schema(
data: DataFrame | Series,
index: bool = True,
primary_key: bool | None = None,
version: bool = True,
) -> dict[str, JSONSerializable]:
"""
Create a Table schema from ``data``.
Parameters
----------
data : Series, DataFrame
index : bool, default True
Whether to include ``data.index`` in the schema.
primary_key : bool or None, default True
Column names to designate as the primary key.
The default `None` will set `'primaryKey'` to the index
level or levels if the index is unique.
version : bool, default True
Whether to include a field `pandas_version` with the version
of pandas that last revised the table schema. This version
can be different from the installed pandas version.
Returns
-------
schema : dict
Notes
-----
See `Table Schema
<https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for
conversion types.
Timedeltas as converted to ISO8601 duration format with
9 decimal places after the seconds field for nanosecond precision.
Categoricals are converted to the `any` dtype, and use the `enum` field
constraint to list the allowed values. The `ordered` attribute is included
in an `ordered` field.
Examples
--------
>>> df = pd.DataFrame(
... {'A': [1, 2, 3],
... 'B': ['a', 'b', 'c'],
... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
... }, index=pd.Index(range(3), name='idx'))
>>> build_table_schema(df)
{'fields': \
[{'name': 'idx', 'type': 'integer'}, \
{'name': 'A', 'type': 'integer'}, \
{'name': 'B', 'type': 'string'}, \
{'name': 'C', 'type': 'datetime'}], \
'primaryKey': ['idx'], \
'pandas_version': '1.4.0'}
"""
if index is True:
data = set_default_names(data)
schema: dict[str, Any] = {}
fields = []
if index:
if data.index.nlevels > 1:
data.index = cast("MultiIndex", data.index)
for level, name in zip(data.index.levels, data.index.names):
new_field = convert_pandas_type_to_json_field(level)
new_field["name"] = name
fields.append(new_field)
else:
fields.append(convert_pandas_type_to_json_field(data.index))
if data.ndim > 1:
for column, s in data.items():
fields.append(convert_pandas_type_to_json_field(s))
else:
fields.append(convert_pandas_type_to_json_field(data))
schema["fields"] = fields
if index and data.index.is_unique and primary_key is None:
if data.index.nlevels == 1:
schema["primaryKey"] = [data.index.name]
else:
schema["primaryKey"] = data.index.names
elif primary_key is not None:
schema["primaryKey"] = primary_key
if version:
schema["pandas_version"] = TABLE_SCHEMA_VERSION
return schema
def parse_table_schema(json, precise_float):
"""
Builds a DataFrame from a given schema
Parameters
----------
json :
A JSON table schema
precise_float : bool
Flag controlling precision when decoding string to double values, as
dictated by ``read_json``
Returns
-------
df : DataFrame
Raises
------
NotImplementedError
If the JSON table schema contains either timezone or timedelta data
Notes
-----
Because :func:`DataFrame.to_json` uses the string 'index' to denote a
name-less :class:`Index`, this function sets the name of the returned
:class:`DataFrame` to ``None`` when said string is encountered with a
normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
applies to any strings beginning with 'level_'. Therefore, an
:class:`Index` name of 'index' and :class:`MultiIndex` names starting
with 'level_' are not supported.
See Also
--------
build_table_schema : Inverse function.
pandas.read_json
"""
table = loads(json, precise_float=precise_float)
col_order = [field["name"] for field in table["schema"]["fields"]]
df = DataFrame(table["data"], columns=col_order)[col_order]
dtypes = {
field["name"]: convert_json_field_to_pandas_type(field)
for field in table["schema"]["fields"]
}
# No ISO constructor for Timedelta as of yet, so need to raise
if "timedelta64" in dtypes.values():
raise NotImplementedError(
'table="orient" can not yet read ISO-formatted Timedelta data'
)
df = df.astype(dtypes)
if "primaryKey" in table["schema"]:
df = df.set_index(table["schema"]["primaryKey"])
if len(df.index.names) == 1:
if df.index.name == "index":
df.index.name = None
else:
df.index.names = [
None if x.startswith("level_") else x for x in df.index.names
]
return df