A PyQT GUI application for converting InfoLease report outputs into Excel files. Handles parsing and summarizing. Learns where files are meant to be store and compiles monthly and yearly summaries.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
InfoLeaseExtract/venv/Lib/site-packages/pandas/io/excel/_odfreader.py

233 lines
7.4 KiB

from __future__ import annotations
import numpy as np
from pandas._typing import (
FilePath,
ReadBuffer,
Scalar,
StorageOptions,
)
from pandas.compat._optional import import_optional_dependency
import pandas as pd
from pandas.io.excel._base import BaseExcelReader
class ODFReader(BaseExcelReader):
"""
Read tables out of OpenDocument formatted files.
Parameters
----------
filepath_or_buffer : str, path to be parsed or
an open readable stream.
storage_options : dict, optional
passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``)
"""
def __init__(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions = None,
):
import_optional_dependency("odf")
super().__init__(filepath_or_buffer, storage_options=storage_options)
@property
def _workbook_class(self):
from odf.opendocument import OpenDocument
return OpenDocument
def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
from odf.opendocument import load
return load(filepath_or_buffer)
@property
def empty_value(self) -> str:
"""Property for compat with other readers."""
return ""
@property
def sheet_names(self) -> list[str]:
"""Return a list of sheet names present in the document"""
from odf.table import Table
tables = self.book.getElementsByType(Table)
return [t.getAttribute("name") for t in tables]
def get_sheet_by_index(self, index: int):
from odf.table import Table
self.raise_if_bad_sheet_by_index(index)
tables = self.book.getElementsByType(Table)
return tables[index]
def get_sheet_by_name(self, name: str):
from odf.table import Table
self.raise_if_bad_sheet_by_name(name)
tables = self.book.getElementsByType(Table)
for table in tables:
if table.getAttribute("name") == name:
return table
self.close()
raise ValueError(f"sheet {name} not found")
def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
"""
Parse an ODF Table into a list of lists
"""
from odf.table import (
CoveredTableCell,
TableCell,
TableRow,
)
covered_cell_name = CoveredTableCell().qname
table_cell_name = TableCell().qname
cell_names = {covered_cell_name, table_cell_name}
sheet_rows = sheet.getElementsByType(TableRow)
empty_rows = 0
max_row_len = 0
table: list[list[Scalar]] = []
for sheet_row in sheet_rows:
sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names]
empty_cells = 0
table_row: list[Scalar] = []
for sheet_cell in sheet_cells:
if sheet_cell.qname == table_cell_name:
value = self._get_cell_value(sheet_cell, convert_float)
else:
value = self.empty_value
column_repeat = self._get_column_repeat(sheet_cell)
# Queue up empty values, writing only if content succeeds them
if value == self.empty_value:
empty_cells += column_repeat
else:
table_row.extend([self.empty_value] * empty_cells)
empty_cells = 0
table_row.extend([value] * column_repeat)
if max_row_len < len(table_row):
max_row_len = len(table_row)
row_repeat = self._get_row_repeat(sheet_row)
if self._is_empty_row(sheet_row):
empty_rows += row_repeat
else:
# add blank rows to our table
table.extend([[self.empty_value]] * empty_rows)
empty_rows = 0
for _ in range(row_repeat):
table.append(table_row)
# Make our table square
for row in table:
if len(row) < max_row_len:
row.extend([self.empty_value] * (max_row_len - len(row)))
return table
def _get_row_repeat(self, row) -> int:
"""
Return number of times this row was repeated
Repeating an empty row appeared to be a common way
of representing sparse rows in the table.
"""
from odf.namespaces import TABLENS
return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1))
def _get_column_repeat(self, cell) -> int:
from odf.namespaces import TABLENS
return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1))
def _is_empty_row(self, row) -> bool:
"""
Helper function to find empty rows
"""
for column in row.childNodes:
if len(column.childNodes) > 0:
return False
return True
def _get_cell_value(self, cell, convert_float: bool) -> Scalar:
from odf.namespaces import OFFICENS
if str(cell) == "#N/A":
return np.nan
cell_type = cell.attributes.get((OFFICENS, "value-type"))
if cell_type == "boolean":
if str(cell) == "TRUE":
return True
return False
if cell_type is None:
return self.empty_value
elif cell_type == "float":
# GH5394
cell_value = float(cell.attributes.get((OFFICENS, "value")))
if convert_float:
val = int(cell_value)
if val == cell_value:
return val
return cell_value
elif cell_type == "percentage":
cell_value = cell.attributes.get((OFFICENS, "value"))
return float(cell_value)
elif cell_type == "string":
return self._get_cell_string_value(cell)
elif cell_type == "currency":
cell_value = cell.attributes.get((OFFICENS, "value"))
return float(cell_value)
elif cell_type == "date":
cell_value = cell.attributes.get((OFFICENS, "date-value"))
return pd.to_datetime(cell_value)
elif cell_type == "time":
stamp = pd.to_datetime(str(cell))
# error: Item "str" of "Union[float, str, NaTType]" has no attribute "time"
return stamp.time() # type: ignore[union-attr]
else:
self.close()
raise ValueError(f"Unrecognized type {cell_type}")
def _get_cell_string_value(self, cell) -> str:
"""
Find and decode OpenDocument text:s tags that represent
a run length encoded sequence of space characters.
"""
from odf.element import Element
from odf.namespaces import TEXTNS
from odf.text import S
text_s = S().qname
value = []
for fragment in cell.childNodes:
if isinstance(fragment, Element):
if fragment.qname == text_s:
spaces = int(fragment.attributes.get((TEXTNS, "c"), 1))
value.append(" " * spaces)
else:
# recursive impl needed in case of nested fragments
# with multiple spaces
# https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704
value.append(self._get_cell_string_value(fragment))
else:
value.append(str(fragment))
return "".join(value)