- Integrated new ReportConfig into program

- Added full test to check everything works as expected after
small changes
- A bit of project restructuring, with switch to absolute imports
dev
= 3 years ago
parent fa7f1516c8
commit 231f5ed4ce
Signed by untrusted user who does not match committer: gprog
GPG Key ID: 5BE9BB58D37713F8
  1. 2
      .gitignore
  2. 12
      src/config.py
  3. 2
      src/configs/config_logger.toml
  4. 8
      src/configs/reports_config.toml
  5. 31
      src/helpers.py
  6. 92
      src/hold_reconciler.py
  7. 32
      src/memory.py
  8. 48
      src/reports.py
  9. 0
      tests/__init__.py
  10. 5
      tests/context.py
  11. 7
      tests/test_config.py
  12. BIN
      tests/test_inputs/April Reconciled Holds.xlsx
  13. 6
      tests/test_inputs/TEST_reports_config.toml
  14. 0
      tests/test_inputs/TestSearch/April 2023 OB.xlsx
  15. 0
      tests/test_inputs/TestSearch/April GP.xlsx
  16. 78
      tests/test_report.py

2
.gitignore vendored

@ -12,4 +12,4 @@ ghlib/
*.txt
!version.txt
!tests/test_inputs/*
!tests/test_inputs/TestSearch/*

@ -56,6 +56,12 @@ class PathsConfig:
pass # will remain as *.xlsx
def get_most_recent(self, report_type: ReportSource = None) -> Path|None| tuple[Path|None, Path|None]:
"""
Gets the most recent hold reports for OnBase and Great Plains.
If no report type is specified both OnBase & GreatPlains are returned.
If no matching reports are found, None will be returned
"""
report_files = []
report_types = [ReportSource.OB, ReportSource.GP] if report_type is None else [report_type]
@ -102,6 +108,12 @@ class PathsConfig:
@dataclass
class ReportConfig:
"""
Allows easy interaction with program configuration.
- Paths to files, db
- Report/Excel column naming
- Regexes
"""
# Paths to work with
# - input/output

@ -18,5 +18,5 @@ formatter = "custom"
filename = "on_hold.log"
[root]
level = "DEBUG"
level = "ERROR"
handlers = ["console", "file"]

@ -1,11 +1,11 @@
#### Paths: using '' makes the string 'raw' to avoid escape characters
# Path to the directory to search for input report files
input_directory = '../Reports'
input_directory = 'Work/Reports'
# Regex used to discover newest files
input_glob_pattern = { GP = "*GP*.xlsx", OB = '*OB*.xlsx'}
# Path to the directory to save the reconcilation work report
output_directory = '../Output'
output_directory = 'Work/Output'
# Fallback to interactive?
interactive_inputs = false # NOT YET IMPLEMENTED
@ -16,7 +16,7 @@ interactive_inputs = false # NOT YET IMPLEMENTED
# NOT YET IMPLEMENTED!
use_mssql = false
# Path to the SQLite database used to view/save reconcilations
database_path = './onhold_reconciliation.db'
database_path = 'src/onhold_reconciliation.db'
### Finished rec details
@ -53,7 +53,7 @@ doc_num_filters = [
"rent",
"cma"
]
po_filter = ["^(?!.*cma(\\s|\\d)).*$"]
po_filter = ['(?i)^(?!.*cma(\s|\d)).*$']
# Columns that are featured & expected on both OB & GP
[[shared_columns]]

@ -38,7 +38,7 @@ def setup_logging():
Returns:
logging.Logger: The logger instance.
"""
with open("config_logger.toml", "rb") as f:
with open("src/configs/config_logger.toml", "rb") as f:
config_dict: dict = load(f)
try:
# Try to load logging configuration from the TOML file
@ -60,31 +60,4 @@ def drop_unnamed(df: DataFrame, inplace: bool = True) -> DataFrame|None:
(on the orignal dataframe, not a copy!)
"""
cols = [c for c in df.columns if "Unnamed" in c]
return df.drop(cols, axis=1, inplace=inplace)
def find_most_recent_file(folder_path: Path, file_pattern: Pattern) -> str:
"""
Given a folder path and a regular expression pattern, this function returns the path of the most recently modified
file in the folder that matches the pattern.
Args:
folder_path (Path): A pathlib.Path object representing the folder to search.
file_pattern (Pattern): A regular expression pattern used to filter the files in the folder.
Returns:
str: The path of the most recently modified file in the folder that matches the pattern.
"""
# Find all files in the folder that match the pattern
files = glob.glob(f"{folder_path}/*")
logger.debug(f"files: {files}")
# Get the modification time of each file and filter to only those that match the pattern
file_times = [(os.path.getmtime(path), path) for path in files if re.match(file_pattern, basename(path))]
# Sort the files by modification time (most recent first)
file_times.sort(reverse=True)
logger.debug(f"file times: {file_times}")
# Return the path of the most recent file
return file_times[0][1]
return df.drop(cols, axis=1, inplace=inplace)

@ -4,11 +4,13 @@ then utilizes the reconcile module to find the differences between them. The out
saved as an excel file with todays date.
"""
# Custom module for reconciliation
from helpers import setup_logging, find_most_recent_file
from reports import OnBaseReport, GreatPlainsReport, ReconciledReports
from src.helpers import setup_logging
from src.reports import OnBaseReport, GreatPlainsReport, ReconciledReports
from src.config import ReportConfig
from src import ReportSource
import pandas as pd
from pandas import DataFrame
from pandas import DataFrame, read_excel, ExcelFile
import re
from re import Pattern
import logging
@ -22,54 +24,26 @@ logger = logging.getLogger(__name__)
logger.info(f"Logger started with level: {logger.level}")
def get_reports(work_dir: str, report_config: dict) -> tuple[pd.DataFrame|None, pd.DataFrame|None]:
"""
Given a dictionary of Excel configuration options, this function searches for the most recently modified GP and OB
Excel files in a "Work" folder and returns their corresponding dataframes.
Args:
excelConfig (dict): A dictionary containing configuration options for the GP and OB Excel files.
Returns:
tuple[pd.DataFrame|None, pd.DataFrame|None]: A tuple containing the OB and GP dataframes, respectively.
"""
# Define regular expression patterns to match the GP and OB Excel files
gp_regex: Pattern = re.compile(".*gp.*\.xlsx$", re.IGNORECASE)
ob_regex: Pattern = re.compile(".*ob.*\.xlsx$", re.IGNORECASE)
# Find the paths of the most recently modified GP and OB Excel files
gp_file_path = find_most_recent_file(work_dir, gp_regex)
logger.debug(f"gp_file_path: {gp_file_path}")
ob_file_path = find_most_recent_file(work_dir, ob_regex)
logger.debug(f"gp_file_path: {ob_file_path}")
def pull_report_sheet(report_path: Path, report_source: ReportSource, report_config: ReportConfig) -> DataFrame|None:
# Read the GP and OB Excel files into dataframes and check that each dataframe has the required columns
gp_xl = pd.ExcelFile(gp_file_path)
gp_req_cols = [col["GP"] for _, col in report_config["shared_columns"].items()]
logger.debug(f"GP_Req_cols: {gp_req_cols}")
gp_sheets = gp_xl.sheet_names
gp_dfs = pd.read_excel(gp_xl, sheet_name=gp_sheets)
for sheet in gp_dfs:
sheet_columns: list[str] = list(gp_dfs[sheet].columns)
logger.debug(f"gp ({sheet}) : {sheet_columns}")
logger.debug(f"Matches {[r in sheet_columns for r in gp_req_cols]}")
if all([r in sheet_columns for r in gp_req_cols]):
logger.debug("FOUND")
gp_df = gp_dfs[sheet]
break
ob_xl = pd.ExcelFile(ob_file_path)
ob_req_cols = [col["OB"] for _, col in report_config["shared_columns"].items()]
ob_sheets = ob_xl.sheet_names
ob_dfs = pd.read_excel(ob_xl, sheet_name=ob_sheets)
for sheet in ob_dfs:
sheet_columns: list[str] = list(ob_dfs[sheet].columns)
if all([r in sheet_columns for r in ob_req_cols]):
ob_df = ob_dfs[sheet]
break
xl_file = ExcelFile(report_path)
# Get the columns required to be a valid report for the given report type
req_cols = [col[report_source.value] for col in report_config.shared_columns]
return ob_df, gp_df
logger.debug(f"GP_Req_cols: {req_cols}")
# Sheets avaialble in the excel file
sheets = xl_file.sheet_names
# Dictionary of dataframes keyed by their sheet name
sheet_dataframes: dict[str:DataFrame] = read_excel(xl_file, sheet_name=sheets)
# Check each dataframe for the required column
for sheet in sheet_dataframes:
sheet_columns: list[str] = list(sheet_dataframes[sheet].columns)
logger.debug(f"{report_source.value} ({sheet}) : {sheet_columns}")
logger.debug(f"Matches {[r in sheet_columns for r in req_cols]}")
if all([r in sheet_columns for r in req_cols]):
logger.debug(f"FOUND: {sheet}")
return sheet_dataframes[sheet]
return None
def main() -> int:
@ -80,23 +54,25 @@ def main() -> int:
Returns:
int: 0 if the script executes successfully.
"""
# Read the configuration options from a TOML file
with open("config_reports.toml", "rb") as f:
reports_config: dict = load(f)
logger.debug(f"Reports Config: {reports_config}")
# Read the configuration options
report_config: ReportConfig = ReportConfig.from_file(Path("src/configs/reports_config.toml"))
# Get the GP and OB dataframes from the Excel files
ob_df, gp_df = get_reports("Work", reports_config)
ob_report, gp_report = report_config.paths.get_most_recent()
print(ob_report)
print(gp_report)
ob_df: DataFrame = pull_report_sheet(ob_report, ReportSource.OB, report_config)
gp_df: DataFrame = pull_report_sheet(gp_report, ReportSource.GP, report_config)
assert not ob_df.empty, "OB Data empty!"
assert not gp_df.empty, "GP Data empty!"
obr: OnBaseReport = OnBaseReport(ob_df, reports_config)
gpr: GreatPlainsReport = GreatPlainsReport(gp_df, reports_config)
obr: OnBaseReport = OnBaseReport(ob_df, report_config)
gpr: GreatPlainsReport = GreatPlainsReport(gp_df, report_config)
rec_output: ReconciledReports = obr.reconcile(gpr)
output_name: Path = Path(f"Reconciled Holds [{dt.now().strftime('%m-%d-%Y')}].xlsx")
output_base: Path = Path(reports_config["output_path"])
output_base: Path = report_config.paths.output_directory
output_path: Path = Path(output_base, output_name)
rec_output.save_reports(output_path)

@ -7,9 +7,11 @@ resolved holds.
*Last Updated: version 1.3
"""
from helpers import drop_unnamed, setup_logging
from ghlib.database.database_manager import SQLiteManager, select_fields_statement
from src.helpers import drop_unnamed, setup_logging
from src.config import ReportConfig, ReportSource
from src.ghlib.database.database_manager import SQLiteManager, select_fields_statement
from pathlib import Path
from pandas import DataFrame, Series, read_sql_query, read_excel, concat
from numpy import NaN
from logging import getLogger
@ -28,6 +30,15 @@ def hash_cols(row: Series, cols_to_hash: list[str]) -> col_hash:
return md5_hash.hexdigest()
def create_identifier(df: DataFrame) -> DataFrame:
"""
We want to create a unqiue and replicable ID to identify each payment pair.
Some transactions may have 1 blank ID which can cause an undeterimable hash.
For this reason we must replace empty IDs with x so that it will have a replicable
value.
Then the two ideas are hashed together using md5. Resulting in a unique 32 character
identifier that can be reproduced.
"""
for id in ["ID_OB","ID_GP"]:
df[id].fillna("x", inplace=True)
df["Indentifier"] = df.apply(lambda row:
@ -37,10 +48,10 @@ def create_identifier(df: DataFrame) -> DataFrame:
df[id].replace('x',NaN, inplace=True)
return df
def save_rec(resolved_dataframes: list[DataFrame]):
def save_rec(resolved_dataframes: list[DataFrame], report_config: ReportConfig):
"""
"""
sqlManager: SQLiteManager = SQLiteManager("OnHold.db")
sqlManager: SQLiteManager = SQLiteManager(report_config.paths.db_path)
with sqlManager.get_session() as session:
rdf: DataFrame
@ -66,14 +77,13 @@ def save_rec(resolved_dataframes: list[DataFrame]):
"Indentifier",
"ID_GP",
"ID_OB",
"HideNextMonth",
"Resolution"
]
rec_cols.extend(report_config.work_columns)
rdf = rdf[rec_cols]
rdf.set_index("Indentifier", inplace=True, drop=True)
rdf.drop_duplicates(inplace=True)
rdf = rdf.dropna(axis=0, how="all", subset=["HideNextMonth", "Resolution"])
rdf = rdf.dropna(axis=0, how="all", subset=report_config.work_columns)
logger.debug(f"Saving resolutions to db:\n{rdf}")
rdf.to_sql('Resolutions',
@ -83,7 +93,7 @@ def save_rec(resolved_dataframes: list[DataFrame]):
def get_prev_reconciled(identfiers: list[col_hash]) -> DataFrame|None:
def get_prev_reconciled(identfiers: list[col_hash], db_location: Path) -> DataFrame|None:
"""
Get a DataFrame of previously reconciled contracts from an SQLite database.
@ -94,7 +104,7 @@ def get_prev_reconciled(identfiers: list[col_hash]) -> DataFrame|None:
DataFrame: A DataFrame of previously reconciled contracts, or an empty DataFrame if none are found.
"""
# Create a DB manager
sqlManager: SQLiteManager = SQLiteManager("OnHold.db")
sqlManager: SQLiteManager = SQLiteManager(db_location)
# Create a temp table to hold this batches contract numbers
# this table will be cleared when sqlManager goes out of scope
@ -139,5 +149,7 @@ if __name__ == "__main__":
no_match: DataFrame = read_excel(args.input, sheet_name="No Match")
# Amount Mismatch
amt_mm: DataFrame = read_excel(args.input, sheet_name="Amount Mismatch")
report_config = ReportConfig(Path(r"configs\reports_config.toml"))
save_rec(resolved_dataframes=[no_match, amt_mm])
save_rec(report_config, resolved_dataframes=[no_match, amt_mm])

@ -3,13 +3,16 @@ from openpyxl import Workbook, load_workbook
from abc import ABC
from logging import getLogger
import re
from re import Pattern
import datetime
from copy import deepcopy
from dataclasses import dataclass
from helpers import CN_REGEX, drop_unnamed
from memory import get_prev_reconciled, hash_cols, col_hash, create_identifier
from pathlib import Path
from src.helpers import CN_REGEX, drop_unnamed
from src.memory import get_prev_reconciled, hash_cols, col_hash, create_identifier
from src.config import ReportConfig, ReportSource
logger = getLogger(__name__)
@dataclass
@ -54,19 +57,19 @@ class HoldReport(ABC):
source = ""
def __init__(self, dataframe: DataFrame, reports_config: dict) -> None:
def __init__(self, dataframe: DataFrame, reports_config: ReportConfig) -> None:
self.config = reports_config
drop_unnamed(dataframe)
self.df = dataframe
self.df = self._add_work_columns(self.df)
self.df = self._add_work_columns(self.df, reports_config.work_columns)
self._normalize()
def _normalize(self):
# Rename the columns to standardize the column names
self.df.rename( columns= { unique_cols[self.source] : common_col
for common_col, unique_cols in self.config["shared_columns"].items()
self.df.rename( columns= { sc_dict[self.source] : sc_dict["standardized_name"]
for sc_dict in self.config.shared_columns
}, inplace=True)
# Convert the on-hold amount column to float format and round to two decimal places
@ -87,7 +90,7 @@ class HoldReport(ABC):
@staticmethod
def _remove_prev_recs(contract_match, no_match) -> \
def _remove_prev_recs(contract_match, no_match, db_location: Path) -> \
tuple[DataFrame, DataFrame, DataFrame]:
"""
"""
@ -96,7 +99,7 @@ class HoldReport(ABC):
idents.extend(create_identifier(no_match)["Indentifier"].to_list())
logger.debug(f"{idents=}")
# Get previsouly reced
prev_recs: DataFrame|None = get_prev_reconciled(idents)
prev_recs: DataFrame|None = get_prev_reconciled(idents, db_location)
if prev_recs is None:
logger.info("No previously reconciled!")
@ -205,19 +208,20 @@ class HoldReport(ABC):
no_match = create_identifier(no_match)
logger.debug(f"_requires_rec | no_match:\n{no_match.columns} ({no_match.shape})")
self.prev_recs, contract_match, no_match = self._remove_prev_recs(contract_match, no_match)
self.prev_recs, contract_match, no_match = self._remove_prev_recs(contract_match,
no_match, self.config.paths.db_path
)
return contract_match, no_match
@staticmethod
def _add_work_columns(df: DataFrame) -> DataFrame:
def _add_work_columns(df: DataFrame, work_cols: list) -> DataFrame:
"""
Add empty columns to the dataframe to faciliate working through the report.
"""
logger.debug("Adding work columns!")
df_cols: list[str] = df.columns.to_list()
WORK_COLS = ["HideNextMonth","Resolution"]
for col in WORK_COLS:
for col in work_cols:
if col not in df_cols:
df[col] = ''
return df
@ -245,7 +249,7 @@ class HoldReport(ABC):
# Formatting
columns: list[str] = ["ID_GP", "ID_OB"]
columns.extend(self.config["output_columns"])
columns.extend(self.config.finished_columns)
nm_cols:list[str] = deepcopy(columns)
nm_cols.insert(3,"onhold_amount")
@ -265,8 +269,6 @@ class HoldReport(ABC):
logger.info(f"no_match: {no_match.shape[0]}")
logger.info(f"am_mm: {amount_mismatch.shape[0]}")
reconciled: ReconciledReports = ReconciledReports(
no_match=no_match,
amt_mismatch=amount_mismatch,
@ -281,7 +283,7 @@ class OnBaseReport(HoldReport):
source = "OB"
def __init__(self, dataframe: DataFrame, reports_config: dict) -> None:
def __init__(self, dataframe: DataFrame, reports_config: ReportConfig) -> None:
self.overdue = self._get_overdue(dataframe)
super().__init__(dataframe, reports_config)
@ -300,24 +302,24 @@ class GreatPlainsReport(HoldReport):
source = "GP"
def __init__(self, dataframe: DataFrame, report_config: dict) -> None:
def __init__(self, dataframe: DataFrame, report_config: ReportConfig) -> None:
self.filtered: DataFrame = self._filter(
gp_report_df= dataframe,
doc_num_filters= report_config["gp_filters"]["doc_num_filters"],
good_po_num_regex= report_config["gp_filters"]["po_filter"]
doc_num_filters= report_config.filters["doc_num_filters"],
good_po_num_regex= report_config.filters["po_filter"][0]
)
super().__init__(dataframe, report_config)
@staticmethod
def _filter(gp_report_df: DataFrame,
doc_num_filters: list[str], good_po_num_regex: str
doc_num_filters: list[Pattern], good_po_num_regex: Pattern
) -> DataFrame:
GOOD_PO_NUM = re.compile(good_po_num_regex, re.IGNORECASE)
GOOD_PO_NUM = good_po_num_regex
bad_doc_num = ''
rx : str
bad_doc_num = '(?i)'
rx : Pattern
for rx in doc_num_filters:
bad_doc_num += f"({rx})|"
bad_doc_num = re.compile(bad_doc_num[:-1], re.IGNORECASE)

@ -1,5 +0,0 @@
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
import src

@ -1,7 +1,6 @@
import unittest
from pathlib import Path
from re import Pattern, compile
from .context import src
from src import config
from src import ReportSource
@ -15,12 +14,12 @@ class TestReportConfig(unittest.TestCase):
report_config = config.ReportConfig.from_file(config_file)
# Assert the values of the attributes in the created instance
self.assertEqual(report_config.paths.input_directory, Path(r"tests\test_inputs"))
self.assertEqual(report_config.paths.input_directory, Path(r"tests\test_inputs\TestSearch"))
self.assertEqual(report_config.paths.gp_glob, r'*GP*.xlsx')
self.assertEqual(report_config.paths.ob_glob, r"*OB*.xlsx")
self.assertEqual(report_config.paths.output_directory, Path(r"tests\test_outputs"))
self.assertEqual(report_config.use_mssql, False)
self.assertEqual(report_config.paths.db_path, Path("./onhold_reconciliation.db"))
self.assertEqual(report_config.paths.db_path, Path(r"tests\test_inputs\Static\test_static_OnHold.db"))
self.assertEqual(report_config.work_columns, ["HideNextMonth", "Resolution"])
self.assertEqual(report_config.finished_columns, [
"contract_number",
@ -42,7 +41,7 @@ class TestReportConfig(unittest.TestCase):
compile(r"rent",),
compile(r"cma",),
])
self.assertEqual(report_config.filters["po_filter"], [compile(r"^(?!.*cma(\s|\d)).*$")])
self.assertEqual(report_config.filters["po_filter"], [compile(r"(?i)^(?!.*cma(\s|\d)).*$")])
self.assertEqual(report_config.shared_columns[0]["standardized_name"], "contract_number")
self.assertEqual(report_config.shared_columns[0]["GP"], "Transaction Description")
self.assertEqual(report_config.shared_columns[0]["OB"], "Contract")

@ -1,7 +1,7 @@
#### Paths: using '' makes the string 'raw' to avoid escape characters
# Path to the directory to search for input report files
input_directory = 'tests\test_inputs'
input_directory = 'tests\test_inputs\TestSearch'
# Regex used to discover newest files
input_glob_pattern = { GP = "*GP*.xlsx", OB = '*OB*.xlsx'}
# Path to the directory to save the reconcilation work report
@ -16,7 +16,7 @@ interactive_inputs = false # NOT YET IMPLEMENTED
# NOT YET IMPLEMENTED!
use_mssql = false
# Path to the SQLite database used to view/save reconcilations
database_path = './onhold_reconciliation.db'
database_path = 'tests\test_inputs\Static\test_static_OnHold.db'
### Finished rec details
@ -53,7 +53,7 @@ doc_num_filters = [
"rent",
"cma"
]
po_filter = ['^(?!.*cma(\s|\d)).*$']
po_filter = ['(?i)^(?!.*cma(\s|\d)).*$']
# Columns that are featured & expected on both OB & GP
[[shared_columns]]

@ -0,0 +1,78 @@
from pandas import DataFrame, merge, to_datetime, NaT, concat, read_excel
from pathlib import Path
from re import Pattern
import pytest as pt
from src.config import ReportConfig, ReportSource
from src.reports import GreatPlainsReport, OnBaseReport, ReconciledReports
from src.hold_reconciler import pull_report_sheet
class TestReport:
@pt.fixture(autouse=True)
def setup(self):
self.report_config = ReportConfig.from_file(
Path(r"./tests/test_inputs/TEST_reports_config.toml")
)
def test_full(self):
"""
Full process test.
This tests inputs will need to be adjust anytime a change is made to the
input/output report layouts, filtering, trimming, normalization.
Basically, this is just to make sure everything still works after making
TINY changes, that are not meant to effect the structure/logic of the program
"""
ob_df = pull_report_sheet(
Path(r"./tests/test_inputs\Static\April 2023 OB.xlsx"),
ReportSource.OB,
self.report_config
)
gp_df = pull_report_sheet(
Path(r"./tests/test_inputs\Static\April GP.xlsx"),
ReportSource.GP,
self.report_config
)
assert not ob_df.empty, "OB Data empty!"
assert not gp_df.empty, "GP Data empty!"
obr: OnBaseReport = OnBaseReport(ob_df, self.report_config)
gpr: GreatPlainsReport = GreatPlainsReport(gp_df, self.report_config)
rec_output: ReconciledReports = obr.reconcile(gpr)
output_path: Path = Path(
self.report_config.paths.output_directory,
"TEST_REPORT.xlsx"
)
rec_output.save_reports(output_path)
SHEET_NAMES = [
"No Match",
"Amount Mismatch",
"Overdue",
"Previously Reconciled",
"Filtered from GP",
]
CONTROL: dict[str:DataFrame] = read_excel(
Path(r"./tests/test_inputs/Static/Reconciled Holds [TEST_FIN].xlsx"),
sheet_name=SHEET_NAMES
)
new: dict[str:DataFrame] = read_excel(
output_path,
sheet_name=SHEET_NAMES
)
for sheet in SHEET_NAMES:
print(sheet)
print(new[sheet])
print("Control: ")
print(CONTROL[sheet])
assert new[sheet].equals(CONTROL[sheet])
Loading…
Cancel
Save