You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
OnHoldReconciler/reconcile_holds.py

190 lines
7.4 KiB

import pandas as pd
from pandas import DataFrame, Series
import re
from re import Pattern
import os
from os.path import basename
import glob
import logging
from pathlib import Path
from tomllib import load
import logging.config
from datetime import datetime as dt
"""
[ ] Pull in past reconciliations to check against
[ ] Record reconciled transaction (connect with VBA)
[ ] Check GP against the database
[ ] Check OB against the database
"""
# Custom module for reconciliation
from rec_lib import get_contract_match, get_no_match, \
get_not_full_match, get_overdue, filter_gp, create_transaction_df
def setup_logging():
"""
Sets up logging configuration from the TOML file. If the logging configuration fails to be loaded from the file,
a default logging configuration is used instead.
Returns:
logging.Logger: The logger instance.
"""
with open("config.toml", "rb") as f:
config_dict: dict = load(f)
try:
# Try to load logging configuration from the TOML file
logging.config.dictConfig(config_dict["logger"])
except Exception as e:
# If the logging configuration fails, use a default configuration and log the error
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.warning("Failed setting up logger!")
logger.exception(e)
logger.warning(f"Config:\n{config_dict}")
return logger
setup_logging()
logger = logging.getLogger(__name__)
logger.info(f"Logger started with level: {logger.level}")
def find_most_recent_file(folder_path: Path, file_pattern: Pattern) -> str:
"""
Given a folder path and a regular expression pattern, this function returns the path of the most recently modified
file in the folder that matches the pattern.
Args:
folder_path (Path): A pathlib.Path object representing the folder to search.
file_pattern (Pattern): A regular expression pattern used to filter the files in the folder.
Returns:
str: The path of the most recently modified file in the folder that matches the pattern.
"""
# Find all files in the folder that match the pattern
files = glob.glob(f"{folder_path}/*")
logger.debug(f"files: {files}")
# Get the modification time of each file and filter to only those that match the pattern
file_times = [(os.path.getmtime(path), path) for path in files if re.match(file_pattern, basename(path))]
# Sort the files by modification time (most recent first)
file_times.sort(reverse=True)
logger.debug(f"file times: {file_times}")
# Return the path of the most recent file
return file_times[0][1]
def check_sheet(df_cols: list[str], excel_col_config: dict) -> bool:
"""
Given a list of column names and a dictionary of column name configurations, this function checks if the required
columns are present in the list of column names.
Args:
df_cols (list[str]): A list of column names.
excel_col_config (dict): A dictionary of column name configurations.
Returns:
bool: True if all of the required columns are present in the list of column names, False otherwise.
"""
# Get the list of required columns from the column configuration dictionary
required_cols: list[str] = list(excel_col_config.values())
# Check if all of the required columns are present in the list of column names
return all([col in df_cols for col in required_cols])
def get_dataframes(work_dir: str, excelConfig: dict) -> tuple[pd.DataFrame|None, pd.DataFrame|None]:
"""
Given a dictionary of Excel configuration options, this function searches for the most recently modified GP and OB
Excel files in a "Work" folder and returns their corresponding dataframes.
Args:
excelConfig (dict): A dictionary containing configuration options for the GP and OB Excel files.
Returns:
tuple[pd.DataFrame|None, pd.DataFrame|None]: A tuple containing the OB and GP dataframes, respectively.
"""
# Define regular expression patterns to match the GP and OB Excel files
gp_regex: Pattern = re.compile(".*gp.*\.xlsx$", re.IGNORECASE)
ob_regex: Pattern = re.compile(".*ob.*\.xlsx$", re.IGNORECASE)
# Find the paths of the most recently modified GP and OB Excel files
gp_file_path = find_most_recent_file(work_dir, gp_regex)
logger.debug(f"gp_file_path: {gp_file_path}")
ob_file_path = find_most_recent_file(work_dir, ob_regex)
logger.debug(f"gp_file_path: {ob_file_path}")
# Read the GP and OB Excel files into dataframes and check that each dataframe has the required columns
gp_xl = pd.ExcelFile(gp_file_path)
gp_config = excelConfig["GP"]
gp_sheets = gp_xl.sheet_names
gp_dfs = pd.read_excel(gp_xl, sheet_name=gp_sheets)
for sheet in gp_dfs:
if check_sheet(gp_dfs[sheet].columns, gp_config):
gp_df = gp_dfs[sheet]
break
ob_xl = pd.ExcelFile(ob_file_path)
ob_config = excelConfig["OB"]
ob_sheets = ob_xl.sheet_names
ob_dfs = pd.read_excel(ob_xl, sheet_name=ob_sheets)
for sheet in ob_dfs:
if check_sheet(ob_dfs[sheet].columns, ob_config):
ob_df = ob_dfs[sheet]
break
return ob_df, gp_df
def main() -> int:
"""
This is the main function for the script. It reads configuration options from a TOML file, reads in the GP and OB
Excel files, performs data reconciliation and analysis, and writes the results to a new Excel file.
Returns:
int: 0 if the script executes successfully.
"""
# Read the configuration options from a TOML file
with open("config.toml", "rb") as f:
config_dict: dict = load(f)
logger.debug(f"Config: {config_dict}")
excelConfig: dict = config_dict["ExcelColumns"]
# Get the GP and OB dataframes from the Excel files
ob_df, gp_df = get_dataframes(config_dict["write_dir"] ,excelConfig)
assert not ob_df.empty, "OB Data empty!"
assert not gp_df.empty, "GP Data empty!"
# Filter the GP dataframe to include only relevant transactions
fgp_df: DataFrame = filter_gp(gp_df, config_dict)
# Get the overdue transactions from the OB dataframe
overdue: DataFrame = get_overdue(ob_df, excelConfig["OB"])
# Create transaction dataframes for the GP and OB dataframes
ob_transactions: DataFrame = create_transaction_df(ob_df, 'OB', excelConfig)
gp_transactions: DataFrame = create_transaction_df(fgp_df, 'GP', excelConfig)
# Get the transactions that do not have matches in both the GP and OB dataframes
no_match: DataFrame = get_no_match(ob_transactions, gp_transactions)
# Get the transactions that have matches in both the GP and OB dataframes but have amount mismatches
full_match, not_full_match = get_not_full_match(ob_transactions, gp_transactions)
only_contracts_match: DataFrame = get_contract_match(not_full_match)
# Write the results to a new Excel file
with pd.ExcelWriter(f"{config_dict['write_dir']}/Reconciled Holds [{dt.now().strftime('%m-%d-%Y')}].xlsx", mode='w') as writer:
full_match.to_excel(writer,sheet_name="FULL", index=False)
no_match.to_excel(writer, sheet_name="No Match", index=False)
only_contracts_match.to_excel(writer, sheet_name="Amount Mismatch", index=False)
overdue.to_excel(writer, sheet_name="Overdue", index=False)
return 0
if __name__ == "__main__":
print("Starting")
main()
print("Completed")