OnHoldReconciler/reconcile_holds.py

import pandas as pd
from pandas import DataFrame, Series
import re
from re import Pattern
import os
from os.path import basename
import glob
import logging
from pathlib import Path
from tomllib import load
import logging.config
from datetime import datetime as dt

"""
[ ] Pull in past reconciliations to check against
[ ] Record reconciled transaction (connect with VBA)
[ ] Check GP against the database
[ ] Check OB against the database
"""

# Custom module for reconciliation
from rec_lib import get_contract_match, get_no_match, \
    get_not_full_match, get_overdue, filter_gp, create_transaction_df

def setup_logging():
    """
    Sets up logging configuration from the TOML file. If the logging configuration fails to be loaded from the file,
    a default logging configuration is used instead.

    Returns:
        logging.Logger: The logger instance.
    """
    with open("config.toml", "rb") as f:
        config_dict: dict = load(f)
        try:
            # Try to load logging configuration from the TOML file
            logging.config.dictConfig(config_dict["logger"])
        except Exception as e:
            # If the logging configuration fails, use a default configuration and log the error
            logger = logging.getLogger()
            logger.setLevel(logging.DEBUG)
            logger.warning("Failed setting up logger!")
            logger.exception(e)
            logger.warning(f"Config:\n{config_dict}")
            return logger


setup_logging()
logger = logging.getLogger(__name__)
logger.info(f"Logger started with level: {logger.level}")

def find_most_recent_file(folder_path: Path, file_pattern: Pattern) -> str:
    """
    Given a folder path and a regular expression pattern, this function returns the path of the most recently modified
    file in the folder that matches the pattern.

    Args:
        folder_path (Path): A pathlib.Path object representing the folder to search.
        file_pattern (Pattern): A regular expression pattern used to filter the files in the folder.

    Returns:
        str: The path of the most recently modified file in the folder that matches the pattern.
    """
    # Find all files in the folder that match the pattern
    files = glob.glob(f"{folder_path}/*")
    logger.debug(f"files: {files}")

    # Get the modification time of each file and filter to only those that match the pattern
    file_times = [(os.path.getmtime(path), path) for path in files if re.match(file_pattern, basename(path))]

    # Sort the files by modification time (most recent first)
    file_times.sort(reverse=True)
    logger.debug(f"file times: {file_times}")

    # Return the path of the most recent file
    return file_times[0][1]


def check_sheet(df_cols: list[str], excel_col_config: dict) -> bool:
    """
    Given a list of column names and a dictionary of column name configurations, this function checks if the required
    columns are present in the list of column names.

    Args:
        df_cols (list[str]): A list of column names.
        excel_col_config (dict): A dictionary of column name configurations.

    Returns:
        bool: True if all of the required columns are present in the list of column names, False otherwise.
    """
    # Get the list of required columns from the column configuration dictionary
    required_cols: list[str] = list(excel_col_config.values())
    # Check if all of the required columns are present in the list of column names
    return all([col in df_cols for col in required_cols])


def get_dataframes(work_dir: str, excelConfig: dict) -> tuple[pd.DataFrame|None, pd.DataFrame|None]:
    """
    Given a dictionary of Excel configuration options, this function searches for the most recently modified GP and OB
    Excel files in a "Work" folder and returns their corresponding dataframes.

    Args:
        excelConfig (dict): A dictionary containing configuration options for the GP and OB Excel files.

    Returns:
        tuple[pd.DataFrame|None, pd.DataFrame|None]: A tuple containing the OB and GP dataframes, respectively.
    """

    # Define regular expression patterns to match the GP and OB Excel files
    gp_regex: Pattern = re.compile(".*gp.*\.xlsx$", re.IGNORECASE)
    ob_regex: Pattern = re.compile(".*ob.*\.xlsx$", re.IGNORECASE)

    # Find the paths of the most recently modified GP and OB Excel files
    gp_file_path = find_most_recent_file(work_dir, gp_regex)
    logger.debug(f"gp_file_path: {gp_file_path}")
    ob_file_path = find_most_recent_file(work_dir, ob_regex)
    logger.debug(f"gp_file_path: {ob_file_path}")

    # Read the GP and OB Excel files into dataframes and check that each dataframe has the required columns
    gp_xl = pd.ExcelFile(gp_file_path)
    gp_config = excelConfig["GP"]
    gp_sheets = gp_xl.sheet_names
    gp_dfs = pd.read_excel(gp_xl, sheet_name=gp_sheets)
    for sheet in gp_dfs:
        if check_sheet(gp_dfs[sheet].columns, gp_config):
            gp_df = gp_dfs[sheet]
            break

    ob_xl = pd.ExcelFile(ob_file_path)
    ob_config = excelConfig["OB"]
    ob_sheets = ob_xl.sheet_names
    ob_dfs = pd.read_excel(ob_xl, sheet_name=ob_sheets)
    for sheet in ob_dfs:
        if check_sheet(ob_dfs[sheet].columns, ob_config):
            ob_df = ob_dfs[sheet]
            break

    return ob_df, gp_df


def main() -> int:
    """
    This is the main function for the script. It reads configuration options from a TOML file, reads in the GP and OB
    Excel files, performs data reconciliation and analysis, and writes the results to a new Excel file.

    Returns:
        int: 0 if the script executes successfully.
    """
    # Read the configuration options from a TOML file
    with open("config.toml", "rb") as f:
        config_dict: dict = load(f)
    logger.debug(f"Config: {config_dict}")

    excelConfig: dict = config_dict["ExcelColumns"]

    # Get the GP and OB dataframes from the Excel files
    ob_df, gp_df = get_dataframes(config_dict["write_dir"] ,excelConfig)
    assert not ob_df.empty, "OB Data empty!"
    assert not gp_df.empty, "GP Data empty!"

    # Filter the GP dataframe to include only relevant transactions
    fgp_df: DataFrame = filter_gp(gp_df, config_dict)
    # Get the overdue transactions from the OB dataframe
    overdue: DataFrame = get_overdue(ob_df, excelConfig["OB"])

    # Create transaction dataframes for the GP and OB dataframes
    ob_transactions: DataFrame = create_transaction_df(ob_df, 'OB', excelConfig)
    gp_transactions: DataFrame = create_transaction_df(fgp_df, 'GP', excelConfig)

    # Get the transactions that do not have matches in both the GP and OB dataframes
    no_match: DataFrame = get_no_match(ob_transactions, gp_transactions)

    # Get the transactions that have matches in both the GP and OB dataframes but have amount mismatches
    full_match, not_full_match = get_not_full_match(ob_transactions, gp_transactions)
    only_contracts_match: DataFrame = get_contract_match(not_full_match)

    # Write the results to a new Excel file
    with pd.ExcelWriter(f"{config_dict['write_dir']}/Reconciled Holds [{dt.now().strftime('%m-%d-%Y')}].xlsx", mode='w') as writer:
        full_match.to_excel(writer,sheet_name="FULL", index=False)
        no_match.to_excel(writer, sheet_name="No Match", index=False)
        only_contracts_match.to_excel(writer, sheet_name="Amount Mismatch", index=False)
        overdue.to_excel(writer, sheet_name="Overdue", index=False)

    return 0


if __name__ == "__main__":
    print("Starting")
    main()
    print("Completed")