import pandas as pd from pandas import DataFrame, Series import re from re import Pattern import os from os.path import basename import glob import logging from pathlib import Path from tomllib import load import logging.config from datetime import datetime as dt """ [ ] Pull in past reconciliations to check against [ ] Record reconciled transaction (connect with VBA) [ ] Check GP against the database [ ] Check OB against the database """ # Custom module for reconciliation from rec_lib import get_contract_match, get_no_match, \ get_not_full_match, get_overdue, filter_gp, create_transaction_df def setup_logging(): """ Sets up logging configuration from the TOML file. If the logging configuration fails to be loaded from the file, a default logging configuration is used instead. Returns: logging.Logger: The logger instance. """ with open("config.toml", "rb") as f: config_dict: dict = load(f) try: # Try to load logging configuration from the TOML file logging.config.dictConfig(config_dict["logger"]) except Exception as e: # If the logging configuration fails, use a default configuration and log the error logger = logging.getLogger() logger.setLevel(logging.DEBUG) logger.warning("Failed setting up logger!") logger.exception(e) logger.warning(f"Config:\n{config_dict}") return logger setup_logging() logger = logging.getLogger(__name__) logger.info(f"Logger started with level: {logger.level}") def find_most_recent_file(folder_path: Path, file_pattern: Pattern) -> str: """ Given a folder path and a regular expression pattern, this function returns the path of the most recently modified file in the folder that matches the pattern. Args: folder_path (Path): A pathlib.Path object representing the folder to search. file_pattern (Pattern): A regular expression pattern used to filter the files in the folder. Returns: str: The path of the most recently modified file in the folder that matches the pattern. """ # Find all files in the folder that match the pattern files = glob.glob(f"{folder_path}/*") logger.debug(f"files: {files}") # Get the modification time of each file and filter to only those that match the pattern file_times = [(os.path.getmtime(path), path) for path in files if re.match(file_pattern, basename(path))] # Sort the files by modification time (most recent first) file_times.sort(reverse=True) logger.debug(f"file times: {file_times}") # Return the path of the most recent file return file_times[0][1] def check_sheet(df_cols: list[str], excel_col_config: dict) -> bool: """ Given a list of column names and a dictionary of column name configurations, this function checks if the required columns are present in the list of column names. Args: df_cols (list[str]): A list of column names. excel_col_config (dict): A dictionary of column name configurations. Returns: bool: True if all of the required columns are present in the list of column names, False otherwise. """ # Get the list of required columns from the column configuration dictionary required_cols: list[str] = list(excel_col_config.values()) # Check if all of the required columns are present in the list of column names return all([col in df_cols for col in required_cols]) def get_dataframes(work_dir: str, excelConfig: dict) -> tuple[pd.DataFrame|None, pd.DataFrame|None]: """ Given a dictionary of Excel configuration options, this function searches for the most recently modified GP and OB Excel files in a "Work" folder and returns their corresponding dataframes. Args: excelConfig (dict): A dictionary containing configuration options for the GP and OB Excel files. Returns: tuple[pd.DataFrame|None, pd.DataFrame|None]: A tuple containing the OB and GP dataframes, respectively. """ # Define regular expression patterns to match the GP and OB Excel files gp_regex: Pattern = re.compile(".*gp.*\.xlsx$", re.IGNORECASE) ob_regex: Pattern = re.compile(".*ob.*\.xlsx$", re.IGNORECASE) # Find the paths of the most recently modified GP and OB Excel files gp_file_path = find_most_recent_file(work_dir, gp_regex) logger.debug(f"gp_file_path: {gp_file_path}") ob_file_path = find_most_recent_file(work_dir, ob_regex) logger.debug(f"gp_file_path: {ob_file_path}") # Read the GP and OB Excel files into dataframes and check that each dataframe has the required columns gp_xl = pd.ExcelFile(gp_file_path) gp_config = excelConfig["GP"] gp_sheets = gp_xl.sheet_names gp_dfs = pd.read_excel(gp_xl, sheet_name=gp_sheets) for sheet in gp_dfs: if check_sheet(gp_dfs[sheet].columns, gp_config): gp_df = gp_dfs[sheet] break ob_xl = pd.ExcelFile(ob_file_path) ob_config = excelConfig["OB"] ob_sheets = ob_xl.sheet_names ob_dfs = pd.read_excel(ob_xl, sheet_name=ob_sheets) for sheet in ob_dfs: if check_sheet(ob_dfs[sheet].columns, ob_config): ob_df = ob_dfs[sheet] break return ob_df, gp_df def main() -> int: """ This is the main function for the script. It reads configuration options from a TOML file, reads in the GP and OB Excel files, performs data reconciliation and analysis, and writes the results to a new Excel file. Returns: int: 0 if the script executes successfully. """ # Read the configuration options from a TOML file with open("config.toml", "rb") as f: config_dict: dict = load(f) logger.debug(f"Config: {config_dict}") excelConfig: dict = config_dict["ExcelColumns"] # Get the GP and OB dataframes from the Excel files ob_df, gp_df = get_dataframes(config_dict["write_dir"] ,excelConfig) assert not ob_df.empty, "OB Data empty!" assert not gp_df.empty, "GP Data empty!" # Filter the GP dataframe to include only relevant transactions fgp_df: DataFrame = filter_gp(gp_df, config_dict) # Get the overdue transactions from the OB dataframe overdue: DataFrame = get_overdue(ob_df, excelConfig["OB"]) # Create transaction dataframes for the GP and OB dataframes ob_transactions: DataFrame = create_transaction_df(ob_df, 'OB', excelConfig) gp_transactions: DataFrame = create_transaction_df(fgp_df, 'GP', excelConfig) # Get the transactions that do not have matches in both the GP and OB dataframes no_match: DataFrame = get_no_match(ob_transactions, gp_transactions) # Get the transactions that have matches in both the GP and OB dataframes but have amount mismatches full_match, not_full_match = get_not_full_match(ob_transactions, gp_transactions) only_contracts_match: DataFrame = get_contract_match(not_full_match) # Write the results to a new Excel file with pd.ExcelWriter(f"{config_dict['write_dir']}/Reconciled Holds [{dt.now().strftime('%m-%d-%Y')}].xlsx", mode='w') as writer: full_match.to_excel(writer,sheet_name="FULL", index=False) no_match.to_excel(writer, sheet_name="No Match", index=False) only_contracts_match.to_excel(writer, sheet_name="Amount Mismatch", index=False) overdue.to_excel(writer, sheet_name="Overdue", index=False) return 0 if __name__ == "__main__": print("Starting") main() print("Completed")