import pandas as pd from pandas import DataFrame from datetime import datetime as dt import datetime import re from typing import Literal import logging logger = logging.getLogger(__name__) def get_overdue(onbase_df: DataFrame, onbase_excel_config) -> DataFrame: """ Given a DataFrame containing OnBase installation data and a dictionary containing the OnBase Excel configuration, this function returns a DataFrame containing the rows from `onbase_df` that have an installation date that is before the current date. Args: onbase_df (pd.DataFrame): A pandas DataFrame containing OnBase installation data. onbase_excel_config (dict): A dictionary containing the OnBase Excel configuration. Returns: pd.DataFrame: A pandas DataFrame containing the rows from `onbase_df` that have an installation date that is before the current date. """ id_col = onbase_excel_config["install_date"] onbase_df[id_col] = pd.to_datetime(onbase_df[id_col]) onbase_df[id_col].fillna(pd.NaT, inplace=True) return onbase_df[onbase_df[id_col].dt.date < datetime.date.today()] def filter_gp(gp_dataframe: pd.DataFrame, full_config: dict) -> pd.DataFrame: """ Given a pandas DataFrame containing GP data and a dictionary containing the GP configuration, this function filters out rows from the DataFrame that are not needed for further analysis based on certain criteria. Args: gp_dataframe (pd.DataFrame): A pandas DataFrame containing GP data. gp_config (dict): A dictionary containing the GP configuration. Returns: pd.DataFrame: A pandas DataFrame containing the filtered GP data. """ # Excludes anything that contains cma with a space or digit following it # CMA23532 would be excluded but 'John Locman' would be allowed GOOD_PO_NUM = re.compile(r"^(?!.*cma(\s|\d)).*$", re.IGNORECASE) gp_config: dict = full_config["ExcelColumns"]["GP"] doc_num_regexes: list[str] = full_config["DocNumFilter"] bad_doc_num = '' rx : str for rx in doc_num_regexes: bad_doc_num += f"({rx})|" bad_doc_num = re.compile(bad_doc_num[:-1], re.IGNORECASE) logger.debug(f"Doc # filter: {bad_doc_num}") # Create a filter/mask to use on the data mask = ( (gp_dataframe[gp_config['doc_type']] == "Invoice") & (gp_dataframe[gp_config['pur_order']].str.contains(GOOD_PO_NUM)) ) # Get the rows to drop based on the filter/mask rows_to_drop = gp_dataframe[~mask].index # Drop the rows and return the filtered DataFrame filtered_df = gp_dataframe.drop(rows_to_drop, inplace=False) mask = filtered_df[gp_config['doc_num']].str.contains(bad_doc_num) rows_to_drop = filtered_df[mask].index return filtered_df.drop(rows_to_drop, inplace=False) def create_transaction_df(dataframe: pd.DataFrame, source: Literal["GP", "OB"], excelConfig: dict): """ Given a pandas DataFrame containing transaction data, the source of the data ("GP" or "OB"), and a dictionary containing the Excel configuration, this function creates a new DataFrame with columns for the contract number, the amount on hold, a unique transaction ID, and the source of the data. Args: dataframe (pd.DataFrame): A pandas DataFrame containing transaction data. source (Literal["GP", "OB"]): The source of the data ("GP" or "OB"). excelConfig (dict): A dictionary containing the Excel configuration. Returns: pd.DataFrame: A pandas DataFrame containing the contract number, amount on hold, transaction ID, and data source for each transaction in the original DataFrame. """ column_config: dict = excelConfig[source] logger.debug(f"column_config: {column_config}") # Create a new DataFrame with the contract number and on-hold amount columns transactions = dataframe[[column_config["contract_number"], column_config["onhold_amount"]]].copy() # Rename the columns to standardize the column names transactions.rename(columns={ column_config["contract_number"]: "contract_number", column_config["onhold_amount"]: "onhold_amount", }, inplace=True) # Convert the on-hold amount column to float format and round to two decimal places transactions["onhold_amount"] = transactions["onhold_amount"].astype(float).round(2) # Use regex to extract the contract number from the column values and create a new column with the standardized format CN_REGEX = re.compile(r"\d{7}(-\d{3})?") transactions["contract_number"] = transactions["contract_number"].apply( lambda cn: str(cn) if not re.search(CN_REGEX, str(cn)) else re.search(CN_REGEX, str(cn)).group(0) ) # Create a new column with a unique transaction ID transactions["ID"] = transactions["contract_number"] +'_'+\ transactions["onhold_amount"].astype(str) # Create a new column with the data source transactions["Source"] = source # Return the new DataFrame with the contract number, on-hold amount, transaction ID, and data source columns return transactions def get_no_match(obt_df: pd.DataFrame, gpt_df: pd.DataFrame): """ Given two pandas DataFrames containing transaction data from OBT and GPT, respectively, this function returns a new DataFrame containing only the transactions that do not have a match in both the OBT and GPT DataFrames. Args: obt_df (pd.DataFrame): A pandas DataFrame containing transaction data from OBT. gpt_df (pd.DataFrame): A pandas DataFrame containing transaction data from GPT. Returns: pd.DataFrame: A pandas DataFrame containing the transactions that do not have a match in both the OBT and GPT DataFrames. """ # Merge the two DataFrames using the contract number as the join key merged_df = pd.merge( obt_df, gpt_df, how="outer", on=["contract_number"], suffixes=("_ob", "_gp") ) # Filter the merged DataFrame to include only the transactions that do not have a match in both OBT and GPT no_match = merged_df.loc[ (merged_df["Source_ob"].isna()) | (merged_df["Source_gp"].isna()) ] # Fill in missing values and drop unnecessary columns no_match["Source"] = no_match["Source_ob"].fillna("GP") no_match["onhold_amount"] = no_match["onhold_amount_ob"].fillna(no_match["onhold_amount_gp"]) no_match.drop(columns=[ "ID_ob", "ID_gp", "onhold_amount_ob", "onhold_amount_gp", "Source_ob", "Source_gp" ], inplace=True) # Reorder and return the new DataFrame with the source, contract number, and on-hold amount columns no_match = no_match[ [ "Source", "contract_number", "onhold_amount"] ] return no_match def get_not_full_match(obt_df: pd.DataFrame, gpt_df: pd.DataFrame): """ Given two pandas DataFrames containing transaction data from OBT and GPT, respectively, this function returns two new DataFrames. The first DataFrame contains the transactions that have a full match on both the OBT and GPT DataFrames, and the second DataFrame contains the transactions that do not have a full match. Args: obt_df (pd.DataFrame): A pandas DataFrame containing transaction data from OBT. gpt_df (pd.DataFrame): A pandas DataFrame containing transaction data from GPT. Returns: tuple(pd.DataFrame, pd.DataFrame): A tuple of two DataFrames. The first DataFrame contains the transactions that have a full match on both the OBT and GPT DataFrames, and the second DataFrame contains the transactions that do not have a full match. """ # Combine the two DataFrames using an outer join on the contract number and on-hold amount merged_df = pd.merge( obt_df, gpt_df, how="outer", on=["ID", "contract_number", "onhold_amount"], suffixes=("_ob", "_gp") ) # Filter the merged DataFrame to include only the transactions that have a full match in both OBT and GPT full_matched = merged_df.dropna(subset=["Source_ob", "Source_gp"]) full_matched.drop(columns=["Source_ob", "Source_gp"], inplace=True) # Create a boolean mask for the rows to drop in full_matched mask = merged_df["ID"].isin(full_matched["ID"]) # Use the mask to remove the selected rows and create a new DataFrame for not full match not_full_match = merged_df[~mask] # This includes items that DO match contracts, but not amounts # It can have multiple items from one source with the same contract number # Create a new column with the data source, using OBT as the default and GPT as backup if missing not_full_match["Source"] = not_full_match["Source_ob"].fillna(not_full_match["Source_gp"]) # Drop the redundant Source columns not_full_match.drop(columns=["Source_ob", "Source_gp"], inplace=True) # Reorder and return the new DataFrame with the source, contract number, and on-hold amount columns not_full_match = not_full_match[ [ "Source", "contract_number", "onhold_amount"] ] # Return the two DataFrames return full_matched, not_full_match def get_contract_match(not_full_match: pd.DataFrame) -> pd.DataFrame: """ Given a pandas DataFrame containing transactions that do not have a full match between OBT and GPT, this function returns a new DataFrame containing only the transactions that have a matching contract number in both OBT and GPT. Args: not_full_match (pd.DataFrame): A pandas DataFrame containing transactions that do not have a full match between OBT and GPT. Returns: pd.DataFrame: A pandas DataFrame containing only the transactions that have a matching contract number in both OBT and GPT. """ # Filter the not_full_match DataFrame by source ob_df = not_full_match[not_full_match["Source"] == "OB"] gp_df = not_full_match[not_full_match["Source"] == "GP"] # Merge the two filtered DataFrames on the contract number contract_match = pd.merge( ob_df, gp_df, how="inner", on=["contract_number"], suffixes=("_ob", "_gp") ) # Fill in missing values in the Source column and drop the redundant columns contract_match.drop(columns=["Source_ob", "Source_gp"], inplace=True) # Reorder and return the new DataFrame with the source, contract number, and on-hold amount columns contract_match = contract_match[ [ "contract_number", "onhold_amount_ob", "onhold_amount_gp"] ] return contract_match