OnHoldReconciler/rec_lib.py

import pandas as pd
from pandas import DataFrame
from datetime import datetime as dt
import datetime
import re
from typing import Literal
import logging


logger = logging.getLogger(__name__)


def get_overdue(onbase_df: DataFrame, onbase_excel_config) -> DataFrame:
    """
    Given a DataFrame containing OnBase installation data and a dictionary containing the OnBase Excel configuration,
    this function returns a DataFrame containing the rows from `onbase_df` that have an installation date that is before
    the current date.

    Args:
        onbase_df (pd.DataFrame): A pandas DataFrame containing OnBase installation data.
        onbase_excel_config (dict): A dictionary containing the OnBase Excel configuration.

    Returns:
        pd.DataFrame: A pandas DataFrame containing the rows from `onbase_df` that have an installation date that is before
        the current date.
    """
    id_col = onbase_excel_config["install_date"]
    onbase_df[id_col] = pd.to_datetime(onbase_df[id_col])
    onbase_df[id_col].fillna(pd.NaT, inplace=True)
    return  onbase_df[onbase_df[id_col].dt.date < datetime.date.today()]


def filter_gp(gp_dataframe: pd.DataFrame, full_config: dict) -> pd.DataFrame:
    """
    Given a pandas DataFrame containing GP data and a dictionary containing the GP configuration, this function
    filters out rows from the DataFrame that are not needed for further analysis based on certain criteria.

    Args:
        gp_dataframe (pd.DataFrame): A pandas DataFrame containing GP data.
        gp_config (dict): A dictionary containing the GP configuration.

    Returns:
        pd.DataFrame: A pandas DataFrame containing the filtered GP data.
    """

    # Excludes anything that contains cma with a space or digit following it
    # CMA23532 would be excluded but 'John Locman' would be allowed
    GOOD_PO_NUM = re.compile(r"^(?!.*cma(\s|\d)).*$", re.IGNORECASE)

    gp_config: dict = full_config["ExcelColumns"]["GP"]
    doc_num_regexes: list[str] = full_config["DocNumFilter"]

    bad_doc_num = ''
    rx : str
    for rx in doc_num_regexes:
        bad_doc_num += f"({rx})|"
    bad_doc_num = re.compile(bad_doc_num[:-1], re.IGNORECASE)
    logger.debug(f"Doc # filter: {bad_doc_num}")
    # Create a filter/mask to use on the data
    mask = (
        (gp_dataframe[gp_config['doc_type']] == "Invoice") &
        (gp_dataframe[gp_config['pur_order']].str.contains(GOOD_PO_NUM))
    )

    # Get the rows to drop based on the filter/mask
    rows_to_drop = gp_dataframe[~mask].index

    # Drop the rows and return the filtered DataFrame
    filtered_df = gp_dataframe.drop(rows_to_drop, inplace=False)

    mask = filtered_df[gp_config['doc_num']].str.contains(bad_doc_num)
    rows_to_drop = filtered_df[mask].index

    return filtered_df.drop(rows_to_drop, inplace=False)


def create_transaction_df(dataframe: pd.DataFrame, source: Literal["GP", "OB"], excelConfig: dict):
    """
    Given a pandas DataFrame containing transaction data, the source of the data ("GP" or "OB"), and a dictionary
    containing the Excel configuration, this function creates a new DataFrame with columns for the contract number,
    the amount on hold, a unique transaction ID, and the source of the data.

    Args:
        dataframe (pd.DataFrame): A pandas DataFrame containing transaction data.
        source (Literal["GP", "OB"]): The source of the data ("GP" or "OB").
        excelConfig (dict): A dictionary containing the Excel configuration.

    Returns:
        pd.DataFrame: A pandas DataFrame containing the contract number, amount on hold, transaction ID, and data source
        for each transaction in the original DataFrame.
    """
    column_config: dict = excelConfig[source]
    logger.debug(f"column_config: {column_config}")
    # Create a new DataFrame with the contract number and on-hold amount columns
    transactions = dataframe[[column_config["contract_number"], column_config["onhold_amount"]]].copy()

    # Rename the columns to standardize the column names
    transactions.rename(columns={
        column_config["contract_number"]: "contract_number",
        column_config["onhold_amount"]: "onhold_amount",
    }, inplace=True)

    # Convert the on-hold amount column to float format and round to two decimal places
    transactions["onhold_amount"] = transactions["onhold_amount"].astype(float).round(2)

    # Use regex to extract the contract number from the column values and create a new column with the standardized format
    CN_REGEX = re.compile(r"\d{7}(-\d{3})?")
    transactions["contract_number"] = transactions["contract_number"].apply(
        lambda cn: str(cn) if not re.search(CN_REGEX, str(cn))
        else re.search(CN_REGEX, str(cn)).group(0)
    )

    # Create a new column with a unique transaction ID
    transactions["ID"] = transactions["contract_number"] +'_'+\
        transactions["onhold_amount"].astype(str)

    # Create a new column with the data source
    transactions["Source"] = source

    # Return the new DataFrame with the contract number, on-hold amount, transaction ID, and data source columns
    return transactions


def get_no_match(obt_df: pd.DataFrame, gpt_df: pd.DataFrame):
    """
    Given two pandas DataFrames containing transaction data from OBT and GPT, respectively, this function returns a new
    DataFrame containing only the transactions that do not have a match in both the OBT and GPT DataFrames.

    Args:
        obt_df (pd.DataFrame): A pandas DataFrame containing transaction data from OBT.
        gpt_df (pd.DataFrame): A pandas DataFrame containing transaction data from GPT.

    Returns:
        pd.DataFrame: A pandas DataFrame containing the transactions that do not have a match in both the OBT and GPT
        DataFrames.
    """
    # Merge the two DataFrames using the contract number as the join key
    merged_df = pd.merge(
        obt_df, gpt_df,
        how="outer",
        on=["contract_number"],
        suffixes=("_ob", "_gp")
    )

    # Filter the merged DataFrame to include only the transactions that do not have a match in both OBT and GPT
    no_match = merged_df.loc[
        (merged_df["Source_ob"].isna()) |
        (merged_df["Source_gp"].isna())
    ]

    # Fill in missing values and drop unnecessary columns
    no_match["Source"] = no_match["Source_ob"].fillna("GP")
    no_match["onhold_amount"] = no_match["onhold_amount_ob"].fillna(no_match["onhold_amount_gp"])
    no_match.drop(columns=[
        "ID_ob", "ID_gp",
        "onhold_amount_ob", "onhold_amount_gp",
        "Source_ob", "Source_gp"
        ],
    inplace=True)

    # Reorder and return the new DataFrame with the source, contract number, and on-hold amount columns
    no_match = no_match[
        [ "Source", "contract_number", "onhold_amount"]
    ]

    return no_match


def get_not_full_match(obt_df: pd.DataFrame, gpt_df: pd.DataFrame):
    """
    Given two pandas DataFrames containing transaction data from OBT and GPT, respectively, this function returns two new
    DataFrames. The first DataFrame contains the transactions that have a full match on both the OBT and GPT DataFrames,
    and the second DataFrame contains the transactions that do not have a full match.

    Args:
        obt_df (pd.DataFrame): A pandas DataFrame containing transaction data from OBT.
        gpt_df (pd.DataFrame): A pandas DataFrame containing transaction data from GPT.

    Returns:
        tuple(pd.DataFrame, pd.DataFrame): A tuple of two DataFrames. The first DataFrame contains the transactions that
        have a full match on both the OBT and GPT DataFrames, and the second DataFrame contains the transactions that do
        not have a full match.
    """
    # Combine the two DataFrames using an outer join on the contract number and on-hold amount
    merged_df = pd.merge(
        obt_df, gpt_df,
        how="outer",
        on=["ID", "contract_number", "onhold_amount"],
        suffixes=("_ob", "_gp")
    )

    # Filter the merged DataFrame to include only the transactions that have a full match in both OBT and GPT
    full_matched = merged_df.dropna(subset=["Source_ob", "Source_gp"])
    full_matched.drop(columns=["Source_ob", "Source_gp"], inplace=True)

    # Create a boolean mask for the rows to drop in full_matched
    mask = merged_df["ID"].isin(full_matched["ID"])
    # Use the mask to remove the selected rows and create a new DataFrame for not full match
    not_full_match = merged_df[~mask]
    # This includes items that DO match contracts, but not amounts
    # It can have multiple items from one source with the same contract number

    # Create a new column with the data source, using OBT as the default and GPT as backup if missing
    not_full_match["Source"] = not_full_match["Source_ob"].fillna(not_full_match["Source_gp"])

    # Drop the redundant Source columns
    not_full_match.drop(columns=["Source_ob", "Source_gp"], inplace=True)

    # Reorder and return the new DataFrame with the source, contract number, and on-hold amount columns
    not_full_match = not_full_match[
        [ "Source", "contract_number", "onhold_amount"]
    ]

    # Return the two DataFrames
    return full_matched, not_full_match


def get_contract_match(not_full_match: pd.DataFrame) -> pd.DataFrame:
    """
    Given a pandas DataFrame containing transactions that do not have a full match between OBT and GPT, this function
    returns a new DataFrame containing only the transactions that have a matching contract number in both OBT and GPT.

    Args:
        not_full_match (pd.DataFrame): A pandas DataFrame containing transactions that do not have a full match between
        OBT and GPT.

    Returns:
        pd.DataFrame: A pandas DataFrame containing only the transactions that have a matching contract number in both
        OBT and GPT.
    """
    # Filter the not_full_match DataFrame by source
    ob_df = not_full_match[not_full_match["Source"] == "OB"]
    gp_df = not_full_match[not_full_match["Source"] == "GP"]

    # Merge the two filtered DataFrames on the contract number
    contract_match = pd.merge(
        ob_df, gp_df,
        how="inner",
        on=["contract_number"],
        suffixes=("_ob", "_gp")
    )

    # Fill in missing values in the Source column and drop the redundant columns
    contract_match.drop(columns=["Source_ob", "Source_gp"], inplace=True)

    # Reorder and return the new DataFrame with the source, contract number, and on-hold amount columns
    contract_match = contract_match[
        [ "contract_number", "onhold_amount_ob", "onhold_amount_gp"]
    ]

    return contract_match