You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
OnHoldReconciler/rec_lib.py

251 lines
10 KiB

import pandas as pd
from pandas import DataFrame
from datetime import datetime as dt
import datetime
import re
from typing import Literal
import logging
logger = logging.getLogger(__name__)
def get_overdue(onbase_df: DataFrame, onbase_excel_config) -> DataFrame:
"""
Given a DataFrame containing OnBase installation data and a dictionary containing the OnBase Excel configuration,
this function returns a DataFrame containing the rows from `onbase_df` that have an installation date that is before
the current date.
Args:
onbase_df (pd.DataFrame): A pandas DataFrame containing OnBase installation data.
onbase_excel_config (dict): A dictionary containing the OnBase Excel configuration.
Returns:
pd.DataFrame: A pandas DataFrame containing the rows from `onbase_df` that have an installation date that is before
the current date.
"""
id_col = onbase_excel_config["install_date"]
onbase_df[id_col] = pd.to_datetime(onbase_df[id_col])
onbase_df[id_col].fillna(pd.NaT, inplace=True)
return onbase_df[onbase_df[id_col].dt.date < datetime.date.today()]
def filter_gp(gp_dataframe: pd.DataFrame, full_config: dict) -> pd.DataFrame:
"""
Given a pandas DataFrame containing GP data and a dictionary containing the GP configuration, this function
filters out rows from the DataFrame that are not needed for further analysis based on certain criteria.
Args:
gp_dataframe (pd.DataFrame): A pandas DataFrame containing GP data.
gp_config (dict): A dictionary containing the GP configuration.
Returns:
pd.DataFrame: A pandas DataFrame containing the filtered GP data.
"""
# Excludes anything that contains cma with a space or digit following it
# CMA23532 would be excluded but 'John Locman' would be allowed
GOOD_PO_NUM = re.compile(r"^(?!.*cma(\s|\d)).*$", re.IGNORECASE)
gp_config: dict = full_config["ExcelColumns"]["GP"]
doc_num_regexes: list[str] = full_config["DocNumFilter"]
bad_doc_num = ''
rx : str
for rx in doc_num_regexes:
bad_doc_num += f"({rx})|"
bad_doc_num = re.compile(bad_doc_num[:-1], re.IGNORECASE)
logger.debug(f"Doc # filter: {bad_doc_num}")
# Create a filter/mask to use on the data
mask = (
(gp_dataframe[gp_config['doc_type']] == "Invoice") &
(gp_dataframe[gp_config['pur_order']].str.contains(GOOD_PO_NUM))
)
# Get the rows to drop based on the filter/mask
rows_to_drop = gp_dataframe[~mask].index
# Drop the rows and return the filtered DataFrame
filtered_df = gp_dataframe.drop(rows_to_drop, inplace=False)
mask = filtered_df[gp_config['doc_num']].str.contains(bad_doc_num)
rows_to_drop = filtered_df[mask].index
return filtered_df.drop(rows_to_drop, inplace=False)
def create_transaction_df(dataframe: pd.DataFrame, source: Literal["GP", "OB"], excelConfig: dict):
"""
Given a pandas DataFrame containing transaction data, the source of the data ("GP" or "OB"), and a dictionary
containing the Excel configuration, this function creates a new DataFrame with columns for the contract number,
the amount on hold, a unique transaction ID, and the source of the data.
Args:
dataframe (pd.DataFrame): A pandas DataFrame containing transaction data.
source (Literal["GP", "OB"]): The source of the data ("GP" or "OB").
excelConfig (dict): A dictionary containing the Excel configuration.
Returns:
pd.DataFrame: A pandas DataFrame containing the contract number, amount on hold, transaction ID, and data source
for each transaction in the original DataFrame.
"""
column_config: dict = excelConfig[source]
logger.debug(f"column_config: {column_config}")
# Create a new DataFrame with the contract number and on-hold amount columns
transactions = dataframe[[column_config["contract_number"], column_config["onhold_amount"]]].copy()
# Rename the columns to standardize the column names
transactions.rename(columns={
column_config["contract_number"]: "contract_number",
column_config["onhold_amount"]: "onhold_amount",
}, inplace=True)
# Convert the on-hold amount column to float format and round to two decimal places
transactions["onhold_amount"] = transactions["onhold_amount"].astype(float).round(2)
# Use regex to extract the contract number from the column values and create a new column with the standardized format
CN_REGEX = re.compile(r"\d{7}(-\d{3})?")
transactions["contract_number"] = transactions["contract_number"].apply(
lambda cn: str(cn) if not re.search(CN_REGEX, str(cn))
else re.search(CN_REGEX, str(cn)).group(0)
)
# Create a new column with a unique transaction ID
transactions["ID"] = transactions["contract_number"] +'_'+\
transactions["onhold_amount"].astype(str)
# Create a new column with the data source
transactions["Source"] = source
# Return the new DataFrame with the contract number, on-hold amount, transaction ID, and data source columns
return transactions
def get_no_match(obt_df: pd.DataFrame, gpt_df: pd.DataFrame):
"""
Given two pandas DataFrames containing transaction data from OBT and GPT, respectively, this function returns a new
DataFrame containing only the transactions that do not have a match in both the OBT and GPT DataFrames.
Args:
obt_df (pd.DataFrame): A pandas DataFrame containing transaction data from OBT.
gpt_df (pd.DataFrame): A pandas DataFrame containing transaction data from GPT.
Returns:
pd.DataFrame: A pandas DataFrame containing the transactions that do not have a match in both the OBT and GPT
DataFrames.
"""
# Merge the two DataFrames using the contract number as the join key
merged_df = pd.merge(
obt_df, gpt_df,
how="outer",
on=["contract_number"],
suffixes=("_ob", "_gp")
)
# Filter the merged DataFrame to include only the transactions that do not have a match in both OBT and GPT
no_match = merged_df.loc[
(merged_df["Source_ob"].isna()) |
(merged_df["Source_gp"].isna())
]
# Fill in missing values and drop unnecessary columns
no_match["Source"] = no_match["Source_ob"].fillna("GP")
no_match["onhold_amount"] = no_match["onhold_amount_ob"].fillna(no_match["onhold_amount_gp"])
no_match.drop(columns=[
"ID_ob", "ID_gp",
"onhold_amount_ob", "onhold_amount_gp",
"Source_ob", "Source_gp"
],
inplace=True)
# Reorder and return the new DataFrame with the source, contract number, and on-hold amount columns
no_match = no_match[
[ "Source", "contract_number", "onhold_amount"]
]
return no_match
def get_not_full_match(obt_df: pd.DataFrame, gpt_df: pd.DataFrame):
"""
Given two pandas DataFrames containing transaction data from OBT and GPT, respectively, this function returns two new
DataFrames. The first DataFrame contains the transactions that have a full match on both the OBT and GPT DataFrames,
and the second DataFrame contains the transactions that do not have a full match.
Args:
obt_df (pd.DataFrame): A pandas DataFrame containing transaction data from OBT.
gpt_df (pd.DataFrame): A pandas DataFrame containing transaction data from GPT.
Returns:
tuple(pd.DataFrame, pd.DataFrame): A tuple of two DataFrames. The first DataFrame contains the transactions that
have a full match on both the OBT and GPT DataFrames, and the second DataFrame contains the transactions that do
not have a full match.
"""
# Combine the two DataFrames using an outer join on the contract number and on-hold amount
merged_df = pd.merge(
obt_df, gpt_df,
how="outer",
on=["ID", "contract_number", "onhold_amount"],
suffixes=("_ob", "_gp")
)
# Filter the merged DataFrame to include only the transactions that have a full match in both OBT and GPT
full_matched = merged_df.dropna(subset=["Source_ob", "Source_gp"])
full_matched.drop(columns=["Source_ob", "Source_gp"], inplace=True)
# Create a boolean mask for the rows to drop in full_matched
mask = merged_df["ID"].isin(full_matched["ID"])
# Use the mask to remove the selected rows and create a new DataFrame for not full match
not_full_match = merged_df[~mask]
# This includes items that DO match contracts, but not amounts
# It can have multiple items from one source with the same contract number
# Create a new column with the data source, using OBT as the default and GPT as backup if missing
not_full_match["Source"] = not_full_match["Source_ob"].fillna(not_full_match["Source_gp"])
# Drop the redundant Source columns
not_full_match.drop(columns=["Source_ob", "Source_gp"], inplace=True)
# Reorder and return the new DataFrame with the source, contract number, and on-hold amount columns
not_full_match = not_full_match[
[ "Source", "contract_number", "onhold_amount"]
]
# Return the two DataFrames
return full_matched, not_full_match
def get_contract_match(not_full_match: pd.DataFrame) -> pd.DataFrame:
"""
Given a pandas DataFrame containing transactions that do not have a full match between OBT and GPT, this function
returns a new DataFrame containing only the transactions that have a matching contract number in both OBT and GPT.
Args:
not_full_match (pd.DataFrame): A pandas DataFrame containing transactions that do not have a full match between
OBT and GPT.
Returns:
pd.DataFrame: A pandas DataFrame containing only the transactions that have a matching contract number in both
OBT and GPT.
"""
# Filter the not_full_match DataFrame by source
ob_df = not_full_match[not_full_match["Source"] == "OB"]
gp_df = not_full_match[not_full_match["Source"] == "GP"]
# Merge the two filtered DataFrames on the contract number
contract_match = pd.merge(
ob_df, gp_df,
how="inner",
on=["contract_number"],
suffixes=("_ob", "_gp")
)
# Fill in missing values in the Source column and drop the redundant columns
contract_match.drop(columns=["Source_ob", "Source_gp"], inplace=True)
# Reorder and return the new DataFrame with the source, contract number, and on-hold amount columns
contract_match = contract_match[
[ "contract_number", "onhold_amount_ob", "onhold_amount_gp"]
]
return contract_match