You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
251 lines
10 KiB
251 lines
10 KiB
import pandas as pd
|
|
from pandas import DataFrame
|
|
from datetime import datetime as dt
|
|
import datetime
|
|
import re
|
|
from typing import Literal
|
|
import logging
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_overdue(onbase_df: DataFrame, onbase_excel_config) -> DataFrame:
|
|
"""
|
|
Given a DataFrame containing OnBase installation data and a dictionary containing the OnBase Excel configuration,
|
|
this function returns a DataFrame containing the rows from `onbase_df` that have an installation date that is before
|
|
the current date.
|
|
|
|
Args:
|
|
onbase_df (pd.DataFrame): A pandas DataFrame containing OnBase installation data.
|
|
onbase_excel_config (dict): A dictionary containing the OnBase Excel configuration.
|
|
|
|
Returns:
|
|
pd.DataFrame: A pandas DataFrame containing the rows from `onbase_df` that have an installation date that is before
|
|
the current date.
|
|
"""
|
|
id_col = onbase_excel_config["install_date"]
|
|
onbase_df[id_col] = pd.to_datetime(onbase_df[id_col])
|
|
onbase_df[id_col].fillna(pd.NaT, inplace=True)
|
|
return onbase_df[onbase_df[id_col].dt.date < datetime.date.today()]
|
|
|
|
|
|
def filter_gp(gp_dataframe: pd.DataFrame, full_config: dict) -> pd.DataFrame:
|
|
"""
|
|
Given a pandas DataFrame containing GP data and a dictionary containing the GP configuration, this function
|
|
filters out rows from the DataFrame that are not needed for further analysis based on certain criteria.
|
|
|
|
Args:
|
|
gp_dataframe (pd.DataFrame): A pandas DataFrame containing GP data.
|
|
gp_config (dict): A dictionary containing the GP configuration.
|
|
|
|
Returns:
|
|
pd.DataFrame: A pandas DataFrame containing the filtered GP data.
|
|
"""
|
|
|
|
# Excludes anything that contains cma with a space or digit following it
|
|
# CMA23532 would be excluded but 'John Locman' would be allowed
|
|
GOOD_PO_NUM = re.compile(r"^(?!.*cma(\s|\d)).*$", re.IGNORECASE)
|
|
|
|
gp_config: dict = full_config["ExcelColumns"]["GP"]
|
|
doc_num_regexes: list[str] = full_config["DocNumFilter"]
|
|
|
|
bad_doc_num = ''
|
|
rx : str
|
|
for rx in doc_num_regexes:
|
|
bad_doc_num += f"({rx})|"
|
|
bad_doc_num = re.compile(bad_doc_num[:-1], re.IGNORECASE)
|
|
logger.debug(f"Doc # filter: {bad_doc_num}")
|
|
# Create a filter/mask to use on the data
|
|
mask = (
|
|
(gp_dataframe[gp_config['doc_type']] == "Invoice") &
|
|
(gp_dataframe[gp_config['pur_order']].str.contains(GOOD_PO_NUM))
|
|
)
|
|
|
|
# Get the rows to drop based on the filter/mask
|
|
rows_to_drop = gp_dataframe[~mask].index
|
|
|
|
# Drop the rows and return the filtered DataFrame
|
|
filtered_df = gp_dataframe.drop(rows_to_drop, inplace=False)
|
|
|
|
mask = filtered_df[gp_config['doc_num']].str.contains(bad_doc_num)
|
|
rows_to_drop = filtered_df[mask].index
|
|
|
|
return filtered_df.drop(rows_to_drop, inplace=False)
|
|
|
|
|
|
def create_transaction_df(dataframe: pd.DataFrame, source: Literal["GP", "OB"], excelConfig: dict):
|
|
"""
|
|
Given a pandas DataFrame containing transaction data, the source of the data ("GP" or "OB"), and a dictionary
|
|
containing the Excel configuration, this function creates a new DataFrame with columns for the contract number,
|
|
the amount on hold, a unique transaction ID, and the source of the data.
|
|
|
|
Args:
|
|
dataframe (pd.DataFrame): A pandas DataFrame containing transaction data.
|
|
source (Literal["GP", "OB"]): The source of the data ("GP" or "OB").
|
|
excelConfig (dict): A dictionary containing the Excel configuration.
|
|
|
|
Returns:
|
|
pd.DataFrame: A pandas DataFrame containing the contract number, amount on hold, transaction ID, and data source
|
|
for each transaction in the original DataFrame.
|
|
"""
|
|
column_config: dict = excelConfig[source]
|
|
logger.debug(f"column_config: {column_config}")
|
|
# Create a new DataFrame with the contract number and on-hold amount columns
|
|
transactions = dataframe[[column_config["contract_number"], column_config["onhold_amount"]]].copy()
|
|
|
|
# Rename the columns to standardize the column names
|
|
transactions.rename(columns={
|
|
column_config["contract_number"]: "contract_number",
|
|
column_config["onhold_amount"]: "onhold_amount",
|
|
}, inplace=True)
|
|
|
|
# Convert the on-hold amount column to float format and round to two decimal places
|
|
transactions["onhold_amount"] = transactions["onhold_amount"].astype(float).round(2)
|
|
|
|
# Use regex to extract the contract number from the column values and create a new column with the standardized format
|
|
CN_REGEX = re.compile(r"\d{7}(-\d{3})?")
|
|
transactions["contract_number"] = transactions["contract_number"].apply(
|
|
lambda cn: str(cn) if not re.search(CN_REGEX, str(cn))
|
|
else re.search(CN_REGEX, str(cn)).group(0)
|
|
)
|
|
|
|
# Create a new column with a unique transaction ID
|
|
transactions["ID"] = transactions["contract_number"] +'_'+\
|
|
transactions["onhold_amount"].astype(str)
|
|
|
|
# Create a new column with the data source
|
|
transactions["Source"] = source
|
|
|
|
# Return the new DataFrame with the contract number, on-hold amount, transaction ID, and data source columns
|
|
return transactions
|
|
|
|
|
|
def get_no_match(obt_df: pd.DataFrame, gpt_df: pd.DataFrame):
|
|
"""
|
|
Given two pandas DataFrames containing transaction data from OBT and GPT, respectively, this function returns a new
|
|
DataFrame containing only the transactions that do not have a match in both the OBT and GPT DataFrames.
|
|
|
|
Args:
|
|
obt_df (pd.DataFrame): A pandas DataFrame containing transaction data from OBT.
|
|
gpt_df (pd.DataFrame): A pandas DataFrame containing transaction data from GPT.
|
|
|
|
Returns:
|
|
pd.DataFrame: A pandas DataFrame containing the transactions that do not have a match in both the OBT and GPT
|
|
DataFrames.
|
|
"""
|
|
# Merge the two DataFrames using the contract number as the join key
|
|
merged_df = pd.merge(
|
|
obt_df, gpt_df,
|
|
how="outer",
|
|
on=["contract_number"],
|
|
suffixes=("_ob", "_gp")
|
|
)
|
|
|
|
# Filter the merged DataFrame to include only the transactions that do not have a match in both OBT and GPT
|
|
no_match = merged_df.loc[
|
|
(merged_df["Source_ob"].isna()) |
|
|
(merged_df["Source_gp"].isna())
|
|
]
|
|
|
|
# Fill in missing values and drop unnecessary columns
|
|
no_match["Source"] = no_match["Source_ob"].fillna("GP")
|
|
no_match["onhold_amount"] = no_match["onhold_amount_ob"].fillna(no_match["onhold_amount_gp"])
|
|
no_match.drop(columns=[
|
|
"ID_ob", "ID_gp",
|
|
"onhold_amount_ob", "onhold_amount_gp",
|
|
"Source_ob", "Source_gp"
|
|
],
|
|
inplace=True)
|
|
|
|
# Reorder and return the new DataFrame with the source, contract number, and on-hold amount columns
|
|
no_match = no_match[
|
|
[ "Source", "contract_number", "onhold_amount"]
|
|
]
|
|
|
|
return no_match
|
|
|
|
|
|
def get_not_full_match(obt_df: pd.DataFrame, gpt_df: pd.DataFrame):
|
|
"""
|
|
Given two pandas DataFrames containing transaction data from OBT and GPT, respectively, this function returns two new
|
|
DataFrames. The first DataFrame contains the transactions that have a full match on both the OBT and GPT DataFrames,
|
|
and the second DataFrame contains the transactions that do not have a full match.
|
|
|
|
Args:
|
|
obt_df (pd.DataFrame): A pandas DataFrame containing transaction data from OBT.
|
|
gpt_df (pd.DataFrame): A pandas DataFrame containing transaction data from GPT.
|
|
|
|
Returns:
|
|
tuple(pd.DataFrame, pd.DataFrame): A tuple of two DataFrames. The first DataFrame contains the transactions that
|
|
have a full match on both the OBT and GPT DataFrames, and the second DataFrame contains the transactions that do
|
|
not have a full match.
|
|
"""
|
|
# Combine the two DataFrames using an outer join on the contract number and on-hold amount
|
|
merged_df = pd.merge(
|
|
obt_df, gpt_df,
|
|
how="outer",
|
|
on=["ID", "contract_number", "onhold_amount"],
|
|
suffixes=("_ob", "_gp")
|
|
)
|
|
|
|
# Filter the merged DataFrame to include only the transactions that have a full match in both OBT and GPT
|
|
full_matched = merged_df.dropna(subset=["Source_ob", "Source_gp"])
|
|
full_matched.drop(columns=["Source_ob", "Source_gp"], inplace=True)
|
|
|
|
# Create a boolean mask for the rows to drop in full_matched
|
|
mask = merged_df["ID"].isin(full_matched["ID"])
|
|
# Use the mask to remove the selected rows and create a new DataFrame for not full match
|
|
not_full_match = merged_df[~mask]
|
|
# This includes items that DO match contracts, but not amounts
|
|
# It can have multiple items from one source with the same contract number
|
|
|
|
# Create a new column with the data source, using OBT as the default and GPT as backup if missing
|
|
not_full_match["Source"] = not_full_match["Source_ob"].fillna(not_full_match["Source_gp"])
|
|
|
|
# Drop the redundant Source columns
|
|
not_full_match.drop(columns=["Source_ob", "Source_gp"], inplace=True)
|
|
|
|
# Reorder and return the new DataFrame with the source, contract number, and on-hold amount columns
|
|
not_full_match = not_full_match[
|
|
[ "Source", "contract_number", "onhold_amount"]
|
|
]
|
|
|
|
# Return the two DataFrames
|
|
return full_matched, not_full_match
|
|
|
|
|
|
def get_contract_match(not_full_match: pd.DataFrame) -> pd.DataFrame:
|
|
"""
|
|
Given a pandas DataFrame containing transactions that do not have a full match between OBT and GPT, this function
|
|
returns a new DataFrame containing only the transactions that have a matching contract number in both OBT and GPT.
|
|
|
|
Args:
|
|
not_full_match (pd.DataFrame): A pandas DataFrame containing transactions that do not have a full match between
|
|
OBT and GPT.
|
|
|
|
Returns:
|
|
pd.DataFrame: A pandas DataFrame containing only the transactions that have a matching contract number in both
|
|
OBT and GPT.
|
|
"""
|
|
# Filter the not_full_match DataFrame by source
|
|
ob_df = not_full_match[not_full_match["Source"] == "OB"]
|
|
gp_df = not_full_match[not_full_match["Source"] == "GP"]
|
|
|
|
# Merge the two filtered DataFrames on the contract number
|
|
contract_match = pd.merge(
|
|
ob_df, gp_df,
|
|
how="inner",
|
|
on=["contract_number"],
|
|
suffixes=("_ob", "_gp")
|
|
)
|
|
|
|
# Fill in missing values in the Source column and drop the redundant columns
|
|
contract_match.drop(columns=["Source_ob", "Source_gp"], inplace=True)
|
|
|
|
# Reorder and return the new DataFrame with the source, contract number, and on-hold amount columns
|
|
contract_match = contract_match[
|
|
[ "contract_number", "onhold_amount_ob", "onhold_amount_gp"]
|
|
]
|
|
|
|
return contract_match |