OnHoldReconciler/reports.py

from pandas import DataFrame, merge, to_datetime, NaT
from numpy import concatenate
from abc import ABC, abstractmethod
from logging import getLogger
import re
from typing import Literal
import datetime

from helpers import CN_REGEX

logger = getLogger(__name__)


class HoldReport(ABC):

    source = ""

    def __init__(self, dataframe: DataFrame, reports_config: dict) -> None:
        self.config = reports_config
        self.df = dataframe
        self._normalize()


    def _normalize(self):

        # Rename the columns to standardize the column names
        self.df.rename( columns= {  unique_cols[self.source] : common_col
                                    for common_col, unique_cols in self.config["shared_columns"].items()
        }, inplace=True)

        # Convert the on-hold amount column to float format and round to two decimal places
        self.df["onhold_amount"] = self.df["onhold_amount"].astype(float).round(2)

        # Use regex to extract the contract number from the column values and create a new column with the standardized format
        self.df["contract_number"] = self.df["contract_number"].apply(
            lambda cn: str(cn) if not re.search(CN_REGEX, str(cn))
            else re.search(CN_REGEX, str(cn)).group(0)
        )

        # Create a new column with a unique transaction ID
        self.df["ID"] = self.df["contract_number"] +'_'+\
                self.df["onhold_amount"].astype(str)

        # Create a new column with the data source
        self.df["Source"] = self.source


    def _get_no_match(self,  other: 'HoldReport'):
        # Merge the two DataFrames using the contract number as the join key
        outer_merge = merge(
            self.df, other.df,
            how="outer",
            on=["contract_number"],
            suffixes=('_'+self.source, '_'+other.source)
        )

        # Filter the merged DataFrame to include only the transactions that do not have a match in both OBT and GPT
        no_match = outer_merge.loc[
            (outer_merge[f"Source_{self.source}"].isna()) |
            (outer_merge[f"Source_{other.source}"].isna())
        ]

        # Fill in missing values and drop unnecessary columns
        no_match["Source"] = no_match[f"Source_{self.source}"].fillna("GP")
        no_match["onhold_amount"] = no_match[f"onhold_amount_{self.source}"].fillna(
            no_match[f"onhold_amount_{other.source}"]
        )
        no_match["vendor_name"] = no_match[f"vendor_name_{self.source}"].fillna(
            no_match[f"vendor_name_{other.source}"]
        )

        return no_match


    def _get_contract_matches(self,  other: 'HoldReport') -> DataFrame:
        """

        """
        # Merge the two filtered DataFrames on the contract number
        contract_match = merge(
            self.df, other.df,
            how="inner",
            on=["contract_number"],
            suffixes=('_'+self.source, '_'+other.source)
        )

        contract_match["vendor_name"] = contract_match[f"vendor_name_{self.source}"].fillna(
            contract_match[f"vendor_name_{other.source}"]
        )


        return contract_match

    @staticmethod
    def _add_work_columns(df: DataFrame) -> DataFrame:
        """
        Add empty columns to the dataframe to faciliate working through the report.
        """
        WORK_COLS = ["Resolution", "Notes"]
        for col in WORK_COLS:
            df[col] = ''
        return df

    def reconcile(self, other: 'HoldReport') -> tuple[DataFrame]:
        """
        """
        no_match: DataFrame = self._get_no_match(other)
        no_match.to_excel("NOMATCH.xlsx")
        logger.debug(f"No_match: {no_match}")

        amount_mismatch: DataFrame = self._get_contract_matches(other)
        amount_mismatch.to_excel("AMTMM.xlsx")
        logger.debug(f"amt_mismatche: {no_match}")

        # Select and reorder columns
        no_match = no_match[
            ["Source"].extend(self.config["output_columns"])
        ]
        no_match = self._add_work_columns(no_match)

        amount_mismatch = amount_mismatch[
           self.config["output_columns"]
        ]
        amount_mismatch = self._add_work_columns(amount_mismatch)

        return no_match, amount_mismatch


class OnBaseReport(HoldReport):

    source = "OB"

    def get_overdue(self) -> DataFrame:
        """
        """
        self.df["install_date"] = to_datetime(self.df["install_date"])
        self.df["install_date"].fillna(NaT, inplace=True)
        return  self.df[self.df["install_date"].dt.date < datetime.date.today()]


class GreatPlainsReport(HoldReport):

    source = "GP"
    filted_df: bool = False

    def __init__(self, dataframe: DataFrame, report_config: dict) -> None:

        self._filter(
            gp_report_df= dataframe,
            doc_num_filters= report_config["gp_filters"]["doc_num_filters"],
            good_po_num_regex=  report_config["gp_filters"]["po_filter"]
        )
        super().__init__(dataframe, report_config)

    @staticmethod
    def _filter(gp_report_df: DataFrame,
                doc_num_filters: list[str], good_po_num_regex: str) -> DataFrame:

        GOOD_PO_NUM = re.compile(good_po_num_regex, re.IGNORECASE)

        bad_doc_num = ''
        rx : str
        for rx in doc_num_filters:
            bad_doc_num += f"({rx})|"
        bad_doc_num = re.compile(bad_doc_num[:-1], re.IGNORECASE)

        # Create a mask/filter that will keep rows that match these
        # requirments
        keep_mask = (
            (gp_report_df["Document Type"] == "Invoice") &
            (gp_report_df["Purchase Order Number"].str.contains(GOOD_PO_NUM))
        )

        # Get the rows that DO NOT fit the keep_mask
        rows_to_drop = gp_report_df[~keep_mask].index
        # Drop the rows to filter
        gp_report_df.drop(rows_to_drop, inplace=True)

        # Create a filter to remove rows that meet this requirment
        # Making this a negative in the keep mask is more trouble than
        # it's worth
        remove_mask = gp_report_df["Document Number"].str.contains(bad_doc_num)
        rows_to_drop = gp_report_df[remove_mask].index
        gp_report_df.drop(rows_to_drop, inplace=True)

        return gp_report_df