Added doc number filtering to config, made it an exclusive rather than

inclusive filter No longer just HOLD or only number
3 years ago · 8ffd24840e
parent 075a84133b
commit 8ffd24840e
5 changed files with 100 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,7 @@
 __pycache__/
 venv/
 work/
 build/
 dist/
 *.log
--- a/Reconciler.spec
+++ b/Reconciler.spec
@ -0,0 +1,50 @@
 # -*- mode: python ; coding: utf-8 -*-
 block_cipher = None
 a = Analysis(
    ['reconcile_holds.py'],
    pathex=[],
    binaries=[],
    datas=[('config.toml', '.'), ('requirements.txt', '.')],
    hiddenimports=['openpyxl'],
    hookspath=[],
    hooksconfig={},
    runtime_hooks=[],
    excludes=[],
    win_no_prefer_redirects=False,
    win_private_assemblies=False,
    cipher=block_cipher,
    noarchive=False,
 )
 pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
 exe = EXE(
    pyz,
    a.scripts,
    [],
    exclude_binaries=True,
    name='Hold Reconciler',
    debug=False,
    bootloader_ignore_signals=False,
    strip=False,
    upx=True,
    console=True,
    disable_windowed_traceback=False,
    argv_emulation=False,
    target_arch=None,
    codesign_identity=None,
    entitlements_file=None,
 )
 coll = COLLECT(
    exe,
    a.binaries,
    a.zipfiles,
    a.datas,
    strip=False,
    upx=True,
    upx_exclude=[],
    name='Hold Reconciler',
 )
--- a/config.toml
+++ b/config.toml
@ -1,5 +1,15 @@
-write_dir = "../Work"
+write_dir = "Work"
-
+DocNumFilter = [
    "p(oin)?ts",
    "pool",
    "promo",
    "o(ver)?f(und)?",
    "m(ar)?ke?t",
    "title",
    "adj",
    "reg free",
    "cma"
 ]
 [ExcelColumns]
    [ExcelColumns.OB]
@ -14,6 +24,9 @@ write_dir = "../Work"
    pur_order = "Purchase Order Number" # ABC123
    doc_type = "Document Type" # Invoice or Credit Memo
 [logger]
    version = 1
@ -24,7 +37,7 @@ write_dir = "../Work"
    [logger.handlers.console]
    class = "logging.StreamHandler"
-    level = "INFO"
+    level = "DEBUG"
    formatter = "custom"
    stream = "ext://sys.stdout"
--- a/rec_lib.py
+++ b/rec_lib.py
@ -4,6 +4,11 @@ from datetime import datetime as dt
 import datetime
 import re
 from typing import Literal
 import logging
 logger = logging.getLogger(__name__)
 def get_overdue(onbase_df: DataFrame, onbase_excel_config) -> DataFrame:
    """
@ -25,7 +30,7 @@ def get_overdue(onbase_df: DataFrame, onbase_excel_config) -> DataFrame:
    return  onbase_df[onbase_df[id_col].dt.date < datetime.date.today()]
-def filter_gp(gp_dataframe: pd.DataFrame, gp_config: dict) -> pd.DataFrame:
+def filter_gp(gp_dataframe: pd.DataFrame, full_config: dict) -> pd.DataFrame:
    """
    Given a pandas DataFrame containing GP data and a dictionary containing the GP configuration, this function
    filters out rows from the DataFrame that are not needed for further analysis based on certain criteria.
@ -37,18 +42,23 @@ def filter_gp(gp_dataframe: pd.DataFrame, gp_config: dict) -> pd.DataFrame:
    Returns:
        pd.DataFrame: A pandas DataFrame containing the filtered GP data.
    """
-    # Regex used to filter unneeded transactions
+
    # filters anything that does not contain a ONLY contract number OR
    # The work hold or just hld
    GOOD_DOC_NUM = re.compile(r"(^(\d+-?)+$)|(ho?ld)", re.IGNORECASE)
    # Excludes anything that contains cma with a space or digit following it
    # CMA23532 would be excluded but 'John Locman' would be allowed
    GOOD_PO_NUM = re.compile(r"^(?!.*cma(\s|\d)).*$", re.IGNORECASE)
    gp_config: dict = full_config["ExcelColumns"]["GP"]
    doc_num_regexes: list[str] = full_config["DocNumFilter"]
    bad_doc_num = ''
    rx : str
    for rx in doc_num_regexes:
        bad_doc_num += f"({rx})|"
    bad_doc_num = re.compile(bad_doc_num[:-1], re.IGNORECASE)
    logger.debug(f"Doc # filter: {bad_doc_num}")
    # Create a filter/mask to use on the data
    mask = (
        (gp_dataframe[gp_config['doc_type']] == "Invoice") &
        (gp_dataframe[gp_config['doc_num']].str.contains(GOOD_DOC_NUM)) &
        (gp_dataframe[gp_config['pur_order']].str.contains(GOOD_PO_NUM))
    )
@ -56,7 +66,12 @@ def filter_gp(gp_dataframe: pd.DataFrame, gp_config: dict) -> pd.DataFrame:
    rows_to_drop = gp_dataframe[~mask].index
    # Drop the rows and return the filtered DataFrame
-    return gp_dataframe.drop(rows_to_drop, inplace=False)
+    filtered_df = gp_dataframe.drop(rows_to_drop, inplace=False)
    mask = filtered_df[gp_config['doc_num']].str.contains(bad_doc_num)
    rows_to_drop = filtered_df[mask].index
    return filtered_df.drop(rows_to_drop, inplace=False)
 def create_transaction_df(dataframe: pd.DataFrame, source: Literal["GP", "OB"], excelConfig: dict):
@ -75,7 +90,7 @@ def create_transaction_df(dataframe: pd.DataFrame, source: Literal["GP", "OB"],
        for each transaction in the original DataFrame.
    """
    column_config: dict = excelConfig[source]
-
+    logger.debug(f"column_config: {column_config}")
    # Create a new DataFrame with the contract number and on-hold amount columns
    transactions = dataframe[[column_config["contract_number"], column_config["onhold_amount"]]].copy()
@ -226,12 +241,11 @@ def get_contract_match(not_full_match: pd.DataFrame) -> pd.DataFrame:
    )
    # Fill in missing values in the Source column and drop the redundant columns
    contract_match["Source"] = contract_match["Source_ob"].fillna("GP")
    contract_match.drop(columns=["Source_ob", "Source_gp"], inplace=True)
    # Reorder and return the new DataFrame with the source, contract number, and on-hold amount columns
    contract_match = contract_match[
-        [ "Source", "contract_number", "onhold_amount_ob", "onhold_amount_gp"]
+        [ "contract_number", "onhold_amount_ob", "onhold_amount_gp"]
    ]
    return contract_match
--- a/reconcile_holds.py
+++ b/reconcile_holds.py
@ -94,7 +94,7 @@ def check_sheet(df_cols: list[str], excel_col_config: dict) -> bool:
    return all([col in df_cols for col in required_cols])
-def get_dataframes(excelConfig: dict) -> tuple[pd.DataFrame|None, pd.DataFrame|None]:
+def get_dataframes(work_dir: str, excelConfig: dict) -> tuple[pd.DataFrame|None, pd.DataFrame|None]:
    """
    Given a dictionary of Excel configuration options, this function searches for the most recently modified GP and OB
    Excel files in a "Work" folder and returns their corresponding dataframes.
@ -105,22 +105,15 @@ def get_dataframes(excelConfig: dict) -> tuple[pd.DataFrame|None, pd.DataFrame|N
    Returns:
        tuple[pd.DataFrame|None, pd.DataFrame|None]: A tuple containing the OB and GP dataframes, respectively.
    """
    # Get the current working directory and the path to the "Work" folder
    current_dir: Path = Path(os.getcwd())
    work_folder: Path = current_dir / 'Work'
    logger.debug(f"Workpath: {work_folder}")
    # Check that the "Work" folder exists
    assert work_folder.exists, "No work folder found!"
    # Define regular expression patterns to match the GP and OB Excel files
    gp_regex: Pattern = re.compile(".*gp.*\.xlsx$", re.IGNORECASE)
    ob_regex: Pattern = re.compile(".*ob.*\.xlsx$", re.IGNORECASE)
    # Find the paths of the most recently modified GP and OB Excel files
-    gp_file_path = find_most_recent_file(work_folder, gp_regex)
+    gp_file_path = find_most_recent_file(work_dir, gp_regex)
    logger.debug(f"gp_file_path: {gp_file_path}")
-    ob_file_path = find_most_recent_file(work_folder, ob_regex)
+    ob_file_path = find_most_recent_file(work_dir, ob_regex)
    logger.debug(f"gp_file_path: {ob_file_path}")
    # Read the GP and OB Excel files into dataframes and check that each dataframe has the required columns
@ -156,16 +149,17 @@ def main() -> int:
    # Read the configuration options from a TOML file
    with open("config.toml", "rb") as f:
        config_dict: dict = load(f)
    logger.debug(f"Config: {config_dict}")
    excelConfig: dict = config_dict["ExcelColumns"]
    # Get the GP and OB dataframes from the Excel files
-    ob_df, gp_df = get_dataframes(excelConfig)
+    ob_df, gp_df = get_dataframes(config_dict["write_dir"] ,excelConfig)
    assert not ob_df.empty, "OB Data empty!"
    assert not gp_df.empty, "GP Data empty!"
    # Filter the GP dataframe to include only relevant transactions
-    fgp_df: DataFrame = filter_gp(gp_df, excelConfig["GP"])
+    fgp_df: DataFrame = filter_gp(gp_df, config_dict)
    # Get the overdue transactions from the OB dataframe
    overdue: DataFrame = get_overdue(ob_df, excelConfig["OB"])
@ -181,7 +175,7 @@ def main() -> int:
    only_contracts_match: DataFrame = get_contract_match(not_full_match)
    # Write the results to a new Excel file
-    with pd.ExcelWriter(f"{config_dict['work_dir']}/Reconciled Holds [{dt.now().strftime('%m-%d-%Y')}].xlsx", mode='w') as writer:
+    with pd.ExcelWriter(f"{config_dict['write_dir']}/Reconciled Holds [{dt.now().strftime('%m-%d-%Y')}].xlsx", mode='w') as writer:
        full_match.to_excel(writer,sheet_name="FULL", index=False)
        no_match.to_excel(writer, sheet_name="No Match", index=False)
        only_contracts_match.to_excel(writer, sheet_name="Amount Mismatch", index=False)