Added doc number filtering to config, made it an exclusive rather than

inclusive filter No longer just HOLD or only number
3 years ago · 8ffd24840e
parent 075a84133b
commit 8ffd24840e
5 changed files with 100 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,7 @@
 __pycache__/
 venv/
 work/
+build/
+dist/
+
 *.log
--- a/Reconciler.spec
+++ b/Reconciler.spec
@ -0,0 +1,50 @@
+# -*- mode: python ; coding: utf-8 -*-
+
+
+block_cipher = None
+
+
+a = Analysis(
+    ['reconcile_holds.py'],
+    pathex=[],
+    binaries=[],
+    datas=[('config.toml', '.'), ('requirements.txt', '.')],
+    hiddenimports=['openpyxl'],
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    win_no_prefer_redirects=False,
+    win_private_assemblies=False,
+    cipher=block_cipher,
+    noarchive=False,
+)
+pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    [],
+    exclude_binaries=True,
+    name='Hold Reconciler',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
+coll = COLLECT(
+    exe,
+    a.binaries,
+    a.zipfiles,
+    a.datas,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    name='Hold Reconciler',
+)
--- a/config.toml
+++ b/config.toml
@ -1,5 +1,15 @@
-write_dir = "../Work"
-
+write_dir = "Work"
+DocNumFilter = [
+    "p(oin)?ts",
+    "pool",
+    "promo",
+    "o(ver)?f(und)?",
+    "m(ar)?ke?t",
+    "title",
+    "adj",
+    "reg free",
+    "cma"
+]
 [ExcelColumns]

    [ExcelColumns.OB]
@ -14,6 +24,9 @@ write_dir = "../Work"
    pur_order = "Purchase Order Number" # ABC123
    doc_type = "Document Type" # Invoice or Credit Memo

+
+
+
 [logger]
    version = 1

@ -24,7 +37,7 @@ write_dir = "../Work"

    [logger.handlers.console]
    class = "logging.StreamHandler"
-    level = "INFO"
+    level = "DEBUG"
    formatter = "custom"
    stream = "ext://sys.stdout"

--- a/rec_lib.py
+++ b/rec_lib.py
@ -4,6 +4,11 @@ from datetime import datetime as dt
 import datetime
 import re
 from typing import Literal
+import logging
+
+
+logger = logging.getLogger(__name__)
+

 def get_overdue(onbase_df: DataFrame, onbase_excel_config) -> DataFrame:
    """
@ -25,7 +30,7 @@ def get_overdue(onbase_df: DataFrame, onbase_excel_config) -> DataFrame:
    return  onbase_df[onbase_df[id_col].dt.date < datetime.date.today()]


-def filter_gp(gp_dataframe: pd.DataFrame, gp_config: dict) -> pd.DataFrame:
+def filter_gp(gp_dataframe: pd.DataFrame, full_config: dict) -> pd.DataFrame:
    """
    Given a pandas DataFrame containing GP data and a dictionary containing the GP configuration, this function
    filters out rows from the DataFrame that are not needed for further analysis based on certain criteria.
@ -37,18 +42,23 @@ def filter_gp(gp_dataframe: pd.DataFrame, gp_config: dict) -> pd.DataFrame:
    Returns:
        pd.DataFrame: A pandas DataFrame containing the filtered GP data.
    """
-    # Regex used to filter unneeded transactions
-    # filters anything that does not contain a ONLY contract number OR
-    # The work hold or just hld
-    GOOD_DOC_NUM = re.compile(r"(^(\d+-?)+$)|(ho?ld)", re.IGNORECASE)
+
    # Excludes anything that contains cma with a space or digit following it
    # CMA23532 would be excluded but 'John Locman' would be allowed
    GOOD_PO_NUM = re.compile(r"^(?!.*cma(\s|\d)).*$", re.IGNORECASE)

+    gp_config: dict = full_config["ExcelColumns"]["GP"]
+    doc_num_regexes: list[str] = full_config["DocNumFilter"]
+
+    bad_doc_num = ''
+    rx : str
+    for rx in doc_num_regexes:
+        bad_doc_num += f"({rx})|"
+    bad_doc_num = re.compile(bad_doc_num[:-1], re.IGNORECASE)
+    logger.debug(f"Doc # filter: {bad_doc_num}")
    # Create a filter/mask to use on the data
    mask = (
        (gp_dataframe[gp_config['doc_type']] == "Invoice") &
-        (gp_dataframe[gp_config['doc_num']].str.contains(GOOD_DOC_NUM)) &
        (gp_dataframe[gp_config['pur_order']].str.contains(GOOD_PO_NUM))
    )

@ -56,7 +66,12 @@ def filter_gp(gp_dataframe: pd.DataFrame, gp_config: dict) -> pd.DataFrame:
    rows_to_drop = gp_dataframe[~mask].index

    # Drop the rows and return the filtered DataFrame
-    return gp_dataframe.drop(rows_to_drop, inplace=False)
+    filtered_df = gp_dataframe.drop(rows_to_drop, inplace=False)
+
+    mask = filtered_df[gp_config['doc_num']].str.contains(bad_doc_num)
+    rows_to_drop = filtered_df[mask].index
+
+    return filtered_df.drop(rows_to_drop, inplace=False)


 def create_transaction_df(dataframe: pd.DataFrame, source: Literal["GP", "OB"], excelConfig: dict):
@ -75,7 +90,7 @@ def create_transaction_df(dataframe: pd.DataFrame, source: Literal["GP", "OB"],
        for each transaction in the original DataFrame.
    """
    column_config: dict = excelConfig[source]
-
+    logger.debug(f"column_config: {column_config}")
    # Create a new DataFrame with the contract number and on-hold amount columns
    transactions = dataframe[[column_config["contract_number"], column_config["onhold_amount"]]].copy()

@ -226,12 +241,11 @@ def get_contract_match(not_full_match: pd.DataFrame) -> pd.DataFrame:
    )

    # Fill in missing values in the Source column and drop the redundant columns
-    contract_match["Source"] = contract_match["Source_ob"].fillna("GP")
    contract_match.drop(columns=["Source_ob", "Source_gp"], inplace=True)

    # Reorder and return the new DataFrame with the source, contract number, and on-hold amount columns
    contract_match = contract_match[
-        [ "Source", "contract_number", "onhold_amount_ob", "onhold_amount_gp"]
+        [ "contract_number", "onhold_amount_ob", "onhold_amount_gp"]
    ]
    
    return contract_match
--- a/reconcile_holds.py
+++ b/reconcile_holds.py
@ -94,7 +94,7 @@ def check_sheet(df_cols: list[str], excel_col_config: dict) -> bool:
    return all([col in df_cols for col in required_cols])


-def get_dataframes(excelConfig: dict) -> tuple[pd.DataFrame|None, pd.DataFrame|None]:
+def get_dataframes(work_dir: str, excelConfig: dict) -> tuple[pd.DataFrame|None, pd.DataFrame|None]:
    """
    Given a dictionary of Excel configuration options, this function searches for the most recently modified GP and OB
    Excel files in a "Work" folder and returns their corresponding dataframes.
@ -105,22 +105,15 @@ def get_dataframes(excelConfig: dict) -> tuple[pd.DataFrame|None, pd.DataFrame|N
    Returns:
        tuple[pd.DataFrame|None, pd.DataFrame|None]: A tuple containing the OB and GP dataframes, respectively.
    """
-    # Get the current working directory and the path to the "Work" folder
-    current_dir: Path = Path(os.getcwd())
-    work_folder: Path = current_dir / 'Work'
-    logger.debug(f"Workpath: {work_folder}")
    
-    # Check that the "Work" folder exists
-    assert work_folder.exists, "No work folder found!"
-
    # Define regular expression patterns to match the GP and OB Excel files
    gp_regex: Pattern = re.compile(".*gp.*\.xlsx$", re.IGNORECASE)
    ob_regex: Pattern = re.compile(".*ob.*\.xlsx$", re.IGNORECASE)

    # Find the paths of the most recently modified GP and OB Excel files
-    gp_file_path = find_most_recent_file(work_folder, gp_regex)
+    gp_file_path = find_most_recent_file(work_dir, gp_regex)
    logger.debug(f"gp_file_path: {gp_file_path}")
-    ob_file_path = find_most_recent_file(work_folder, ob_regex)
+    ob_file_path = find_most_recent_file(work_dir, ob_regex)
    logger.debug(f"gp_file_path: {ob_file_path}")

    # Read the GP and OB Excel files into dataframes and check that each dataframe has the required columns
@ -156,16 +149,17 @@ def main() -> int:
    # Read the configuration options from a TOML file
    with open("config.toml", "rb") as f:
        config_dict: dict = load(f)
+    logger.debug(f"Config: {config_dict}")

    excelConfig: dict = config_dict["ExcelColumns"]

    # Get the GP and OB dataframes from the Excel files
-    ob_df, gp_df = get_dataframes(excelConfig)
+    ob_df, gp_df = get_dataframes(config_dict["write_dir"] ,excelConfig)
    assert not ob_df.empty, "OB Data empty!"
    assert not gp_df.empty, "GP Data empty!"

    # Filter the GP dataframe to include only relevant transactions
-    fgp_df: DataFrame = filter_gp(gp_df, excelConfig["GP"])
+    fgp_df: DataFrame = filter_gp(gp_df, config_dict)
    # Get the overdue transactions from the OB dataframe
    overdue: DataFrame = get_overdue(ob_df, excelConfig["OB"])

@ -181,7 +175,7 @@ def main() -> int:
    only_contracts_match: DataFrame = get_contract_match(not_full_match)

    # Write the results to a new Excel file
-    with pd.ExcelWriter(f"{config_dict['work_dir']}/Reconciled Holds [{dt.now().strftime('%m-%d-%Y')}].xlsx", mode='w') as writer:
+    with pd.ExcelWriter(f"{config_dict['write_dir']}/Reconciled Holds [{dt.now().strftime('%m-%d-%Y')}].xlsx", mode='w') as writer:
        full_match.to_excel(writer,sheet_name="FULL", index=False)
        no_match.to_excel(writer, sheet_name="No Match", index=False)
        only_contracts_match.to_excel(writer, sheet_name="Amount Mismatch", index=False)