diff --git a/IL Formatter.py b/IL Formatter.py index d3d741c..bf3094e 100644 --- a/IL Formatter.py +++ b/IL Formatter.py @@ -1,6 +1,5 @@ from ui import Ui_MainWindow from errorDialog import ErrorDialog -import ILParser from PyQt5 import QtWidgets from logging import debug, info, warning, exception as logException, error,DEBUG, INFO, WARNING, ERROR, basicConfig, getLogger from sys import argv @@ -11,6 +10,9 @@ from os import startfile from json import load, dump from time import sleep +import ILParser + + # Open the config file, create a dict, and set up logging with open("config.json") as configFile: config: dict = load(configFile) @@ -256,6 +258,7 @@ class MainWindow(QtWidgets.QMainWindow, Ui_MainWindow): try: data: DataFrame = ILParser.extract_data(report, config["COLS"]) except Exception as e: + self.processButton.setEnabled(False) logException(f"Failed to parse file-> {filePath} :\n{e}") open_error_dialog("Parsing Error:", f"Failed to parse file-> {filePath}", repr(e)) return None diff --git a/ILParser.py b/ILParser.py index bd395f2..eded524 100644 --- a/ILParser.py +++ b/ILParser.py @@ -1,10 +1,17 @@ from pandas import DataFrame import re +from re import Match, Pattern +from logging import getLogger, basicConfig +from json import load, dump + + +logger = getLogger(__name__) +logger.setLevel("DEBUG") COLUMN_NAME_REGEX = re.compile(r"(?P(\w|\.|#|\/)+)", re.IGNORECASE) -def replace_bad_cols(line: str, cols: list[str]): +def replace_bad_cols(line: str, cols: list[str]) -> str: """ Replaces bad column names in a string with modified names that have spaces replaced with dots. @@ -15,15 +22,29 @@ def replace_bad_cols(line: str, cols: list[str]): Returns: str: The modified string with bad column names replaced. """ + logger.debug(f"Line: {line} | Cols: {cols}") for c in cols: - # Replace spaces with dots in the column name - gc = c.replace(' ', '.') - # Replace the bad column name with the modified column name in the string - line = line.replace(c, gc) + + # Create a regex for the col + col_regex: Pattern = re.compile(c.replace(' ', r'(?:\s|\.)')) + logger.debug(f"Col_regex: {col_regex}") + # Get all columns that match that pattern + col_matches: list[str|tuple[str]] = re.findall(col_regex, line) + logger.debug(f"Col_matches: {col_matches}") + # Match the substition for all matches if any + col_name: str + for col_name in col_matches: + + logger.debug(f"col_name: {col_name}") + # Replace the bad column name with the modified column name in the string + # Adding the '.' instead of a space helps the parser tell what the continous + # column are + line = line.replace(col_name, col_name.replace(' ', '.')) + return line -def extract_data(input_doc: str, column_list: list[str]): +def extract_data(input_doc: str, column_list: list[str]) -> DataFrame|None: """ Extracts data from a string in a table-like format, where columns are identified by a list of column names, and returns the data as a Pandas DataFrame. @@ -40,20 +61,45 @@ def extract_data(input_doc: str, column_list: list[str]): data = {} for line in input_doc.splitlines(): if len(columns) == 0 : + logger.debug(f"Columns = 0: {line}") # Find the line that contains the column names and replace bad column names if re.search("^\w", line): + logger.debug("Found word on first line.") line = replace_bad_cols(line, column_list) + logger.debug(f"Column replacements made: {line}") # Find the start and end positions of each column name and store them in a dictionary columns_names = re.finditer(COLUMN_NAME_REGEX, line) + logger.debug(f"Found column names: {columns_names}") for c in columns_names: columns[c.group("column_name")] = {"start": c.start(), "end": c.end()} + logger.debug(f"Column section: {columns[c.group('column_name')]}") data[c.group("column_name")] = [] continue elif len(line) < 2: + logger.debug(f"Line len less than 2.") continue # Check if we've reached the end of the table and return the data if re.search("\d+ records listed", line): + logger.debug(f"End of document: {line}") + logger.debug(f"Extracted data: {data}") return DataFrame(data) # Extract the data from each column based on the start and end positions for key, span in columns.items(): - data[key].append(line[span["start"]:span["end"]].strip()) \ No newline at end of file + data[key].append(line[span["start"]:span["end"]].strip()) + +if __name__ == "__main__": + + basicConfig(filename='ILParser.log', encoding='utf-8', + level="DEBUG", filemode='w', force=True) + + def test_replace_bad_cols(): + + with open("Inputs\CUST_ISSUE") as c: + input: str = c.read() + with open("config.json") as configFile: + config: dict = load(configFile) + columns: list[str] = config["COLS"] + + replace_bad_cols(input.splitlines()[1], columns) + + test_replace_bad_cols() \ No newline at end of file diff --git a/config.json b/config.json index 42ab9ab..8ac3329 100644 --- a/config.json +++ b/config.json @@ -1 +1 @@ -{"loggingLevel": "ERROR", "directories": {"ASSET": "C:/Users/glott/OneDrive - LEAF Commercial Capital/Documents/0 In Progess/Portfolio/Automation/IL Formatter/Inputs", "CUST": "C:/Users/glott/OneDrive - LEAF Commercial Capital/Documents/0 In Progess/Portfolio/Automation/IL Formatter/Inputs", "DOB": "C:/Users/glott/OneDrive - LEAF Commercial Capital/Documents/0 In Progess/Portfolio/Automation/IL Formatter/Inputs", "FIN": "C:/Users/glott/OneDrive - LEAF Commercial Capital/Documents/0 In Progess/Portfolio/Automation/IL Formatter/Inputs", "output": "C:/Users/glott/OneDrive - LEAF Commercial Capital/Documents/0 In Progess/Portfolio/Automation/IL Formatter/Inputs"}, "COLS": ["CUST ID", "CONTRACT NO", "BUSINESS TYPE", "FED ID", "CUST CREDIT ACCT", "CUSTOMER", "LEASE TYPE", "EQUIPMENT COST", "CBR", "NET INVESTMENT", "ANNUAL COMBINED IRR", "CONTRACT TERM", "INCOME START DATE", "FIRST PYMT DATE", "FIRST PYMT AMT", "CONTRACT PYMT", "INVOICE CODE", "INV DAYS", "INV DUE DAY", "SEC DEPOSIT", "IDC AMOUNTS", "IDC DATES", "RESIDUAL", "MANAGERS RESIDUAL", "PROMOTION", "PRODUCT LINE", "REGION", "REGION DESC", "BRANCH", "BUSINESS SEGMENT", "LEAD BANK", "MRKTNG REP", "MRKTNG REGION", "REMIT TO", "PYMT OPTION", "BANK CODE", "TAPE BANK NUM", "TAPE ACCOUNT NUM", "TAPE ACCT TYPE", "DEALER", "PRIVATE LABEL", "RESID METHOD", "LATE CHRG EXMPT", "INSURANCE CODE", "VARIABLE DATE", "VARIABLE RATE", "BILLING CYCLE", "UM USER DATE2", "CR ATTG PHONE", "GROSS CONTRACT", "ADV ", "PD AMT FINANCED", "PD INCOME START DATE", "INVOICE DESC", "VARIABLE PYMT CODE", "PD PAYMENT AMT", "QUOTE BUYOUT", "LATE CHARGE CODE", "LATE CHRG RATE", "M DEF COLLECTOR", "AM ACH LEAD DAYS", "UNL POOL", "PD RISK DATE", "PD RISK", "LGD RISK", "LGD DATE", "Service By Others", "CONTRACT NO", "CUST CREDIT ACCT", "CUST ID", "CUST NAME", "UATB CUST DBA", "UATB CUST ADDRESS1 45", "UATB CUST ADDRESS2 45", "UATB CUST ADDRESS3 45", "CUST CITY", "CUST STATE", "CUST ZIP", "GUAR CODE 1", "PRIN1/GUAR NAME 1", "PRIN1 ADD1", "PRIN1 ADD2", "PRIN1 CITY1", "PRIN1 ST 1", "ZIP 1", "FED ID/SS#1", "GUAR CODE 2 PRIN/GUAR NAME 2", "PRIN2 ADD2", "PRIN2 ADDR2", "PRIN2 CITY2", "PRIN2 ST 2ZIP 2", "FED ID/SS#2", "BILLING NAME", "UATB AR ADDRESS1 45", "UATB AR ADDRESS2 45", "UATB AR ADDRESS3 45", "AR CITY", "AR STATE", "AR ZIP", "AR ATTN", "UATB CR ATTG NAME40", "CR SCORING", "FACILITY SCORE", "SIC CODE", "ASSET #", "EQUIP DESC", "QUANTITY", "NEW USED", "MODEL", "A MANUFACTURER YEAR", "SERIAL NUMBER", "EQUIP CODE", "EQUIP CODE DESC", "ASSET VENDOR", "ASSET VENDOR NAME", "MANUFACTURER", "MANUFACT NAME", "UATB EQUIP ADDR1 45", "UATB EQUIP ADDR2 45", "EQUIP CITY", "EQUIP STATE", "EQUIP ZIP", "STATE TAX CODE", "CNTY TAX CODE", "CITY TAX CODE", "PROP STATUS", "EQUIP COST", "EQUIP COST PCT", "PUR OPTION", "PUR OPTION", "AS RECOURSE CODE", "RESID AMT", "BEG DEPR DATE", "OPER LS BEGIN DATE", "OPER LS LIM", "OPER LS SALVAGE", "PRIN/GUAR NAME 1", "DOB1", "GUAR CODE 2", "PRIN/GUAR NAME 2", "DOB2"]} \ No newline at end of file +{"loggingLevel": "ERROR", "directories": {"ASSET": "", "CUST": "", "DOB": "", "FIN": "", "output": ""}, "COLS": ["CUST ID", "CONTRACT NO", "BUSINESS TYPE", "FED ID", "CUST CREDIT ACCT", "CUSTOMER", "LEASE TYPE", "EQUIPMENT COST", "CBR", "NET INVESTMENT", "ANNUAL COMBINED IRR", "CONTRACT TERM", "INCOME START DATE", "FIRST PYMT DATE", "FIRST PYMT AMT", "CONTRACT PYMT", "INVOICE CODE", "INV DAYS", "INV DUE DAY", "SEC DEPOSIT", "IDC AMOUNTS", "IDC DATES", "RESIDUAL", "MANAGERS RESIDUAL", "PROMOTION", "PRODUCT LINE", "REGION", "REGION DESC", "BRANCH", "BUSINESS SEGMENT", "LEAD BANK", "MRKTNG REP", "MRKTNG REGION", "REMIT TO", "PYMT OPTION", "BANK CODE", "TAPE BANK NUM", "TAPE ACCOUNT NUM", "TAPE ACCT TYPE", "DEALER", "PRIVATE LABEL", "RESID METHOD", "LATE CHRG EXMPT", "INSURANCE CODE", "VARIABLE DATE", "VARIABLE RATE", "BILLING CYCLE", "UM USER DATE\\d?", "CR ATTG PHONE", "GROSS CONTRACT", "ADV ", "PD AMT FINANCED", "PD INCOME START DATE", "INVOICE DESC", "VARIABLE PYMT CODE", "PD PAYMENT AMT", "QUOTE BUYOUT", "LATE CHARGE CODE", "LATE CHRG RATE", "M DEF COLLECTOR", "AM ACH LEAD DAYS", "UNL POOL", "PD RISK DATE", "PD RISK", "LGD RISK", "LGD DATE", "Service By Others", "CONTRACT NO", "CUST CREDIT ACCT", "CUST ID", "CUST NAME", "UATB CUST DBA", "UATB CUST ADDRESS\\d \\d{2}", "CUST CITY", "CUST STATE", "CUST ZIP", "GUAR CODE \\d", "PRIN\\d?/GUAR NAME \\d", "PRIN\\d? ADDR?\\d", "PRIN\\d? CITY\\d", "PRIN\\d? ST \\d", "ZIP \\d", "FED ID/SS#\\d", "BILLING NAME", "UATB AR ADDRESS\\d \\d{2}", "AR CITY", "AR STATE", "AR ZIP", "AR ATTN", "UATB CR ATTG NAME\\d{2}", "CR SCORING", "FACILITY SCORE", "SIC CODE", "ASSET #", "EQUIP DESC", "QUANTITY", "NEW USED", "MODEL", "A MANUFACTURER YEAR", "SERIAL NUMBER", "EQUIP CODE", "EQUIP CODE DESC", "ASSET VENDOR", "ASSET VENDOR NAME", "MANUFACTURER", "MANUFACT NAME", "UATB EQUIP ADDR\\d \\d{2}", "EQUIP CITY", "EQUIP STATE", "EQUIP ZIP", "STATE TAX CODE", "CNTY TAX CODE", "CITY TAX CODE", "PROP STATUS", "EQUIP COST", "EQUIP COST PCT", "PUR OPTION", "PUR OPTION", "AS RECOURSE CODE", "RESID AMT", "BEG DEPR DATE", "OPER LS BEGIN DATE", "OPER LS LIM", "OPER LS SALVAGE", "PRIN/GUAR NAME \\d", "DOB\\d", "GUAR CODE \\d", "PRIN/GUAR NAME \\d"]} \ No newline at end of file diff --git a/todo.txt b/todo.txt index 8044c60..3a6c2e9 100644 --- a/todo.txt +++ b/todo.txt @@ -1,2 +1,6 @@ +[ ] Allow automatic recogniction of report type + [ ] Check type by columns + [ ] Remove Indiviudal selection + [ ] Allow drag & drop (of multiselection) [ ] Notification on completion [ ] Icons \ No newline at end of file