Reworked ILParser. The col name config will now be processed as regex.

This adds much more flexability.
master
= 3 years ago
parent 51edca51cf
commit c6c785c923
Signed by untrusted user who does not match committer: gprog
GPG Key ID: 5BE9BB58D37713F8
  1. 5
      IL Formatter.py
  2. 60
      ILParser.py
  3. 2
      config.json
  4. 4
      todo.txt

@ -1,6 +1,5 @@
from ui import Ui_MainWindow
from errorDialog import ErrorDialog
import ILParser
from PyQt5 import QtWidgets
from logging import debug, info, warning, exception as logException, error,DEBUG, INFO, WARNING, ERROR, basicConfig, getLogger
from sys import argv
@ -11,6 +10,9 @@ from os import startfile
from json import load, dump
from time import sleep
import ILParser
# Open the config file, create a dict, and set up logging
with open("config.json") as configFile:
config: dict = load(configFile)
@ -256,6 +258,7 @@ class MainWindow(QtWidgets.QMainWindow, Ui_MainWindow):
try:
data: DataFrame = ILParser.extract_data(report, config["COLS"])
except Exception as e:
self.processButton.setEnabled(False)
logException(f"Failed to parse file-> {filePath} :\n{e}")
open_error_dialog("Parsing Error:", f"Failed to parse file-> {filePath}", repr(e))
return None

@ -1,10 +1,17 @@
from pandas import DataFrame
import re
from re import Match, Pattern
from logging import getLogger, basicConfig
from json import load, dump
logger = getLogger(__name__)
logger.setLevel("DEBUG")
COLUMN_NAME_REGEX = re.compile(r"(?P<column_name>(\w|\.|#|\/)+)", re.IGNORECASE)
def replace_bad_cols(line: str, cols: list[str]):
def replace_bad_cols(line: str, cols: list[str]) -> str:
"""
Replaces bad column names in a string with modified names that have spaces replaced with dots.
@ -15,15 +22,29 @@ def replace_bad_cols(line: str, cols: list[str]):
Returns:
str: The modified string with bad column names replaced.
"""
logger.debug(f"Line: {line} | Cols: {cols}")
for c in cols:
# Replace spaces with dots in the column name
gc = c.replace(' ', '.')
# Replace the bad column name with the modified column name in the string
line = line.replace(c, gc)
# Create a regex for the col
col_regex: Pattern = re.compile(c.replace(' ', r'(?:\s|\.)'))
logger.debug(f"Col_regex: {col_regex}")
# Get all columns that match that pattern
col_matches: list[str|tuple[str]] = re.findall(col_regex, line)
logger.debug(f"Col_matches: {col_matches}")
# Match the substition for all matches if any
col_name: str
for col_name in col_matches:
logger.debug(f"col_name: {col_name}")
# Replace the bad column name with the modified column name in the string
# Adding the '.' instead of a space helps the parser tell what the continous
# column are
line = line.replace(col_name, col_name.replace(' ', '.'))
return line
def extract_data(input_doc: str, column_list: list[str]):
def extract_data(input_doc: str, column_list: list[str]) -> DataFrame|None:
"""
Extracts data from a string in a table-like format, where columns are identified by a list of column names, and
returns the data as a Pandas DataFrame.
@ -40,20 +61,45 @@ def extract_data(input_doc: str, column_list: list[str]):
data = {}
for line in input_doc.splitlines():
if len(columns) == 0 :
logger.debug(f"Columns = 0: {line}")
# Find the line that contains the column names and replace bad column names
if re.search("^\w", line):
logger.debug("Found word on first line.")
line = replace_bad_cols(line, column_list)
logger.debug(f"Column replacements made: {line}")
# Find the start and end positions of each column name and store them in a dictionary
columns_names = re.finditer(COLUMN_NAME_REGEX, line)
logger.debug(f"Found column names: {columns_names}")
for c in columns_names:
columns[c.group("column_name")] = {"start": c.start(), "end": c.end()}
logger.debug(f"Column section: {columns[c.group('column_name')]}")
data[c.group("column_name")] = []
continue
elif len(line) < 2:
logger.debug(f"Line len less than 2.")
continue
# Check if we've reached the end of the table and return the data
if re.search("\d+ records listed", line):
logger.debug(f"End of document: {line}")
logger.debug(f"Extracted data: {data}")
return DataFrame(data)
# Extract the data from each column based on the start and end positions
for key, span in columns.items():
data[key].append(line[span["start"]:span["end"]].strip())
data[key].append(line[span["start"]:span["end"]].strip())
if __name__ == "__main__":
basicConfig(filename='ILParser.log', encoding='utf-8',
level="DEBUG", filemode='w', force=True)
def test_replace_bad_cols():
with open("Inputs\CUST_ISSUE") as c:
input: str = c.read()
with open("config.json") as configFile:
config: dict = load(configFile)
columns: list[str] = config["COLS"]
replace_bad_cols(input.splitlines()[1], columns)
test_replace_bad_cols()

@ -1 +1 @@
{"loggingLevel": "ERROR", "directories": {"ASSET": "C:/Users/glott/OneDrive - LEAF Commercial Capital/Documents/0 In Progess/Portfolio/Automation/IL Formatter/Inputs", "CUST": "C:/Users/glott/OneDrive - LEAF Commercial Capital/Documents/0 In Progess/Portfolio/Automation/IL Formatter/Inputs", "DOB": "C:/Users/glott/OneDrive - LEAF Commercial Capital/Documents/0 In Progess/Portfolio/Automation/IL Formatter/Inputs", "FIN": "C:/Users/glott/OneDrive - LEAF Commercial Capital/Documents/0 In Progess/Portfolio/Automation/IL Formatter/Inputs", "output": "C:/Users/glott/OneDrive - LEAF Commercial Capital/Documents/0 In Progess/Portfolio/Automation/IL Formatter/Inputs"}, "COLS": ["CUST ID", "CONTRACT NO", "BUSINESS TYPE", "FED ID", "CUST CREDIT ACCT", "CUSTOMER", "LEASE TYPE", "EQUIPMENT COST", "CBR", "NET INVESTMENT", "ANNUAL COMBINED IRR", "CONTRACT TERM", "INCOME START DATE", "FIRST PYMT DATE", "FIRST PYMT AMT", "CONTRACT PYMT", "INVOICE CODE", "INV DAYS", "INV DUE DAY", "SEC DEPOSIT", "IDC AMOUNTS", "IDC DATES", "RESIDUAL", "MANAGERS RESIDUAL", "PROMOTION", "PRODUCT LINE", "REGION", "REGION DESC", "BRANCH", "BUSINESS SEGMENT", "LEAD BANK", "MRKTNG REP", "MRKTNG REGION", "REMIT TO", "PYMT OPTION", "BANK CODE", "TAPE BANK NUM", "TAPE ACCOUNT NUM", "TAPE ACCT TYPE", "DEALER", "PRIVATE LABEL", "RESID METHOD", "LATE CHRG EXMPT", "INSURANCE CODE", "VARIABLE DATE", "VARIABLE RATE", "BILLING CYCLE", "UM USER DATE2", "CR ATTG PHONE", "GROSS CONTRACT", "ADV ", "PD AMT FINANCED", "PD INCOME START DATE", "INVOICE DESC", "VARIABLE PYMT CODE", "PD PAYMENT AMT", "QUOTE BUYOUT", "LATE CHARGE CODE", "LATE CHRG RATE", "M DEF COLLECTOR", "AM ACH LEAD DAYS", "UNL POOL", "PD RISK DATE", "PD RISK", "LGD RISK", "LGD DATE", "Service By Others", "CONTRACT NO", "CUST CREDIT ACCT", "CUST ID", "CUST NAME", "UATB CUST DBA", "UATB CUST ADDRESS1 45", "UATB CUST ADDRESS2 45", "UATB CUST ADDRESS3 45", "CUST CITY", "CUST STATE", "CUST ZIP", "GUAR CODE 1", "PRIN1/GUAR NAME 1", "PRIN1 ADD1", "PRIN1 ADD2", "PRIN1 CITY1", "PRIN1 ST 1", "ZIP 1", "FED ID/SS#1", "GUAR CODE 2 PRIN/GUAR NAME 2", "PRIN2 ADD2", "PRIN2 ADDR2", "PRIN2 CITY2", "PRIN2 ST 2ZIP 2", "FED ID/SS#2", "BILLING NAME", "UATB AR ADDRESS1 45", "UATB AR ADDRESS2 45", "UATB AR ADDRESS3 45", "AR CITY", "AR STATE", "AR ZIP", "AR ATTN", "UATB CR ATTG NAME40", "CR SCORING", "FACILITY SCORE", "SIC CODE", "ASSET #", "EQUIP DESC", "QUANTITY", "NEW USED", "MODEL", "A MANUFACTURER YEAR", "SERIAL NUMBER", "EQUIP CODE", "EQUIP CODE DESC", "ASSET VENDOR", "ASSET VENDOR NAME", "MANUFACTURER", "MANUFACT NAME", "UATB EQUIP ADDR1 45", "UATB EQUIP ADDR2 45", "EQUIP CITY", "EQUIP STATE", "EQUIP ZIP", "STATE TAX CODE", "CNTY TAX CODE", "CITY TAX CODE", "PROP STATUS", "EQUIP COST", "EQUIP COST PCT", "PUR OPTION", "PUR OPTION", "AS RECOURSE CODE", "RESID AMT", "BEG DEPR DATE", "OPER LS BEGIN DATE", "OPER LS LIM", "OPER LS SALVAGE", "PRIN/GUAR NAME 1", "DOB1", "GUAR CODE 2", "PRIN/GUAR NAME 2", "DOB2"]}
{"loggingLevel": "ERROR", "directories": {"ASSET": "", "CUST": "", "DOB": "", "FIN": "", "output": ""}, "COLS": ["CUST ID", "CONTRACT NO", "BUSINESS TYPE", "FED ID", "CUST CREDIT ACCT", "CUSTOMER", "LEASE TYPE", "EQUIPMENT COST", "CBR", "NET INVESTMENT", "ANNUAL COMBINED IRR", "CONTRACT TERM", "INCOME START DATE", "FIRST PYMT DATE", "FIRST PYMT AMT", "CONTRACT PYMT", "INVOICE CODE", "INV DAYS", "INV DUE DAY", "SEC DEPOSIT", "IDC AMOUNTS", "IDC DATES", "RESIDUAL", "MANAGERS RESIDUAL", "PROMOTION", "PRODUCT LINE", "REGION", "REGION DESC", "BRANCH", "BUSINESS SEGMENT", "LEAD BANK", "MRKTNG REP", "MRKTNG REGION", "REMIT TO", "PYMT OPTION", "BANK CODE", "TAPE BANK NUM", "TAPE ACCOUNT NUM", "TAPE ACCT TYPE", "DEALER", "PRIVATE LABEL", "RESID METHOD", "LATE CHRG EXMPT", "INSURANCE CODE", "VARIABLE DATE", "VARIABLE RATE", "BILLING CYCLE", "UM USER DATE\\d?", "CR ATTG PHONE", "GROSS CONTRACT", "ADV ", "PD AMT FINANCED", "PD INCOME START DATE", "INVOICE DESC", "VARIABLE PYMT CODE", "PD PAYMENT AMT", "QUOTE BUYOUT", "LATE CHARGE CODE", "LATE CHRG RATE", "M DEF COLLECTOR", "AM ACH LEAD DAYS", "UNL POOL", "PD RISK DATE", "PD RISK", "LGD RISK", "LGD DATE", "Service By Others", "CONTRACT NO", "CUST CREDIT ACCT", "CUST ID", "CUST NAME", "UATB CUST DBA", "UATB CUST ADDRESS\\d \\d{2}", "CUST CITY", "CUST STATE", "CUST ZIP", "GUAR CODE \\d", "PRIN\\d?/GUAR NAME \\d", "PRIN\\d? ADDR?\\d", "PRIN\\d? CITY\\d", "PRIN\\d? ST \\d", "ZIP \\d", "FED ID/SS#\\d", "BILLING NAME", "UATB AR ADDRESS\\d \\d{2}", "AR CITY", "AR STATE", "AR ZIP", "AR ATTN", "UATB CR ATTG NAME\\d{2}", "CR SCORING", "FACILITY SCORE", "SIC CODE", "ASSET #", "EQUIP DESC", "QUANTITY", "NEW USED", "MODEL", "A MANUFACTURER YEAR", "SERIAL NUMBER", "EQUIP CODE", "EQUIP CODE DESC", "ASSET VENDOR", "ASSET VENDOR NAME", "MANUFACTURER", "MANUFACT NAME", "UATB EQUIP ADDR\\d \\d{2}", "EQUIP CITY", "EQUIP STATE", "EQUIP ZIP", "STATE TAX CODE", "CNTY TAX CODE", "CITY TAX CODE", "PROP STATUS", "EQUIP COST", "EQUIP COST PCT", "PUR OPTION", "PUR OPTION", "AS RECOURSE CODE", "RESID AMT", "BEG DEPR DATE", "OPER LS BEGIN DATE", "OPER LS LIM", "OPER LS SALVAGE", "PRIN/GUAR NAME \\d", "DOB\\d", "GUAR CODE \\d", "PRIN/GUAR NAME \\d"]}

@ -1,2 +1,6 @@
[ ] Allow automatic recogniction of report type
[ ] Check type by columns
[ ] Remove Indiviudal selection
[ ] Allow drag & drop (of multiselection)
[ ] Notification on completion
[ ] Icons
Loading…
Cancel
Save