import os import pandas as pd from datetime import datetime as dt, timedelta import sys, getopt import re from pathlib import Path import time contract_number_regex = "\d{3}-\d{7}-\d{3}" class ILReport: """ InfoLease Report class will be used to work with the files. It makes it easier to add new reports to the workflow and to make it more clear where the reports are coming from. It also helps with tracking reports that may not be ready yet. """ def __init__(self, location, extraction_function = None, output_location = None, output_name = None): # The location where the InfoLease report is stored self.location = location # The base name of the file, corresponds to the report type # If output location not specified, save to the input location if output_location == None: self.output_location = Path(location).parent.absolute() else: self.output_location = output_location # This is optional but has a default if output_name == None: # Get the file name of the input and remove the date self.output_name = os.path.basename(f"{self.location}")\ .replace(f"{(dt.now() - timedelta(days=+1)).strftime('%Y.%m.%d')}","") else: self.output_name = output_name # The function used to extract the data from the report self.x_method = extraction_function # Tracks whether the data was successfully exctracted self.successful = False def run(self) -> int: """ This method is what actully run the report. I uses the specidied extraction function to create and save an excel document. SUCESS returns 0 ERROR returns 1 Failure is also noted by self.success == False """ try: # Open the file and read it to a string | errors = 'replace' deals with non UTF-8 characters (no affect on output) with open(self.location, errors="replace") as ifile: report = ifile.read() except IOError as ioe: print(f"Failed to open file: {self.location}\n{ioe}") self.successful = False return 1 try: # Run the associated method to extract the data and get the dataframe dataframe = self.x_method(report) try: assert(len(dataframe) > 1) except Exception as e: print(f"Data Length Error: {self.output_name} is empty:\n{dataframe}") self.successful = False return 1 except Exception as e: print(f"{self.output_name} failed to process:\n{e}") self.successful = False return 1 try: # Save the dataframe as an excel document dataframe.to_excel(f"{self.output_location}/{self.output_name}_{dt.now().strftime('%Y%m%d-%H%M')}.xlsx", index = False) except Exception as e: self.successful = False print(f"{self.output_name} failed to save to excel!\n{dataframe}\n{e}") return 1 self.successful = True return 0 def process(self): try: # Open the file and read it to a string | errors = 'replace' deals with non UTF-8 characters (no affect on output) with open(self.location, errors="replace") as ifile: report = ifile.read() except IOError as ioe: print(f"Failed to open file: {self.location}\n{ioe}") self.successful = False return 1 try: # Run the associated method to extract the data and get the dataframe dataframe = self.x_method(report) try: assert(len(dataframe) > 1) except Exception as e: print(f"Data Length Error: {self.output_name} is empty:\n{dataframe}") self.successful = False return 1 except Exception as e: print(f"{self.output_name} failed to process:\n{e}") self.successful = False return 1 return dataframe def create_line_divider(breakage_list: list): """ This allows for the creation of a custom data extractor Breakage list defines the split points that will be used for the line Example Given breakage_list [10, 20, 30] using slot_num 0 in the resulting extract_line_slot will yield characters 0 - 10 from the string. Slot 1 would give characters 10 - 20 """ def extract_line_slot(slot_num : int, line_string: str, debug : bool = False): """ Pulls data from a line/string using break points defined by the parent function. ONLY USE THIS FUNCTION THROUGH CREATION USING 'create_line_extractor' Will automatically convert numbers to floats """ assert(slot_num < len(breakage_list)+1) low_range = 0 if slot_num == 0 else breakage_list[slot_num-1] high_range = len(line_string) if slot_num == len(breakage_list) else breakage_list[slot_num] data = line_string[low_range:high_range].strip().replace(",", "") try: data = float(data) except: pass if debug: print(f"Slot num: {slot_num} | Low: {low_range} | High: {high_range} | Data: {data}") return data return extract_line_slot ###################################################################################################################### # # # EXTRACTION FUNCTIONS: used to pull data out of specific InfoLease report types # # # ###################################################################################################################### """ COMMON EXTRACTION COMPONENTS/FEATURES: - lines = report.splitlines() : splits the reports into a list of lines (based on \n line breaks in document) - extracted_data_dict : this is a dictionary that will hold the extracted data and will be used to create the dataframe - columns = list(extracted_data_dict.keys()) : breaks the extracted_data_dict into a list of its keys (excel column heads) - data_extractor = create_line_divider([#,#,#,#,#]): This creates a function we can use to pull data from a line based on its 'slot position'. A slot position is the characters between the numbers specified in the list passed into the function - for line in enumerate(lines): iterates through each line in the document. Line is a tuple of (line number, line string) having the line number can be very useful when we need to access data in adjacent lines - line# = list(zip(columns[#:#],[i for i in range(#,#)])): This creates a list with the tuple (column name, slot number). It allows us to iterate through this list and make sure the correct data slots are being used for each column/key in the data dictionary COMMON REGEX COMPONENTS \d : any digit [0-9] \D : any character that is not a digit \s : whitespace . : any character besides newline (\n) {#}: # number of the preceding character * : 0 or more repetitions of the preceding character """ def ach(report: str): lines = report.splitlines() extracted_data_dict = { "ContractNumber" : [], "CustomerName" : [], "BankCode" : [], "BankNumber": [], "AccountNumber" : [], "Payment" : [], } columns = list(extracted_data_dict.keys()) data_extractor = create_line_divider([19,57,67,82,104]) bank_number_regex = "\d{9}" for line in enumerate(lines): if (re.search(contract_number_regex, line[1]) != None) & (re.search(bank_number_regex, line[1]) != None): [extracted_data_dict[columns[c]].append(data_extractor(c, line[1])) for c in range(0, len(columns))] return pd.DataFrame(extracted_data_dict) def disposition(report: str): lines = report.splitlines() extracted_data_dict = { "ContractNumber" : [], "Amount Rec" : [], "Trans Num" : [], "Date RCVD": [], "Date Posted" : [], "Last Pymt Due" : [], "Date Due" : [], "Residual Amt" : [], "Term Date" : [], "Total Pastdue" : [], "Customer Name" : [], } columns = list(extracted_data_dict.keys()) data_extractor = create_line_divider([15,32,41, 51, 61, 79,88, 103, 114]) for line in enumerate(lines): if re.search(contract_number_regex, data_extractor(0,line[1])): [extracted_data_dict[columns[c]].append(data_extractor(c,line[1])) for c in range(0, len(columns)-1)] extracted_data_dict["Customer Name"].append(lines[line[0]+1].strip()) return pd.DataFrame(extracted_data_dict) def gainloss(report: str): lines = report.splitlines() extracted_data_dict = { 'REM RENT RCVB' : [], 'GUAR RESIDUAL' : [], 'ASSET VAL' : [], 'EQUITY ADDON' : [], 'CURR INT RCVB' : [], 'MISC G/L' : [], 'BLENDED INC' : [], 'CONTRACT NUMBER' : [], 'CURR RENT RCVB' : [], 'RESIDUAL' : [], 'END/SEC DEP' : [], 'SALES TAX' : [], 'INVENT CHANGE' : [], 'NET RESERVE' : [], 'LATE CHGS' : [], 'CUSTOMER NAME' : [], 'UNEARNED FIN' : [], 'UNAMORT RES' : [], 'MISC' : [], 'MISC TAX' : [], 'CASH RECEIVED' : [], 'RCV OFFSET' : [], 'GAIN/LOSS' : [], 'DISPOSITION CODE' : [], 'DISPOSITION DESC' 'UNEARNED IDC' : [], 'UNPAID INT' : [], 'PENALTY FEE' : [], 'UNPAID ACCRD' : [], 'RENEWAL RCVBL' : [], 'DEF REN INC' : [], 'DEF REN INT' : [], 'EARNED IDC' : [], 'GST BOOK G/L' : [], 'UNRECOG GST' : [], 'INT EARNED' : [], 'OVER/SHORT' : [], 'OPER RCVB' : [], 'OPER BASIS' : [], 'CTD OPER DEPR' : [], } # L0: BlendedInc 6 # L1: Late CHGS 14 # L2: Gain/Loss 22 # L3: Def Ren Int 30 # l4 Over/Short 35 # L5: CTD OPER columns = list(extracted_data_dict.keys()) # These line data are used to tell the data extrator which values to pull for each line of # relevant data. It parits dictionary keys with thier corresponding data slot in the line # So that they can be iterated through during data extraction line0 = list(zip(columns[0:7],[i for i in range(1,8)])) line1 = list(zip(columns[7:15],[i for i in range(0,8)])) line2 = list(zip(columns[15:23], [i for i in range(0,8)])) line3 = list(zip(columns[23:31], [i for i in range(0,8)])) line4 = list(zip(columns[31:36], [i for i in range(1,8) if i not in [3,6]])) line5 = list(zip(columns[36:], [i for i in range(1,4)])) data_extractor = create_line_divider([27,43,58,74,88,105,120]) for line in enumerate(lines): if (re.search(contract_number_regex, data_extractor(0,line[1])) != None)&\ (type(data_extractor(1,line[1])) == float) : data_section = lines[line[0]-1:line[0]+5] [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[0])) for c in line0] [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[1])) for c in line1] [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[2])) for c in line2] [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[3])) for c in line3] [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[4])) for c in line4] [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[5])) for c in line5] df = pd.DataFrame(extracted_data_dict) # The Accounting team wanted the disposotion code split into number and descriptionso... disp_code = [] disp_descriptoin = [] for d in df['DISPOSITION CODE'].to_list(): disp_split = d.split(" ") disp_code.append(disp_split[0]) disp_descriptoin.append(" ".join(disp_split[1:])) df["DISPOSITION CODE"] = disp_code df["DISPOSITION DESC"] = disp_descriptoin return df # Works for Net-inv-loans & NIV-after def net_invest_trial_balance(report: str): lines = report.splitlines() extracted_data_dict = { 'CUSTOMER NAME' : [], 'CURR INT RCVB' : [], 'UNEARNED BLENDED' : [], 'BLEND NET INV' : [], 'LEASE NUMBER' : [], 'GROSS CONTRACT' : [], 'CURR RENT RCVB' : [], 'UNEARN FIN' : [], 'END DEPOSIT' : [], 'SEC DEPOSIT' : [], 'LEASE PYMTS' : [], 'TOTAL' : [], 'CONTRACT STAT' : [], 'PAYMENTS RCVD' : [], 'REM RENT RCVB' : [], 'UNEARN RESID' : [], 'PROV LOSS' : [], 'NET RESERVE' : [], 'UNEARN INC' : [], 'BAL REMAINING' : [], 'RESIDUAL' : [], 'UNPAID INT' : [], 'NET INV' : [], 'UNEARNED IDC' : [], } columns = list(extracted_data_dict.keys()) line0 = list(zip(columns[0:4], [0,3,4,5])) line1 = list(zip(columns[4:12], [i for i in range(0,8)])) line2 = list(zip(columns[12:19], [i for i in range(0,7)])) line3 = list(zip(columns[19:], [i for i in range(1,6)])) data_extractor = create_line_divider([18,35,53,67,87,106,117]) for line in enumerate(lines): slot1 = data_extractor(0,line[1],False) if type(slot1) != str : continue if re.search(contract_number_regex, slot1) != None: data_section = lines[line[0]-1:line[0]+4] [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[0])) for c in line0] [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[1])) for c in line1] [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[2])) for c in line2] [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[3])) for c in line3] return pd.DataFrame(extracted_data_dict) def lockbox(report: str): lines = report.splitlines() extracted_data_dict = { "CustomerName" : [], "PaymentDate" : [], "InvoiceNumber" : [], "CheckNumber" : [], "InvoicePayment" : [], "ContractNumber" : [], "ContractPayment" : [], } # These are lists of the dictionary columns/keys and the data slots in which # that data can be found in the report. this way we can iterate through them # While extracting data bank_payment_records = [list(extracted_data_dict.keys())[1:5],[1,2,3,4]] infolease_payment_records = [list(extracted_data_dict.keys())[5:],[7,8]] # Below are the Regular Exppressions used to find relvant data lines full_line = "\d*\s{5}\d{2}/\d{2}/\d{4}\s{4}1" contract_only_line = "\s{90}\d.{7}1\d{2}-" cust_name_line = "\s{98}.{28}\D*" # The data extractor allows us to extract data from the report using slots # Slots are ranges of character denote by the list feed into the creation function data_extractor = create_line_divider([9,19,39,56,69,90,98,118]) for line in enumerate(lines): # We can skip empty lines if len(line[1]) == 0: continue # First we should check if there is a full line of data (defined by regex) if re.search(full_line, line[1]): # If this is true then we can iterate through the lists we created earlier and append the data to our dict for k in range(0,len(bank_payment_records[0])): extracted_data_dict[bank_payment_records[0][k]].append(data_extractor(bank_payment_records[1][k],line[1])) for k in range(0,len(infolease_payment_records[0])): extracted_data_dict[infolease_payment_records[0][k]].append(data_extractor(infolease_payment_records[1][k],line[1])) # Otherwise we should check if this is a line with only contract data elif re.search(contract_only_line,line[1]): # If that's the case we can use the 'bank payment data' from the previous entry since it should apply to his contract for k in range(0,len(bank_payment_records[0])): extracted_data_dict[bank_payment_records[0][k]].append(extracted_data_dict[bank_payment_records[0][k]][-1]) for k in range(0,len(infolease_payment_records[0])): extracted_data_dict[infolease_payment_records[0][k]].append(data_extractor(infolease_payment_records[1][k],line[1])) # If it doesn't hit either of these critera then continue since it's irelevant data else: continue i = 1 # used to track how many lines below the current line we're looking for the customer name # keep moving down a line and checking for a customer name # Customer name typically happens 1 line under data but can be 13 lines if cut off by page end while re.search(cust_name_line,lines[line[0]+i]) == None: i += 1 # Once it hits, add the name to the dict extracted_data_dict["CustomerName"].append(data_extractor(7,lines[line[0]+i])) return pd.DataFrame(extracted_data_dict) def minv(report: str): lines = report.splitlines() data_extractor = create_line_divider([15,32,52,71,83,107,116,128]) extracted_data_dict = { "ContractNumber" : [], "UTAB_OIC_DUE" : [], "RentalDue" : [], "UTAB_OIC_PYMT" : [], "ChargeType" : [], "OutstandBalance" : [], "BizSegment" : [], "BookingDate" : [], "Branch" : [], } columns = list(extracted_data_dict.keys()) for line in enumerate(lines): if re.search(contract_number_regex, line[1]) != None: [extracted_data_dict[columns[c]].append(data_extractor(c,line[1],debug=False)) for c in range(0,len(columns))] #All the list lengths need to be the same so if anything was missed it will fail to build return pd.DataFrame(extracted_data_dict) # Good for PUB_WIRES, VMCC, PBP_EPAY, returned check def payment_transactions(report: str): lines = report.splitlines() data_extractor = create_line_divider([6,33,52,62,80,89,110,121]) extracted_data_dict = { 'SEQ' : [], 'ACCOUNT NUMBER' : [], 'PYMT METHOD' : [], 'DATE RCVD' : [], 'AMOUNT' : [], 'REF NO': [], 'PAYMENT MEMO' : [], 'PYMT TYPE' : [], 'CHECK NO' : [], 'CUSTOMER NAME' : [], 'TRANSACTIONS NUM': [], 'INV NO' : [], } columns = list(extracted_data_dict.keys()) transaction_num_regex = "\d{8}" for line in enumerate(lines): slot1 = data_extractor(1,line[1],False) if type(slot1) != str : continue if re.search(contract_number_regex, slot1) != None: [extracted_data_dict[columns[c]].append(data_extractor(c, line[1])) for c in range(0,len(columns)-3)] tnum_match = re.search(transaction_num_regex, lines[line[0]+1]) if tnum_match: tnum = lines[line[0]+1][tnum_match.start():tnum_match.end()] else: tnum = "" extracted_data_dict["TRANSACTIONS NUM"].append(tnum) cname = lines[line[0]+1][6:37].strip() extracted_data_dict['CUSTOMER NAME'].append(cname) inv_no = lines[line[0]+1][79:90].strip() extracted_data_dict['INV NO'].append(inv_no) return pd.DataFrame(extracted_data_dict) def renewal_net_invest_trial_balance(report: str): lines = report.splitlines() data_extractor = create_line_divider([21,29,43,58,71,88,99,113]) extracted_data_dict = { 'CUSTOMER NAME' : [], 'TYPE' : [], 'GROSS RENEWAL' : [], 'CUR RENT RCVB' : [], 'UNEARNED RIN' : [], 'REMAINING RES' : [], 'LEASE PYMTS' : [], 'CONTRACT NUMBER' : [], 'RENEWAL' : [], 'PAYMENTS RCVD' : [], 'REM RENT RCVB' : [], 'UNPAID RES' : [], 'SECURITY DEP' : [], 'NET INVEST' : [], 'UNEARN INCOME' : [], 'TOTAL' : [], 'REMAINING BAL' : [], 'FINANCED RES' : [], } columns = list(extracted_data_dict.keys()) line0 = list(zip(columns[0:7], [0,1,2,3,4,5,7])) line1 = list(zip(columns[7:16], [i for i in range(0,9)])) line2 = list(zip(columns[16:], [3,4])) for line in enumerate(lines): slot1 = data_extractor(0,line[1],False) if type(slot1) != str : continue if re.search(contract_number_regex, slot1) != None: data_section = lines[line[0]-1:line[0]+4] [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[0])) for c in line0] [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[1])) for c in line1] [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[2])) for c in line2] return pd.DataFrame(extracted_data_dict) def unapplied(report: str): lines = report.splitlines() extracted_data_dict = { "Trans Num" : [], "ContractNumber" : [], "CheckNum" : [], "Date RCVD" : [], "Asset ID": [], "Reversed Amt" : [], "Branch" : [], "Unapplied Susp Acct" : [], "PaymentMemo" : [], "Payers Name" : [], "Batch Num" : [], "Posting Date" : [], "Unapplied Amt" : [], "Rev Post Date" : [], "Ref Num" : [], "Check Amt" : [], "Reason Code" : [], } columns = list(extracted_data_dict.keys()) # Iterate through the lines one at a time to look for relavant data # Use enumerate so that we know which line we're currently working on # this allows us to also work in the 'report' structure so that we can # grab the customer name from the line proceding the data data_extractor = create_line_divider([9,25, 38, 50, 65, 80, 89, 108]) trans_num = "\d{7}" for line in enumerate(lines): if (re.search("\d{7}", str(data_extractor(0,line[1],debug=False))) != None) &\ (re.search("\d{2}/\d{2}/\d{4}", str(data_extractor(3,line[1],debug=False))) != None): [extracted_data_dict[columns[c]].append(data_extractor(c,line[1])) for c in range(0,9)] [extracted_data_dict[columns[8+c]].append(data_extractor(c,lines[line[0]+1])) for c in range(1,len(columns)-8)] return pd.DataFrame(extracted_data_dict)