import os import pandas as pd from datetime import datetime as dt, timedelta import sys, getopt import re from pathlib import Path import time from pprint import pprint as prt import numpy as np contract_number_regex = "\d{3}-\d{7}-\d{3}" def create_line_divider(breakage_list: list): """ This allows for the creation of a custom data extractor Breakage list defines the split points that will be used for the line Example Given breakage_list [10, 20, 30] using slot_num 0 in the resulting extract_line_slot will yield characters 0 - 10 from the string. Slot 1 would give characters 10 - 20 """ def extract_line_slot(slot_num : int, line_string: str, debug : bool = False): """ Pulls data from a line/string using break points defined by the parent function. ONLY USE THIS FUNCTION THROUGH CREATION USING 'create_line_extractor' Will automatically convert numbers to floats """ assert(slot_num < len(breakage_list)+1) low_range = 0 if slot_num == 0 else breakage_list[slot_num-1] high_range = len(line_string) if slot_num == len(breakage_list) else breakage_list[slot_num] data = line_string[low_range:high_range].strip().replace(",", "") try: data = float(data) except: pass if debug: print(f"Slot num: {slot_num} | Low: {low_range} | High: {high_range} | Data: {data}") return data return extract_line_slot def renewal_net_invest_trial_balance(report: str, save_name: str): lines = report.splitlines() data_extractor = create_line_divider([21,29,43,58,71,88,99,113]) extracted_data_dict = { 'CUSTOMER NAME' : [], 'TYPE' : [], 'GROSS RENEWAL' : [], 'REMAINING BAL' : [], 'FINANCED RES' : [], 'REMAINING RES' : [], 'LEASE PYMTS' : [], 'CONTRACT NUMBER' : [], 'RENEWAL' : [], 'PAYMENTS RCVD' : [], 'CUR RENT RCVB' : [], 'UNEARNED RIN' : [], 'SECURITY DEP' : [], 'NET INVEST' : [], 'UNEARN INCOME' : [], 'TOTAL' : [], 'REM RENT RCVB' : [], 'UNPAID RES' : [], } columns = list(extracted_data_dict.keys()) line0 = list(zip(columns[0:7], [0,1,2,3,4,5,7])) line1 = list(zip(columns[7:16], [i for i in range(0,9)])) line2 = list(zip(columns[16:], [3,4])) for line in enumerate(lines): slot1 = data_extractor(0,line[1],False) if type(slot1) != str : continue if re.search(contract_number_regex, slot1) != None: data_section = lines[line[0]-1:line[0]+2] for ds in enumerate(data_section): print(ds[1]) if ds[1].find(".") == -1: [print(f"\n{d[0]}: {d[1]}") for d in enumerate(data_section)] print('\n') [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[0])) for c in line0] [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[1])) for c in line1] [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[2])) for c in line2] dataframe = pd.DataFrame(extracted_data_dict) dataframe.to_excel(save_name, index=False) return dataframe with open("/config/workspace/LEAF/IL Extract SRC/2022.05.20 Renewal Net Investment", errors="replace") as rep_file: report = rep_file.read() prt(renewal_net_invest_trial_balance(report, "rn_TESTING.xlsx"))