import os import pandas as pd from datetime import datetime as dt, timedelta import sys, getopt import re from pathlib import Path import time import numpy as np contract_number_regex = "\d{3}-\d{7}-\d{3}" def create_line_divider(breakage_list: list): """ This allows for the creation of a custom data extractor Breakage list defines the split points that will be used for the line Example Given breakage_list [10, 20, 30] using slot_num 0 in the resulting extract_line_slot will yield characters 0 - 10 from the string. Slot 1 would give characters 10 - 20 """ def extract_line_slot(slot_num : int, line_string: str, debug : bool = False): """ Pulls data from a line/string using break points defined by the parent function. ONLY USE THIS FUNCTION THROUGH CREATION USING 'create_line_extractor' Will automatically convert numbers to floats """ # We can't have a slot number higher than the number of slots assert(slot_num < len(breakage_list)+1) low_range = 0 if slot_num == 0 else breakage_list[slot_num-1] high_range = len(line_string) if slot_num == len(breakage_list) else breakage_list[slot_num] # In order to create a float we need to remove the , from the string data = line_string[low_range:high_range].strip().replace(",", "") try: data = float(data) except: pass if debug: print(f"Slot num: {slot_num} | Low: {low_range} | High: {high_range} | Data: {data}") return data return extract_line_slot def ach(report: str, save_name: str): lines = report.splitlines() extracted_data_dict = { "ContractNumber" : [], "CustomerName" : [], "BankCode" : [], "BankNumber": [], "AccountNumber" : [], "Payment" : [], "Batch": [], "Lessor": [], "PaymentDate": [], } columns = list(extracted_data_dict.keys()) batches = { "batch_num": [], "payment_date": [], "lessor": [], #"count": [], "total": [] } data_extractor = create_line_divider([19,57,67,82,104]) bank_number_regex = "\d{9}" batch_num_regex = "BATCH \d{4} TOTAL" for line in enumerate(lines): # Check for a contract number and a bank number in the line if (re.search(contract_number_regex, line[1]) != None) & (re.search(bank_number_regex, line[1]) != None): # Iterates through the columns list and adds the corresponding slot number to the dictonary for the column # Here the order of the columns (keys in dictonary) matter since they need to be in the same order as # the slot numbers [extracted_data_dict[columns[c]].append(data_extractor(c, line[1])) for c in range(0, len(columns)-3)] # This searches for a statement that looks like a batch number # This sums the contracts by thier lessor code. A feature requested by cash apps if re.search(batch_num_regex, line[1]) != None: # Batch number is always in characters 96 to 101 batches["batch_num"].append(line[1][96:101]) # Payment date will be 2 lines below that between charactes 114 and 125 batches["payment_date"].append(lines[line[0]+2][114:125]) # Lessor is just the first three number sof the contract number batches["lessor"].append(extracted_data_dict["ContractNumber"][-1][0:3]) # Total is a number given by the report for that batch. ',' is removed so that it can be transformed into a float batches["total"].append(float(line[1][107:125].strip().replace(",", ""))) #print(f"{line[0]+6} | {lines[line[0]+6][107:125]}\n{lines[line[0]+6]}") #batches["count"].append(float(lines[line[0]+6][107:125].strip().replace(",", ""))) # Any time there's a new batch we need to add this data to the dictionary up up to the currrent place # So we iterate over the number of contracts and add in the newest value for each that don't have one of these values already [extracted_data_dict["Batch"].append(batches["batch_num"][-1]) for _ in range(0, (len(extracted_data_dict["BankCode"]) - len(extracted_data_dict["Batch"])))] [extracted_data_dict["Lessor"].append(batches["lessor"][-1]) for _ in range(0, (len(extracted_data_dict["BankCode"]) - len(extracted_data_dict["Lessor"])))] [extracted_data_dict["PaymentDate"].append(batches["payment_date"][-1]) for _ in range(0, (len(extracted_data_dict["BankCode"]) - len(extracted_data_dict["PaymentDate"])))] # Now the dictioanry lists should all be equal lengths and we can create a dataframe dataframe = pd.DataFrame(extracted_data_dict) # We're creating two sheets: data & summary so we need to open and excel writer # This also helps with a bug caused by larger dataframes with pd.ExcelWriter(save_name) as writer: dataframe.to_excel(writer, index=False, sheet_name="data") # The batches dictioanry is converted to a dataframe and added as it's own sheet pd.DataFrame(batches).to_excel(writer, index=False, sheet_name="Summary") return dataframe r1 = "/config/workspace/LEAF/IL Extract SRC/ach_errors/2022.05.27_ACH_C" r2 = "/config/workspace/LEAF/IL Extract SRC/ach_errors/2022.06.03_ACH_C" with open(r2, errors="replace") as ifile: report = ifile.read() ach(report, "test_ach_0613.xlsx")