InfoLeaseExtract/ach_fix.py

import os
import pandas as pd
from datetime import datetime as dt, timedelta
import sys, getopt
import re
from pathlib import Path
import time
import numpy as np


contract_number_regex = "\d{3}-\d{7}-\d{3}"


def create_line_divider(breakage_list: list):
    """
    This allows for the creation of a custom data extractor
    Breakage list defines the split points that will be used for the line
    Example
    Given breakage_list [10, 20, 30]
    using slot_num 0 in the resulting extract_line_slot will yield
    characters 0 - 10 from the string.
    Slot 1 would give characters 10 - 20
    """
    def extract_line_slot(slot_num : int, line_string: str, debug : bool = False):
        """
        Pulls data from a line/string using break points defined by the
        parent function.
        ONLY USE THIS FUNCTION THROUGH CREATION USING 'create_line_extractor'
        Will automatically convert numbers to floats
        """
        # We can't have a slot number higher than the number of slots
        assert(slot_num < len(breakage_list)+1)
        low_range = 0 if slot_num == 0 else breakage_list[slot_num-1]
        high_range = len(line_string) if slot_num == len(breakage_list) else breakage_list[slot_num]
        # In order to create a float we need to remove the , from the string
        data = line_string[low_range:high_range].strip().replace(",", "")
        try: data = float(data)
        except: pass
        if debug:
            print(f"Slot num: {slot_num} | Low: {low_range} | High: {high_range} | Data: {data}")
        return data
    return extract_line_slot

def ach(report: str, save_name: str):
    lines = report.splitlines()
    extracted_data_dict = {
    "ContractNumber" : [],
    "CustomerName" : [],
    "BankCode" : [],
    "BankNumber": [],
    "AccountNumber" : [],
    "Payment" : [],
    "Batch": [],
    "Lessor": [],
    "PaymentDate": [],
    }
    columns = list(extracted_data_dict.keys())
    batches = {
        "batch_num": [],
        "payment_date": [],
        "lessor": [],
        #"count": [],
        "total": []
    }

    data_extractor = create_line_divider([19,57,67,82,104])
    bank_number_regex = "\d{9}"
    batch_num_regex = "BATCH \d{4} TOTAL"
    for line in enumerate(lines):
        # Check for a contract number and a bank number in the line
        if (re.search(contract_number_regex, line[1]) != None) & (re.search(bank_number_regex, line[1]) != None):
            # Iterates through the columns list and adds the corresponding slot number to the dictonary for the column
            # Here the order of the columns (keys in dictonary) matter since they need to be in the same order as
            # the slot numbers
            [extracted_data_dict[columns[c]].append(data_extractor(c, line[1])) for c in range(0, len(columns)-3)]
        # This searches for a statement that looks like a batch number
        # This sums the contracts by thier lessor code. A feature requested by cash apps
        if re.search(batch_num_regex, line[1]) != None:
            # Batch number is always in characters 96 to 101
            batches["batch_num"].append(line[1][96:101])
            # Payment date will be 2 lines below that between charactes 114 and 125
            batches["payment_date"].append(lines[line[0]+2][114:125])
            # Lessor is just the first three number sof the contract number
            batches["lessor"].append(extracted_data_dict["ContractNumber"][-1][0:3])
            # Total is a number given by the report for that batch. ',' is removed so that it can be transformed into a float
            batches["total"].append(float(line[1][107:125].strip().replace(",", "")))
            #print(f"{line[0]+6} | {lines[line[0]+6][107:125]}\n{lines[line[0]+6]}")
            #batches["count"].append(float(lines[line[0]+6][107:125].strip().replace(",", "")))
            # Any time there's a new batch we need to add this data to the dictionary up up to the currrent place
            # So we iterate over the number of contracts and add in the newest value for each that don't have one of these values already
            [extracted_data_dict["Batch"].append(batches["batch_num"][-1]) for _ in range(0, (len(extracted_data_dict["BankCode"]) - len(extracted_data_dict["Batch"])))]
            [extracted_data_dict["Lessor"].append(batches["lessor"][-1]) for _ in range(0, (len(extracted_data_dict["BankCode"]) - len(extracted_data_dict["Lessor"])))]
            [extracted_data_dict["PaymentDate"].append(batches["payment_date"][-1]) for _ in range(0, (len(extracted_data_dict["BankCode"]) - len(extracted_data_dict["PaymentDate"])))]
    # Now the dictioanry lists should all be equal lengths and we can create a dataframe
    dataframe = pd.DataFrame(extracted_data_dict)
    # We're creating two sheets: data & summary so we need to open and excel writer
    # This also helps with a bug caused by larger dataframes
    with pd.ExcelWriter(save_name) as writer:
        dataframe.to_excel(writer, index=False, sheet_name="data")
        # The batches dictioanry is converted to a dataframe and added as it's own sheet
        pd.DataFrame(batches).to_excel(writer, index=False, sheet_name="Summary")
    return dataframe

r1 = "/config/workspace/LEAF/IL Extract SRC/ach_errors/2022.05.27_ACH_C"
r2 = "/config/workspace/LEAF/IL Extract SRC/ach_errors/2022.06.03_ACH_C"

with open(r2, errors="replace") as ifile:
    report = ifile.read()

ach(report, "test_ach_0613.xlsx")