You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
110 lines
5.3 KiB
110 lines
5.3 KiB
import os
|
|
import pandas as pd
|
|
from datetime import datetime as dt, timedelta
|
|
import sys, getopt
|
|
import re
|
|
from pathlib import Path
|
|
import time
|
|
import numpy as np
|
|
|
|
|
|
contract_number_regex = "\d{3}-\d{7}-\d{3}"
|
|
|
|
|
|
def create_line_divider(breakage_list: list):
|
|
"""
|
|
This allows for the creation of a custom data extractor
|
|
Breakage list defines the split points that will be used for the line
|
|
Example
|
|
Given breakage_list [10, 20, 30]
|
|
using slot_num 0 in the resulting extract_line_slot will yield
|
|
characters 0 - 10 from the string.
|
|
Slot 1 would give characters 10 - 20
|
|
"""
|
|
def extract_line_slot(slot_num : int, line_string: str, debug : bool = False):
|
|
"""
|
|
Pulls data from a line/string using break points defined by the
|
|
parent function.
|
|
ONLY USE THIS FUNCTION THROUGH CREATION USING 'create_line_extractor'
|
|
Will automatically convert numbers to floats
|
|
"""
|
|
# We can't have a slot number higher than the number of slots
|
|
assert(slot_num < len(breakage_list)+1)
|
|
low_range = 0 if slot_num == 0 else breakage_list[slot_num-1]
|
|
high_range = len(line_string) if slot_num == len(breakage_list) else breakage_list[slot_num]
|
|
# In order to create a float we need to remove the , from the string
|
|
data = line_string[low_range:high_range].strip().replace(",", "")
|
|
try: data = float(data)
|
|
except: pass
|
|
if debug:
|
|
print(f"Slot num: {slot_num} | Low: {low_range} | High: {high_range} | Data: {data}")
|
|
return data
|
|
return extract_line_slot
|
|
|
|
def ach(report: str, save_name: str):
|
|
lines = report.splitlines()
|
|
extracted_data_dict = {
|
|
"ContractNumber" : [],
|
|
"CustomerName" : [],
|
|
"BankCode" : [],
|
|
"BankNumber": [],
|
|
"AccountNumber" : [],
|
|
"Payment" : [],
|
|
"Batch": [],
|
|
"Lessor": [],
|
|
"PaymentDate": [],
|
|
}
|
|
columns = list(extracted_data_dict.keys())
|
|
batches = {
|
|
"batch_num": [],
|
|
"payment_date": [],
|
|
"lessor": [],
|
|
#"count": [],
|
|
"total": []
|
|
}
|
|
|
|
data_extractor = create_line_divider([19,57,67,82,104])
|
|
bank_number_regex = "\d{9}"
|
|
batch_num_regex = "BATCH \d{4} TOTAL"
|
|
for line in enumerate(lines):
|
|
# Check for a contract number and a bank number in the line
|
|
if (re.search(contract_number_regex, line[1]) != None) & (re.search(bank_number_regex, line[1]) != None):
|
|
# Iterates through the columns list and adds the corresponding slot number to the dictonary for the column
|
|
# Here the order of the columns (keys in dictonary) matter since they need to be in the same order as
|
|
# the slot numbers
|
|
[extracted_data_dict[columns[c]].append(data_extractor(c, line[1])) for c in range(0, len(columns)-3)]
|
|
# This searches for a statement that looks like a batch number
|
|
# This sums the contracts by thier lessor code. A feature requested by cash apps
|
|
if re.search(batch_num_regex, line[1]) != None:
|
|
# Batch number is always in characters 96 to 101
|
|
batches["batch_num"].append(line[1][96:101])
|
|
# Payment date will be 2 lines below that between charactes 114 and 125
|
|
batches["payment_date"].append(lines[line[0]+2][114:125])
|
|
# Lessor is just the first three number sof the contract number
|
|
batches["lessor"].append(extracted_data_dict["ContractNumber"][-1][0:3])
|
|
# Total is a number given by the report for that batch. ',' is removed so that it can be transformed into a float
|
|
batches["total"].append(float(line[1][107:125].strip().replace(",", "")))
|
|
#print(f"{line[0]+6} | {lines[line[0]+6][107:125]}\n{lines[line[0]+6]}")
|
|
#batches["count"].append(float(lines[line[0]+6][107:125].strip().replace(",", "")))
|
|
# Any time there's a new batch we need to add this data to the dictionary up up to the currrent place
|
|
# So we iterate over the number of contracts and add in the newest value for each that don't have one of these values already
|
|
[extracted_data_dict["Batch"].append(batches["batch_num"][-1]) for _ in range(0, (len(extracted_data_dict["BankCode"]) - len(extracted_data_dict["Batch"])))]
|
|
[extracted_data_dict["Lessor"].append(batches["lessor"][-1]) for _ in range(0, (len(extracted_data_dict["BankCode"]) - len(extracted_data_dict["Lessor"])))]
|
|
[extracted_data_dict["PaymentDate"].append(batches["payment_date"][-1]) for _ in range(0, (len(extracted_data_dict["BankCode"]) - len(extracted_data_dict["PaymentDate"])))]
|
|
# Now the dictioanry lists should all be equal lengths and we can create a dataframe
|
|
dataframe = pd.DataFrame(extracted_data_dict)
|
|
# We're creating two sheets: data & summary so we need to open and excel writer
|
|
# This also helps with a bug caused by larger dataframes
|
|
with pd.ExcelWriter(save_name) as writer:
|
|
dataframe.to_excel(writer, index=False, sheet_name="data")
|
|
# The batches dictioanry is converted to a dataframe and added as it's own sheet
|
|
pd.DataFrame(batches).to_excel(writer, index=False, sheet_name="Summary")
|
|
return dataframe
|
|
|
|
r1 = "/config/workspace/LEAF/IL Extract SRC/ach_errors/2022.05.27_ACH_C"
|
|
r2 = "/config/workspace/LEAF/IL Extract SRC/ach_errors/2022.06.03_ACH_C"
|
|
|
|
with open(r2, errors="replace") as ifile:
|
|
report = ifile.read()
|
|
|
|
ach(report, "test_ach_0613.xlsx") |