InfoLeaseExtract/ILExtract.py

import os
import pandas as pd
from datetime import datetime as dt, timedelta
import sys, getopt
import re
from pathlib import Path
import time

contract_number_regex = "\d{3}-\d{7}-\d{3}"


class ILReport:
    """
    InfoLease Report class will be used to work with the files.
    It makes it easier to add new reports to the workflow and to make it more clear where
    the reports are coming from. It also helps with tracking reports that may not be ready yet.
    """
    def __init__(self, location, extraction_function = None, output_location = None, output_name = None):
        # The location where the InfoLease report is stored
        self.location = location
        # The base name of the file, corresponds to the report type
        # If output location not specified, save to the input location
        if output_location == None:
            self.output_location = Path(location).parent.absolute()
        else:
            self.output_location = output_location
        # This is optional but has a default
        if output_name == None:
            # Get the file name of the input and remove the date
            self.output_name = os.path.basename(f"{self.location}")\
                .replace(f"{(dt.now() - timedelta(days=+1)).strftime('%Y.%m.%d')}","")
        else:
            self.output_name = output_name
        # The function used to extract the data from the report
        self.x_method = extraction_function
        # Tracks whether the data was successfully exctracted
        self.successful  = False


    def run(self) -> int:
        """
        This method is what actully run the report. I uses the specidied extraction function to create and save an excel document.
        SUCESS returns 0
        ERROR returns 1
        Failure is also noted by self.success == False
        """
        try:
            # Open the file and read it to a string | errors = 'replace' deals with non UTF-8 characters (no affect on output)
            with open(self.location, errors="replace") as ifile:
                report = ifile.read()
        except IOError as ioe:
            print(f"Failed to open file: {self.location}\n{ioe}")
            self.successful = False
            return 1
        try:
            # Run the associated method to extract the data and get the dataframe
            dataframe = self.x_method(report, self.output_location)
            try:
                assert(len(dataframe) > 1)
            except Exception as e:
                print(f"Data Length Error: {self.output_name} is empty:\n{dataframe}")
                self.successful = False
                return 1
        except Exception as e:
            print(f"{self.output_name} failed to process:\n{e}")
            self.successful = False
            return 1
        # try:
        #     # Save the dataframe as an excel document
        #     dataframe.to_excel(f"{self.output_location}/{self.output_name}_{dt.now().strftime('%Y%m%d-%H%M')}.xlsx", index = False)
        # except Exception as e:
        #     self.successful = False
        #     print(f"{self.output_name} failed to save to excel!\n{dataframe}\n{e}")
        #     return 1
        # self.successful = True
        return 0
    def process(self):
        try:
            # Open the file and read it to a string | errors = 'replace' deals with non UTF-8 characters (no affect on output)
            with open(self.location, errors="replace") as ifile:
                report = ifile.read()
        except IOError as ioe:
            print(f"Failed to open file: {self.location}\n{ioe}")
            self.successful = False
            return 1
        try:
            # Run the associated method to extract the data and get the dataframe
            dataframe = self.x_method(report, self.output_name)
            try:
                assert(len(dataframe) > 1)
            except Exception as e:
                print(f"Data Length Error: {self.output_name} is empty:\n{dataframe}")
                self.successful = False
                return 1
        except Exception as e:
            print(f"{self.output_name} failed to process:\n{e}")
            self.successful = False
            return 1
        return dataframe


def create_line_divider(breakage_list: list):
    """
    This allows for the creation of a custom data extractor
    Breakage list defines the split points that will be used for the line
    Example
    Given breakage_list [10, 20, 30]
    using slot_num 0 in the resulting extract_line_slot will yield
    characters 0 - 10 from the string.
    Slot 1 would give characters 10 - 20
    """
    def extract_line_slot(slot_num : int, line_string: str, debug : bool = False):
        """
        Pulls data from a line/string using break points defined by the
        parent function.
        ONLY USE THIS FUNCTION THROUGH CREATION USING 'create_line_extractor'
        Will automatically convert numbers to floats
        """
        assert(slot_num < len(breakage_list)+1)
        low_range = 0 if slot_num == 0 else breakage_list[slot_num-1]
        high_range = len(line_string) if slot_num == len(breakage_list) else breakage_list[slot_num]
        data = line_string[low_range:high_range].strip().replace(",", "")
        try: data = float(data)
        except: pass
        if debug:
            print(f"Slot num: {slot_num} | Low: {low_range} | High: {high_range} | Data: {data}")
        return data
    return extract_line_slot


######################################################################################################################
#                                                                                                                    #
#                   EXTRACTION FUNCTIONS: used to pull data out of specific InfoLease report types                   #
#                                                                                                                    #
######################################################################################################################
"""
COMMON EXTRACTION COMPONENTS/FEATURES:
    - lines = report.splitlines() : splits the reports into a list of lines (based on \n line breaks in document)

    - extracted_data_dict : this is a dictionary that will hold the extracted data and will be used to create the dataframe

    - columns = list(extracted_data_dict.keys()) : breaks the extracted_data_dict into a list of its keys (excel column heads)

    - data_extractor = create_line_divider([#,#,#,#,#]): This creates a function we can use to pull data from a line based on
    its 'slot position'. A slot position is the characters between the numbers specified in the list passed into the function

    - for line in enumerate(lines): iterates through each line in the document. Line is a tuple of (line number, line string)
    having the line number can be very useful when we need to access data in adjacent lines

    - line# = list(zip(columns[#:#],[i for i in range(#,#)])): This creates a list with the tuple (column name, slot number).
    It allows us to iterate through this list and make sure the correct data slots are being used for each column/key in the
    data dictionary

COMMON REGEX COMPONENTS
\d : any digit [0-9]
\D : any character that is not a digit
\s : whitespace
.  : any character besides newline (\n)
{#}: # number of the preceding character
*  : 0 or more repetitions of the preceding character
"""


def ach(report: str, save_name: str):
    lines = report.splitlines()
    extracted_data_dict = {
    "ContractNumber" : [],
    "CustomerName" : [],
    "BankCode" : [],
    "BankNumber": [],
    "AccountNumber" : [],
    "Payment" : [],
    }
    columns = list(extracted_data_dict.keys())
    data_extractor = create_line_divider([19,57,67,82,104])
    bank_number_regex = "\d{9}"
    for line in enumerate(lines):
        if (re.search(contract_number_regex, line[1]) != None) & (re.search(bank_number_regex, line[1]) != None):
            [extracted_data_dict[columns[c]].append(data_extractor(c, line[1])) for c in range(0, len(columns))]
    dataframe = pd.DataFrame(extracted_data_dict)
    dataframe.to_excel(save_name, index=False, engine="xlsxwrtier")
    return dataframe

def disposition(report: str, save_name: str):
    lines = report.splitlines()
    extracted_data_dict = {
    "ContractNumber" : [],
    "Amount Rec" : [],
    "Trans Num" : [],
    "Date RCVD": [],
    "Date Posted" : [],
    "Last Pymt Due" : [],
    "Date Due" : [],
    "Residual Amt" : [],
    "Term Date" : [],
    "Total Pastdue" : [],
    "Customer Name" : [],
    }
    columns = list(extracted_data_dict.keys())
    data_extractor = create_line_divider([15,32,41, 51, 61, 79,88, 103, 114])
    for line in enumerate(lines):
        if re.search(contract_number_regex, data_extractor(0,line[1])):
            [extracted_data_dict[columns[c]].append(data_extractor(c,line[1])) for c in range(0, len(columns)-1)]
            extracted_data_dict["Customer Name"].append(lines[line[0]+1].strip())
    dataframe = pd.DataFrame(extracted_data_dict)
    dataframe.to_excel(save_name, index=False, engine="xlsxwrtier")
    return dataframe


def gainloss(report: str, save_name: str):
    lines = report.splitlines()
    extracted_data_dict = {
        'REM RENT RCVB' :  [],
        'GUAR RESIDUAL' :  [],
        'ASSET VAL' :  [],
        'EQUITY ADDON' :  [],
        'CURR INT RCVB' :  [],
        'MISC G/L' :  [],
        'BLENDED INC' :  [],
        'CONTRACT NUMBER' :  [],
        'CURR RENT RCVB' :  [],
        'RESIDUAL' :  [],
        'END/SEC DEP' :  [],
        'SALES TAX' :  [],
        'INVENT CHANGE' :  [],
        'NET RESERVE' :  [],
        'LATE CHGS' :  [],
        'CUSTOMER NAME' :  [],
        'UNEARNED FIN' :  [],
        'UNAMORT RES' :  [],
        'MISC' :  [],
        'MISC TAX' :  [],
        'CASH RECEIVED' :  [],
        'RCV OFFSET' :  [],
        'GAIN/LOSS' :  [],
        'DISPOSITION CODE' :  [],
        'DISPOSITION DESC'
        'UNEARNED IDC' :  [],
        'UNPAID INT' :  [],
        'PENALTY FEE' :  [],
        'UNPAID ACCRD' :  [],
        'RENEWAL RCVBL' :  [],
        'DEF REN INC' :  [],
        'DEF REN INT' :  [],
        'EARNED IDC' :  [],
        'GST BOOK G/L' :  [],
        'UNRECOG GST' :  [],
        'INT EARNED' :  [],
        'OVER/SHORT' :  [],
        'OPER RCVB' :  [],
        'OPER BASIS' :  [],
        'CTD OPER DEPR' :  [],
    }
    # L0: BlendedInc 6
    # L1: Late CHGS 14
    # L2: Gain/Loss 22
    # L3: Def Ren Int 30
    # l4 Over/Short 35
    # L5: CTD OPER
    columns = list(extracted_data_dict.keys())
    # These line data are used to tell the data extrator which values to pull for each line of
    # relevant data. It parits dictionary keys with thier corresponding data slot in the line
    # So that they can be iterated through during data extraction
    line0 = list(zip(columns[0:7],[i for i in range(1,8)]))
    line1 = list(zip(columns[7:15],[i for i in range(0,8)]))
    line2 = list(zip(columns[15:23], [i for i in range(0,8)]))
    line3 = list(zip(columns[23:31], [i for i in range(0,8)]))
    line4 = list(zip(columns[31:36], [i for i in range(1,8) if i not in [3,6]]))
    line5 = list(zip(columns[36:], [i for i in range(1,4)]))
    data_extractor = create_line_divider([27,43,58,74,88,105,120])
    for line in enumerate(lines):
        if (re.search(contract_number_regex, data_extractor(0,line[1])) != None)&\
            (type(data_extractor(1,line[1])) == float) :
            data_section = lines[line[0]-1:line[0]+5]
            [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[0])) for c in line0]
            [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[1])) for c in line1]
            [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[2])) for c in line2]
            [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[3])) for c in line3]
            [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[4])) for c in line4]
            [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[5])) for c in line5]

    df = pd.DataFrame(extracted_data_dict)
    # The Accounting team wanted the disposotion code split into number and descriptionso...
    disp_code = []
    disp_descriptoin = []
    for d in df['DISPOSITION CODE'].to_list():
        disp_split = d.split(" ")
        disp_code.append(disp_split[0])
        disp_descriptoin.append(" ".join(disp_split[1:]))
    df["DISPOSITION CODE"] = disp_code
    df["DISPOSITION DESC"] = disp_descriptoin
    df.to_excel(save_name, index=False, engine="xlsxwrtier")
    return df

# Works for Net-inv-loans & NIV-after
def net_invest_trial_balance(report: str, save_name: str):
    lines = report.splitlines()
    extracted_data_dict = {
        'CUSTOMER NAME' :  [],
        'CURR INT RCVB' :  [],
        'UNEARNED BLENDED' :  [],
        'BLEND NET INV' :  [],
        'LEASE NUMBER' :  [],
        'GROSS CONTRACT' :  [],
        'CURR RENT RCVB' :  [],
        'UNEARN FIN' :  [],
        'END DEPOSIT' :  [],
        'SEC DEPOSIT' :  [],
        'LEASE PYMTS' :  [],
        'TOTAL' :  [],
        'CONTRACT STAT' :  [],
        'PAYMENTS RCVD' :  [],
        'REM RENT RCVB' :  [],
        'UNEARN RESID' :  [],
        'PROV LOSS' :  [],
        'NET RESERVE' :  [],
        'UNEARN INC' :  [],
        'BAL REMAINING' :  [],
        'RESIDUAL' :  [],
        'UNPAID INT' :  [],
        'NET INV' :  [],
        'UNEARNED IDC' :  [],
    }
    columns = list(extracted_data_dict.keys())
    line0 = list(zip(columns[0:4], [0,3,4,5]))
    line1 = list(zip(columns[4:12], [i for i in range(0,8)]))
    line2 = list(zip(columns[12:19], [i for i in range(0,7)]))
    line3 = list(zip(columns[19:], [i for i in range(1,6)]))

    data_extractor = create_line_divider([18,35,53,67,87,106,117])
    for line in enumerate(lines):
        slot1 = data_extractor(0,line[1],False)
        if type(slot1) != str : continue
        if re.search(contract_number_regex, slot1) != None:
                data_section = lines[line[0]-1:line[0]+4]
                [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[0])) for c in line0]
                [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[1])) for c in line1]
                [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[2])) for c in line2]
                [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[3])) for c in line3]
    dataframe = pd.DataFrame(extracted_data_dict)
    dataframe.to_excel(save_name, index=False, engine="xlsxwrtier")
    return dataframe


def lockbox(report: str, save_name: str):
    lines = report.splitlines()
    extracted_data_dict = {
        "CustomerName" : [],
        "PaymentDate" : [],
        "InvoiceNumber" : [],
        "CheckNumber" : [],
        "InvoicePayment" : [],
        "ContractNumber" : [],
        "ContractPayment" : [],
    }
    # These are lists of the dictionary columns/keys and the data slots in which
    # that data can be found in the report. this way we can iterate through them
    # While extracting data
    bank_payment_records = [list(extracted_data_dict.keys())[1:5],[1,2,3,4]]
    infolease_payment_records = [list(extracted_data_dict.keys())[5:],[7,8]]

    # Below are the Regular Exppressions used to find relvant data lines
    full_line = "\d*\s{5}\d{2}/\d{2}/\d{4}\s{4}1"
    contract_only_line = "\s{90}\d.{7}1\d{2}-"
    cust_name_line = "\s{98}.{28}\D*"
    # The data extractor allows us to extract data from the report using slots
    # Slots are ranges of character denote by the list feed into the creation function
    data_extractor = create_line_divider([9,19,39,56,69,90,98,118])
    for line in enumerate(lines):
        # We can skip empty lines
        if len(line[1]) == 0: continue
        # First we should check if there is a full line of data (defined by regex)
        if re.search(full_line, line[1]):
            # If this is true then we can iterate through the lists we created earlier and append the data to our dict
            for k in range(0,len(bank_payment_records[0])):
                extracted_data_dict[bank_payment_records[0][k]].append(data_extractor(bank_payment_records[1][k],line[1]))
            for k in range(0,len(infolease_payment_records[0])):
                extracted_data_dict[infolease_payment_records[0][k]].append(data_extractor(infolease_payment_records[1][k],line[1]))
        # Otherwise we should check if this is a line with only contract data
        elif re.search(contract_only_line,line[1]):
            # If that's the case we can use the 'bank payment data' from the previous entry since it should apply to his contract
            for k in range(0,len(bank_payment_records[0])):
                extracted_data_dict[bank_payment_records[0][k]].append(extracted_data_dict[bank_payment_records[0][k]][-1])
            for k in range(0,len(infolease_payment_records[0])):
                extracted_data_dict[infolease_payment_records[0][k]].append(data_extractor(infolease_payment_records[1][k],line[1]))
        # If it doesn't hit either of these critera then continue since it's irelevant data
        else: continue
        i = 1
        # used to track how many lines below the current line we're looking for the customer name
        # keep moving down a line and checking for a customer name
        # Customer name typically happens 1 line under data but can be 13 lines if cut off by page end
        while re.search(cust_name_line,lines[line[0]+i]) == None:
            i += 1
        # Once it hits, add the name to the dict
        extracted_data_dict["CustomerName"].append(data_extractor(7,lines[line[0]+i]))
    dataframe = pd.DataFrame(extracted_data_dict)
    dataframe.to_excel(save_name, index=False, engine="xlsxwrtier")
    return dataframe


def minv(report: str, save_name: str):
    lines = report.splitlines()
    data_extractor = create_line_divider([15,32,52,71,83,107,116,128])
    extracted_data_dict = {
        "ContractNumber" : [],
        "UTAB_OIC_DUE" : [],
        "RentalDue" : [],
        "UTAB_OIC_PYMT" : [],
        "ChargeType" : [],
        "OutstandBalance" : [],
        "BizSegment" : [],
        "BookingDate" : [],
        "Branch" : [],
    }
    columns = list(extracted_data_dict.keys())
    for line in enumerate(lines):
        if re.search(contract_number_regex, line[1]) != None:
            [extracted_data_dict[columns[c]].append(data_extractor(c,line[1],debug=False)) for c in range(0,len(columns))]
    #All the list lengths need to be the same so if anything was missed it will fail to build
    dataframe = pd.DataFrame(extracted_data_dict)
    dataframe.to_excel(save_name, index=False, engine="xlsxwrtier")
    return dataframe

# Good for PUB_WIRES, VMCC, PBP_EPAY, returned check
def payment_transactions(report: str, save_name: str):
    lines = report.splitlines()
    data_extractor = create_line_divider([6,33,52,62,80,89,110,121])
    extracted_data_dict = {
    'SEQ' :  [],
    'ACCOUNT NUMBER' :  [],
    'PYMT METHOD' :  [],
    'DATE RCVD' :  [],
    'AMOUNT' :  [],
    'REF NO': [],
    'PAYMENT MEMO' :  [],
    'PYMT TYPE' :  [],
    'CHECK NO' :  [],
    'CUSTOMER NAME' :  [],
    'TRANSACTIONS NUM': [],
    'INV NO' : [],
    }
    columns = list(extracted_data_dict.keys())
    transaction_num_regex = "\d{8}"
    for line in enumerate(lines):
        slot1 = data_extractor(1,line[1],False)
        if type(slot1) != str : continue
        if re.search(contract_number_regex, slot1) != None:
            [extracted_data_dict[columns[c]].append(data_extractor(c, line[1])) for c in range(0,len(columns)-3)]
            tnum_match = re.search(transaction_num_regex, lines[line[0]+1])
            if tnum_match:
                tnum = lines[line[0]+1][tnum_match.start():tnum_match.end()]
            else:
                 tnum = ""
            extracted_data_dict["TRANSACTIONS NUM"].append(tnum)
            cname = lines[line[0]+1][6:37].strip()
            extracted_data_dict['CUSTOMER NAME'].append(cname)
            inv_no = lines[line[0]+1][79:90].strip()
            extracted_data_dict['INV NO'].append(inv_no)
    dataframe = pd.DataFrame(extracted_data_dict)
    dataframe.to_excel(save_name, index=False, engine="xlsxwrtier")
    return dataframe


def renewal_net_invest_trial_balance(report: str, save_name: str):
    lines = report.splitlines()
    data_extractor = create_line_divider([21,29,43,58,71,88,99,113])
    extracted_data_dict = {
        'CUSTOMER NAME' :  [],
        'TYPE' :  [],
        'GROSS RENEWAL' :  [],
        'CUR RENT RCVB' :  [],
        'UNEARNED RIN' :  [],
        'REMAINING RES' :  [],
        'LEASE PYMTS' :  [],
        'CONTRACT NUMBER' :  [],
        'RENEWAL' :  [],
        'PAYMENTS RCVD' :  [],
        'REM RENT RCVB' :  [],
        'UNPAID RES' :  [],
        'SECURITY DEP' :  [],
        'NET INVEST' :  [],
        'UNEARN INCOME' :  [],
        'TOTAL' :  [],
        'REMAINING BAL' :  [],
        'FINANCED RES' :  [],
    }
    columns = list(extracted_data_dict.keys())
    line0 = list(zip(columns[0:7], [0,1,2,3,4,5,7]))
    line1 = list(zip(columns[7:16], [i for i in range(0,9)]))
    line2 = list(zip(columns[16:], [3,4]))

    for line in enumerate(lines):
        slot1 = data_extractor(0,line[1],False)
        if type(slot1) != str : continue
        if re.search(contract_number_regex, slot1) != None:
            data_section = lines[line[0]-1:line[0]+4]
            [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[0])) for c in line0]
            [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[1])) for c in line1]
            [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[2])) for c in line2]
    dataframe = pd.DataFrame(extracted_data_dict)
    dataframe.to_excel(save_name, index=False, engine="xlsxwrtier")
    return dataframe


def unapplied(report: str, save_name: str):
    lines = report.splitlines()
    extracted_data_dict = {
    "Trans Num" : [],
    "ContractNumber" : [],
    "CheckNum" : [],
    "Date RCVD" : [],
    "Asset ID": [],
    "Reversed Amt" : [],
    "Branch" : [],
    "Unapplied Susp Acct" : [],
    "PaymentMemo" : [],
    "Payers Name" : [],
    "Batch Num" : [],
    "Posting Date" : [],
    "Unapplied Amt" : [],
    "Rev Post Date" : [],
    "Ref Num" : [],
    "Check Amt" : [],
    "Reason Code" : [],
    }
    columns = list(extracted_data_dict.keys())
    # Iterate through the lines one at a time to look for relavant data
    # Use enumerate so that we know which line  we're currently working on
    # this allows us to also work in the 'report' structure so that we can
    # grab the customer name from the line proceding the data
    data_extractor = create_line_divider([9,25, 38, 50, 65, 80, 89, 108])
    trans_num = "\d{7}"
    for line in enumerate(lines):
        if (re.search("\d{7}", str(data_extractor(0,line[1],debug=False))) != None) &\
        (re.search("\d{2}/\d{2}/\d{4}", str(data_extractor(3,line[1],debug=False))) != None):
            [extracted_data_dict[columns[c]].append(data_extractor(c,line[1])) for c in range(0,9)]
            [extracted_data_dict[columns[8+c]].append(data_extractor(c,lines[line[0]+1])) for c in range(1,len(columns)-8)]
    dataframe = pd.DataFrame(extracted_data_dict)
    dataframe.to_excel(save_name, index=False, engine="xlsxwrtier")
    return dataframe