Fixed incorrect columns, handled unexpected line gaps

4 years ago · 3baea9331e
parent d690c75399
commit 3baea9331e
2 changed files with 244 additions and 0 deletions
--- a/NI_sum.py
+++ b/NI_sum.py
@ -0,0 +1,152 @@
 import os
 import pandas as pd
 from datetime import datetime as dt, timedelta
 import sys, getopt
 import re
 from pathlib import Path
 import time
 from pprint import pprint as prt
 import numpy as np
 contract_number_regex = "\d{3}-\d{7}-\d{3}"
 def create_line_divider(breakage_list: list):
    """
    This allows for the creation of a custom data extractor
    Breakage list defines the split points that will be used for the line
    Example
    Given breakage_list [10, 20, 30]
    using slot_num 0 in the resulting extract_line_slot will yield
    characters 0 - 10 from the string. 
    Slot 1 would give characters 10 - 20
    """
    def extract_line_slot(slot_num : int, line_string: str, debug : bool = False):
        """
        Pulls data from a line/string using break points defined by the
        parent function.
        ONLY USE THIS FUNCTION THROUGH CREATION USING 'create_line_extractor'
        Will automatically convert numbers to floats
        """
        assert(slot_num < len(breakage_list)+1)
        low_range = 0 if slot_num == 0 else breakage_list[slot_num-1]
        high_range = len(line_string) if slot_num == len(breakage_list) else breakage_list[slot_num]
        data = line_string[low_range:high_range].strip().replace(",", "")
        try: data = float(data)
        except: pass
        if debug:
            print(f"Slot num: {slot_num} | Low: {low_range} | High: {high_range} | Data: {data}")
        return data
    return extract_line_slot
 def net_invest_trial_balance(report: str, save_name: str):
    lines = report.splitlines()
    extracted_data_dict = {
        'CUSTOMER NAME' :  [],
        'CURR INT RCVB' :  [],
        'UNEARNED BLENDED' :  [],
        'BLEND NET INV' :  [],
        'LEASE NUMBER' :  [],
        'GROSS CONTRACT' :  [],
        'CURR RENT RCVB' :  [],
        'UNEARN FIN' :  [],
        'END DEPOSIT' :  [],
        'SEC DEPOSIT' :  [],
        'LEASE PYMTS' :  [],
        'TOTAL' :  [],
        'CONTRACT STAT' :  [],
        'PAYMENTS RCVD' :  [],
        'REM RENT RCVB' :  [],
        'UNEARN RESID' :  [],
        'PROV LOSS' :  [],
        'NET RESERVE' :  [],
        'UNEARN INC' :  [],
        'BAL REMAINING' :  [],
        'RESIDUAL' :  [],
        'UNPAID INT' :  [],
        'NET INV' :  [],
        'UNEARNED IDC' :  [],
        "LESSOR": []
    }
    lessors = []
    columns = list(extracted_data_dict.keys())
    line0 = list(zip(columns[0:4], [0,3,4,5]))
    line1 = list(zip(columns[4:12], [i for i in range(0,8)]))
    line2 = list(zip(columns[12:19], [i for i in range(0,7)]))
    line3 = list(zip(columns[19:-1], [i for i in range(1,6)]))
    for l in [line0,line1,line2,line3]:
        print(f"\n{l}")
    data_extractor = create_line_divider([18,32,50,66,84,100,117])
    for line in enumerate(lines):
        slot1 = data_extractor(0,line[1],False)
        if type(slot1) != str : continue
        if re.search(contract_number_regex, slot1) != None:
                data_section = lines[line[0]-1:line[0]+3]
                if data_section[0].find(".") == -1:
                    data_section[0] = lines[line[0]-2]
                for ds in enumerate(data_section):
                    if ds[1].find(".") == -1:
                        if ds[0] < len(data_section) -1:
                            for i in range(ds[0], len(data_section)-1):
                                #print(f"{i}: { data_section[i]}")
                                data_section[i] = data_section[i+1]
                                #print(f"DELTA| {i}: { data_section[i]}")
                            data_section[3] = lines[line[0]+3]
                        else:
                            data_section[3] = lines[line[0]+3]
                # [print(f"\n{d[0]}: {d[1]}") for d in enumerate(data_section)]
                # print('\n')
                [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[0], False)) for c in line0]
                [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[1], False)) for c in line1]
                [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[2], False)) for c in line2]
                [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[3], False)) for c in line3]
                extracted_data_dict["LESSOR"].append(extracted_data_dict["LEASE NUMBER"][-1][0:3])
                if extracted_data_dict["LESSOR"][-1] not in lessors:
                    print(extracted_data_dict["LESSOR"][-1])
                    lessors.append(extracted_data_dict["LESSOR"][-1])      
                    print(lessors)
    for c in columns:
        print(f"C: {c} | {len(extracted_data_dict[c])}")
    print(lessors)
    dataframe = pd.DataFrame(extracted_data_dict)
    summary_series = []
    for lessor in lessors:
        reduced_df = dataframe.loc[dataframe["LESSOR"] == lessor]
        del reduced_df["CUSTOMER NAME"]
        del reduced_df["LEASE NUMBER"]
        del reduced_df["CONTRACT STAT"]
        reduced_df = reduced_df.replace("", np.NaN)
        reduced_df = reduced_df.replace("REVOLV", np.NaN)
        reduced_df = reduced_df.replace("ING ACCOUNT", np.NaN)
        summation = reduced_df.sum(skipna=True, axis=0)
        summation["LESSOR"] = lessor
        summation["CONTRACT COUNT"] = len(reduced_df.index)
        summary_series.append(summation)
    summary_df = pd.concat(summary_series, axis=1).transpose().set_index("LESSOR")
    prt(summary_df)
    with pd.ExcelWriter(save_name) as writer:  
        dataframe.to_excel(writer, index=False, sheet_name="data")
        pd.DataFrame(summary_df).to_excel(writer, index=True, sheet_name="Summary")
    return dataframe
 with open("/config/workspace/LEAF/IL Extract SRC/2022.05.20 Net Investment", errors="replace") as rep_file:
    report = rep_file.read()
 prt(net_invest_trial_balance(report, "520_NI_TEST.xlsx"))
--- a/RenewalTest.py
+++ b/RenewalTest.py
@ -0,0 +1,92 @@
 import os
 import pandas as pd
 from datetime import datetime as dt, timedelta
 import sys, getopt
 import re
 from pathlib import Path
 import time
 from pprint import pprint as prt
 import numpy as np
 contract_number_regex = "\d{3}-\d{7}-\d{3}"
 def create_line_divider(breakage_list: list):
    """
    This allows for the creation of a custom data extractor
    Breakage list defines the split points that will be used for the line
    Example
    Given breakage_list [10, 20, 30]
    using slot_num 0 in the resulting extract_line_slot will yield
    characters 0 - 10 from the string. 
    Slot 1 would give characters 10 - 20
    """
    def extract_line_slot(slot_num : int, line_string: str, debug : bool = False):
        """
        Pulls data from a line/string using break points defined by the
        parent function.
        ONLY USE THIS FUNCTION THROUGH CREATION USING 'create_line_extractor'
        Will automatically convert numbers to floats
        """
        assert(slot_num < len(breakage_list)+1)
        low_range = 0 if slot_num == 0 else breakage_list[slot_num-1]
        high_range = len(line_string) if slot_num == len(breakage_list) else breakage_list[slot_num]
        data = line_string[low_range:high_range].strip().replace(",", "")
        try: data = float(data)
        except: pass
        if debug:
            print(f"Slot num: {slot_num} | Low: {low_range} | High: {high_range} | Data: {data}")
        return data
    return extract_line_slot
 def renewal_net_invest_trial_balance(report: str, save_name: str):
    lines = report.splitlines()
    data_extractor = create_line_divider([21,29,43,58,71,88,99,113])
    extracted_data_dict = {
        'CUSTOMER NAME' :  [],
        'TYPE' :  [],
        'GROSS RENEWAL' :  [],
        'REMAINING BAL' :  [],
        'FINANCED RES' :  [],
        'REMAINING RES' :  [],
        'LEASE PYMTS' :  [],
        'CONTRACT NUMBER' :  [],
        'RENEWAL' :  [],
        'PAYMENTS RCVD' :  [],
        'CUR RENT RCVB' :  [],
        'UNEARNED RIN' :  [],
        'SECURITY DEP' :  [],
        'NET INVEST' :  [],
        'UNEARN INCOME' :  [],
        'TOTAL' :  [],
        'REM RENT RCVB' :  [],
        'UNPAID RES' :  [],
    }
    columns = list(extracted_data_dict.keys())
    line0 = list(zip(columns[0:7], [0,1,2,3,4,5,7]))
    line1 = list(zip(columns[7:16], [i for i in range(0,9)]))
    line2 = list(zip(columns[16:], [3,4]))
    for line in enumerate(lines):
        slot1 = data_extractor(0,line[1],False)
        if type(slot1) != str : continue
        if re.search(contract_number_regex, slot1) != None:
            data_section = lines[line[0]-1:line[0]+2]
            for ds in enumerate(data_section):
                print(ds[1])
                if ds[1].find(".") == -1:
                    [print(f"\n{d[0]}: {d[1]}") for d in enumerate(data_section)]
                    print('\n')
            [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[0])) for c in line0]
            [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[1])) for c in line1]
            [extracted_data_dict[c[0]].append(data_extractor(c[1], data_section[2])) for c in line2]
    dataframe = pd.DataFrame(extracted_data_dict)
    dataframe.to_excel(save_name, index=False)
    return dataframe
 with open("/config/workspace/LEAF/IL Extract SRC/2022.05.20 Renewal Net Investment", errors="replace") as rep_file:
    report = rep_file.read()
 prt(renewal_net_invest_trial_balance(report, "rn_TESTING.xlsx"))