import os import pandas as pd from datetime import datetime as dt, timedelta import re from pathlib import Path import time import numpy as np from pprint import pprint as prt def pfd(df: pd.DataFrame): with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also print(df) def create_line_divider(breakage_list: list): """ This allows for the creation of a custom data extractor Breakage list defines the split points that will be used for the line Example Given breakage_list [10, 20, 30] using slot_num 0 in the resulting extract_line_slot will yield characters 0 - 10 from the string. Slot 1 would give characters 10 - 20 """ def extract_line_slot(slot_num : int, line_string: str, debug : bool = False): """ Pulls data from a line/string using break points defined by the parent function. ONLY USE THIS FUNCTION THROUGH CREATION USING 'create_line_extractor' Will automatically convert numbers to floats """ # We can't have a slot number higher than the number of slots assert(slot_num < len(breakage_list)+1) low_range = 0 if slot_num == 0 else breakage_list[slot_num-1] high_range = len(line_string) if slot_num == len(breakage_list) else breakage_list[slot_num] # In order to create a float we need to remove the , from the string data = line_string[low_range:high_range].strip().replace(",", "") try: data = float(data) except: pass if debug: print(f"Slot num: {slot_num} | Low: {low_range} | High: {high_range} | Data: {data}") return data return extract_line_slot def minv(report: str, save_name: str): lines = report.splitlines() data_extractor = create_line_divider([15,32,52,71,83,107,116,128]) extracted_data_dict = { "ContractNumber" : [], "UTAB_OIC_DUE" : [], "RentalDue" : [], "UTAB_OIC_PYMT" : [], "ChargeType" : [], "OutstandBalance" : [], "BizSegment" : [], "BookingDate" : [], "Branch" : [], } columns = list(extracted_data_dict.keys()) for line in enumerate(lines): if re.search(contract_number_regex, line[1]) != None: [extracted_data_dict[columns[c]].append(data_extractor(c,line[1],debug=False)) for c in range(0,len(columns))] #All the list lengths need to be the same so if anything was missed it will fail to build dataframe = pd.DataFrame(extracted_data_dict) # ( bookdate != today & rent = 0 ) OR (outstanding > 100 & rent = 0) # dt.today().strftime("%m/%m/%Y") filtered = dataframe[ ((dataframe["BookingDate"] != '04/26/2022') & (dataframe["RentalDue"] == 0)) |\ ((dataframe["RentalDue"] == 0 ) & (dataframe["OutstandBalance"] > 100))] filtered.to_excel(save_name, index=False) return filtered current_output = [ '100-1011756-004', '100-1354567-002', '100-1637209-005', '100-1665517-003', '100-1670517-003', '100-2081987-008', '100-2139037-002', '100-2446458-002', '100-2453558-003', '100-2611389-007', '100-3492758-003', '100-3500858-001', '100-3694757-001', '100-3725849-003', '100-3876959-007', '100-3910629-001', '100-3964329-001', '100-4462739-001', '100-4850431-001', '100-4945021-001', '100-5382471-001', '100-6738611-001', '100-6849836-001', '100-7037791-001', '100-7045691-001', '100-7052571-001', '100-7059671-001', '100-7087121-001', '100-7107941-001', '100-7146771-001', '100-7156851-001', '100-7178461-001', '100-7203371-001', '100-7219911-001', '100-7232561-001', '100-7237601-001', '100-7242461-001', '100-9660710-001', '100-9723689-001', ] contract_number_regex = "\d{3}-\d{7}-\d{3}" with open("2022.05.04_MINV_C", errors="replace") as ifile: report = ifile.read() fin_df = minv(report, "man_inv_test.xlsx") pfd(fin_df) il_contracts = fin_df.ContractNumber.to_list() prt(il_contracts) extra_contracts = [] not_included = [] for c in il_contracts: if c not in current_output: extra_contracts.append(c) for c in current_output: if c not in il_contracts: not_included.append(c) print("\nExtra Contracts:") prt(extra_contracts) print("Not Included Contracts:") prt(not_included) print(f"MATCHING CONTRACTS: {il_contracts == current_output}") print(f"Current # contract {len(current_output)} | ILE Processed Contracts: {len(il_contracts)}") print(f"# Extra contracts included: {len(extra_contracts)} | # Contracts not included: {len(not_included)}")