from sklearn.svm import LinearSVC import numpy as np import pandas as pd from pandas import DataFrame import matplotlib.pyplot as plt from pprint import pprint as prt from datetime import datetime as dt def eval_approval(sbss: int, yib: float) -> int: """ Very basic approval algorithm Essential there are ceratin YIB breaks for each SBSS range This is why we see a downward 'step' deliminator between approvals This is also why our basic 'LINEAR' model struggles around the pivot points """ if sbss < 140: return 0 if sbss < 180: if yib < 10: return 0 else: return 1 elif sbss < 200: if yib < 8: return 0 else: return 1 elif sbss < 220: if yib < 6: return 0 else: return 1 elif sbss < 240: if yib < 5: return 0 else: return 1 elif sbss < 260: if yib < 3: return 0 else: return 1 elif sbss < 280: if yib < 2: return 0 else: return 1 else: return 1 def generate_sample_data(data_size: int, save_data: bool = False, save_name: str = None) -> DataFrame: # Small Business Scoring System # using a normal dist with a mean of 200 and a std of 50 # This can produce 'invalid' SBSS of > 300 sbss = np.random.normal(loc= 200,scale= 50, size= data_size) # Years in Buiness # equal chances of 0->15 years yib = np.random.uniform(low= 0, high= 15, size= data_size) # Business Id bid = np.array([_ for _ in range(0, data_size)]) data_set = {"BusinessID": bid, "YearsInBusiness": yib, "SBSS": sbss} df = DataFrame(data_set).set_index("BusinessID") # SBSS should be whole numbers df["SBSS"] = df["SBSS"].astype("int32") # Round YIB to 2 decimals (easy to look at) df["YearsInBusiness"] = df["YearsInBusiness"].apply(lambda y: round(y,2)) # Add an approval column based on the approval function defined earlier df["Approved"] = df.apply(lambda row: eval_approval(row["SBSS"], row["YearsInBusiness"]), axis=1) if save_data: save_name = save_name if save_name != None else f"basic_credit_sample_data_{dt.now().strftime('%Y-%M-%d')}.csv" df.to_csv(save_name) return df def predict_with_model(model): yib = float(input("How many years in business?:\t")) if yib == "": return False sbss = int(input("What is your SBSS?:\t\t")) if sbss == "": return False model_pred = model.predict([[yib,sbss]])[0] == 1 actual = eval_approval(sbss, yib) == 1 print(f"The model predicts:\t\t{model_pred}\nThe rules say:\t\t\t{actual}.") print(f"The model was {'correct' if model_pred == actual else 'incorrect'}.\n") return True sample_data = generate_sample_data(100000, True, "default_cd_sample.csv") #sample_data = pd.read_csv("default_cd_sample.csv", index_col="BusinessID") prt(sample_data) print(f"Approval Rate: {round(len(sample_data.query('Approved == 1'))/len(sample_data),4)* 100}%\n") # # Generate a scatter plot # colors = [] for a in sample_data["Approved"].to_list(): if a == 0: # Approved deals will be green colors.append('r') else: # Declined deals will be red colors.append('g') plt.scatter(sample_data["YearsInBusiness"], sample_data["SBSS"], c=colors) plt.plot() plt.xlabel("# of Years in Business") plt.ylabel("SBSS") plt.title("Credit Decision") plt.savefig("cd_scatter1.jpg") # Now we need to combine our features (YIB & SBSS) into a list of pairs combined_data = list(zip(sample_data["YearsInBusiness"].to_list(), sample_data["SBSS"].to_list())) # Feed that data into the model LSVCClf = LinearSVC(dual = False, random_state = 0, penalty = 'l2',tol = 1e-5, max_iter=1000000) model = LSVCClf.fit(combined_data ,sample_data["Approved"]) # Let people play and see how the model does at prediction while True: if not predict_with_model(model): break