You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
CreditDecisionML/BasicCreditDec.py

114 lines
3.9 KiB

from sklearn.svm import LinearSVC
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
from pprint import pprint as prt
from datetime import datetime as dt
def eval_approval(sbss: int, yib: float) -> int:
"""
Very basic approval algorithm
Essential there are ceratin YIB breaks for each SBSS range
This is why we see a downward 'step' deliminator between approvals
This is also why our basic 'LINEAR' model struggles around the pivot points
"""
if sbss < 140:
return 0
if sbss < 180:
if yib < 10:
return 0
else: return 1
elif sbss < 200:
if yib < 8:
return 0
else: return 1
elif sbss < 220:
if yib < 6:
return 0
else: return 1
elif sbss < 240:
if yib < 5:
return 0
else: return 1
elif sbss < 260:
if yib < 3:
return 0
else: return 1
elif sbss < 280:
if yib < 2:
return 0
else: return 1
else: return 1
def generate_sample_data(data_size: int, save_data: bool = False, save_name: str = None) -> DataFrame:
# Small Business Scoring System
# using a normal dist with a mean of 200 and a std of 50
# This can produce 'invalid' SBSS of > 300
sbss = np.random.normal(loc= 200,scale= 50, size= data_size)
# Years in Buiness
# equal chances of 0->15 years
yib = np.random.uniform(low= 0, high= 15, size= data_size)
# Business Id
bid = np.array([_ for _ in range(0, data_size)])
data_set = {"BusinessID": bid, "YearsInBusiness": yib, "SBSS": sbss}
df = DataFrame(data_set).set_index("BusinessID")
# SBSS should be whole numbers
df["SBSS"] = df["SBSS"].astype("int32")
# Round YIB to 2 decimals (easy to look at)
df["YearsInBusiness"] = df["YearsInBusiness"].apply(lambda y: round(y,2))
# Add an approval column based on the approval function defined earlier
df["Approved"] = df.apply(lambda row: eval_approval(row["SBSS"], row["YearsInBusiness"]), axis=1)
if save_data:
save_name = save_name if save_name != None else f"basic_credit_sample_data_{dt.now().strftime('%Y-%M-%d')}.csv"
df.to_csv(save_name)
return df
def predict_with_model(model):
yib = float(input("How many years in business?:\t"))
if yib == "": return False
sbss = int(input("What is your SBSS?:\t\t"))
if sbss == "": return False
model_pred = model.predict([[yib,sbss]])[0] == 1
actual = eval_approval(sbss, yib) == 1
print(f"The model predicts:\t\t{model_pred}\nThe rules say:\t\t\t{actual}.")
print(f"The model was {'correct' if model_pred == actual else 'incorrect'}.\n")
return True
sample_data = generate_sample_data(100000, True, "default_cd_sample.csv")
#sample_data = pd.read_csv("default_cd_sample.csv", index_col="BusinessID")
prt(sample_data)
print(f"Approval Rate: {round(len(sample_data.query('Approved == 1'))/len(sample_data),4)* 100}%\n")
#
# Generate a scatter plot
#
colors = []
for a in sample_data["Approved"].to_list():
if a == 0:
# Approved deals will be green
colors.append('r')
else:
# Declined deals will be red
colors.append('g')
plt.scatter(sample_data["YearsInBusiness"], sample_data["SBSS"], c=colors)
plt.plot()
plt.xlabel("# of Years in Business")
plt.ylabel("SBSS")
plt.title("Credit Decision")
plt.savefig("cd_scatter1.jpg")
# Now we need to combine our features (YIB & SBSS) into a list of pairs
combined_data = list(zip(sample_data["YearsInBusiness"].to_list(), sample_data["SBSS"].to_list()))
# Feed that data into the model
LSVCClf = LinearSVC(dual = False, random_state = 0, penalty = 'l2',tol = 1e-5, max_iter=1000000)
model = LSVCClf.fit(combined_data ,sample_data["Approved"])
# Let people play and see how the model does at prediction
while True:
if not predict_with_model(model): break