CreditDecisionML/BasicCreditDec.py

from sklearn.svm import LinearSVC
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
from pprint import pprint as prt
from datetime import datetime as dt


def eval_approval(sbss: int, yib: float) -> int:
    """
    Very basic approval algorithm
    Essential there are ceratin YIB breaks for each SBSS range
    This is why we see a downward 'step' deliminator between approvals
    This is also why our basic 'LINEAR' model struggles around the pivot points
    """
    if sbss < 140:
        return 0
    if sbss < 180:
        if yib < 10:
            return 0
        else: return 1
    elif sbss < 200:
        if yib < 8:
            return 0
        else: return 1
    elif sbss < 220:
        if yib < 6:
            return 0
        else: return 1
    elif sbss < 240:
        if yib < 5:
            return 0
        else: return 1
    elif sbss < 260:
        if yib < 3:
            return 0
        else: return 1
    elif sbss < 280:
        if yib < 2:
            return 0
        else: return 1
    else: return 1


def generate_sample_data(data_size: int, save_data: bool = False, save_name: str = None) -> DataFrame:
    # Small Business Scoring System
    # using a normal dist with a mean of 200 and a std of 50
    # This can produce 'invalid' SBSS of > 300
    sbss = np.random.normal(loc= 200,scale= 50, size= data_size)
    # Years in Buiness
    # equal chances of 0->15 years
    yib = np.random.uniform(low= 0, high= 15, size= data_size)
    # Business Id
    bid =  np.array([_ for _ in range(0, data_size)])

    data_set  = {"BusinessID": bid, "YearsInBusiness": yib, "SBSS": sbss}
    df = DataFrame(data_set).set_index("BusinessID")
    # SBSS should be whole numbers
    df["SBSS"] = df["SBSS"].astype("int32")
    # Round YIB to 2 decimals (easy to look at)
    df["YearsInBusiness"] = df["YearsInBusiness"].apply(lambda y: round(y,2))
    # Add an approval column based on the approval function defined earlier
    df["Approved"] = df.apply(lambda row: eval_approval(row["SBSS"], row["YearsInBusiness"]), axis=1)
    if save_data:
        save_name = save_name if save_name != None else f"basic_credit_sample_data_{dt.now().strftime('%Y-%M-%d')}.csv"
        df.to_csv(save_name)
    return df


def predict_with_model(model):
    yib = float(input("How many years in business?:\t"))
    if yib == "": return False
    sbss = int(input("What is your SBSS?:\t\t"))
    if sbss == "": return False
    model_pred = model.predict([[yib,sbss]])[0] == 1
    actual = eval_approval(sbss, yib) == 1
    print(f"The model predicts:\t\t{model_pred}\nThe rules say:\t\t\t{actual}.")
    print(f"The model was {'correct' if model_pred == actual else 'incorrect'}.\n")
    return True


sample_data = generate_sample_data(100000, True, "default_cd_sample.csv")
#sample_data = pd.read_csv("default_cd_sample.csv", index_col="BusinessID")
prt(sample_data)
print(f"Approval Rate: {round(len(sample_data.query('Approved == 1'))/len(sample_data),4)* 100}%\n")

#
#   Generate a scatter plot
#
colors = []
for a in sample_data["Approved"].to_list():
    if a == 0:
        # Approved deals will be green
        colors.append('r')
    else:
        # Declined deals will be red
        colors.append('g')
plt.scatter(sample_data["YearsInBusiness"], sample_data["SBSS"], c=colors)
plt.plot()
plt.xlabel("# of Years in Business")
plt.ylabel("SBSS")
plt.title("Credit Decision")
plt.savefig("cd_scatter1.jpg")

# Now we need to combine our features (YIB & SBSS) into a list of pairs
combined_data = list(zip(sample_data["YearsInBusiness"].to_list(), sample_data["SBSS"].to_list()))
# Feed that data into the model
LSVCClf = LinearSVC(dual = False, random_state = 0, penalty = 'l2',tol = 1e-5, max_iter=1000000)
model = LSVCClf.fit(combined_data ,sample_data["Approved"])

# Let people play and see how the model does at prediction
while True:
    if not predict_with_model(model): break