commit a1d93803d32eb09b69185afb5cf454dc74a05f12 Author: Griffiths Lott Date: Sun Nov 6 15:37:20 2022 -0500 Working basic 2 feature model diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..65f025d --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.jpg +*.png +*.csv +/venv \ No newline at end of file diff --git a/BasicCreditDec.py b/BasicCreditDec.py new file mode 100644 index 0000000..de84bcb --- /dev/null +++ b/BasicCreditDec.py @@ -0,0 +1,114 @@ +from sklearn.svm import LinearSVC +import numpy as np +import pandas as pd +from pandas import DataFrame +import matplotlib.pyplot as plt +from pprint import pprint as prt +from datetime import datetime as dt + + +def eval_approval(sbss: int, yib: float) -> int: + """ + Very basic approval algorithm + Essential there are ceratin YIB breaks for each SBSS range + This is why we see a downward 'step' deliminator between approvals + This is also why our basic 'LINEAR' model struggles around the pivot points + """ + if sbss < 140: + return 0 + if sbss < 180: + if yib < 10: + return 0 + else: return 1 + elif sbss < 200: + if yib < 8: + return 0 + else: return 1 + elif sbss < 220: + if yib < 6: + return 0 + else: return 1 + elif sbss < 240: + if yib < 5: + return 0 + else: return 1 + elif sbss < 260: + if yib < 3: + return 0 + else: return 1 + elif sbss < 280: + if yib < 2: + return 0 + else: return 1 + else: return 1 + + +def generate_sample_data(data_size: int, save_data: bool = False, save_name: str = None) -> DataFrame: + # Small Business Scoring System + # using a normal dist with a mean of 200 and a std of 50 + # This can produce 'invalid' SBSS of > 300 + sbss = np.random.normal(loc= 200,scale= 50, size= data_size) + # Years in Buiness + # equal chances of 0->15 years + yib = np.random.uniform(low= 0, high= 15, size= data_size) + # Business Id + bid = np.array([_ for _ in range(0, data_size)]) + + data_set = {"BusinessID": bid, "YearsInBusiness": yib, "SBSS": sbss} + df = DataFrame(data_set).set_index("BusinessID") + # SBSS should be whole numbers + df["SBSS"] = df["SBSS"].astype("int32") + # Round YIB to 2 decimals (easy to look at) + df["YearsInBusiness"] = df["YearsInBusiness"].apply(lambda y: round(y,2)) + # Add an approval column based on the approval function defined earlier + df["Approved"] = df.apply(lambda row: eval_approval(row["SBSS"], row["YearsInBusiness"]), axis=1) + if save_data: + save_name = save_name if save_name != None else f"basic_credit_sample_data_{dt.now().strftime('%Y-%M-%d')}.csv" + df.to_csv(save_name) + return df + + +def predict_with_model(model): + yib = float(input("How many years in business?:\t")) + if yib == "": return False + sbss = int(input("What is your SBSS?:\t")) + if sbss == "": return False + model_pred = model.predict([[yib,sbss]]) == 1 + actual = eval_approval(sbss, yib) == 1 + print(f"The model predicts:\t{model_pred}\nThe rules say:\t{actual}.") + print(f"The model was {'correct' if model_pred == actual else 'incorrect'}\n") + return True + + +sample_data = generate_sample_data(100000, True, "default_cd_sample.csv") +#sample_data = pd.read_csv("default_cd_sample.csv", index_col="BusinessID") +prt(sample_data) +print(f"Approval Rate: {round(len(sample_data.query('Approved == 1'))/len(sample_data),4)* 100}%\n") + +# +# Generate a scatter plot +# +colors = [] +for a in sample_data["Approved"].to_list(): + if a == 0: + # Approved deals will be green + colors.append('r') + else: + # Declined deals will be red + colors.append('g') +plt.scatter(sample_data["YearsInBusiness"], sample_data["SBSS"], c=colors) +plt.plot() +plt.xlabel("# of Years in Business") +plt.ylabel("SBSS") +plt.title("Credit Decision") +plt.savefig("cd_scatter1.jpg") + +# Now we need to combine our features (YIB & SBSS) into a list of pairs +combined_data = list(zip(sample_data["YearsInBusiness"].to_list(), sample_data["SBSS"].to_list())) +# Feed that data into the model +LSVCClf = LinearSVC(dual = False, random_state = 0, penalty = 'l2',tol = 1e-5, max_iter=1000000) +model = LSVCClf.fit(combined_data ,sample_data["Approved"]) + +# Let people play and see how the model does at prediction +while True: + if not predict_with_model(model): break \ No newline at end of file diff --git a/LinearSVC.py b/LinearSVC.py new file mode 100644 index 0000000..af1dd5d --- /dev/null +++ b/LinearSVC.py @@ -0,0 +1,17 @@ +from sklearn.svm import LinearSVC +from sklearn.datasets import make_classification +import numpy as np +from pprint import pprint as prt + +x2 = np.random.rand(100) + +X, y = make_classification(n_features = 5, random_state = 0) + +print(X.shape) +prt(X) +print('\n') +print(len(y)) + + +LSVCClf = LinearSVC(dual = False, random_state = 0, penalty = 'l1',tol = 1e-5) +LSVCClf.fit(X, y)