Working basic 2 feature model

3 years ago · a1d93803d3
commit a1d93803d3
3 changed files with 135 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+*.jpg
+*.png
+*.csv
+/venv
--- a/BasicCreditDec.py
+++ b/BasicCreditDec.py
@ -0,0 +1,114 @@
+from sklearn.svm import LinearSVC
+import numpy as np
+import pandas as pd
+from pandas import DataFrame
+import matplotlib.pyplot as plt
+from pprint import pprint as prt
+from datetime import datetime as dt
+
+
+def eval_approval(sbss: int, yib: float) -> int:
+    """
+    Very basic approval algorithm
+    Essential there are ceratin YIB breaks for each SBSS range
+    This is why we see a downward 'step' deliminator between approvals
+    This is also why our basic 'LINEAR' model struggles around the pivot points
+    """
+    if sbss < 140:
+        return 0
+    if sbss < 180:
+        if yib < 10:
+            return 0
+        else: return 1
+    elif sbss < 200:
+        if yib < 8:
+            return 0
+        else: return 1
+    elif sbss < 220:
+        if yib < 6:
+            return 0
+        else: return 1
+    elif sbss < 240:
+        if yib < 5:
+            return 0
+        else: return 1
+    elif sbss < 260:
+        if yib < 3:
+            return 0
+        else: return 1
+    elif sbss < 280:
+        if yib < 2:
+            return 0
+        else: return 1
+    else: return 1
+
+
+def generate_sample_data(data_size: int, save_data: bool = False, save_name: str = None) -> DataFrame:
+    # Small Business Scoring System
+    # using a normal dist with a mean of 200 and a std of 50
+    # This can produce 'invalid' SBSS of > 300
+    sbss = np.random.normal(loc= 200,scale= 50, size= data_size)
+    # Years in Buiness
+    # equal chances of 0->15 years 
+    yib = np.random.uniform(low= 0, high= 15, size= data_size)
+    # Business Id
+    bid =  np.array([_ for _ in range(0, data_size)])
+
+    data_set  = {"BusinessID": bid, "YearsInBusiness": yib, "SBSS": sbss}
+    df = DataFrame(data_set).set_index("BusinessID")
+    # SBSS should be whole numbers
+    df["SBSS"] = df["SBSS"].astype("int32")
+    # Round YIB to 2 decimals (easy to look at)
+    df["YearsInBusiness"] = df["YearsInBusiness"].apply(lambda y: round(y,2))
+    # Add an approval column based on the approval function defined earlier
+    df["Approved"] = df.apply(lambda row: eval_approval(row["SBSS"], row["YearsInBusiness"]), axis=1)
+    if save_data:
+        save_name = save_name if save_name != None else f"basic_credit_sample_data_{dt.now().strftime('%Y-%M-%d')}.csv"
+        df.to_csv(save_name)
+    return df
+
+
+def predict_with_model(model):
+    yib = float(input("How many years in business?:\t"))
+    if yib == "": return False
+    sbss = int(input("What is your SBSS?:\t"))
+    if sbss == "": return False
+    model_pred = model.predict([[yib,sbss]]) == 1
+    actual = eval_approval(sbss, yib) == 1
+    print(f"The model predicts:\t{model_pred}\nThe rules say:\t{actual}.")
+    print(f"The model was {'correct' if model_pred == actual else 'incorrect'}\n")
+    return True
+
+
+sample_data = generate_sample_data(100000, True, "default_cd_sample.csv")
+#sample_data = pd.read_csv("default_cd_sample.csv", index_col="BusinessID")
+prt(sample_data)
+print(f"Approval Rate: {round(len(sample_data.query('Approved == 1'))/len(sample_data),4)* 100}%\n")
+
+#
+#   Generate a scatter plot
+#
+colors = []
+for a in sample_data["Approved"].to_list():
+    if a == 0:
+        # Approved deals will be green
+        colors.append('r')
+    else:
+        # Declined deals will be red
+        colors.append('g')
+plt.scatter(sample_data["YearsInBusiness"], sample_data["SBSS"], c=colors)
+plt.plot()
+plt.xlabel("# of Years in Business")
+plt.ylabel("SBSS")
+plt.title("Credit Decision")
+plt.savefig("cd_scatter1.jpg")
+
+# Now we need to combine our features (YIB & SBSS) into a list of pairs
+combined_data = list(zip(sample_data["YearsInBusiness"].to_list(), sample_data["SBSS"].to_list()))
+# Feed that data into the model
+LSVCClf = LinearSVC(dual = False, random_state = 0, penalty = 'l2',tol = 1e-5, max_iter=1000000)
+model = LSVCClf.fit(combined_data ,sample_data["Approved"])
+
+# Let people play and see how the model does at prediction
+while True:
+    if not predict_with_model(model): break
--- a/LinearSVC.py
+++ b/LinearSVC.py
@ -0,0 +1,17 @@
+from sklearn.svm import LinearSVC
+from sklearn.datasets import make_classification
+import numpy as np
+from pprint import pprint as prt
+
+x2 = np.random.rand(100)
+
+X, y = make_classification(n_features = 5, random_state = 0)
+
+print(X.shape)
+prt(X)
+print('\n')
+print(len(y))
+
+
+LSVCClf = LinearSVC(dual = False, random_state = 0, penalty = 'l1',tol = 1e-5)
+LSVCClf.fit(X, y)