Add getBestModel function

jzsmoreno · Nov 12, 2024 · e045dcd · e045dcd
1 parent 5a7b3b9
commit e045dcd
Show file tree

Hide file tree

Showing 2 changed files with 130 additions and 5 deletions.
diff --git a/fraud_detection/Soms_FraudDetection.py b/fraud_detection/Soms_FraudDetection.py
@@ -4,10 +4,15 @@
 @author: J. Ivan Avalos
 """
 
+import os
 import pickle
+import sys
+from functools import partial
+from typing import Callable
 
 import numpy as np
 import pandas as pd
+from likelihood import walkers
 from minisom import MiniSom
 from sklearn.preprocessing import MinMaxScaler
 
@@ -119,12 +124,12 @@ def getMetrics(dataset, fraud_id):
         f1_score = 0  # Avoid division by zero
 
     # Output the metrics
-    print("MinSom accuracy : ", accuracy)
-    print("MinSom precision : ", precision)
-    print("MinSom recall : ", recall)
-    print("MinSom F1-score : ", f1_score)
+    # print("MinSom accuracy : ", accuracy)
+    # print("MinSom precision : ", precision)
+    # print("MinSom recall : ", recall)
+    # print("MinSom F1-score : ", f1_score)
 
-    return accuracy
+    return [accuracy, precision, recall, f1_score]
 
 
 def load_model(filepath):
@@ -133,6 +138,121 @@ def load_model(filepath):
     return model
 
 
+def model(x, theta, sc=None, dataset=None):
+    # Apply the MinSom model to the input data
+    nx = int(round(theta[0], 0))
+    ny = int(round(theta[1], 0))
+    sigma = theta[2]
+    learning_rate = abs(theta[3])
+    num_iterations = int(round(theta[4], 0))
+    dist_int = theta[5]
+    som = somTrained(x, nx, ny, sigma, learning_rate, num_iterations)
+    try:
+        fraud_id = getFrauds(som, x, dist_int, sc)
+        metrics = getMetrics(dataset, fraud_id)
+        return np.array(metrics)
+    except:
+        return np.array([0.0, 0.0, 0.0, 0.0])
+
+
+def getBestModel(
+    x, model, iterations: int = 100, num_models: int = 10, sc=None, dataset=None, patience: int = 5
+) -> MiniSom:
+    # Initialize the best model and its performance
+    best_model = None
+    mean_performance = []
+    best_metric_f1 = 0
+    best_metric_acc = 0
+    min_error_so_far = np.inf
+    y = np.array([100.0, 100.0, 100.0, 100.0])
+    theta = np.array([5.0, 5.0, 0.5, 0.01, 50, 0.75])
+    conditions = [
+        2.0,
+        10.0,
+        2.0,
+        10.0,
+        0.01,
+        0.95,
+        0.001,
+        0.95,
+        10.0,
+        100.0,
+        0.1,
+        0.95,
+    ]
+    partial_model = partial(model, sc=sc, dataset=dataset)
+
+    # Variable to track the number of consecutive iterations without improvement
+    no_improvement_counter = 0
+
+    for i in range(num_models):
+        print("model ", i)
+        # Initialize the model with random parameters
+        par, error = walkers(
+            20,
+            x,
+            y,
+            partial_model,
+            theta,
+            conditions,
+            0.05,
+            iterations,
+            0.25,
+            1.0 * 10**-3,
+            False,
+            None,
+        )
+        try:
+            n = np.where(error == min(error))[0][0]
+        except:
+            print(error)
+        _parameters = par[n]
+        print("min_error_so_far : ", min_error_so_far)
+        _model = somTrained(
+            x,
+            int(round(_parameters[0], 0)),
+            int(round(_parameters[1], 0)),
+            _parameters[2],
+            abs(_parameters[3]),
+            int(round(_parameters[4], 0)),
+        )
+        try:
+            fraud_id = getFrauds(som, x, _parameters[5], sc)
+            metrics = getMetrics(dataset, fraud_id)
+
+            # Check if the model's performance improves
+            if (best_metric_f1 < metrics[-1]) or (best_metric_acc < metrics[0]):
+                best_metric_f1 = metrics[-1]
+                best_metric_acc = metrics[0]
+                min_error_so_far = error[n]
+                best_model = _model
+                best_parameters = _parameters
+                mean_performance.append(metrics)
+                print("MinSom accuracy : ", mean_performance[-1][-4])
+                print("MinSom precision : ", mean_performance[-1][-3])
+                print("MinSom recall : ", mean_performance[-1][-2])
+                print("MinSom F1-score : ", mean_performance[-1][-1])
+
+                # Reset the no-improvement counter since we found a better model
+                no_improvement_counter = 0
+            else:
+                # Increment the no-improvement counter
+                no_improvement_counter += 1
+
+            # Early stopping: If no improvement for `patience` consecutive iterations, stop
+            if no_improvement_counter >= patience:
+                print(
+                    f"Early stopping after {no_improvement_counter} iterations without improvement."
+                )
+                break
+
+        except:
+            print("error in getFrauds")
+            break
+
+    return best_model, mean_performance, best_parameters
+
+
 if __name__ == "__main__":
     # Cargar datos
     dataset, features, isFraud = getData()
@@ -148,3 +268,8 @@ def load_model(filepath):
         pickle.dump(som, outfile)
 
     som = load_model(filepath)
+    print("\nSearching for the best model...")
+    best_model, mean_performance, best_parameters = getBestModel(
+        features_transformed, model, num_models=30, sc=sc, dataset=dataset
+    )
+    print("Best model MinSom F1-score : ", mean_performance[-1][-1])
diff --git a/fraud_detection/som.p b/fraud_detection/som.p