develop (#34)

* Create `autoencoders.py` * Add `AutoClassifier` class * Update `autoencoders.py` * Add `concatenate` to `classifier` output * [FIX] `AutoClassifier` architecture * Update `tools.py` * Add normalization to `DataFrameEncoder` * minor changes * Update `numeric_tools.py` * Update `xi_corr` function * Update `autoencoders.py` * Add `call_existing_code` * Add `build_model` * Add `setup_model` * [FIX] `autoencoders.py` * minor changes * Update `VERSION` and `setup.py` * Add docstrings * Update `generate-docs.yml`
jzsmoreno · Jun 26, 2024 · f8e1eb6 · f8e1eb6
1 parent 5b8a1f6
commit f8e1eb6
Show file tree

Hide file tree

Showing 7 changed files with 294 additions and 14 deletions.
diff --git a/.github/workflows/generate-docs.yml b/.github/workflows/generate-docs.yml
@@ -30,6 +30,8 @@ jobs:
         pip install networkx
         pip install pyvis
         pip install pdoc3
+        pip install tensorflow
+        pip install keras-tuner
 
     - name: Set up Git
       env:

diff --git a/likelihood/VERSION b/likelihood/VERSION
@@ -1 +1 @@
-1.2.12
+1.2.13
diff --git a/likelihood/models/deep/__init__.py b/likelihood/models/deep/__init__.py
@@ -0,0 +1 @@
+from .autoencoders import *
diff --git a/likelihood/models/deep/autoencoders.py b/likelihood/models/deep/autoencoders.py
@@ -0,0 +1,248 @@
+import os
+from functools import partial
+
+import keras_tuner
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from pandas.core.frame import DataFrame
+from tensorflow.keras.models import Model
+
+from likelihood.tools import OneHotEncoder
+
+
+class AutoClassifier(Model):
+    """
+    An auto-classifier model that automatically determines the best classification strategy based on the input data.
+
+    Attributes:
+        - input_shape: The shape of the input data.
+        - num_classes: The number of classes in the dataset.
+        - units: The number of neurons in each hidden layer.
+        - activation: The type of activation function to use for the neural network layers.
+
+    Methods:
+        __init__(self, input_shape, num_classes, units, activation): Initializes an AutoClassifier instance with the given parameters.
+    """
+
+    def __init__(self, input_shape, num_classes, units, activation):
+        """
+        Initializes an AutoClassifier instance with the given parameters.
+
+        Parameters
+        ----------
+        input_shape : `int`
+            The shape of the input data.
+        num_classes : `int`
+            The number of classes in the dataset.
+        units : `int`
+            The number of neurons in each hidden layer.
+        activation : `str`
+            The type of activation function to use for the neural network layers.
+
+        Returns
+        -------
+            `None`
+        """
+        super(AutoClassifier, self).__init__()
+        self.units = units
+        self.shape = input_shape
+
+        self.encoder = tf.keras.Sequential(
+            [
+                tf.keras.layers.Dense(units=units, activation=activation),
+                tf.keras.layers.Dense(units=int(units / 2), activation=activation),
+            ]
+        )
+
+        self.decoder = tf.keras.Sequential(
+            [
+                tf.keras.layers.Dense(units=units, activation=activation),
+                tf.keras.layers.Dense(units=input_shape, activation=activation),
+            ]
+        )
+
+        self.classifier = tf.keras.Sequential(
+            [tf.keras.layers.Dense(num_classes, activation="softmax")]
+        )
+
+    def call(self, x):
+        encoded = self.encoder(x)
+        decoded = self.decoder(encoded)
+        combined = tf.concat([decoded, encoded], axis=1)
+        classifier = self.classifier(combined)
+        return classifier
+
+
+def call_existing_code(
+    units: int,
+    activation: str,
+    threshold: float,
+    optimizer: str,
+    input_shape: None | int = None,
+    num_classes: None | int = None,
+) -> AutoClassifier:
+    """
+    Calls an existing AutoClassifier instance.
+
+    Parameters
+    ----------
+    units : `int`
+        The number of neurons in each hidden layer.
+    activation : `str`
+        The type of activation function to use for the neural network layers.
+    threshold : `float`
+        The threshold for the classifier.
+    optimizer : `str`
+        The type of optimizer to use for the neural network layers.
+    input_shape : `None` | `int`
+        The shape of the input data.
+    num_classes : `int`
+        The number of classes in the dataset.
+
+    Returns
+    -------
+    `AutoClassifier`
+        The AutoClassifier instance.
+    """
+    model = AutoClassifier(
+        input_shape=input_shape, num_classes=num_classes, units=units, activation=activation
+    )
+    model.compile(
+        optimizer=optimizer,
+        loss="categorical_crossentropy",
+        metrics=[tf.keras.metrics.F1Score(threshold=threshold)],
+    )
+    return model
+
+
+def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoClassifier:
+    """Builds a neural network model using Keras Tuner's search algorithm.
+
+    Parameters
+    ----------
+    hp : `keras_tuner.HyperParameters`
+        The hyperparameters to tune.
+    input_shape : `None` | `int`
+        The shape of the input data.
+    num_classes : `int`
+        The number of classes in the dataset.
+
+    Returns
+    -------
+    `keras.Model`
+        The neural network model.
+    """
+    units = hp.Int("units", min_value=int(input_shape * 0.2), max_value=input_shape, step=2)
+    activation = hp.Choice("activation", ["sigmoid", "relu", "tanh", "selu", "softplus"])
+    optimizer = hp.Choice("optimizer", ["sgd", "adam", "adadelta"])
+    threshold = hp.Float("threshold", min_value=0.1, max_value=0.9, sampling="log")
+
+    model = call_existing_code(
+        units=units,
+        activation=activation,
+        threshold=threshold,
+        optimizer=optimizer,
+        input_shape=input_shape,
+        num_classes=num_classes,
+    )
+    return model
+
+
+def setup_model(
+    data: DataFrame, target: str, epochs: int, train_size: float = 0.7, seed=None, **kwargs
+) -> AutoClassifier:
+    """Setup model for training and tuning.
+
+    Parameters
+    ----------
+    data : `DataFrame`
+        The dataset to train the model on.
+    target : `str`
+        The name of the target column.
+    epochs : `int`
+        The number of epochs to train the model for.
+    train_size : `float`
+        The proportion of the dataset to use for training.
+    seed : `Any` | `int`
+        The random seed to use for reproducibility.
+
+    Keyword Arguments:
+    ----------
+    Additional keyword arguments to pass to the model.
+
+    max_trials : `int`
+        The maximum number of trials to perform.
+    directory : `str`
+        The directory to save the model to.
+    project_name : `str`
+        The name of the project.
+    objective : `str`
+        The objective to optimize.
+    verbose : `bool`
+        Whether to print verbose output.
+
+    Returns
+    -------
+    model : `AutoClassifier`
+        The trained model.
+    """
+    max_trials = kwargs["max_trials"] if "max_trials" in kwargs else 10
+    directory = kwargs["directory"] if "directory" in kwargs else "./my_dir"
+    project_name = kwargs["project_name"] if "project_name" in kwargs else "get_best"
+    objective = kwargs["objective"] if "objective" in kwargs else "val_loss"
+    verbose = kwargs["verbose"] if "verbose" in kwargs else True
+
+    X = data.drop(columns=target)
+    y = data[target]
+    # Verify if there are categorical columns in the dataframe
+    assert (
+        X.select_dtypes(include=["object"]).empty == True
+    ), "Categorical variables within the DataFrame must be encoded, this is done by using the DataFrameEncoder from likelihood."
+    validation_split = 1.0 - train_size
+    # Create my_dir path if it does not exist
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+        # Create a Classifier instance
+        y_encoder = OneHotEncoder()
+        y = y_encoder.encode(y.to_list())
+        X = X.to_numpy()
+        X = np.asarray(X).astype(np.float32)
+
+        y = pd.DataFrame(y, columns=["class_0", "class_1"])
+        y = y.to_numpy()
+        y = np.asarray(y).astype(np.float32)
+
+        input_shape = X.shape[1]
+        num_classes = y.shape[1]
+        global build_model
+        build_model = partial(build_model, input_shape=input_shape, num_classes=num_classes)
+
+        # Create the AutoKeras model
+        tuner = keras_tuner.RandomSearch(
+            hypermodel=build_model,
+            objective=objective,
+            max_trials=max_trials,
+            directory=directory,
+            project_name=project_name,
+            seed=seed,
+        )
+
+        tuner.search(X, y, epochs=epochs, validation_split=validation_split)
+        models = tuner.get_best_models(num_models=2)
+        best_model = models[0]
+
+        # save model
+        best_model.save("./my_dir/best_model.keras")
+
+        if verbose:
+            tuner.results_summary()
+    else:
+        # Load the best model from the directory
+        best_model = tf.keras.models.load_model("./my_dir/best_model.keras")
+
+    return best_model
+
+
+########################################################################################
diff --git a/likelihood/tools/numeric_tools.py b/likelihood/tools/numeric_tools.py
@@ -8,7 +8,7 @@
 # -------------------------------------------------------------------------
 
 
-def xi_corr(df: DataFrame) -> Dict:
+def xi_corr(df: DataFrame) -> DataFrame:
     """Calculate new coefficient of correlation for all pairs of columns in a `DataFrame`.
 
     Parameters
@@ -18,8 +18,8 @@ def xi_corr(df: DataFrame) -> Dict:
 
     Returns
     -------
-    `dict`
-        A dictionary with variable names as keys and their corresponding
+    `DataFrame`
+        A dataframe with variable names as keys and their corresponding
         correlation coefficients as values.
     """
     correlations = {}
@@ -33,7 +33,8 @@ def xi_corr(df: DataFrame) -> Dict:
 
                 correlation = xicor(x, y)
                 correlations[(col1, col2)] = round(correlation, 8)
-
+    # dictionary to dataframe
+    correlations = DataFrame(list(correlations.items()), columns=["Variables", "Xi Correlation"])
     return correlations
 
 
@@ -259,5 +260,5 @@ def gauss_elimination(A: ndarray | list, pr: int = 2) -> ndarray:
     print("coefficient for Y = X * X : ", xicor(X, Y))
 
     print("New correlation coefficient test for pandas DataFrame")
-    values = xi_corr(df)
+    values_df = xi_corr(df)
     breakpoint()