Skip to content

Commit

Permalink
develop (#34)
Browse files Browse the repository at this point in the history
* Create `autoencoders.py`

* Add `AutoClassifier` class

* Update `autoencoders.py`

* Add `concatenate` to `classifier` output

* [FIX] `AutoClassifier` architecture

* Update `tools.py`

* Add normalization to `DataFrameEncoder`

* minor changes

* Update `numeric_tools.py`
* Update `xi_corr` function

* Update `autoencoders.py`

* Add `call_existing_code`
* Add `build_model`
* Add `setup_model`

* [FIX] `autoencoders.py`

* minor changes
* Update `VERSION` and `setup.py`

* Add docstrings

* Update `generate-docs.yml`
  • Loading branch information
jzsmoreno authored Jun 26, 2024
1 parent 5b8a1f6 commit f8e1eb6
Show file tree
Hide file tree
Showing 7 changed files with 294 additions and 14 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/generate-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ jobs:
pip install networkx
pip install pyvis
pip install pdoc3
pip install tensorflow
pip install keras-tuner
- name: Set up Git
env:
Expand Down
2 changes: 1 addition & 1 deletion likelihood/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.2.12
1.2.13
1 change: 1 addition & 0 deletions likelihood/models/deep/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .autoencoders import *
248 changes: 248 additions & 0 deletions likelihood/models/deep/autoencoders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
import os
from functools import partial

import keras_tuner
import numpy as np
import pandas as pd
import tensorflow as tf
from pandas.core.frame import DataFrame
from tensorflow.keras.models import Model

from likelihood.tools import OneHotEncoder


class AutoClassifier(Model):
"""
An auto-classifier model that automatically determines the best classification strategy based on the input data.
Attributes:
- input_shape: The shape of the input data.
- num_classes: The number of classes in the dataset.
- units: The number of neurons in each hidden layer.
- activation: The type of activation function to use for the neural network layers.
Methods:
__init__(self, input_shape, num_classes, units, activation): Initializes an AutoClassifier instance with the given parameters.
"""

def __init__(self, input_shape, num_classes, units, activation):
"""
Initializes an AutoClassifier instance with the given parameters.
Parameters
----------
input_shape : `int`
The shape of the input data.
num_classes : `int`
The number of classes in the dataset.
units : `int`
The number of neurons in each hidden layer.
activation : `str`
The type of activation function to use for the neural network layers.
Returns
-------
`None`
"""
super(AutoClassifier, self).__init__()
self.units = units
self.shape = input_shape

self.encoder = tf.keras.Sequential(
[
tf.keras.layers.Dense(units=units, activation=activation),
tf.keras.layers.Dense(units=int(units / 2), activation=activation),
]
)

self.decoder = tf.keras.Sequential(
[
tf.keras.layers.Dense(units=units, activation=activation),
tf.keras.layers.Dense(units=input_shape, activation=activation),
]
)

self.classifier = tf.keras.Sequential(
[tf.keras.layers.Dense(num_classes, activation="softmax")]
)

def call(self, x):
encoded = self.encoder(x)
decoded = self.decoder(encoded)
combined = tf.concat([decoded, encoded], axis=1)
classifier = self.classifier(combined)
return classifier


def call_existing_code(
units: int,
activation: str,
threshold: float,
optimizer: str,
input_shape: None | int = None,
num_classes: None | int = None,
) -> AutoClassifier:
"""
Calls an existing AutoClassifier instance.
Parameters
----------
units : `int`
The number of neurons in each hidden layer.
activation : `str`
The type of activation function to use for the neural network layers.
threshold : `float`
The threshold for the classifier.
optimizer : `str`
The type of optimizer to use for the neural network layers.
input_shape : `None` | `int`
The shape of the input data.
num_classes : `int`
The number of classes in the dataset.
Returns
-------
`AutoClassifier`
The AutoClassifier instance.
"""
model = AutoClassifier(
input_shape=input_shape, num_classes=num_classes, units=units, activation=activation
)
model.compile(
optimizer=optimizer,
loss="categorical_crossentropy",
metrics=[tf.keras.metrics.F1Score(threshold=threshold)],
)
return model


def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoClassifier:
"""Builds a neural network model using Keras Tuner's search algorithm.
Parameters
----------
hp : `keras_tuner.HyperParameters`
The hyperparameters to tune.
input_shape : `None` | `int`
The shape of the input data.
num_classes : `int`
The number of classes in the dataset.
Returns
-------
`keras.Model`
The neural network model.
"""
units = hp.Int("units", min_value=int(input_shape * 0.2), max_value=input_shape, step=2)
activation = hp.Choice("activation", ["sigmoid", "relu", "tanh", "selu", "softplus"])
optimizer = hp.Choice("optimizer", ["sgd", "adam", "adadelta"])
threshold = hp.Float("threshold", min_value=0.1, max_value=0.9, sampling="log")

model = call_existing_code(
units=units,
activation=activation,
threshold=threshold,
optimizer=optimizer,
input_shape=input_shape,
num_classes=num_classes,
)
return model


def setup_model(
data: DataFrame, target: str, epochs: int, train_size: float = 0.7, seed=None, **kwargs
) -> AutoClassifier:
"""Setup model for training and tuning.
Parameters
----------
data : `DataFrame`
The dataset to train the model on.
target : `str`
The name of the target column.
epochs : `int`
The number of epochs to train the model for.
train_size : `float`
The proportion of the dataset to use for training.
seed : `Any` | `int`
The random seed to use for reproducibility.
Keyword Arguments:
----------
Additional keyword arguments to pass to the model.
max_trials : `int`
The maximum number of trials to perform.
directory : `str`
The directory to save the model to.
project_name : `str`
The name of the project.
objective : `str`
The objective to optimize.
verbose : `bool`
Whether to print verbose output.
Returns
-------
model : `AutoClassifier`
The trained model.
"""
max_trials = kwargs["max_trials"] if "max_trials" in kwargs else 10
directory = kwargs["directory"] if "directory" in kwargs else "./my_dir"
project_name = kwargs["project_name"] if "project_name" in kwargs else "get_best"
objective = kwargs["objective"] if "objective" in kwargs else "val_loss"
verbose = kwargs["verbose"] if "verbose" in kwargs else True

X = data.drop(columns=target)
y = data[target]
# Verify if there are categorical columns in the dataframe
assert (
X.select_dtypes(include=["object"]).empty == True
), "Categorical variables within the DataFrame must be encoded, this is done by using the DataFrameEncoder from likelihood."
validation_split = 1.0 - train_size
# Create my_dir path if it does not exist
if not os.path.exists(directory):
os.makedirs(directory)

# Create a Classifier instance
y_encoder = OneHotEncoder()
y = y_encoder.encode(y.to_list())
X = X.to_numpy()
X = np.asarray(X).astype(np.float32)

y = pd.DataFrame(y, columns=["class_0", "class_1"])
y = y.to_numpy()
y = np.asarray(y).astype(np.float32)

input_shape = X.shape[1]
num_classes = y.shape[1]
global build_model
build_model = partial(build_model, input_shape=input_shape, num_classes=num_classes)

# Create the AutoKeras model
tuner = keras_tuner.RandomSearch(
hypermodel=build_model,
objective=objective,
max_trials=max_trials,
directory=directory,
project_name=project_name,
seed=seed,
)

tuner.search(X, y, epochs=epochs, validation_split=validation_split)
models = tuner.get_best_models(num_models=2)
best_model = models[0]

# save model
best_model.save("./my_dir/best_model.keras")

if verbose:
tuner.results_summary()
else:
# Load the best model from the directory
best_model = tf.keras.models.load_model("./my_dir/best_model.keras")

return best_model


########################################################################################
11 changes: 6 additions & 5 deletions likelihood/tools/numeric_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# -------------------------------------------------------------------------


def xi_corr(df: DataFrame) -> Dict:
def xi_corr(df: DataFrame) -> DataFrame:
"""Calculate new coefficient of correlation for all pairs of columns in a `DataFrame`.
Parameters
Expand All @@ -18,8 +18,8 @@ def xi_corr(df: DataFrame) -> Dict:
Returns
-------
`dict`
A dictionary with variable names as keys and their corresponding
`DataFrame`
A dataframe with variable names as keys and their corresponding
correlation coefficients as values.
"""
correlations = {}
Expand All @@ -33,7 +33,8 @@ def xi_corr(df: DataFrame) -> Dict:

correlation = xicor(x, y)
correlations[(col1, col2)] = round(correlation, 8)

# dictionary to dataframe
correlations = DataFrame(list(correlations.items()), columns=["Variables", "Xi Correlation"])
return correlations


Expand Down Expand Up @@ -259,5 +260,5 @@ def gauss_elimination(A: ndarray | list, pr: int = 2) -> ndarray:
print("coefficient for Y = X * X : ", xicor(X, Y))

print("New correlation coefficient test for pandas DataFrame")
values = xi_corr(df)
values_df = xi_corr(df)
breakpoint()
Loading

0 comments on commit f8e1eb6

Please sign in to comment.