From 8b6bf04fcc15dd293743eebbb4f9d813ffb2dde4 Mon Sep 17 00:00:00 2001 From: "rodrigo.arenas" <31422766+rodrigo-arenas@users.noreply.github.com> Date: Thu, 12 Sep 2024 10:25:29 -0500 Subject: [PATCH 1/9] model cache for faster evaluation --- dev-requirements.txt | 2 +- pytest.ini | 2 ++ sklearn_genetic/genetic_search.py | 44 +++++++++++++++++++++++++++++-- 3 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 pytest.ini diff --git a/dev-requirements.txt b/dev-requirements.txt index 53fced5..a9e964e 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,4 +1,4 @@ -scikit-learn>=1.1.0 +scikit-learn>=1.3.0 deap>=1.3.3 numpy>=1.19.0 pytest==7.4.0 diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..11d9f4c --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +addopts = --verbose --color=yes --assert=plain --cov-fail-under=95 --cov-config=.coveragerc --cov=./ -p no:warnings --tb=short --cov-report=term-missing:skip-covered diff --git a/sklearn_genetic/genetic_search.py b/sklearn_genetic/genetic_search.py index 2ea6e14..e2f8da8 100644 --- a/sklearn_genetic/genetic_search.py +++ b/sklearn_genetic/genetic_search.py @@ -259,6 +259,7 @@ def __init__( self.return_train_score = return_train_score self.creator = creator self.log_config = log_config + self.fitness_cache = {} # Check that the estimator is compatible with scikit-learn if not is_classifier(self.estimator) and not is_regressor(self.estimator): @@ -392,6 +393,17 @@ def evaluate(self, individual): key: individual[n] for n, key in enumerate(self.space.parameters) } + # Convert hyperparameters to a tuple to use as a key in the cache + individual_key = tuple(sorted(current_generation_params.items())) + + # Check if the individual has already been evaluated + if individual_key in self.fitness_cache: + # Retrieve cached result + cached_result = self.fitness_cache[individual_key] + # Ensure the logbook is updated even if the individual is cached + self.logbook.record(parameters=cached_result["current_generation_params"]) + return cached_result["fitness"] + local_estimator = clone(self.estimator) local_estimator.set_params(**current_generation_params) @@ -437,7 +449,15 @@ def evaluate(self, individual): # Log the hyperparameters and the cv-score self.logbook.record(parameters=current_generation_params) - return [score] + fitness_result = [score] + + # Store the fitness result and the current generation parameters in the cache + self.fitness_cache[individual_key] = { + "fitness": fitness_result, + "current_generation_params": current_generation_params + } + + return fitness_result def fit(self, X, y, callbacks=None): """ @@ -880,6 +900,7 @@ def __init__( self.return_train_score = return_train_score self.creator = creator self.log_config = log_config + self.fitness_cache = {} # Check that the estimator is compatible with scikit-learn if not is_classifier(self.estimator) and not is_regressor(self.estimator): @@ -965,6 +986,16 @@ def evaluate(self, individual): local_estimator = clone(self.estimator) n_selected_features = np.sum(individual) + # Convert the individual to a tuple to use as a key in the cache + individual_key = tuple(individual) + + # Check if the individual has already been evaluated + if individual_key in self.fitness_cache: + cached_result = self.fitness_cache[individual_key] + # Ensure the logbook is updated even if the individual is cached + self.logbook.record(parameters=cached_result["current_generation_features"]) + return cached_result["fitness"] + # Compute the cv-metrics using only the selected features cv_results = cross_validate( local_estimator, @@ -1014,7 +1045,16 @@ def evaluate(self, individual): ): score = -self.criteria_sign * 100000 - return [score, n_selected_features] + # Prepare the fitness result + fitness_result = [score, n_selected_features] + + # Store the fitness result and the current generation features in the cache + self.fitness_cache[individual_key] = { + "fitness": fitness_result, + "current_generation_features": current_generation_features + } + + return fitness_result def fit(self, X, y, callbacks=None): """ From 741ef000a5b178d341021b7386888178329dc172 Mon Sep 17 00:00:00 2001 From: "rodrigo.arenas" <31422766+rodrigo-arenas@users.noreply.github.com> Date: Thu, 12 Sep 2024 14:44:26 -0500 Subject: [PATCH 2/9] model cache for faster evaluation --- docs/index.rst | 2 +- docs/release_notes.rst | 11 +++ setup.py | 2 +- sklearn_genetic/_version.py | 2 +- sklearn_genetic/genetic_search.py | 124 +++++++++++++++++------------- 5 files changed, 83 insertions(+), 58 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 10c0edb..b730171 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -27,7 +27,7 @@ inside the env use:: pip install sklearn-genetic-opt .. |PythonMinVersion| replace:: 3.8 -.. |ScikitLearnMinVersion| replace:: 1.1.0 +.. |ScikitLearnMinVersion| replace:: 1.3.0 .. |NumPyMinVersion| replace:: 1.19.0 .. |SeabornMinVersion| replace:: 0.11.2 .. |DEAPMinVersion| replace:: 1.3.3 diff --git a/docs/release_notes.rst b/docs/release_notes.rst index 886a097..3261d2e 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -3,6 +3,17 @@ Release Notes Some notes on new features in various releases +What's new in 0.11.0dev0 +------------------------ + +^^^^^^^^^ +Features: +^^^^^^^^^ + +* Added a parameter named `use_cache`, defaults to `True`, If set to true it will avoid to re-evaluating solutions that have already seen, + otherwise it will always evaluate the solutions to get the performance metrics + + What's new in 0.10.1 -------------------- diff --git a/setup.py b/setup.py index 9a438a2..e75729e 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ include=["sklearn_genetic", "sklearn_genetic.*"], exclude=["*tests*"] ), install_requires=[ - "scikit-learn>=1.1.0", + "scikit-learn>=1.3.0", "numpy>=1.19.0", "deap>=1.3.3", "tqdm>=4.61.1", diff --git a/sklearn_genetic/_version.py b/sklearn_genetic/_version.py index 018bfb0..fcf3da7 100644 --- a/sklearn_genetic/_version.py +++ b/sklearn_genetic/_version.py @@ -1 +1 @@ -__version__ = "0.10.2dev0" +__version__ = "0.11.0dev0" diff --git a/sklearn_genetic/genetic_search.py b/sklearn_genetic/genetic_search.py index e2f8da8..6077430 100644 --- a/sklearn_genetic/genetic_search.py +++ b/sklearn_genetic/genetic_search.py @@ -172,6 +172,10 @@ class GASearchCV(BaseSearchCV): Configuration to log metrics and models to mlflow, of None, no mlflow logging will be performed + use_cache: bool, default=True + If set to true it will avoid to re-evaluating solutions that have already seen, + otherwise it will always evaluate the solutions to get the performance metrics + Attributes ---------- @@ -214,27 +218,28 @@ class GASearchCV(BaseSearchCV): """ def __init__( - self, - estimator, - cv=3, - param_grid=None, - scoring=None, - population_size=50, - generations=80, - crossover_probability=0.2, - mutation_probability=0.8, - tournament_size=3, - elitism=True, - verbose=True, - keep_top_k=1, - criteria="max", - algorithm="eaMuPlusLambda", - refit=True, - n_jobs=1, - pre_dispatch="2*n_jobs", - error_score=np.nan, - return_train_score=False, - log_config=None, + self, + estimator, + cv=3, + param_grid=None, + scoring=None, + population_size=50, + generations=80, + crossover_probability=0.2, + mutation_probability=0.8, + tournament_size=3, + elitism=True, + verbose=True, + keep_top_k=1, + criteria="max", + algorithm="eaMuPlusLambda", + refit=True, + n_jobs=1, + pre_dispatch="2*n_jobs", + error_score=np.nan, + return_train_score=False, + log_config=None, + use_cache=True, ): self.estimator = estimator self.cv = cv @@ -259,6 +264,7 @@ def __init__( self.return_train_score = return_train_score self.creator = creator self.log_config = log_config + self.use_cache = use_cache self.fitness_cache = {} # Check that the estimator is compatible with scikit-learn @@ -397,7 +403,7 @@ def evaluate(self, individual): individual_key = tuple(sorted(current_generation_params.items())) # Check if the individual has already been evaluated - if individual_key in self.fitness_cache: + if individual_key in self.fitness_cache and self.use_cache: # Retrieve cached result cached_result = self.fitness_cache[individual_key] # Ensure the logbook is updated even if the individual is cached @@ -451,11 +457,12 @@ def evaluate(self, individual): fitness_result = [score] - # Store the fitness result and the current generation parameters in the cache - self.fitness_cache[individual_key] = { - "fitness": fitness_result, - "current_generation_params": current_generation_params - } + if self.use_cache: + # Store the fitness result and the current generation parameters in the cache + self.fitness_cache[individual_key] = { + "fitness": fitness_result, + "current_generation_params": current_generation_params + } return fitness_result @@ -814,6 +821,10 @@ class GAFeatureSelectionCV(MetaEstimatorMixin, SelectorMixin, BaseEstimator): Configuration to log metrics and models to mlflow, of None, no mlflow logging will be performed + use_cache: bool, default=True + If set to true it will avoid to re-evaluating solutions that have already seen, + otherwise it will always evaluate the solutions to get the performance metrics + Attributes ---------- @@ -855,27 +866,28 @@ class GAFeatureSelectionCV(MetaEstimatorMixin, SelectorMixin, BaseEstimator): """ def __init__( - self, - estimator, - cv=3, - scoring=None, - population_size=50, - generations=80, - crossover_probability=0.2, - mutation_probability=0.8, - tournament_size=3, - elitism=True, - max_features=None, - verbose=True, - keep_top_k=1, - criteria="max", - algorithm="eaMuPlusLambda", - refit=True, - n_jobs=1, - pre_dispatch="2*n_jobs", - error_score=np.nan, - return_train_score=False, - log_config=None, + self, + estimator, + cv=3, + scoring=None, + population_size=50, + generations=80, + crossover_probability=0.2, + mutation_probability=0.8, + tournament_size=3, + elitism=True, + max_features=None, + verbose=True, + keep_top_k=1, + criteria="max", + algorithm="eaMuPlusLambda", + refit=True, + n_jobs=1, + pre_dispatch="2*n_jobs", + error_score=np.nan, + return_train_score=False, + log_config=None, + use_cache=True, ): self.estimator = estimator self.cv = cv @@ -900,6 +912,7 @@ def __init__( self.return_train_score = return_train_score self.creator = creator self.log_config = log_config + self.use_cache = use_cache self.fitness_cache = {} # Check that the estimator is compatible with scikit-learn @@ -990,7 +1003,7 @@ def evaluate(self, individual): individual_key = tuple(individual) # Check if the individual has already been evaluated - if individual_key in self.fitness_cache: + if individual_key in self.fitness_cache and self.use_cache: cached_result = self.fitness_cache[individual_key] # Ensure the logbook is updated even if the individual is cached self.logbook.record(parameters=cached_result["current_generation_features"]) @@ -1041,18 +1054,19 @@ def evaluate(self, individual): # Penalize individuals with more features than the max_features parameter if self.max_features and ( - n_selected_features > self.max_features or n_selected_features == 0 + n_selected_features > self.max_features or n_selected_features == 0 ): score = -self.criteria_sign * 100000 # Prepare the fitness result fitness_result = [score, n_selected_features] - # Store the fitness result and the current generation features in the cache - self.fitness_cache[individual_key] = { - "fitness": fitness_result, - "current_generation_features": current_generation_features - } + if self.use_cache: + # Store the fitness result and the current generation features in the cache + self.fitness_cache[individual_key] = { + "fitness": fitness_result, + "current_generation_features": current_generation_features + } return fitness_result From 5bdb85d5575ca317a0e23108a16c802993a2e0c8 Mon Sep 17 00:00:00 2001 From: "rodrigo.arenas" <31422766+rodrigo-arenas@users.noreply.github.com> Date: Thu, 12 Sep 2024 15:33:52 -0500 Subject: [PATCH 3/9] warm_start_configs for model initialization --- README.rst | 9 +++++++++ dev-requirements.txt | 2 ++ docs/release_notes.rst | 27 ++++++++++++++++++++++++--- sklearn_genetic/genetic_search.py | 27 ++++++++++++++++++++++++++- sklearn_genetic/space/space.py | 21 +++++++++++++++++++++ 5 files changed, 82 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 7654b19..d065656 100644 --- a/README.rst +++ b/README.rst @@ -102,12 +102,19 @@ Example: Hyperparameters Tuning clf = RandomForestClassifier() + # Defines the possible values to search param_grid = {'min_weight_fraction_leaf': Continuous(0.01, 0.5, distribution='log-uniform'), 'bootstrap': Categorical([True, False]), 'max_depth': Integer(2, 30), 'max_leaf_nodes': Integer(2, 35), 'n_estimators': Integer(100, 300)} + # Seed solutions + warm_start_configs = [ + {"min_weight_fraction_leaf": 0.02, "bootstrap": True, "max_depth": None, "n_estimators": 100}, + {"min_weight_fraction_leaf": 0.4, "bootstrap": True, "max_depth": 5, "n_estimators": 200}, + ] + cv = StratifiedKFold(n_splits=3, shuffle=True) evolved_estimator = GASearchCV(estimator=clf, @@ -118,6 +125,8 @@ Example: Hyperparameters Tuning param_grid=param_grid, n_jobs=-1, verbose=True, + use_cache=True, + warm_start_configs=warm_start_configs, keep_top_k=4) # Train and optimize the estimator diff --git a/dev-requirements.txt b/dev-requirements.txt index a9e964e..c556fd0 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -15,6 +15,8 @@ sphinx_rtd_theme sphinx-copybutton numpydoc nbsphinx +ipython>=8.27.0 +Pygments>=2.18.0 tensorflow>=2.4.0 tqdm>=4.61.1 tk diff --git a/docs/release_notes.rst b/docs/release_notes.rst index 3261d2e..4541930 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -10,9 +10,30 @@ What's new in 0.11.0dev0 Features: ^^^^^^^^^ -* Added a parameter named `use_cache`, defaults to `True`, If set to true it will avoid to re-evaluating solutions that have already seen, - otherwise it will always evaluate the solutions to get the performance metrics - +* Added a parameter `use_cache`, which defaults to ``True``. When enabled, the algorithm will skip re-evaluating solutions that have already been evaluated, retrieving the performance metrics from the cache instead. + If use_cache is set to ``False``, the algorithm will always re-evaluate solutions, even if they have been seen before, to obtain fresh performance metrics. +* Add a parameter in `GAFeatureSelectionCV` named warm_start_configs, defaults to ``None``, a list of predefined hyperparameter configurations to seed the initial population. + Each element in the list is a dictionary where the keys are the names of the hyperparameters, + and the values are the corresponding hyperparameter values to be used for the individual. + + Example: + + .. code-block:: python + :linenos: + + warm_start_configs = [ + {"min_weight_fraction_leaf": 0.02, "bootstrap": True, "max_depth": None, "n_estimators": 100}, + {"min_weight_fraction_leaf": 0.4, "bootstrap": True, "max_depth": 5, "n_estimators": 200}, + ] + + The genetic algorithm will initialize part of the population with these configurations to + warm-start the optimization process. The remaining individuals in the population will + be initialized randomly according to the defined hyperparameter space. + + This parameter is useful when prior knowledge of good hyperparameter configurations exists, + allowing the algorithm to focus on refining known good solutions while still exploring new + areas of the hyperparameter space. If set to ``None``, the entire population will be initialized + randomly. What's new in 0.10.1 -------------------- diff --git a/sklearn_genetic/genetic_search.py b/sklearn_genetic/genetic_search.py index 6077430..c160879 100644 --- a/sklearn_genetic/genetic_search.py +++ b/sklearn_genetic/genetic_search.py @@ -240,6 +240,7 @@ def __init__( return_train_score=False, log_config=None, use_cache=True, + warm_start_configs=None, ): self.estimator = estimator self.cv = cv @@ -266,6 +267,7 @@ def __init__( self.log_config = log_config self.use_cache = use_cache self.fitness_cache = {} + self.warm_start_configs = warm_start_configs or [] # Check that the estimator is compatible with scikit-learn if not is_classifier(self.estimator) and not is_regressor(self.estimator): @@ -346,7 +348,7 @@ def _register(self): self.toolbox.register("evaluate", self.evaluate) - self._pop = self.toolbox.population(n=self.population_size) + self._pop = self._initialize_population() self._hof = tools.HallOfFame(self.keep_top_k) self._stats = tools.Statistics(lambda ind: ind.fitness.values) @@ -357,6 +359,29 @@ def _register(self): self.logbook = tools.Logbook() + def _initialize_population(self): + """ + Initialize the population, using warm-start configurations if provided. + """ + population = [] + # Seed part of the population with warm-start values + num_warm_start = min(len(self.warm_start_configs), self.population_size) + + for config in self.warm_start_configs[:num_warm_start]: + # Sample an individual from the warm-start configuration + individual_values = self.space.sample_warm_start(config) + individual_values_list = list(individual_values.values()) + + # Manually create the individual and assign its fitness + individual = creator.Individual(individual_values_list) + population.append(individual) + + # Fill the remaining population with random individuals + num_random = self.population_size - num_warm_start + population.extend(self.toolbox.population(n=num_random)) + + return population + def mutate(self, individual): """ This function is responsible for change a randomly selected parameter from an individual diff --git a/sklearn_genetic/space/space.py b/sklearn_genetic/space/space.py index cb913db..c0a8754 100644 --- a/sklearn_genetic/space/space.py +++ b/sklearn_genetic/space/space.py @@ -222,6 +222,27 @@ def __init__(self, param_grid: dict = None): self.param_grid = param_grid + def sample_warm_start(self, warm_start_values: dict): + """ + Sample a predefined configuration (warm-start) or fill in random values if missing. + + Parameters + ---------- + warm_start_values: dict + Predefined configuration values for hyperparameters. + + Returns + ------- + A dictionary containing sampled values for each hyperparameter. + """ + sampled_params = {} + for param, dimension in self.param_grid.items(): + if param in warm_start_values: + sampled_params[param] = warm_start_values[param] + else: + sampled_params[param] = dimension.sample() # Random sample if no warm-start value + return sampled_params + @property def dimensions(self): """ From 977aea3dd25ef363575060feebc6138b9b3fbd20 Mon Sep 17 00:00:00 2001 From: "rodrigo.arenas" <31422766+rodrigo-arenas@users.noreply.github.com> Date: Thu, 12 Sep 2024 15:38:20 -0500 Subject: [PATCH 4/9] Dropped support for python 3.8 --- .github/workflows/ci-tests.yml | 2 +- README.rst | 2 +- docs/index.rst | 2 +- docs/release_notes.rst | 6 ++++++ setup.py | 2 +- 5 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 3944f1e..239ed93 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -9,7 +9,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [ '3.8', '3.9', '3.10', '3.11'] + python-version: [ '3.9', '3.10', '3.11'] os: [ubuntu-latest, windows-latest, macOS-latest] include: - os: ubuntu-latest diff --git a/README.rst b/README.rst index d065656..9e31d68 100644 --- a/README.rst +++ b/README.rst @@ -8,7 +8,7 @@ .. |Codecov| image:: https://codecov.io/gh/rodrigo-arenas/Sklearn-genetic-opt/branch/master/graphs/badge.svg?branch=master&service=github .. _Codecov: https://codecov.io/github/rodrigo-arenas/Sklearn-genetic-opt?branch=master -.. |PythonVersion| image:: https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue +.. |PythonVersion| image:: https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11-blue .. _PythonVersion : https://www.python.org/downloads/ .. |PyPi| image:: https://badge.fury.io/py/sklearn-genetic-opt.svg .. _PyPi: https://badge.fury.io/py/sklearn-genetic-opt diff --git a/docs/index.rst b/docs/index.rst index b730171..0c821b1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -26,7 +26,7 @@ inside the env use:: pip install sklearn-genetic-opt -.. |PythonMinVersion| replace:: 3.8 +.. |PythonMinVersion| replace:: 3.9 .. |ScikitLearnMinVersion| replace:: 1.3.0 .. |NumPyMinVersion| replace:: 1.19.0 .. |SeabornMinVersion| replace:: 0.11.2 diff --git a/docs/release_notes.rst b/docs/release_notes.rst index 4541930..66dfa8a 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -35,6 +35,12 @@ Features: areas of the hyperparameter space. If set to ``None``, the entire population will be initialized randomly. +^^^^^^^^^^^^ +API Changes: +^^^^^^^^^^^^ + +* Dropped support for python 3.8 + What's new in 0.10.1 -------------------- diff --git a/setup.py b/setup.py index e75729e..586c89f 100644 --- a/setup.py +++ b/setup.py @@ -51,6 +51,6 @@ "tensorflow": ["tensorflow>=2.0.0"], "all": ["mlflow>=1.30.0", "seaborn>=0.11.2", "tensorflow>=2.0.0"], }, - python_requires=">=3.8", + python_requires=">=3.9", include_package_data=True, ) From a84e7ba13ae20b9b5a40fa79d60e4d39952a74b3 Mon Sep 17 00:00:00 2001 From: "rodrigo.arenas" <31422766+rodrigo-arenas@users.noreply.github.com> Date: Thu, 12 Sep 2024 15:40:03 -0500 Subject: [PATCH 5/9] Dropped support for python 3.8 --- dev-requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index c556fd0..8ac28d2 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -15,8 +15,8 @@ sphinx_rtd_theme sphinx-copybutton numpydoc nbsphinx -ipython>=8.27.0 -Pygments>=2.18.0 +ipython +Pygments tensorflow>=2.4.0 tqdm>=4.61.1 tk From 265cda82f7bad9c1d4ea85e5a526636efb84f7db Mon Sep 17 00:00:00 2001 From: "rodrigo.arenas" <31422766+rodrigo-arenas@users.noreply.github.com> Date: Thu, 12 Sep 2024 15:58:19 -0500 Subject: [PATCH 6/9] test warm_start_configs --- sklearn_genetic/space/space.py | 2 +- sklearn_genetic/tests/test_genetic_search.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn_genetic/space/space.py b/sklearn_genetic/space/space.py index c0a8754..c28306e 100644 --- a/sklearn_genetic/space/space.py +++ b/sklearn_genetic/space/space.py @@ -240,7 +240,7 @@ def sample_warm_start(self, warm_start_values: dict): if param in warm_start_values: sampled_params[param] = warm_start_values[param] else: - sampled_params[param] = dimension.sample() # Random sample if no warm-start value + sampled_params[param] = dimension.sample() return sampled_params @property diff --git a/sklearn_genetic/tests/test_genetic_search.py b/sklearn_genetic/tests/test_genetic_search.py index af82c0e..4f2bf04 100644 --- a/sklearn_genetic/tests/test_genetic_search.py +++ b/sklearn_genetic/tests/test_genetic_search.py @@ -605,6 +605,8 @@ def test_expected_ga_schedulers(): "average": Categorical([True, False]), "max_iter": Integer(700, 1000), }, + warm_start_configs=[{"l1_ratio": 0.5, "alpha": 0.5, "average": False, "max_iter": 400}, + {"l1_ratio": 0.2, "alpha": 0.8, "average": True, "max_iter": 400}], verbose=False, algorithm="eaSimple", n_jobs=-1, From a5006fa9f81fcd9f3280f8ff1f512da97e9fd05e Mon Sep 17 00:00:00 2001 From: "rodrigo.arenas" <31422766+rodrigo-arenas@users.noreply.github.com> Date: Thu, 12 Sep 2024 16:08:39 -0500 Subject: [PATCH 7/9] upgrade actions versions --- .github/workflows/ci-tests.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 239ed93..1c320ad 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -19,12 +19,12 @@ jobs: - os: windows-latest path: ~\AppData\Local\pip\Cache steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - uses: actions/cache@v3 + - uses: actions/cache@v4 with: path: ${{ matrix.path }} key: ${{ runner.os }}-pip-${{ hashFiles('dev-requirements.txt') }} @@ -38,7 +38,7 @@ jobs: run: | pytest sklearn_genetic/ --verbose --color=yes --assert=plain --cov-fail-under=95 --cov-config=.coveragerc --cov=./ -p no:warnings - name: "Upload coverage to Codecov" - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v4 with: token: ${{ secrets.CODECOV_TOKEN }} fail_ci_if_error: true From d8ae6a937b139541ec2db95bc0061db0ef055619 Mon Sep 17 00:00:00 2001 From: "rodrigo.arenas" <31422766+rodrigo-arenas@users.noreply.github.com> Date: Thu, 12 Sep 2024 17:47:35 -0500 Subject: [PATCH 8/9] novelty score for GASearchCV --- docs/release_notes.rst | 9 +++++++ sklearn_genetic/genetic_search.py | 16 +++++++------ sklearn_genetic/utils/tools.py | 40 +++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 7 deletions(-) diff --git a/docs/release_notes.rst b/docs/release_notes.rst index 66dfa8a..e0ba510 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -34,6 +34,15 @@ Features: allowing the algorithm to focus on refining known good solutions while still exploring new areas of the hyperparameter space. If set to ``None``, the entire population will be initialized randomly. +* Introduced a **novelty search strategy** to the `GASearchCV` class. This strategy rewards solutions that are more distinct from others + in the population by incorporating a **novelty score** into the fitness evaluation. The novelty score encourages exploration and promotes diversity, + reducing the risk of premature convergence to local optima. + + - **Novelty Score**: Calculated based on the distance between an individual and its nearest neighbors in the population. + Individuals with higher novelty scores are more distinct from the rest of the population. + - **Fitness Evaluation**: The overall fitness is now a combination of the traditional performance score and the novelty score, + allowing the algorithm to balance between exploiting known good solutions and exploring new, diverse ones. + - **Improved Exploration**: This strategy helps explore new areas of the hyperparameter space, increasing the likelihood of discovering better solutions and avoiding local optima. ^^^^^^^^^^^^ API Changes: diff --git a/sklearn_genetic/genetic_search.py b/sklearn_genetic/genetic_search.py index c160879..eee15c7 100644 --- a/sklearn_genetic/genetic_search.py +++ b/sklearn_genetic/genetic_search.py @@ -28,7 +28,7 @@ create_feature_selection_cv_results_, ) from .utils.random import weighted_bool_individual -from .utils.tools import cxUniform, mutFlipBit +from .utils.tools import cxUniform, mutFlipBit, novelty_scorer class GASearchCV(BaseSearchCV): @@ -308,7 +308,7 @@ def _register(self): """ self.toolbox = base.Toolbox() - self.creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign]) + self.creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign, 1.0]) self.creator.create("Individual", list, fitness=creator.FitnessMax) attributes = [] @@ -352,10 +352,10 @@ def _register(self): self._hof = tools.HallOfFame(self.keep_top_k) self._stats = tools.Statistics(lambda ind: ind.fitness.values) - self._stats.register("fitness", np.mean) - self._stats.register("fitness_std", np.std) - self._stats.register("fitness_max", np.max) - self._stats.register("fitness_min", np.min) + self._stats.register("fitness", np.mean, axis=0) + self._stats.register("fitness_std", np.std, axis=0) + self._stats.register("fitness_max", np.max, axis=0) + self._stats.register("fitness_min", np.min, axis=0) self.logbook = tools.Logbook() @@ -454,6 +454,8 @@ def evaluate(self, individual): cv_scores = cv_results[f"test_{self.refit_metric}"] score = np.mean(cv_scores) + novelty_score = novelty_scorer(individual, self._pop) + # Uses the log config to save in remote log server (e.g MLflow) if self.log_config is not None: self.log_config.create_run( @@ -480,7 +482,7 @@ def evaluate(self, individual): # Log the hyperparameters and the cv-score self.logbook.record(parameters=current_generation_params) - fitness_result = [score] + fitness_result = [score, novelty_score] if self.use_cache: # Store the fitness result and the current generation parameters in the cache diff --git a/sklearn_genetic/utils/tools.py b/sklearn_genetic/utils/tools.py index 3fe9005..d1847a5 100644 --- a/sklearn_genetic/utils/tools.py +++ b/sklearn_genetic/utils/tools.py @@ -1,4 +1,5 @@ import random +import numpy as np def mutFlipBit(individual, indpb): @@ -67,3 +68,42 @@ def check_bool_individual(individual): individual[index] = 1 return individual + + +def novelty_scorer(individual, population, k=15): + """ + Calculate novelty score for an individual based on its distance from other individuals in the population. + + Parameters + ---------- + individual: Individual object + The individual (set of hyperparameters) that is being evaluated. + + population: List[Individual] + The current population of individuals. + + k: int, default=15 + The number of nearest neighbors to consider for the novelty calculation. + + Returns + ------- + novelty_score: float + The novelty score for the individual. + """ + distances = [] + + # Calculate distances between the individual and every other individual in the population + for other in population: + if other != individual: + # Here we use Hamming distance to measure difference + distance = sum(i != o for i, o in zip(individual, other)) + distances.append(distance) + + # Sort the distances and take the average of the k nearest neighbors + distances = sorted(distances) + k_min = min(k, len(population)) + nearest_distances = distances[:k_min] + + # Novelty score is the average distance to the k-nearest neighbors + novelty_score = np.mean(nearest_distances) if nearest_distances else 0 + return novelty_score From fba486c5ede488cc0873d6c3bfc7ee48ae0b8865 Mon Sep 17 00:00:00 2001 From: "rodrigo.arenas" <31422766+rodrigo-arenas@users.noreply.github.com> Date: Thu, 12 Sep 2024 18:16:45 -0500 Subject: [PATCH 9/9] 0.11.0 release --- docs/release_notes.rst | 4 ++-- sklearn_genetic/_version.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/release_notes.rst b/docs/release_notes.rst index e0ba510..2db6a18 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -3,8 +3,8 @@ Release Notes Some notes on new features in various releases -What's new in 0.11.0dev0 ------------------------- +What's new in 0.11.0 +-------------------- ^^^^^^^^^ Features: diff --git a/sklearn_genetic/_version.py b/sklearn_genetic/_version.py index fcf3da7..ae6db5f 100644 --- a/sklearn_genetic/_version.py +++ b/sklearn_genetic/_version.py @@ -1 +1 @@ -__version__ = "0.11.0dev0" +__version__ = "0.11.0"