Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add more river based incremental methods #27

Merged
merged 7 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/models/base_models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from .base_models import BaseModel, RiverBatchBaseModel, SKLearnBaseModel, XGBoost
from .base_models import (
BaseModel,
RiverBatchBaseModel,
RiverStreamBaseModel,
SKLearnBaseModel,
XGBoost,
)
from .ibase_model import IBaseModel
from . import (
base_models,
Expand Down
35 changes: 33 additions & 2 deletions src/models/base_models/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from typing import Optional

from numpy import ndarray, unique
from pandas import DataFrame, Series
from pandas import concat, DataFrame, Series
from xgboost import Booster, DMatrix, train
from river.stream import iter_pandas
from src.models.base_models.ibase_model import IBaseModel


Expand Down Expand Up @@ -93,11 +94,41 @@ def fit(self, x, y):

def incremental_fit(self, ni_x, ni_y):
if self._model is not None:
self._model = self._model.learn_many(ni_x, ni_y)
self._model.learn_many(ni_x, ni_y)

def predict(self, x) -> DataFrame:
if len(x) == 0:
return DataFrame(columns=["prediction"])

ser_prediction: Series = self._model.predict_many(x)
return ser_prediction.to_frame(name="prediction")


class RiverStreamBaseModel(BaseModel, ABC):
def __init__(self) -> None:
super().__init__()

def fit(self, x, y):
if self._model is not None:
for xi, yi in iter_pandas(x, y):
self._model.learn_one(xi, yi)
else:
raise NotImplementedError("Model is Empty. Choose appropriate subclasses")

def incremental_fit(self, ni_x, ni_y):
if self._model is not None:
for xi, yi in iter_pandas(ni_x, ni_y):
self._model.learn_one(xi, yi)

def predict(self, x) -> DataFrame:
if len(x) == 0:
return DataFrame(columns=["prediction"])

pd_prediction = DataFrame()
for index, row in x.iterrows():
y = self._model.predict_one(row)
pd_prediction = concat(
[pd_prediction, DataFrame({"prediction": [y]})], axis=0
)

return pd_prediction
17 changes: 17 additions & 0 deletions src/models/base_models/ensemble/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,21 @@
from .adaboost import AdaBoostClassifier
from .decision_tree import DecisionTreeClassifier
from .hoeffding_tree import (
HoeffdingAdaptiveTreeClassifier,
HoeffdingTreeClassifier,
HoeffdingTreeRegressor,
)

# from .random_forest import ARFClassifier, ARFRegressor
from .sgt import SGTRegressor
from .streaming_random_patches import SRPClassifier
from .xgboost import XGBClassifier, XGBRegressor
from . import (
adaboost,
decision_tree,
hoeffding_tree,
random_forest,
sgt,
streaming_random_patches,
xgboost,
)
15 changes: 15 additions & 0 deletions src/models/base_models/ensemble/adaboost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from river.tree import HoeffdingTreeClassifier
from river.ensemble import AdaBoostClassifier as ExAdaBoostClassifier
from src.models.base_models.base_models import RiverStreamBaseModel


class AdaBoostClassifier(RiverStreamBaseModel):
def __init__(self) -> None:
super().__init__()
self._is_classifier = True
self._params = {}
self._model = ExAdaBoostClassifier(
model=(HoeffdingTreeClassifier(split_criterion="gini", grace_period=2000)),
n_models=5,
seed=42,
)
14 changes: 14 additions & 0 deletions src/models/base_models/ensemble/decision_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from river.tree import ExtremelyFastDecisionTreeClassifier
from src.models.base_models.base_models import RiverStreamBaseModel


class DecisionTreeClassifier(RiverStreamBaseModel):
def __init__(self) -> None:
super().__init__()
self._is_classifier = True
self._params = {}
self._model = ExtremelyFastDecisionTreeClassifier(
grace_period=100,
nominal_attributes=["elevel", "car", "zipcode"],
min_samples_reevaluate=100,
)
40 changes: 40 additions & 0 deletions src/models/base_models/ensemble/hoeffding_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from river.preprocessing import StandardScaler
from river.tree import (
HoeffdingAdaptiveTreeClassifier as ExHoeffdingAdaptiveTreeClassifier,
HoeffdingTreeClassifier as ExHoeffdingTreeClassifier,
HoeffdingTreeRegressor as ExHoeffdingTreeRegressor,
)
from src.models.base_models.base_models import RiverStreamBaseModel

"""
Tree-based models are popular due to their interpretability. Hoeffding Tree uses a tree data structure to model
the data. When a sample arrives, it traverses the tree until it reaches a leaf node. Internal nodes define the
path for a data sample based on the values of its features. Leaf nodes are models that provide predictions for
unlabeled-samples and can update their internal state using the labels from labeled samples.
"""


class HoeffdingTreeClassifier(RiverStreamBaseModel):
def __init__(self) -> None:
super().__init__()
self._is_classifier = True
self._params = {}
self._model = ExHoeffdingTreeClassifier()


class HoeffdingTreeRegressor(RiverStreamBaseModel):
def __init__(self) -> None:
super().__init__()
self._is_classifier = False
self._params = {}
self._model = StandardScaler() | ExHoeffdingTreeRegressor(
grace_period=100, model_selector_decay=0.9
)


class HoeffdingAdaptiveTreeClassifier(RiverStreamBaseModel):
def __init__(self) -> None:
super().__init__()
self._is_classifier = True
self._params = {}
self._model = ExHoeffdingAdaptiveTreeClassifier(seed=42)
32 changes: 32 additions & 0 deletions src/models/base_models/ensemble/random_forest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# from river.forest import (ARFClassifier as ExARFClassifier, ARFRegressor as ExARFRegressor)
# from river.preprocessing import StandardScaler
# from src.models.base_models.base_models import RiverStreamBaseModel
#
# """
# The 3 most important aspects of ARF are:
#
# * inducing diversity through re-sampling
# * inducing diversity through randomly selecting subsets of features for node splits
# * drift detectors per base tree, which cause selective resets in response to drifts
#
# It also allows training background trees, which start training if a warning is detected and replace the active tree if the warning escalates to a drift.
# """
#
#
# class ARFClassifier(RiverStreamBaseModel):
# def __init__(self) -> None:
# super().__init__()
# self._is_classifier = True
# self._params = {}
# self._model = ExARFClassifier(n_models=10)
#
#
# class ARFRegressor(RiverStreamBaseModel):
# def __init__(self) -> None:
# super().__init__()
# self._is_classifier = False
# self._params = {}
# self._model = (
# StandardScaler() |
# ExARFRegressor(seed=42)
# )
16 changes: 16 additions & 0 deletions src/models/base_models/ensemble/sgt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from river.tree import SGTRegressor as ExSGTRegressor
from river.tree.splitter import DynamicQuantizer
from src.models.base_models.base_models import RiverStreamBaseModel


class SGTRegressor(RiverStreamBaseModel):
def __init__(self) -> None:
super().__init__()
self._is_classifier = False
self._params = {}
self._model = ExSGTRegressor(
delta=0.01,
lambda_value=0.01,
grace_period=20,
feature_quantizer=DynamicQuantizer(std_prop=0.1),
)
21 changes: 21 additions & 0 deletions src/models/base_models/ensemble/streaming_random_patches.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from river.ensemble import SRPClassifier as ExSRPClassifier
from river.tree import HoeffdingTreeClassifier
from src.models.base_models.base_models import RiverStreamBaseModel

"""
SRP is an ensemble method that simulates bagging or random subspaces.
The default algorithm uses both bagging and random subspaces, namely Random Patches.
The default base estimator is a Hoeffding Tree, but other base estimators can be used (differently from random forest variations).
"""


class SRPClassifier(RiverStreamBaseModel):
def __init__(self) -> None:
super().__init__()
self._is_classifier = True
self._params = {}
self._model = ExSRPClassifier(
model=HoeffdingTreeClassifier(),
n_models=10,
seed=42,
)
1 change: 1 addition & 0 deletions src/models/base_models/naive_bayes/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .naive_bayes import BernoulliNB, MultinomialNB
from . import (
naive_bayes,
naive_bayes_river,
)
30 changes: 30 additions & 0 deletions src/models/base_models/naive_bayes/naive_bayes_river.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from river.naive_bayes import (
ComplementNB as ExComplementNB,
BernoulliNB as ExBernoulliNB,
MultinomialNB as ExMultinomialNB,
)
from src.models.base_models.base_models import RiverBatchBaseModel


class BernoulliNB(RiverBatchBaseModel):
def __init__(self) -> None:
super().__init__()
self._is_classifier = True
self._params = {}
self._model = ExBernoulliNB()


class MultinomialNB(RiverBatchBaseModel):
def __init__(self) -> None:
super().__init__()
self._is_classifier = True
self._params = {}
self._model = ExMultinomialNB()


class ComplementNB(RiverBatchBaseModel):
def __init__(self) -> None:
super().__init__()
self._is_classifier = True
self._params = {}
self._model = ExComplementNB()