From 2c4b26c15042efaa3b1221e05c6217faa7ddd5bf Mon Sep 17 00:00:00 2001 From: Kajanan Selvanesan Date: Sun, 12 May 2024 00:04:34 +0530 Subject: [PATCH] feat: Introduce use case for dwell time --- .gitignore | 1 + README.md | 25 +- examples/pipeline.py | 4 +- src/_pipeline.py | 458 ++++++++++++++++++ src/models/use_cases/__init__.py | 1 + src/models/use_cases/dwell_time/__init__.py | 3 + .../use_cases/dwell_time/bus/__init__.py | 1 + .../use_cases/dwell_time/bus/mme4bdt.py | 73 +++ src/pipeline.py | 96 +--- 9 files changed, 568 insertions(+), 94 deletions(-) create mode 100644 src/_pipeline.py create mode 100644 src/models/use_cases/dwell_time/__init__.py create mode 100644 src/models/use_cases/dwell_time/bus/__init__.py create mode 100644 src/models/use_cases/dwell_time/bus/mme4bdt.py diff --git a/.gitignore b/.gitignore index 1e9f2f7..9f0247a 100644 --- a/.gitignore +++ b/.gitignore @@ -51,6 +51,7 @@ Temporary Items ### PyCharm ### # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 +.idea/ # User-specific stuff .idea/**/workspace.xml diff --git a/README.md b/README.md index 0039c6d..f81c7f3 100644 --- a/README.md +++ b/README.md @@ -76,8 +76,10 @@ Output: 3. `pipeline` can be used to run experiments with the all the steps by simply running one function with some parameters. ```py from datetime import datetime -from ibat.pipeline import run_exp + from ibat.concept_drift_detector.strategies import DDM +from ibat.datasets import BUS_654_FEATURES_ENCODED_DWELL_TIMES +from ibat.pipeline import run_dt_exp def datetime_from_string(datetime_string: str) -> datetime: @@ -86,19 +88,24 @@ def datetime_from_string(datetime_string: str) -> datetime: if __name__ == "__main__": cdd_strategy = DDM( - drift_level=2, - min_num_instances=2, + warning_level=0.1, + drift_level=1.5, + min_num_instances=1, ) - run_exp( + run_dt_exp( + dt_df=BUS_654_FEATURES_ENCODED_DWELL_TIMES.dataframe, hist_start=datetime_from_string("2021-10-01"), - hist_end=datetime_from_string("2022-01-01"), - stream_start=datetime_from_string("2022-01-01"), + hist_end=datetime_from_string("2022-02-01"), + stream_start=datetime_from_string("2022-02-01"), stream_end=datetime_from_string("2022-11-01"), - interval_min=60, + interval_min=60 * 2, + chunk_size=100, active_strategy=True, + is_buffer_enabled=False, cdd_strategy=cdd_strategy, - output_parent_dir="../experiments", - label="m-xgb-s-xgb_model", + incremental_learning=True, + output_parent_dir="./demo", + label="demo-dt-exp-for-hbp", ) ``` diff --git a/examples/pipeline.py b/examples/pipeline.py index 8760d8a..80c7ad4 100644 --- a/examples/pipeline.py +++ b/examples/pipeline.py @@ -2,7 +2,7 @@ from src.concept_drift_detector.strategies import DDM from src.datasets import BUS_654_FEATURES_ENCODED_DWELL_TIMES -from src.pipeline import run_exp +from src.pipeline import run_dt_exp def datetime_from_string(datetime_string: str) -> datetime: @@ -28,7 +28,7 @@ def datetime_from_string(datetime_string: str) -> datetime: folder_path_to_save_result = "../experiments" experiment_label = "m-xgb-s-xgb_model-d-batch_r2" - run_exp( + run_dt_exp( dt_df=dwell_time_df, hist_start=historical_data_starting_from, hist_end=historical_data_ending_at, diff --git a/src/_pipeline.py b/src/_pipeline.py new file mode 100644 index 0000000..4c32864 --- /dev/null +++ b/src/_pipeline.py @@ -0,0 +1,458 @@ +import os +import warnings +from datetime import datetime, timedelta +from time import time +from typing import Optional + +import matplotlib.pyplot as plt +from numpy import mean +from pandas import concat, DataFrame, to_datetime +from pandas.errors import SettingWithCopyWarning +from sklearn.metrics import ( + mean_absolute_error, + mean_absolute_percentage_error, + root_mean_squared_error, +) +from src.concept_drift_detector.strategies import IStrategy +from src.datasets import ( + BUS_654_FEATURES_ADDED_RUNNING_TIMES, + BUS_654_FEATURES_ENCODED_DWELL_TIMES, +) +from src.models.use_cases.arrival_time.bus import MME4BAT + + +def run_exp( + dt_df: DataFrame, + hist_start: datetime, + hist_end: datetime, + stream_start: datetime, + stream_end: datetime, + interval_min: float, + chunk_size: int, + active_strategy: Optional[bool] = False, + is_buffer_enabled: Optional[bool] = False, + cdd_strategy: Optional[IStrategy] = None, + incremental_learning: Optional[bool] = True, + output_parent_dir: Optional[str] = "./", + label: Optional[str] = "", +) -> None: + """ + Run the experiment. + + Args: + dt_df: Dwell time dataframe to contact the experiment. + hist_start: Start timestamp (inclusive) for historical data. + hist_end: End timestamp (exclusive) for historical data. + stream_start: Start timestamp (inclusive) for streaming data. + stream_end: End timestamp (exclusive) for streaming data. + interval_min: Minimum waiting time (in minutes) for data processing. + chunk_size: Minimum required amount of data for data processing. + active_strategy: Flag indicating whether to use active strategy (default is False). + is_buffer_enabled: To do (default is False). + cdd_strategy: Strategy to be used for detecting concept drift. Required if active_strategy is True. + incremental_learning: To do (default is True). + output_parent_dir: Parent directory's path to save experiment results. + label: Experiment label (default is an empty string). + + Returns: + None + """ + # To do: warnings.filterwarnings("ignore", category=SettingWithCopyWarning) + warnings.filterwarnings("ignore") + + if interval_min == 0 and chunk_size == 0: + raise ValueError("Both interval_min & chunk_size cannot be set to zero.") + else: + count_not_enough = False + is_end_reached = False + + hybrid_bp = interval_min != 0 and chunk_size != 0 + scheduled_bp = interval_min != 0 and chunk_size == 0 + + if hybrid_bp: + bp_technique = "HYBRID" + elif scheduled_bp: + bp_technique = "SCHEDULED" + else: + bp_technique = "PERIODIC" + + if active_strategy and cdd_strategy is None: + raise ValueError("cdd_strategy must be provided when active_strategy is True.") + else: + strategy = "ACTIVE" if active_strategy else "PASSIVE" + + print(f"BATCH PROCESSING TECHNIQUE: {bp_technique} | CONCEPT DRIFT HANDLING STRATEGY: {strategy}") + + rt_df: DataFrame = BUS_654_FEATURES_ADDED_RUNNING_TIMES.dataframe + # dt_df: DataFrame = BUS_654_FEATURES_ENCODED_DWELL_TIMES.dataframe + + dt_df["arrival_datetime"] = to_datetime( + dt_df["date"] + " " + dt_df["arrival_time"], format="%Y-%m-%d %H:%M:%S" + ) + dt_df.sort_values(by="arrival_datetime", inplace=True) + + base_model: Optional[MME4BAT] = None + model: Optional[MME4BAT] = None + + result_dt_df = DataFrame( + columns=dt_df.columns.tolist() + + ["true_prediction", "base_model_prediction", "model_prediction"] + ) + + true_predictions = [] + base_model_predictions = [] + model_predictions = [] + + processing_times = [] + + dt_x_buffer: Optional[DataFrame] = None + dt_y_buffer: Optional[DataFrame] = None + + from_date_time = hist_start + to_date_time = hist_end + + while from_date_time < stream_end: + if hybrid_bp and count_not_enough: + temp_df = dt_df.loc[ + (dt_df["arrival_datetime"] >= from_date_time), + :, + ].reset_index(drop=True) + if temp_df.shape[0] < chunk_size: + is_end_reached = True + else: + to_date_time = temp_df["arrival_datetime"].iloc[chunk_size - 1] + timedelta(minutes=1) + + print( + f"DATA STREAM: [{from_date_time.strftime('%Y-%m-%d %H:%M:%S')} - {to_date_time.strftime('%Y-%m-%d %H:%M:%S')})", + end="", + flush=True, + ) + + dt_chunk: DataFrame = dt_df.loc[ + (from_date_time <= dt_df["arrival_datetime"]) + & (dt_df["arrival_datetime"] < to_date_time), + :, + ].reset_index(drop=True) + count_not_enough = dt_chunk.shape[0] < chunk_size + + if ( + scheduled_bp + or (hybrid_bp and not count_not_enough) + or is_end_reached + ): + from_date_time = ( + stream_start if from_date_time == hist_start else to_date_time + ) + to_date_time = from_date_time + timedelta(minutes=interval_min) + + if len(dt_chunk) == 0: + print(" | NO INSTANCES") + continue + else: + print(f" | NUMBER OF INSTANCES: {len(dt_chunk):04d}", end="") + + numeric_dt_chunk = dt_chunk.select_dtypes(include="number") + + dt_x: DataFrame = numeric_dt_chunk.drop(columns=["dwell_time_in_seconds"]) + dt_y: DataFrame = numeric_dt_chunk[["dwell_time_in_seconds"]] + + if not model: + base_model = MME4BAT() + model = MME4BAT(cdd_strategy=cdd_strategy) + + base_model.fit(rt_x=None, rt_y=None, dt_x=dt_x, dt_y=dt_y) + model.fit(rt_x=None, rt_y=None, dt_x=dt_x, dt_y=dt_y) + + print(" | MODEL INITIATED") + else: + true_prediction = dt_y["dwell_time_in_seconds"].tolist() + base_model_prediction = base_model.predict(rt_x=None, dt_x=dt_x)[ + "prediction" + ].tolist() + model_prediction = model.predict(rt_x=None, dt_x=dt_x)[ + "prediction" + ].tolist() + + true_predictions.extend(true_prediction) + base_model_predictions.extend(base_model_prediction) + model_predictions.extend(model_prediction) + + dt_chunk["true_prediction"] = true_prediction + dt_chunk["base_model_prediction"] = base_model_prediction + dt_chunk["model_prediction"] = model_prediction + + result_dt_df = concat([result_dt_df, dt_chunk], ignore_index=True) + + start_time = time() + + if active_strategy: + if is_buffer_enabled: + if (dt_x_buffer is None) or (dt_y_buffer is None): + dt_x_buffer = dt_x + dt_y_buffer = dt_y + else: + dt_x_buffer = concat([dt_x_buffer, dt_x], ignore_index=True) + dt_y_buffer = concat([dt_y_buffer, dt_y], ignore_index=True) + ni_dt_x = dt_x_buffer + ni_dt_y = dt_y_buffer + else: + ni_dt_x = dt_x + ni_dt_y = dt_y + + is_detected = model.is_concept_drift_detected( + ni_rt_x=None, ni_rt_y=None, ni_dt_x=dt_x, ni_dt_y=dt_y + ) + if is_detected: + model.incremental_fit( + ni_rt_x=None, + ni_rt_y=None, + ni_dt_x=ni_dt_x, + ni_dt_y=ni_dt_y, + ) + dt_x_buffer = None + dt_y_buffer = None + else: + model.incremental_fit( + ni_rt_x=None, ni_rt_y=None, ni_dt_x=dt_x, ni_dt_y=dt_y + ) + print() + + end_time = time() + processing_time = end_time - start_time + processing_times.append(processing_time) + else: + print(f" | NUMBER OF INSTANCES: {len(dt_chunk):04d} | COUNT IS NOT ENOUGH. WAITING FOR MORE DATA POINTS.") + + print("\rDATA STREAMING ENDED.", flush=True) + print( + "\rGENERATING & EXPORTING THE RESULTS...", + end="", + flush=True, + ) + + cdd_strategy_content = "" + if active_strategy: + cdd_strategy_content = f"- {cdd_strategy.__class__.__name__}:\n\n" + cdd_strategy_content += f"| Attribute | Value |\n|---|---|\n" + + for attr, value in cdd_strategy.get_attributes().items(): + cdd_strategy_content += f"| {attr} | {value} |\n" + + base_model_mae = mean_absolute_error(true_predictions, base_model_predictions) + model_mae = mean_absolute_error(true_predictions, model_predictions) + + base_model_rmse = root_mean_squared_error(true_predictions, base_model_predictions) + model_rmse = root_mean_squared_error(true_predictions, model_predictions) + + md_file_content = f""" +# Experiment: {label} + +## Parameters +- Historical data starting from: {hist_start} +- Historical data ending at: {hist_end} +- Streaming data starting from: {stream_start} +- Streaming data ending at: {stream_end} +- Minimum waiting time: {interval_min} minutes +- Minimum required amount of data: {chunk_size} +- Batch processing technique: {bp_technique} +- Concept drift handling strategy: {strategy} +- Is buffer enabled: {is_buffer_enabled} +- Concept drift detection algorithm: {cdd_strategy.__class__.__name__ if active_strategy else None} +{cdd_strategy_content} + +## Results +- Model performance metrics: + +| Model | MAE (s) | RMSE (s) | +|--------------------------------------|----------------------|-----------------------| +| Base model (XGBoost) | {base_model_mae:.3f} | {base_model_rmse:.3f} | +| Base model with incremental learning | {model_mae:.3f} | {model_rmse:.3f} | + +- Error reduction percentage in terms of MAE: {(base_model_mae - model_mae) * 100 / base_model_mae:.3f} % +- Average processing time after the batch preparation: {mean(processing_times) * 1000:.3f} ms + """ + + output_dir = os.path.join( + output_parent_dir, + f"ex-{label}-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}", + ) + os.makedirs(output_dir, exist_ok=True) + + with open(f"{output_dir}/README.md", "w") as f: + f.write(md_file_content) + + print("\rGENERATING & EXPORTING THE RESULTS ENDED.", flush=True) + + # print( + # f"MAE for base model: {mean_absolute_error(true_prediction, base_model_prediction)}" + # ) + # print( + # f"MAPE for base model: {mean_absolute_percentage_error(true_prediction, base_model_prediction) * 100}" + # ) + # print( + # f"RMSE for base model: {mean_squared_error(true_prediction, base_model_prediction, squared=False)}" + # ) + # print("-----------------------------------------------------------------") + # print( + # f"MAE for incremental online learning model: {mean_absolute_error(true_prediction, model_prediction)}" + # ) + # print( + # f"""MAPE for incremental online learning model: { + # mean_absolute_percentage_error(true_prediction, model_prediction) * 100 + # }""" + # ) + # print( + # f"""RMSE for incremental online learning model: { + # mean_squared_error(true_prediction, model_prediction, squared=False) + # }""" + # ) + + # dt_pred_df = dt_df[1000:-100] + # # dt_pred_df = dt_df[1000:-83] + # if ( + # len(dt_pred_df) + # == len(true_prediction) + # == len(model_prediction) + # == len(base_model_prediction) + # ): + # dt_pred_df["true_prediction"] = true_prediction + # dt_pred_df["model_prediction"] = model_prediction + # dt_pred_df["base_model_prediction"] = base_model_prediction + # else: + # print( + # len(true_prediction), + # len(model_prediction), + # len(base_model_prediction), + # dt_pred_df.shape, + # ) + # print("Error: The lengths of the arrays and the DataFrame did not match.") + + # plt.figure(figsize=(100, 20)) + # + # x = list(range(len(true_prediction))) + # plt.plot(x, true_prediction, label="True dwell time") + # plt.plot(x, base_model_prediction, label="Base model prediction") + # plt.plot(x, model_prediction, label="Prediction with incremental online learning") + # + # plt.xlabel("X") + # plt.ylabel("Dwell time (s)") + # plt.title("Multiple Lines on a Common Graph") + # + # plt.legend() + # plt.savefig("dt.png", dpi=150) + + df = result_dt_df + df["date"] = to_datetime(df["date"]) + df["arrival_time"] = to_datetime(df["arrival_time"], format="%H:%M:%S") + + directions = df["direction"].unique() + bus_stops = df["bus_stop"].unique() + + starting_time = datetime.strptime("06:00:00", "%H:%M:%S") + ending_time = datetime.strptime("18:00:00", "%H:%M:%S") + + from_time = starting_time + + while from_time < ending_time: + to_time = from_time + timedelta(minutes=60) + print( + f"\rGENERATING & EXPORTING PLOTS FOR EACH BUS STOP: [{from_time.strftime('%H:%M:%S')} - {to_time.strftime('%H:%M:%S')})", + end="", + flush=True, + ) + + for direction in directions: + for bus_stop in bus_stops: + filtered_df = df[ + (df["direction"] == direction) + & (df["bus_stop"] == bus_stop) + & (df["arrival_time"].dt.time >= from_time.time()) + & (df["arrival_time"].dt.time < to_time.time()) + ] + + export_at = os.path.join( + output_dir, + f"dt-d-{direction}-ti-{from_time.strftime('%H-%M-%S')}_{to_time.strftime('%H-%M-%S')}", + ) + os.makedirs(export_at, exist_ok=True) + + if len(filtered_df) > 0: + export_mean_dt_plot_as_image( + df=filtered_df, + bus_stop=bus_stop, + starting_time=from_time, + ending_time=to_time, + export_at=export_at, + ) + + from_time = to_time + + print("\rGENERATING & EXPORTING PLOTS FOR EACH BUS STOP ENDED.", flush=True) + + +def export_mean_dt_plot_as_image( + df: DataFrame, + bus_stop: int, + starting_time, + ending_time, + export_at: str, +) -> None: + dt_in_seconds_df = ( + df.groupby("date")["dwell_time_in_seconds"] + .mean() + .reset_index() + .sort_values(by="date") + ) + + base_model_prediction_df = ( + df.groupby("date")["base_model_prediction"] + .mean() + .reset_index() + .sort_values(by="date") + ) + + model_prediction_df = ( + df.groupby("date")["model_prediction"] + .mean() + .reset_index() + .sort_values(by="date") + ) + + x = model_prediction_df["date"] + + plt.figure(figsize=(13, 5)) + plt.plot( + x, + dt_in_seconds_df["dwell_time_in_seconds"], + label="True dwell time", + marker="o", + linestyle="-", + color="blue", + ) + plt.plot( + x, + base_model_prediction_df["base_model_prediction"], + label="Base model prediction", + marker="o", + linestyle="--", + color="green", + ) + plt.plot( + x, + model_prediction_df["model_prediction"], + label="Prediction with incremental online learning", + marker="o", + linestyle="--", + color="orange", + ) + plt.xlabel("Date") + plt.ylabel("Mean dwell time (s)") + plt.title( + f"The mean dwell time at bus stop {bus_stop} for each day between {starting_time.time()} and {ending_time.time()}." + ) + plt.xticks(rotation=45) + plt.grid(True) + plt.legend() + plt.tight_layout() + plt.savefig(f"{export_at}/{bus_stop}.png", dpi=200) + plt.close() diff --git a/src/models/use_cases/__init__.py b/src/models/use_cases/__init__.py index b0869e2..a86ab28 100644 --- a/src/models/use_cases/__init__.py +++ b/src/models/use_cases/__init__.py @@ -1,3 +1,4 @@ from . import ( arrival_time, + dwell_time, ) diff --git a/src/models/use_cases/dwell_time/__init__.py b/src/models/use_cases/dwell_time/__init__.py new file mode 100644 index 0000000..aa24235 --- /dev/null +++ b/src/models/use_cases/dwell_time/__init__.py @@ -0,0 +1,3 @@ +from . import ( + bus, +) diff --git a/src/models/use_cases/dwell_time/bus/__init__.py b/src/models/use_cases/dwell_time/bus/__init__.py new file mode 100644 index 0000000..8ac1cd3 --- /dev/null +++ b/src/models/use_cases/dwell_time/bus/__init__.py @@ -0,0 +1 @@ +from .mme4bdt import MME4BDT diff --git a/src/models/use_cases/dwell_time/bus/mme4bdt.py b/src/models/use_cases/dwell_time/bus/mme4bdt.py new file mode 100644 index 0000000..c089636 --- /dev/null +++ b/src/models/use_cases/dwell_time/bus/mme4bdt.py @@ -0,0 +1,73 @@ +from copy import deepcopy +from typing import Optional + +from src.concept_drift_detector import CDD +from src.concept_drift_detector.strategies import IStrategy +from src.models.base_models.ensemble.xgboost import XGBClassifier, XGBRegressor + + +class MME4BDT: + def __init__(self, cdd_strategy: Optional[IStrategy] = None) -> None: + self.xgb_dt_classifier = XGBClassifier() + self.xgb_dt_regressor = XGBRegressor() + + if cdd_strategy: + self._cdd_of_xgb_dt_classifier = CDD( + strategy=deepcopy(cdd_strategy), + ) + self._cdd_of_xgb_dt_regressor = CDD( + strategy=deepcopy(cdd_strategy), + ) + + def fit(self, dt_x, dt_y) -> None: + is_dt_gt_0 = dt_y.iloc[:, 0].apply(lambda dt: 1 if dt > 0 else 0) + self.xgb_dt_classifier.fit(dt_x, is_dt_gt_0) + + dt_x_gt_0 = dt_x[dt_y.iloc[:, 0] > 0] + dt_y_gt_0 = dt_y[dt_y.iloc[:, 0] > 0] + self.xgb_dt_regressor.fit(dt_x_gt_0, dt_y_gt_0) + + def incremental_fit(self, ni_dt_x, ni_dt_y) -> None: + is_ni_dt_gt_0 = ni_dt_y.iloc[:, 0].apply(lambda dt: 1 if dt > 0 else 0) + self.xgb_dt_classifier.incremental_fit(ni_dt_x, is_ni_dt_gt_0) + + ni_dt_x_gt_0 = ni_dt_x[ni_dt_y.iloc[:, 0] > 0] + ni_dt_y_gt_0 = ni_dt_y[ni_dt_y.iloc[:, 0] > 0] + self.xgb_dt_regressor.incremental_fit(ni_dt_x_gt_0, ni_dt_y_gt_0) + + def predict(self, dt_x): + is_dt_gt_0 = self.xgb_dt_classifier.predict(dt_x) + + dt_x_gt_0 = dt_x[is_dt_gt_0.iloc[:, 0] == 1] + dt_y_gt_0 = self.xgb_dt_regressor.predict(dt_x_gt_0) + + dt_y = is_dt_gt_0.copy() + dt_y.loc[dt_y["prediction"] > 0, "prediction"] = dt_y_gt_0[ + "prediction" + ].to_numpy() + dt_y.loc[dt_y["prediction"] < 0, "prediction"] = 0 + + return dt_y + + def is_concept_drift_detected(self, ni_dt_x, ni_dt_y) -> bool: + try: + is_detected_1 = self._cdd_of_xgb_dt_classifier.is_concept_drift_detected( + model=self.xgb_dt_classifier, + ni_x=ni_dt_x, + ni_y=ni_dt_y, + ) + + ni_dt_x_gt_0 = ni_dt_x[ni_dt_y.iloc[:, 0] > 0] + ni_dt_y_gt_0 = ni_dt_y[ni_dt_y.iloc[:, 0] > 0] + + is_detected_2 = self._cdd_of_xgb_dt_regressor.is_concept_drift_detected( + model=self.xgb_dt_regressor, + ni_x=ni_dt_x_gt_0, + ni_y=ni_dt_y_gt_0, + ) + print( + f" | CDD at xgb_dt_classifier: {is_detected_1} | CDD at xgb_dt_regressor: {is_detected_2}" + ) + return is_detected_1 or is_detected_2 + except NameError as e: + raise e diff --git a/src/pipeline.py b/src/pipeline.py index 4c32864..5175bce 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -7,21 +7,15 @@ import matplotlib.pyplot as plt from numpy import mean from pandas import concat, DataFrame, to_datetime -from pandas.errors import SettingWithCopyWarning from sklearn.metrics import ( mean_absolute_error, - mean_absolute_percentage_error, root_mean_squared_error, ) from src.concept_drift_detector.strategies import IStrategy -from src.datasets import ( - BUS_654_FEATURES_ADDED_RUNNING_TIMES, - BUS_654_FEATURES_ENCODED_DWELL_TIMES, -) -from src.models.use_cases.arrival_time.bus import MME4BAT +from src.models.use_cases.dwell_time.bus import MME4BDT -def run_exp( +def run_dt_exp( dt_df: DataFrame, hist_start: datetime, hist_end: datetime, @@ -37,7 +31,7 @@ def run_exp( label: Optional[str] = "", ) -> None: """ - Run the experiment. + Run a dwell time experiment. Args: dt_df: Dwell time dataframe to contact the experiment. @@ -57,7 +51,6 @@ def run_exp( Returns: None """ - # To do: warnings.filterwarnings("ignore", category=SettingWithCopyWarning) warnings.filterwarnings("ignore") if interval_min == 0 and chunk_size == 0: @@ -83,16 +76,13 @@ def run_exp( print(f"BATCH PROCESSING TECHNIQUE: {bp_technique} | CONCEPT DRIFT HANDLING STRATEGY: {strategy}") - rt_df: DataFrame = BUS_654_FEATURES_ADDED_RUNNING_TIMES.dataframe - # dt_df: DataFrame = BUS_654_FEATURES_ENCODED_DWELL_TIMES.dataframe - dt_df["arrival_datetime"] = to_datetime( dt_df["date"] + " " + dt_df["arrival_time"], format="%Y-%m-%d %H:%M:%S" ) dt_df.sort_values(by="arrival_datetime", inplace=True) - base_model: Optional[MME4BAT] = None - model: Optional[MME4BAT] = None + base_model: Optional[MME4BDT] = None + model: Optional[MME4BDT] = None result_dt_df = DataFrame( columns=dt_df.columns.tolist() @@ -157,19 +147,19 @@ def run_exp( dt_y: DataFrame = numeric_dt_chunk[["dwell_time_in_seconds"]] if not model: - base_model = MME4BAT() - model = MME4BAT(cdd_strategy=cdd_strategy) + base_model = MME4BDT() + model = MME4BDT(cdd_strategy=cdd_strategy) - base_model.fit(rt_x=None, rt_y=None, dt_x=dt_x, dt_y=dt_y) - model.fit(rt_x=None, rt_y=None, dt_x=dt_x, dt_y=dt_y) + base_model.fit(dt_x=dt_x, dt_y=dt_y) + model.fit(dt_x=dt_x, dt_y=dt_y) print(" | MODEL INITIATED") else: true_prediction = dt_y["dwell_time_in_seconds"].tolist() - base_model_prediction = base_model.predict(rt_x=None, dt_x=dt_x)[ + base_model_prediction = base_model.predict(dt_x=dt_x)[ "prediction" ].tolist() - model_prediction = model.predict(rt_x=None, dt_x=dt_x)[ + model_prediction = model.predict(dt_x=dt_x)[ "prediction" ].tolist() @@ -200,12 +190,10 @@ def run_exp( ni_dt_y = dt_y is_detected = model.is_concept_drift_detected( - ni_rt_x=None, ni_rt_y=None, ni_dt_x=dt_x, ni_dt_y=dt_y + ni_dt_x=dt_x, ni_dt_y=dt_y ) if is_detected: model.incremental_fit( - ni_rt_x=None, - ni_rt_y=None, ni_dt_x=ni_dt_x, ni_dt_y=ni_dt_y, ) @@ -213,7 +201,7 @@ def run_exp( dt_y_buffer = None else: model.incremental_fit( - ni_rt_x=None, ni_rt_y=None, ni_dt_x=dt_x, ni_dt_y=dt_y + ni_dt_x=dt_x, ni_dt_y=dt_y ) print() @@ -283,64 +271,6 @@ def run_exp( print("\rGENERATING & EXPORTING THE RESULTS ENDED.", flush=True) - # print( - # f"MAE for base model: {mean_absolute_error(true_prediction, base_model_prediction)}" - # ) - # print( - # f"MAPE for base model: {mean_absolute_percentage_error(true_prediction, base_model_prediction) * 100}" - # ) - # print( - # f"RMSE for base model: {mean_squared_error(true_prediction, base_model_prediction, squared=False)}" - # ) - # print("-----------------------------------------------------------------") - # print( - # f"MAE for incremental online learning model: {mean_absolute_error(true_prediction, model_prediction)}" - # ) - # print( - # f"""MAPE for incremental online learning model: { - # mean_absolute_percentage_error(true_prediction, model_prediction) * 100 - # }""" - # ) - # print( - # f"""RMSE for incremental online learning model: { - # mean_squared_error(true_prediction, model_prediction, squared=False) - # }""" - # ) - - # dt_pred_df = dt_df[1000:-100] - # # dt_pred_df = dt_df[1000:-83] - # if ( - # len(dt_pred_df) - # == len(true_prediction) - # == len(model_prediction) - # == len(base_model_prediction) - # ): - # dt_pred_df["true_prediction"] = true_prediction - # dt_pred_df["model_prediction"] = model_prediction - # dt_pred_df["base_model_prediction"] = base_model_prediction - # else: - # print( - # len(true_prediction), - # len(model_prediction), - # len(base_model_prediction), - # dt_pred_df.shape, - # ) - # print("Error: The lengths of the arrays and the DataFrame did not match.") - - # plt.figure(figsize=(100, 20)) - # - # x = list(range(len(true_prediction))) - # plt.plot(x, true_prediction, label="True dwell time") - # plt.plot(x, base_model_prediction, label="Base model prediction") - # plt.plot(x, model_prediction, label="Prediction with incremental online learning") - # - # plt.xlabel("X") - # plt.ylabel("Dwell time (s)") - # plt.title("Multiple Lines on a Common Graph") - # - # plt.legend() - # plt.savefig("dt.png", dpi=150) - df = result_dt_df df["date"] = to_datetime(df["date"]) df["arrival_time"] = to_datetime(df["arrival_time"], format="%H:%M:%S")