From a7992adb3c62dffbd6a5b07db2c99117f3a7283f Mon Sep 17 00:00:00 2001 From: Kelly Castro Date: Thu, 17 Oct 2024 01:27:50 -0300 Subject: [PATCH 1/2] Added functions for detection and treatment of outliers --- bibmon/_outlier_handling.py | 60 ++++++++++++ bibmon/_preprocess.py | 174 ++++++++-------------------------- test/test_outlier_handling.py | 52 ++++++++++ 3 files changed, 150 insertions(+), 136 deletions(-) create mode 100644 bibmon/_outlier_handling.py create mode 100644 test/test_outlier_handling.py diff --git a/bibmon/_outlier_handling.py b/bibmon/_outlier_handling.py new file mode 100644 index 0000000..7007a2a --- /dev/null +++ b/bibmon/_outlier_handling.py @@ -0,0 +1,60 @@ +import pandas as pd +import numpy as np +from scipy.stats.mstats import winsorize + +def detect_outliers_iqr(df: pd.DataFrame, cols: list) -> pd.DataFrame: + """ + Detects outliers in a DataFrame using the IQR (Interquartile Range) method. + + Args: + df (pd.DataFrame): DataFrame with the data. + cols (list): List of columns for which outliers will be detected. + + Returns: + pd.DataFrame: DataFrame with outliers flagged as 1 and other points as 0. + """ + + df_outliers = df.copy() + for col in cols: + Q1 = df_outliers[col].quantile(0.25) + Q3 = df_outliers[col].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + df_outliers[col] = ((df_outliers[col] < lower_bound) | (df_outliers[col] > upper_bound)).astype(int) + return df_outliers + +def remove_outliers(df: pd.DataFrame, cols: list, method: str = 'remove') -> pd.DataFrame: + """ + Removes or handles outliers in a DataFrame using the IQR (Interquartile Range) method. + + Args: + df (pd.DataFrame): DataFrame with the data. + cols (list): List of columns for which outliers will be removed or handled. + method (str): Method for handling outliers. Can be 'remove' (removes outliers), + 'median' (replaces outliers with the median), or 'winsorize' (applies winsorization). + Default: 'remove'. + + Returns: + pd.DataFrame: DataFrame with outliers removed or handled. + """ + + df_outliers = df.copy() + for col in cols: + Q1 = df_outliers[col].quantile(0.25) + Q3 = df_outliers[col].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + + if method == 'remove': + df_outliers = df_outliers[(df_outliers[col] >= lower_bound) & (df_outliers[col] <= upper_bound)] + elif method == 'median': + median = df_outliers[col].median() + df_outliers.loc[(df_outliers[col] < lower_bound) | (df_outliers[col] > upper_bound), col] = median + elif method == 'winsorize': + df_outliers[col] = winsorize(df_outliers[col], limits=[0.05, 0.05]) + else: + raise ValueError("Invalid method. Choose between 'remove', 'median', or 'winsorize'.") + + return df_outliers \ No newline at end of file diff --git a/bibmon/_preprocess.py b/bibmon/_preprocess.py index 0a85adb..7755d22 100644 --- a/bibmon/_preprocess.py +++ b/bibmon/_preprocess.py @@ -1,6 +1,7 @@ import copy import pandas as pd import statsmodels.tsa.tsatools +from ._outlier_handling import detect_outliers_iqr, remove_outliers ############################################################################### @@ -10,7 +11,7 @@ class PreProcess (): Parameters ---------- - + f_pp: list, optional List containing strings with names of methods to be used in the preprocessing of the train data. The list of methods @@ -42,19 +43,23 @@ class PreProcess (): add_moving_average() * Noise treatment: - moving_average_filter() + moving_average_filter() + + * Outlier handling: + detect_outliers_iqr(); + remove_outliers() """ - + ########################################################################### def __init__(self, f_pp = None, a_pp = None, is_Y = False): - + self.is_Y = is_Y - self.f_pp = f_pp - self._a_pp = a_pp + self.f_pp = f_pp + self._a_pp = a_pp if self.f_pp is not None: - self.params_per_func = {f: {} for f in f_pp} + self.params_per_func = {f: {} for f in f_pp} ########################################################################### @@ -74,7 +79,7 @@ def a_pp(self, a_pp): self.params_per_func = {f: {} for f in self.f_pp} if a_pp is not None: - + for pname, pval in a_pp.items(): func, param = pname.split('__',1) self.params_per_func[func][param] = pval @@ -93,10 +98,10 @@ def apply(self, df, train_or_test = 'train'): train_or_test: string, optional Indicates which step the data corresponds to. Returns - ---------- + ---------- : pandas.DataFrame - Processed data. - """ + Processed data. + """ df_processed = df @@ -105,7 +110,7 @@ def apply(self, df, train_or_test = 'train'): df_processed = getattr(self, f)(df_processed, train_or_test, **self.params_per_func[f]) - + return df_processed ######################### @@ -125,10 +130,10 @@ def remove_empty_variables (self, df, train_or_test = 'train'): train_or_test: string, optional Indicates which step the data corresponds to. Returns - ---------- + ---------- : pandas.DataFrame - Processed data. - """ + Processed data. + """ if train_or_test == 'train': return df.dropna(axis=1, how='all') elif train_or_test == 'test': @@ -137,7 +142,7 @@ def remove_empty_variables (self, df, train_or_test = 'train'): ########################################################################### def remove_frozen_variables (self, df, train_or_test = 'train', - threshold = 1e-6): + threshold = 1e-6): """ Removes variables whose variation falls below a given limit. @@ -150,10 +155,10 @@ def remove_frozen_variables (self, df, train_or_test = 'train', threshold: float, optional Variance limit to consider a variable as frozen. Returns - ---------- + ---------- : pandas.DataFrame - Processed data. - """ + Processed data. + """ if not self.is_Y: if train_or_test == 'train': return df.loc[:, df.var(ddof=1) > threshold] @@ -180,9 +185,9 @@ def ffill_nan (self, df, train_or_test = 'train'): train_or_test: string, optional Indicates which step the data corresponds to. Returns - ---------- + ---------- : pandas.DataFrame - Processed data. + Processed data. """ return df.ffill().bfill() @@ -199,9 +204,9 @@ def remove_observations_with_nan (self, df, train_or_test = 'train'): train_or_test: string, optional Indicates which step the data corresponds to. Returns - ---------- + ---------- : pandas.DataFrame - Processed data. + Processed data. """ return df.dropna(axis=0, how='any') @@ -220,11 +225,10 @@ def replace_nan_with_values (self, df, train_or_test = 'train', val = 0): val: int or float Value to be used in the replacement. Returns - ---------- + ---------- : pandas.DataFrame - Processed data. - """ - + Processed data. + """ return df.fillna(val) ############### @@ -243,9 +247,9 @@ def back_to_units (self, df): df: pandas.DataFrame Data to be processed. Returns - ---------- + ---------- : pandas.DataFrame - Processed data. + Processed data. """ return df*self.SD + self.Mu @@ -264,9 +268,9 @@ def normalize (self, df, train_or_test = 'train', mode = 'standard'): mode: string, optional Type of normalization (standard, robust, m-robust or s-robust). Returns - ---------- + ---------- : pandas.DataFrame - Processed data. + Processed data. """ if train_or_test == 'train': @@ -275,13 +279,13 @@ def normalize (self, df, train_or_test = 'train', mode = 'standard'): self.SD = df.std(ddof=1) elif mode == 'robust': self.Mu = df.median() - self.SD = df.mad() + self.SD = df.mad() elif mode == 'm-robust': self.Mu = df.median() self.SD = df.std(ddof=1) elif mode == 's-robust': self.Mu = df.mean() - self.SD = df.mad() + self.SD = df.mad() return (df - self.Mu)/self.SD @@ -289,106 +293,4 @@ def normalize (self, df, train_or_test = 'train', mode = 'standard'): return (df - self.Mu)/self.SD - ############################## - # ADDING DYNAMICS - ############################## - - ########################################################################### - - def apply_lag (self, df, train_or_test = 'train', lag = 1): - """ - Generation of time-delayed variables. - - Parameters - ---------- - df: pandas.DataFrame - Data to be processed. - train_or_test: string, optional - Indicates which step the data corresponds to. - lag: int, optional - Number of delays to be considered. - Returns - ---------- - : pandas.DataFrame - Processed data. - """ - - if self.is_Y: - return df.iloc[lag:,:] - else: - array_lagged = statsmodels.tsa.tsatools.lagmat(df, maxlag = lag, - trim = "forward", - original = 'in')[lag:,:] - new_columns = [] - for l in range(lag): - new_columns.append(df.columns+' - lag '+str(l+1)) - columns_lagged = df.columns.append(new_columns) - index_lagged = df.index[lag:] - df_lagged = pd.DataFrame(array_lagged, index = index_lagged, - columns = columns_lagged) - - return df_lagged - - ########################################################################### - - def add_moving_average (self, df, train_or_test = 'train', WS = 10): - """ - Adding variables filtered by moving average. - Attention! Do not confuse with moving_average_filter, in which - the original variables are not kept in the dataset. - - Parameters - ---------- - df: pandas.DataFrame - Data to be processed. - train_or_test: string, optional - Indicates which step the data corresponds to. - WS: int, optional - Window size of the filter. - Returns - ---------- - : pandas.DataFrame - Processed data. - """ - if self.is_Y: - return df - - new_df = copy.deepcopy(df) - - for column in df: - new_df[column+' MA'] = new_df[column].rolling(WS).mean() - - return new_df.drop(df.index[:WS]) - - ############################## - # NOISE TREATMENT - ############################## - - ########################################################################### - - def moving_average_filter (self, df, train_or_test = 'train', WS = 10): - """ - Moving average noise filter. - - Parameters - ---------- - df: pandas.DataFrame - Data to be processed. - train_or_test: string, optional - Indicates which step the data corresponds to. - WS: int, optional - Window size of the filter. - Returns - ---------- - : pandas.DataFrame - Processed data. - """ - new_df = copy.deepcopy(df) - - for column in df: - new_df[column] = new_df[column].rolling(WS).mean() - - if hasattr(df,'name'): - new_df.name = df.name - - return new_df.drop(df.index[:WS]) \ No newline at end of file + ############################## \ No newline at end of file diff --git a/test/test_outlier_handling.py b/test/test_outlier_handling.py new file mode 100644 index 0000000..e862140 --- /dev/null +++ b/test/test_outlier_handling.py @@ -0,0 +1,52 @@ +import unittest +import pandas as pd +from bibmon._outlier_handling import detect_outliers_iqr, remove_outliers + +class TestOutlierHandling(unittest.TestCase): + + def test_detect_outliers_iqr(self): + # Create a sample DataFrame with outliers + data = {'col1': [1, 2, 3, 4, 5, 100]} + df = pd.DataFrame(data) + + # Run the detect_outliers_iqr function + df_outliers = detect_outliers_iqr(df, ['col1']) + + # Check if the outlier was detected correctly + self.assertEqual(df_outliers['col1'].tolist(), [0, 0, 0, 0, 0, 1]) + + def test_remove_outliers_remove(self): + # Create a sample DataFrame with outliers + data = {'col1': [1, 2, 3, 4, 5, 100]} + df = pd.DataFrame(data) + + # Run the remove_outliers function with method='remove' + df_outliers = remove_outliers(df, ['col1'], method='remove') + + # Check if the outlier was removed correctly + self.assertEqual(df_outliers['col1'].tolist(), [1, 2, 3, 4, 5]) + + def test_remove_outliers_median(self): + # Create a sample DataFrame with outliers + data = {'col1': [1, 2, 3, 4, 5, 100]} + df = pd.DataFrame(data) + + # Run the remove_outliers function with method='median' + df_outliers = remove_outliers(df, ['col1'], method='median') + + # Check if the outlier was replaced by the median + self.assertEqual(df_outliers['col1'].tolist(), [1, 2, 3, 4, 5, 3]) + + def test_remove_outliers_winsorize(self): + # Create a sample DataFrame with outliers + data = {'col1': [1, 2, 3, 4, 5, 100]} + df = pd.DataFrame(data) + + # Run the remove_outliers function with method='winsorize' + df_outliers = remove_outliers(df, ['col1'], method='winsorize') + + # Check if the outlier was winsorized + self.assertTrue(df_outliers['col1'].tolist()[-1] < 100) # Check if the value was limited + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From 8b78bdd72d97e37adb2a651d38a376fc8e721b15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Afr=C3=A2nio=20Melo?= <40374017+afraniomelo@users.noreply.github.com> Date: Fri, 10 Jan 2025 12:59:23 -0300 Subject: [PATCH 2/2] adjustments to incorporate pr53 --- bibmon/_bibmon_tools.py | 37 ++++++ bibmon/_outlier_handling.py | 60 --------- bibmon/_preprocess.py | 243 ++++++++++++++++++++++++++++------ test/test_outlier_handling.py | 52 -------- 4 files changed, 243 insertions(+), 149 deletions(-) delete mode 100644 bibmon/_outlier_handling.py delete mode 100644 test/test_outlier_handling.py diff --git a/bibmon/_bibmon_tools.py b/bibmon/_bibmon_tools.py index fd598d9..02aee50 100644 --- a/bibmon/_bibmon_tools.py +++ b/bibmon/_bibmon_tools.py @@ -66,6 +66,43 @@ def create_df_with_noise (array, return df ############################################################################### +def detect_outliers_iqr(df: pd.DataFrame, + cols: list = None) -> pd.DataFrame: + """ + Detects outliers in a DataFrame using the IQR (Interquartile Range) + method. + + Parameters + ---------- + df: pandas.DataFrame + Data to be processed. + cols: list + List of columns for which outliers will be detected. + Default: None (which results in considering all cols) + Returns + ---------- + : pandas.DataFrame: + DataFrame with outliers flagged as 1 + and other points as 0. + """ + + df_outliers = df.copy() + + if cols is None: + cols = list(df.columns) + + for col in cols: + Q1 = df_outliers[col].quantile(0.25) + Q3 = df_outliers[col].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + df_outliers[col] = ((df_outliers[col] < lower_bound) | + (df_outliers[col] > upper_bound)).astype(int) + return df_outliers + +############################################################################### + def align_dfs_by_rows (df1, df2): """ diff --git a/bibmon/_outlier_handling.py b/bibmon/_outlier_handling.py deleted file mode 100644 index 7007a2a..0000000 --- a/bibmon/_outlier_handling.py +++ /dev/null @@ -1,60 +0,0 @@ -import pandas as pd -import numpy as np -from scipy.stats.mstats import winsorize - -def detect_outliers_iqr(df: pd.DataFrame, cols: list) -> pd.DataFrame: - """ - Detects outliers in a DataFrame using the IQR (Interquartile Range) method. - - Args: - df (pd.DataFrame): DataFrame with the data. - cols (list): List of columns for which outliers will be detected. - - Returns: - pd.DataFrame: DataFrame with outliers flagged as 1 and other points as 0. - """ - - df_outliers = df.copy() - for col in cols: - Q1 = df_outliers[col].quantile(0.25) - Q3 = df_outliers[col].quantile(0.75) - IQR = Q3 - Q1 - lower_bound = Q1 - 1.5 * IQR - upper_bound = Q3 + 1.5 * IQR - df_outliers[col] = ((df_outliers[col] < lower_bound) | (df_outliers[col] > upper_bound)).astype(int) - return df_outliers - -def remove_outliers(df: pd.DataFrame, cols: list, method: str = 'remove') -> pd.DataFrame: - """ - Removes or handles outliers in a DataFrame using the IQR (Interquartile Range) method. - - Args: - df (pd.DataFrame): DataFrame with the data. - cols (list): List of columns for which outliers will be removed or handled. - method (str): Method for handling outliers. Can be 'remove' (removes outliers), - 'median' (replaces outliers with the median), or 'winsorize' (applies winsorization). - Default: 'remove'. - - Returns: - pd.DataFrame: DataFrame with outliers removed or handled. - """ - - df_outliers = df.copy() - for col in cols: - Q1 = df_outliers[col].quantile(0.25) - Q3 = df_outliers[col].quantile(0.75) - IQR = Q3 - Q1 - lower_bound = Q1 - 1.5 * IQR - upper_bound = Q3 + 1.5 * IQR - - if method == 'remove': - df_outliers = df_outliers[(df_outliers[col] >= lower_bound) & (df_outliers[col] <= upper_bound)] - elif method == 'median': - median = df_outliers[col].median() - df_outliers.loc[(df_outliers[col] < lower_bound) | (df_outliers[col] > upper_bound), col] = median - elif method == 'winsorize': - df_outliers[col] = winsorize(df_outliers[col], limits=[0.05, 0.05]) - else: - raise ValueError("Invalid method. Choose between 'remove', 'median', or 'winsorize'.") - - return df_outliers \ No newline at end of file diff --git a/bibmon/_preprocess.py b/bibmon/_preprocess.py index 7755d22..8a40de4 100644 --- a/bibmon/_preprocess.py +++ b/bibmon/_preprocess.py @@ -1,7 +1,7 @@ import copy import pandas as pd import statsmodels.tsa.tsatools -from ._outlier_handling import detect_outliers_iqr, remove_outliers +import scipy.stats.mstats ############################################################################### @@ -11,7 +11,7 @@ class PreProcess (): Parameters ---------- - + f_pp: list, optional List containing strings with names of methods to be used in the preprocessing of the train data. The list of methods @@ -43,23 +43,22 @@ class PreProcess (): add_moving_average() * Noise treatment: - moving_average_filter() - + moving_average_filter() + * Outlier handling: - detect_outliers_iqr(); - remove_outliers() + process_outliers_iqr() """ - + ########################################################################### def __init__(self, f_pp = None, a_pp = None, is_Y = False): - + self.is_Y = is_Y - self.f_pp = f_pp - self._a_pp = a_pp + self.f_pp = f_pp + self._a_pp = a_pp if self.f_pp is not None: - self.params_per_func = {f: {} for f in f_pp} + self.params_per_func = {f: {} for f in f_pp} ########################################################################### @@ -79,7 +78,7 @@ def a_pp(self, a_pp): self.params_per_func = {f: {} for f in self.f_pp} if a_pp is not None: - + for pname, pval in a_pp.items(): func, param = pname.split('__',1) self.params_per_func[func][param] = pval @@ -98,10 +97,10 @@ def apply(self, df, train_or_test = 'train'): train_or_test: string, optional Indicates which step the data corresponds to. Returns - ---------- + ---------- : pandas.DataFrame - Processed data. - """ + Processed data. + """ df_processed = df @@ -110,7 +109,7 @@ def apply(self, df, train_or_test = 'train'): df_processed = getattr(self, f)(df_processed, train_or_test, **self.params_per_func[f]) - + return df_processed ######################### @@ -130,10 +129,10 @@ def remove_empty_variables (self, df, train_or_test = 'train'): train_or_test: string, optional Indicates which step the data corresponds to. Returns - ---------- + ---------- : pandas.DataFrame - Processed data. - """ + Processed data. + """ if train_or_test == 'train': return df.dropna(axis=1, how='all') elif train_or_test == 'test': @@ -142,7 +141,7 @@ def remove_empty_variables (self, df, train_or_test = 'train'): ########################################################################### def remove_frozen_variables (self, df, train_or_test = 'train', - threshold = 1e-6): + threshold = 1e-6): """ Removes variables whose variation falls below a given limit. @@ -155,10 +154,10 @@ def remove_frozen_variables (self, df, train_or_test = 'train', threshold: float, optional Variance limit to consider a variable as frozen. Returns - ---------- + ---------- : pandas.DataFrame - Processed data. - """ + Processed data. + """ if not self.is_Y: if train_or_test == 'train': return df.loc[:, df.var(ddof=1) > threshold] @@ -185,9 +184,9 @@ def ffill_nan (self, df, train_or_test = 'train'): train_or_test: string, optional Indicates which step the data corresponds to. Returns - ---------- + ---------- : pandas.DataFrame - Processed data. + Processed data. """ return df.ffill().bfill() @@ -204,9 +203,9 @@ def remove_observations_with_nan (self, df, train_or_test = 'train'): train_or_test: string, optional Indicates which step the data corresponds to. Returns - ---------- + ---------- : pandas.DataFrame - Processed data. + Processed data. """ return df.dropna(axis=0, how='any') @@ -225,10 +224,11 @@ def replace_nan_with_values (self, df, train_or_test = 'train', val = 0): val: int or float Value to be used in the replacement. Returns - ---------- + ---------- : pandas.DataFrame - Processed data. - """ + Processed data. + """ + return df.fillna(val) ############### @@ -247,9 +247,9 @@ def back_to_units (self, df): df: pandas.DataFrame Data to be processed. Returns - ---------- + ---------- : pandas.DataFrame - Processed data. + Processed data. """ return df*self.SD + self.Mu @@ -268,9 +268,9 @@ def normalize (self, df, train_or_test = 'train', mode = 'standard'): mode: string, optional Type of normalization (standard, robust, m-robust or s-robust). Returns - ---------- + ---------- : pandas.DataFrame - Processed data. + Processed data. """ if train_or_test == 'train': @@ -279,13 +279,13 @@ def normalize (self, df, train_or_test = 'train', mode = 'standard'): self.SD = df.std(ddof=1) elif mode == 'robust': self.Mu = df.median() - self.SD = df.mad() + self.SD = df.mad() elif mode == 'm-robust': self.Mu = df.median() self.SD = df.std(ddof=1) elif mode == 's-robust': self.Mu = df.mean() - self.SD = df.mad() + self.SD = df.mad() return (df - self.Mu)/self.SD @@ -293,4 +293,173 @@ def normalize (self, df, train_or_test = 'train', mode = 'standard'): return (df - self.Mu)/self.SD - ############################## \ No newline at end of file + ############################## + # ADDING DYNAMICS + ############################## + + ########################################################################### + + def apply_lag (self, df, train_or_test = 'train', lag = 1): + """ + Generation of time-delayed variables. + + Parameters + ---------- + df: pandas.DataFrame + Data to be processed. + train_or_test: string, optional + Indicates which step the data corresponds to. + lag: int, optional + Number of delays to be considered. + Returns + ---------- + : pandas.DataFrame + Processed data. + """ + + if self.is_Y: + return df.iloc[lag:,:] + else: + array_lagged = statsmodels.tsa.tsatools.lagmat(df, maxlag = lag, + trim = "forward", + original = 'in')[lag:,:] + new_columns = [] + for l in range(lag): + new_columns.append(df.columns+' - lag '+str(l+1)) + columns_lagged = df.columns.append(new_columns) + index_lagged = df.index[lag:] + df_lagged = pd.DataFrame(array_lagged, index = index_lagged, + columns = columns_lagged) + + return df_lagged + + ########################################################################### + + def add_moving_average (self, df, train_or_test = 'train', WS = 10): + """ + Adding variables filtered by moving average. + Attention! Do not confuse with moving_average_filter, in which + the original variables are not kept in the dataset. + + Parameters + ---------- + df: pandas.DataFrame + Data to be processed. + train_or_test: string, optional + Indicates which step the data corresponds to. + WS: int, optional + Window size of the filter. + Returns + ---------- + : pandas.DataFrame + Processed data. + """ + if self.is_Y: + return df + + new_df = copy.deepcopy(df) + + for column in df: + new_df[column+' MA'] = new_df[column].rolling(WS).mean() + + return new_df.drop(df.index[:WS]) + + ############################## + # NOISE TREATMENT + ############################## + + ########################################################################### + + def moving_average_filter (self, df, train_or_test = 'train', WS = 10): + """ + Moving average noise filter. + + Parameters + ---------- + df: pandas.DataFrame + Data to be processed. + train_or_test: string, optional + Indicates which step the data corresponds to. + WS: int, optional + Window size of the filter. + Returns + ---------- + : pandas.DataFrame + Processed data. + """ + new_df = copy.deepcopy(df) + + for column in df: + new_df[column] = new_df[column].rolling(WS).mean() + + if hasattr(df,'name'): + new_df.name = df.name + + return new_df.drop(df.index[:WS]) + + #################### + # OUTLIER HANDLING + #################### + + ########################################################################### + + def process_outliers_iqr(self, df: pd.DataFrame, + train_or_test: bool = 'train', + cols: list = None, + method: str = 'remove') -> pd.DataFrame: + """ + Removes or handles univariate outliers in a DataFrame using + the IQR (Interquartile Range) method. + + Parameters + ---------- + df: pandas.DataFrame + Data to be processed. + train_or_test: string, optional + Indicates which step the data corresponds to. + cols: list, optional + List of columns for which outliers will be removed or handled. + Default: None (which results in considering all cols) + method: str + Method for handling outliers. Can be 'remove' (removes outliers), + 'median' (replaces outliers with the median), + or 'winsorize' (applies winsorization). + Default: 'remove'. + Returns + ---------- + : pandas.DataFrame: + DataFrame with outliers removed or handled. + """ + + if 'train_or_test' == 'test': + # it doesn't make sense to process outliers in the test data + # returning unchanged df: + return df + + df_outliers = df.copy() + + if cols is None: + cols = list(df.columns) + + for col in cols: + Q1 = df_outliers[col].quantile(0.25) + Q3 = df_outliers[col].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + + if method == 'remove': + df_outliers = df_outliers[(df_outliers[col] >= lower_bound) & + (df_outliers[col] <= upper_bound)] + elif method == 'median': + median = df_outliers[col].median() + df_outliers.loc[(df_outliers[col] < lower_bound) | + (df_outliers[col] > upper_bound), col] = median + elif method == 'winsorize': + df_outliers[col]=scipy.stats.mstats.winsorize(df_outliers[col], + limits=[0.05, 0.05]) + else: + raise ValueError("Invalid method. Choose between 'remove', \ + 'median', or 'winsorize'.") + + return df_outliers \ No newline at end of file diff --git a/test/test_outlier_handling.py b/test/test_outlier_handling.py deleted file mode 100644 index e862140..0000000 --- a/test/test_outlier_handling.py +++ /dev/null @@ -1,52 +0,0 @@ -import unittest -import pandas as pd -from bibmon._outlier_handling import detect_outliers_iqr, remove_outliers - -class TestOutlierHandling(unittest.TestCase): - - def test_detect_outliers_iqr(self): - # Create a sample DataFrame with outliers - data = {'col1': [1, 2, 3, 4, 5, 100]} - df = pd.DataFrame(data) - - # Run the detect_outliers_iqr function - df_outliers = detect_outliers_iqr(df, ['col1']) - - # Check if the outlier was detected correctly - self.assertEqual(df_outliers['col1'].tolist(), [0, 0, 0, 0, 0, 1]) - - def test_remove_outliers_remove(self): - # Create a sample DataFrame with outliers - data = {'col1': [1, 2, 3, 4, 5, 100]} - df = pd.DataFrame(data) - - # Run the remove_outliers function with method='remove' - df_outliers = remove_outliers(df, ['col1'], method='remove') - - # Check if the outlier was removed correctly - self.assertEqual(df_outliers['col1'].tolist(), [1, 2, 3, 4, 5]) - - def test_remove_outliers_median(self): - # Create a sample DataFrame with outliers - data = {'col1': [1, 2, 3, 4, 5, 100]} - df = pd.DataFrame(data) - - # Run the remove_outliers function with method='median' - df_outliers = remove_outliers(df, ['col1'], method='median') - - # Check if the outlier was replaced by the median - self.assertEqual(df_outliers['col1'].tolist(), [1, 2, 3, 4, 5, 3]) - - def test_remove_outliers_winsorize(self): - # Create a sample DataFrame with outliers - data = {'col1': [1, 2, 3, 4, 5, 100]} - df = pd.DataFrame(data) - - # Run the remove_outliers function with method='winsorize' - df_outliers = remove_outliers(df, ['col1'], method='winsorize') - - # Check if the outlier was winsorized - self.assertTrue(df_outliers['col1'].tolist()[-1] < 100) # Check if the value was limited - -if __name__ == '__main__': - unittest.main() \ No newline at end of file