Skip to content

Commit

Permalink
Merge branch 'pr53'
Browse files Browse the repository at this point in the history
  • Loading branch information
afraniomelo committed Jan 10, 2025
2 parents 7391c5c + 8b78bdd commit 5cff43a
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 4 deletions.
37 changes: 37 additions & 0 deletions bibmon/_bibmon_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,43 @@ def create_df_with_noise (array,
return df

###############################################################################
def detect_outliers_iqr(df: pd.DataFrame,
cols: list = None) -> pd.DataFrame:
"""
Detects outliers in a DataFrame using the IQR (Interquartile Range)
method.
Parameters
----------
df: pandas.DataFrame
Data to be processed.
cols: list
List of columns for which outliers will be detected.
Default: None (which results in considering all cols)
Returns
----------
: pandas.DataFrame:
DataFrame with outliers flagged as 1
and other points as 0.
"""

df_outliers = df.copy()

if cols is None:
cols = list(df.columns)

for col in cols:
Q1 = df_outliers[col].quantile(0.25)
Q3 = df_outliers[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_outliers[col] = ((df_outliers[col] < lower_bound) |
(df_outliers[col] > upper_bound)).astype(int)
return df_outliers

###############################################################################


def align_dfs_by_rows (df1, df2):
"""
Expand Down
79 changes: 75 additions & 4 deletions bibmon/_preprocess.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import copy
import pandas as pd
import statsmodels.tsa.tsatools
import scipy.stats.mstats

###############################################################################

Expand Down Expand Up @@ -42,7 +43,10 @@ class PreProcess ():
add_moving_average()
* Noise treatment:
moving_average_filter()
moving_average_filter()
* Outlier handling:
process_outliers_iqr()
"""

Expand Down Expand Up @@ -275,13 +279,13 @@ def normalize (self, df, train_or_test = 'train', mode = 'standard'):
self.SD = df.std(ddof=1)
elif mode == 'robust':
self.Mu = df.median()
self.SD = df.mad()
self.SD = df.mad()
elif mode == 'm-robust':
self.Mu = df.median()
self.SD = df.std(ddof=1)
elif mode == 's-robust':
self.Mu = df.mean()
self.SD = df.mad()
self.SD = df.mad()

return (df - self.Mu)/self.SD

Expand Down Expand Up @@ -391,4 +395,71 @@ def moving_average_filter (self, df, train_or_test = 'train', WS = 10):
if hasattr(df,'name'):
new_df.name = df.name

return new_df.drop(df.index[:WS])
return new_df.drop(df.index[:WS])

####################
# OUTLIER HANDLING
####################

###########################################################################

def process_outliers_iqr(self, df: pd.DataFrame,
train_or_test: bool = 'train',
cols: list = None,
method: str = 'remove') -> pd.DataFrame:
"""
Removes or handles univariate outliers in a DataFrame using
the IQR (Interquartile Range) method.
Parameters
----------
df: pandas.DataFrame
Data to be processed.
train_or_test: string, optional
Indicates which step the data corresponds to.
cols: list, optional
List of columns for which outliers will be removed or handled.
Default: None (which results in considering all cols)
method: str
Method for handling outliers. Can be 'remove' (removes outliers),
'median' (replaces outliers with the median),
or 'winsorize' (applies winsorization).
Default: 'remove'.
Returns
----------
: pandas.DataFrame:
DataFrame with outliers removed or handled.
"""

if 'train_or_test' == 'test':
# it doesn't make sense to process outliers in the test data
# returning unchanged df:
return df

df_outliers = df.copy()

if cols is None:
cols = list(df.columns)

for col in cols:
Q1 = df_outliers[col].quantile(0.25)
Q3 = df_outliers[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

if method == 'remove':
df_outliers = df_outliers[(df_outliers[col] >= lower_bound) &
(df_outliers[col] <= upper_bound)]
elif method == 'median':
median = df_outliers[col].median()
df_outliers.loc[(df_outliers[col] < lower_bound) |
(df_outliers[col] > upper_bound), col] = median
elif method == 'winsorize':
df_outliers[col]=scipy.stats.mstats.winsorize(df_outliers[col],
limits=[0.05, 0.05])
else:
raise ValueError("Invalid method. Choose between 'remove', \
'median', or 'winsorize'.")

return df_outliers

0 comments on commit 5cff43a

Please sign in to comment.