Skip to content
This repository has been archived by the owner on Aug 25, 2024. It is now read-only.

Adding feature selection operations #1398

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions operations/data/dffml_operations_data/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@

definitions = [
Definition(name="input_data", primitive="List[List[int]]"),
Definition(name="target_data", primitive="List[int]"),
Definition(name="output_data", primitive="List[List[int]]"),
Definition(name="n_components", primitive="int"),
Definition(name="n_iter", primitive="int"),
Definition(name="random_state", primitive="int"),
Definition(name="missing_values", primitive="Any"),
Definition(name="strategy", primitive="str"),
Definition(name="categories", primitive="List[List[Any]]"),
Definition(name="percentile", primitive="int"),
Definition(name="k", primitive="int"),
Definition(name="score_func", primitive="function")
]

for definition in definitions:
Expand Down
75 changes: 75 additions & 0 deletions operations/data/dffml_operations_data/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import f_classif, SelectKBest, SelectPercentile

from dffml.df.base import op

Expand All @@ -14,6 +15,10 @@
random_state,
n_components,
missing_values,
target_data,
k,
percentile,
score_func
)


Expand Down Expand Up @@ -206,8 +211,78 @@ async def ordinal_encoder(data):
Returns
-------
result: Encoded data for categorical values

References:

- https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html

"""
enc = OneHotEncoder()
enc.fit(data)
new_data = enc.transform(data).toarray()
return {"result": new_data}

@op(
inputs={"data": input_data, "target_data": target_data, "k": k, "score_func": score_func},
outputs={"result": output_data}
)
async def select_k_best(data, target_data, score_func=f_classif, k=10):
"""
Select the top k features, based on the score function.

Parameters
----------
data : List[List[int]]
Input data, excluding the target column
target_data : List[int]
1D list containing values for the target column.
score_func : function
Function that takes in data and target_data, and returns
a pair of arrays (scores, pvalues) or a single array with
scores.
k : int
Number of top features to select.

Returns
-------
result: Encoded data for categorical values
"""

selector = SelectKBest(score_func, k=k)
new_data = selector.fit_transform(data, target_data)
return {"result": new_data}

@op(
inputs={"data": input_data, "target_data": target_data, "percentile": percentile, "score_func": score_func},
outputs={"result": output_data}
)
async def select_percentile(data, target_data, score_func=f_classif, percentile=10):
"""
Select a certain top percentile of features, based on the score function.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Select a certain top percentile of features, based on the score function.
Select a certain top percentile of features, based on the score function.
References:
- https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html


Parameters
----------
data : List[List[int]]
Input data, excluding the target column
target_data : List[int]
1D list containing values for the target column.
score_func : function
Function that takes in data and target_data, and returns
a pair of arrays (scores, pvalues) or a single array with
scores.
percentile : int
Percentile of top features to select.

Returns
-------
result: Encoded data for categorical values

References:

- https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html

"""

selector = SelectPercentile(score_func, percentile=percentile)
new_data = selector.fit_transform(data, target_data)
return {"result": new_data}
75 changes: 74 additions & 1 deletion operations/data/tests/test_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from dffml.operation.output import GetSingle
from dffml.df.memory import MemoryOrchestrator
from dffml.util.asynctestcase import AsyncTestCase

from sklearn.feature_selection import f_classif
from dffml_operations_data.operations import *


Expand Down Expand Up @@ -225,3 +225,76 @@ async def test_ordinal_encoder(self):
== output_data
).all()
)

async def test_select_k_best(self):
input_data = [[1, 1], [1, 2], [1, 1], [0, 2], [1, 1], [1, 1]]
target_data = [1,2,1,2,1,2]
output_data = [[1], [2], [1], [2], [1], [1]]

async for ctx, results in MemoryOrchestrator.run(
DataFlow.auto(select_k_best, GetSingle),
[
Input(
value=[select_k_best.op.outputs["result"].name],
definition=GetSingle.op.inputs["spec"],
),
Input(
value=input_data,
definition=select_k_best.op.inputs["data"],
),
Input(
value=target_data,
definition=select_k_best.op.inputs["target_data"],
),
Input(
value=f_classif,
definition=select_k_best.op.inputs["score_func"],
),
Input(
value=1,
definition=select_k_best.op.inputs["k"],
),
],
):
self.assertTrue(
(
results[select_k_best.op.outputs["result"].name]
== output_data
).all()
)
async def test_select_percentile(self):
input_data = [[1, 1], [1, 2], [1, 1], [0, 2], [1, 1], [1, 1]]
target_data = [1,2,1,2,1,2]
output_data = [[1], [2], [1], [2], [1], [1]]

async for ctx, results in MemoryOrchestrator.run(
DataFlow.auto(select_percentile, GetSingle),
[
Input(
value=[select_percentile.op.outputs["result"].name],
definition=GetSingle.op.inputs["spec"],
),
Input(
value=input_data,
definition=select_percentile.op.inputs["data"],
),
Input(
value=target_data,
definition=select_percentile.op.inputs["target_data"],
),
Input(
value=f_classif,
definition=select_percentile.op.inputs["score_func"],
),
Input(
value=50,
definition=select_percentile.op.inputs["percentile"],
),
],
):
self.assertTrue(
(
results[select_percentile.op.outputs["result"].name]
== output_data
).all()
)