intel · seraphimstreets · Jun 22, 2022 · Jul 29, 2022 · johnandersen777 · Jul 24, 2022
diff --git a/operations/data/dffml_operations_data/definitions.py b/operations/data/dffml_operations_data/definitions.py
@@ -3,13 +3,17 @@
 
 definitions = [
     Definition(name="input_data", primitive="List[List[int]]"),
+    Definition(name="target_data", primitive="List[int]"),
     Definition(name="output_data", primitive="List[List[int]]"),
     Definition(name="n_components", primitive="int"),
     Definition(name="n_iter", primitive="int"),
     Definition(name="random_state", primitive="int"),
     Definition(name="missing_values", primitive="Any"),
     Definition(name="strategy", primitive="str"),
     Definition(name="categories", primitive="List[List[Any]]"),
+    Definition(name="percentile",  primitive="int"),
+    Definition(name="k",  primitive="int"),
+    Definition(name="score_func", primitive="function")
 ]
 
 for definition in definitions:

diff --git a/operations/data/dffml_operations_data/operations.py b/operations/data/dffml_operations_data/operations.py
@@ -2,6 +2,7 @@
 from sklearn.decomposition import PCA, TruncatedSVD
 from sklearn.preprocessing import OneHotEncoder, StandardScaler
 from sklearn.impute import SimpleImputer
+from sklearn.feature_selection import f_classif, SelectKBest, SelectPercentile
 
 from dffml.df.base import op
 
@@ -14,6 +15,10 @@
     random_state,
     n_components,
     missing_values,
+    target_data,
+    k,
+    percentile,
+    score_func
 )
 
 
@@ -206,8 +211,78 @@ async def ordinal_encoder(data):
     Returns
     -------
     result: Encoded data for categorical values
+
+    References:
+
+        - https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
+
     """
     enc = OneHotEncoder()
     enc.fit(data)
     new_data = enc.transform(data).toarray()
     return {"result": new_data}
+
+@op(
+    inputs={"data": input_data, "target_data": target_data, "k": k, "score_func": score_func},
+    outputs={"result": output_data}
+)
+async def select_k_best(data, target_data, score_func=f_classif, k=10):
+    """
+    Select the top k features, based on the score function.
+
+    Parameters
+    ----------
+    data : List[List[int]]
+        Input data, excluding the target column
+    target_data : List[int]
+        1D list containing values for the target column.
+    score_func : function
+        Function that takes in data and target_data, and returns 
+        a pair of arrays (scores, pvalues) or a single array with
+        scores.
+    k : int
+        Number of top features to select.
+
+    Returns
+    -------
+    result: Encoded data for categorical values
+    """
+
+    selector = SelectKBest(score_func, k=k)
+    new_data = selector.fit_transform(data, target_data)
+    return {"result": new_data}
+
+@op(
+    inputs={"data": input_data, "target_data": target_data, "percentile": percentile, "score_func": score_func},
+    outputs={"result": output_data}
+)
+async def select_percentile(data, target_data, score_func=f_classif, percentile=10):
+    """
+    Select a certain top percentile of features, based on the score function.
-    Select a certain top percentile of features, based on the score function.
+    Select a certain top percentile of features, based on the score function.
+
+    References:
+
+        - https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html
-    Select a certain top percentile of features, based on the score function.
+    Select a certain top percentile of features, based on the score function.
+
+    References:
+
+        - https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html
+
+    Parameters
+    ----------
+    data : List[List[int]]
+        Input data, excluding the target column
+    target_data : List[int]
+        1D list containing values for the target column.
+    score_func : function
+        Function that takes in data and target_data, and returns 
+        a pair of arrays (scores, pvalues) or a single array with
+        scores.
+    percentile : int
+        Percentile of top features to select.
+
+    Returns
+    -------
+    result: Encoded data for categorical values
+
+    References:
+
+        - https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html
+
+    """
+
+    selector = SelectPercentile(score_func, percentile=percentile)
+    new_data = selector.fit_transform(data, target_data)
+    return {"result": new_data}
diff --git a/operations/data/tests/test_operations.py b/operations/data/tests/test_operations.py
@@ -5,7 +5,7 @@
 from dffml.operation.output import GetSingle
 from dffml.df.memory import MemoryOrchestrator
 from dffml.util.asynctestcase import AsyncTestCase
-
+from sklearn.feature_selection import f_classif
 from dffml_operations_data.operations import *
 
 
@@ -225,3 +225,76 @@ async def test_ordinal_encoder(self):
                     == output_data
                 ).all()
             )
+
+    async def test_select_k_best(self):
+        input_data = [[1, 1], [1, 2], [1, 1], [0, 2], [1, 1], [1, 1]]
+        target_data = [1,2,1,2,1,2]
+        output_data = [[1], [2], [1], [2], [1], [1]]
+
+        async for ctx, results in MemoryOrchestrator.run(
+            DataFlow.auto(select_k_best, GetSingle),
+            [
+                Input(
+                    value=[select_k_best.op.outputs["result"].name],
+                    definition=GetSingle.op.inputs["spec"],
+                ),
+                Input(
+                    value=input_data,
+                    definition=select_k_best.op.inputs["data"],
+                ),
+                Input(
+                    value=target_data,
+                    definition=select_k_best.op.inputs["target_data"],
+                ),
+                Input(
+                    value=f_classif,
+                    definition=select_k_best.op.inputs["score_func"],
+                ),
+                Input(
+                    value=1,
+                    definition=select_k_best.op.inputs["k"],
+                ),
+            ],
+        ):
+            self.assertTrue(
+                (
+                    results[select_k_best.op.outputs["result"].name]
+                    == output_data
+                ).all()
+            )
+    async def test_select_percentile(self):
+        input_data = [[1, 1], [1, 2], [1, 1], [0, 2], [1, 1], [1, 1]]
+        target_data = [1,2,1,2,1,2]
+        output_data = [[1], [2], [1], [2], [1], [1]]
+
+        async for ctx, results in MemoryOrchestrator.run(
+            DataFlow.auto(select_percentile, GetSingle),
+            [
+                Input(
+                    value=[select_percentile.op.outputs["result"].name],
+                    definition=GetSingle.op.inputs["spec"],
+                ),
+                Input(
+                    value=input_data,
+                    definition=select_percentile.op.inputs["data"],
+                ),
+                Input(
+                    value=target_data,
+                    definition=select_percentile.op.inputs["target_data"],
+                ),
+                Input(
+                    value=f_classif,
+                    definition=select_percentile.op.inputs["score_func"],
+                ),
+                Input(
+                    value=50,
+                    definition=select_percentile.op.inputs["percentile"],
+                ),
+            ],
+        ):
+            self.assertTrue(
+                (
+                    results[select_percentile.op.outputs["result"].name]
+                    == output_data
+                ).all()
+            )