From 7942cd09f66762d2c40c7a7fbf6db3e7ea091e2a Mon Sep 17 00:00:00 2001 From: Eden Wu Date: Sat, 31 Aug 2024 20:28:15 -0400 Subject: [PATCH] update to regressor --- alpha_automl/hyperparameter_tuning/smac.py | 11 +- .../smac_parameters.json | 229 ++- examples/s4e9.ipynb | 1297 +++++++---------- 3 files changed, 786 insertions(+), 751 deletions(-) diff --git a/alpha_automl/hyperparameter_tuning/smac.py b/alpha_automl/hyperparameter_tuning/smac.py index 715781f..b9f667a 100644 --- a/alpha_automl/hyperparameter_tuning/smac.py +++ b/alpha_automl/hyperparameter_tuning/smac.py @@ -41,11 +41,14 @@ def gen_pipeline(config, pipeline): if step_type == 'COLUMN_TRANSFORMER': transformers = [] - for trans_name, _, trans_index in step_obj.__dict__['transformers']: + for trans_name, trans_obj_ori, trans_index in step_obj.__dict__['transformers']: trans_prim_name = trans_name.split('-')[0] - trans_obj = create_object(trans_prim_name, get_primitive_params(config, trans_prim_name)) - transformers.append((trans_name, trans_obj, trans_index)) - step_obj.__dict__['transformers'] = transformers + if "alpha_automl.builtin_primitives.math_features" in trans_prim_name: + transformers.append((trans_name, trans_obj_ori, trans_index)) + else: + trans_obj = create_object(trans_prim_name, get_primitive_params(config, trans_prim_name)) + transformers.append((trans_name, trans_obj, trans_index)) + step_obj.__dict__['transformers'] = transformers new_pipeline.steps.append([step_name, create_object(step_name, step_obj.__dict__)]) else: new_pipeline.steps.append([step_name, create_object(step_name, get_primitive_params(config, step_name))]) diff --git a/alpha_automl/hyperparameter_tuning/smac_parameters.json b/alpha_automl/hyperparameter_tuning/smac_parameters.json index 82da4b3..d994052 100644 --- a/alpha_automl/hyperparameter_tuning/smac_parameters.json +++ b/alpha_automl/hyperparameter_tuning/smac_parameters.json @@ -274,8 +274,229 @@ }, "sklearn.preprocessing.PolynomialFeatures": {}, "alpha_automl.wrapper_primitives.llm_feature_engine.LLMFeatureGenerator": {}, - "feature_engine.creation.math_features.MathFeatures-sum": {}, - "feature_engine.creation.math_features.MathFeatures-mean": {}, - "feature_engine.creation.math_features.MathFeatures-prod": {}, - "feature_engine.creation.math_features.MathFeatures-std": {} + "alpha_automl.builtin_primitives.math_features.MathFeaturesProd": {}, + "alpha_automl.builtin_primitives.math_features.MathFeaturesMean": {}, + "alpha_automl.builtin_primitives.math_features.MathFeaturesSum": {}, + "alpha_automl.builtin_primitives.math_features.MathFeaturesStd": {}, + "sklearn.ensemble.ExtraTreesRegressor": { + "n_estimators": { + "type": "Integer", + "value": [ + 200, + 2000 + ], + "default": 400 + }, + "max_features": { + "type": "Float", + "value": [ + 0.1, + 1.0 + ], + "default": 1.0 + }, + "max_leaf_nodes": { + "type": "Integer", + "value": [ + 80, + 10000 + ], + "default": 80 + }, + "criterion": { + "type": "Categorical", + "value": [ + "entropy", + "gini" + ], + "default": "entropy" + } + }, + "sklearn.ensemble.RandomForestRegressor": { + "n_estimators": { + "type": "Integer", + "value": [ + 200, + 2000 + ], + "default": 400 + }, + "max_features": { + "type": "Float", + "value": [ + 0.1, + 1.0 + ], + "default": 1.0 + }, + "criterion": { + "type": "Categorical", + "value": [ + "entropy", + "gini" + ], + "default": "entropy" + } + }, + "xgboost.XGBRegressor": { + "n_estimators": { + "type": "Integer", + "value": [ + 200, + 2000 + ], + "default": 400 + }, + "max_leaves": { + "type": "Integer", + "value": [ + 80, + 10000 + ], + "default": 80 + }, + "min_child_weight": { + "type": "Float", + "value": [ + 0.001, + 0.1 + ], + "default": 0.1 + }, + "learning_rate": { + "type": "Float", + "value": [ + 0.01, + 0.3 + ], + "default": 0.1 + }, + "subsample": { + "type": "Float", + "value": [ + 0.5, + 1.0 + ], + "default": 1.0 + }, + "colsample_bylevel": { + "type": "Float", + "value": [ + 0.5, + 1.0 + ], + "default": 1.0 + }, + "colsample_bytree": { + "type": "Float", + "value": [ + 0.5, + 1.0 + ], + "default": 1.0 + } + }, + "lightgbm.LGBMRegressor": { + "n_estimators": { + "type": "Integer", + "value": [ + 200, + 2000 + ], + "default": 400 + }, + "num_leaves": { + "type": "Integer", + "value": [ + 80, + 10000 + ], + "default": 80 + }, + "min_child_samples": { + "type": "Integer", + "value": [ + 20, + 100 + ], + "default": 20 + }, + "learning_rate": { + "type": "Float", + "value": [ + 0.001, + 0.3 + ], + "default": 0.1 + }, + "log_max_bin": { + "type": "Integer", + "value": [ + 6, + 10 + ], + "default": 8 + }, + "colsample_bytree": { + "type": "Float", + "value": [ + 0.3, + 1.0 + ], + "default": 1.0 + }, + "verbose": { + "type": "Constant", + "value": -1, + "default": -1 + } + }, + "catboost.CatBoostRegressor": { + "logging_level": { + "type": "Constant", + "value": "Silent", + "default": "Silent" + }, + "learning_rate": { + "type": "Float", + "value": [ + 0.01, + 0.3 + ], + "default": 0.1 + }, + "depth": { + "type": "Integer", + "value": [ + 1, + 6 + ], + "default": 6 + }, + "l2_leaf_reg": { + "type": "Float", + "value": [ + 1, + 10 + ], + "default": 3 + }, + "border_count": { + "type": "Integer", + "value": [ + 32, + 255 + ], + "default": 254 + }, + "auto_class_weights": { + "type": "Categorical", + "value": [ + "None", + "Balanced", + "SqrtBalanced" + ], + "default": "None" + } + } } diff --git a/examples/s4e9.ipynb b/examples/s4e9.ipynb index 94f70b4..529615a 100644 --- a/examples/s4e9.ipynb +++ b/examples/s4e9.ipynb @@ -12,8 +12,8 @@ "text": [ "/ext3/miniconda3/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", - "2024-08-31 18:44:55,024\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n", - "2024-08-31 18:44:55,574\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" + "2024-08-31 20:27:25,109\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n", + "2024-08-31 20:27:25,695\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" ] }, { @@ -25,14 +25,14 @@ } ], "source": [ - "from alpha_automl import AutoMLClassifier\n", + "from alpha_automl import AutoMLRegressor\n", "import pandas as pd\n", "import numpy as np\n", "\n", "%env OPENAI_API_KEY=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\n", "\n", - "train_dataset = pd.read_csv('datasets/s4e8/train.csv').sample(200000)\n", - "test_dataset = pd.read_csv('datasets/s4e8/test.csv')" + "train_dataset = pd.read_csv('datasets/s4e9/train.csv').sample(20000)\n", + "test_dataset = pd.read_csv('datasets/s4e9/test.csv')" ] }, { @@ -62,143 +62,89 @@ " \n", " \n", " \n", - " cap-diameter\n", - " cap-shape\n", - " cap-surface\n", - " cap-color\n", - " does-bruise-or-bleed\n", - " gill-attachment\n", - " gill-spacing\n", - " gill-color\n", - " stem-height\n", - " stem-width\n", - " stem-root\n", - " stem-surface\n", - " stem-color\n", - " veil-type\n", - " veil-color\n", - " has-ring\n", - " ring-type\n", - " spore-print-color\n", - " habitat\n", - " season\n", + " brand\n", + " model\n", + " model_year\n", + " milage\n", + " fuel_type\n", + " engine\n", + " transmission\n", + " ext_col\n", + " int_col\n", + " accident\n", + " clean_title\n", " \n", " \n", " \n", " \n", - " 2006170\n", - " 0.76\n", - " x\n", - " g\n", - " o\n", - " f\n", - " NaN\n", - " NaN\n", - " y\n", - " 3.59\n", - " 0.82\n", - " NaN\n", - " NaN\n", - " w\n", - " NaN\n", - " NaN\n", - " f\n", - " f\n", - " NaN\n", - " d\n", - " a\n", + " 71793\n", + " Toyota\n", + " Tacoma TRD Sport\n", + " 2013\n", + " 135606\n", + " Gasoline\n", + " 159.0HP 2.7L 4 Cylinder Engine Gasoline Fuel\n", + " 6-Speed M/T\n", + " Red\n", + " Black\n", + " None reported\n", + " Yes\n", " \n", " \n", - " 207690\n", - " 5.72\n", - " p\n", - " NaN\n", - " g\n", - " f\n", - " f\n", - " f\n", - " f\n", - " 5.37\n", - " 18.09\n", - " NaN\n", - " NaN\n", - " n\n", - " NaN\n", - " NaN\n", - " f\n", - " f\n", - " NaN\n", - " l\n", - " s\n", + " 33214\n", + " Ford\n", + " Mustang GT Premium\n", + " 2012\n", + " 90000\n", + " Gasoline\n", + " 412.0HP 5.0L 8 Cylinder Engine Gasoline Fuel\n", + " 6-Speed M/T\n", + " Black\n", + " Black\n", + " At least 1 accident or damage reported\n", + " Yes\n", " \n", " \n", - " 2396937\n", - " 2.24\n", - " x\n", - " i\n", - " l\n", - " f\n", - " a\n", - " NaN\n", - " p\n", - " 4.11\n", - " 2.57\n", - " NaN\n", - " NaN\n", - " w\n", - " NaN\n", - " NaN\n", - " f\n", - " f\n", - " p\n", - " g\n", - " u\n", + " 4936\n", + " INFINITI\n", + " G35 Base\n", + " 2000\n", + " 96000\n", + " Gasoline\n", + " 298.0HP 3.5L V6 Cylinder Engine Gasoline Fuel\n", + " 6-Speed M/T\n", + " Black\n", + " Black\n", + " None reported\n", + " Yes\n", " \n", " \n", - " 890751\n", - " 1.52\n", - " b\n", - " h\n", - " n\n", - " f\n", - " a\n", - " NaN\n", - " n\n", - " 5.01\n", - " 1.66\n", - " NaN\n", - " t\n", - " n\n", - " NaN\n", + " 121116\n", + " Acura\n", + " TLX A-Spec\n", + " 2023\n", + " 14381\n", + " Gasoline\n", + " 2.0L I4 16V GDI DOHC Turbo\n", + " 9-Speed Automatic\n", + " Gray\n", + " Black/Gun Metal\n", + " None reported\n", " NaN\n", - " f\n", - " f\n", - " NaN\n", - " g\n", - " a\n", " \n", " \n", - " 530679\n", - " 7.67\n", - " f\n", - " NaN\n", - " y\n", - " f\n", - " a\n", - " c\n", - " w\n", - " 6.54\n", - " 17.52\n", - " NaN\n", - " NaN\n", - " w\n", - " NaN\n", - " NaN\n", - " f\n", - " f\n", - " NaN\n", - " d\n", - " a\n", + " 36972\n", + " Chrysler\n", + " 300 Touring\n", + " 2009\n", + " 185000\n", + " Gasoline\n", + " 250.0HP 3.5L V6 Cylinder Engine Gasoline Fuel\n", + " A/T\n", + " Gray\n", + " Gray\n", + " At least 1 accident or damage reported\n", + " Yes\n", " \n", " \n", " ...\n", @@ -213,190 +159,136 @@ " ...\n", " ...\n", " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", " \n", " \n", - " 2537501\n", - " 5.67\n", - " x\n", - " h\n", - " y\n", - " t\n", - " x\n", - " c\n", - " y\n", - " 6.71\n", - " 14.65\n", - " NaN\n", - " NaN\n", - " w\n", - " NaN\n", - " NaN\n", - " f\n", - " f\n", - " NaN\n", - " d\n", - " u\n", + " 3948\n", + " RAM\n", + " 1500 SLT\n", + " 2017\n", + " 97700\n", + " Gasoline\n", + " 395.0HP 5.7L 8 Cylinder Engine Gasoline Fuel\n", + " A/T\n", + " Gray\n", + " Gray\n", + " At least 1 accident or damage reported\n", + " Yes\n", " \n", " \n", - " 1498074\n", - " 5.85\n", - " x\n", - " s\n", - " b\n", - " f\n", - " x\n", - " c\n", - " w\n", - " 6.47\n", - " 9.63\n", - " NaN\n", - " s\n", - " n\n", + " 84168\n", + " Tesla\n", + " Model X Long Range Plus\n", + " 2020\n", + " 46000\n", " NaN\n", - " NaN\n", - " f\n", - " f\n", - " NaN\n", - " d\n", - " w\n", + " 557.0HP Electric Motor Electric Fuel System\n", + " A/T\n", + " Gray\n", + " Black\n", + " None reported\n", + " Yes\n", " \n", " \n", - " 1399239\n", - " 3.59\n", - " f\n", - " l\n", - " y\n", - " f\n", - " x\n", - " d\n", - " y\n", - " 5.16\n", - " 4.47\n", - " NaN\n", - " NaN\n", - " n\n", - " NaN\n", - " NaN\n", - " f\n", - " f\n", - " NaN\n", - " l\n", - " u\n", + " 186205\n", + " Mercedes-Benz\n", + " GLS 450 Base 4MATIC\n", + " 2018\n", + " 7100\n", + " Gasoline\n", + " 362.0HP 3.0L V6 Cylinder Engine Gasoline Fuel\n", + " 7-Speed A/T\n", + " White\n", + " Beige\n", + " None reported\n", + " Yes\n", " \n", " \n", - " 275415\n", - " 9.80\n", - " f\n", - " NaN\n", - " n\n", - " f\n", - " s\n", - " c\n", - " g\n", - " 7.09\n", - " 21.54\n", - " b\n", - " NaN\n", - " u\n", - " NaN\n", + " 38335\n", + " Mercedes-Benz\n", + " GLC 300 GLC 300\n", + " 2022\n", + " 2900\n", + " Gasoline\n", + " 2.0 Liter Turbo\n", + " Automatic\n", + " Graphite Grey Metallic\n", + " –\n", + " None reported\n", " NaN\n", - " f\n", - " f\n", - " NaN\n", - " g\n", - " a\n", " \n", " \n", - " 252426\n", - " 4.61\n", - " o\n", - " NaN\n", - " n\n", - " f\n", - " f\n", - " f\n", - " f\n", - " 5.27\n", - " 19.09\n", - " NaN\n", - " NaN\n", - " n\n", - " NaN\n", - " NaN\n", - " f\n", - " f\n", - " NaN\n", - " d\n", - " s\n", + " 93996\n", + " Porsche\n", + " 911 Carrera\n", + " 2020\n", + " 8755\n", + " Gasoline\n", + " 379.0HP 3.0L Flat 6 Cylinder Engine Gasoline Fuel\n", + " 8-Speed A/T\n", + " Silver\n", + " Black\n", + " None reported\n", + " Yes\n", " \n", " \n", "\n", - "

100000 rows × 20 columns

\n", + "

20000 rows × 11 columns

\n", "" ], "text/plain": [ - " cap-diameter cap-shape cap-surface cap-color does-bruise-or-bleed \\\n", - "2006170 0.76 x g o f \n", - "207690 5.72 p NaN g f \n", - "2396937 2.24 x i l f \n", - "890751 1.52 b h n f \n", - "530679 7.67 f NaN y f \n", - "... ... ... ... ... ... \n", - "2537501 5.67 x h y t \n", - "1498074 5.85 x s b f \n", - "1399239 3.59 f l y f \n", - "275415 9.80 f NaN n f \n", - "252426 4.61 o NaN n f \n", + " brand model model_year milage fuel_type \\\n", + "71793 Toyota Tacoma TRD Sport 2013 135606 Gasoline \n", + "33214 Ford Mustang GT Premium 2012 90000 Gasoline \n", + "4936 INFINITI G35 Base 2000 96000 Gasoline \n", + "121116 Acura TLX A-Spec 2023 14381 Gasoline \n", + "36972 Chrysler 300 Touring 2009 185000 Gasoline \n", + "... ... ... ... ... ... \n", + "3948 RAM 1500 SLT 2017 97700 Gasoline \n", + "84168 Tesla Model X Long Range Plus 2020 46000 NaN \n", + "186205 Mercedes-Benz GLS 450 Base 4MATIC 2018 7100 Gasoline \n", + "38335 Mercedes-Benz GLC 300 GLC 300 2022 2900 Gasoline \n", + "93996 Porsche 911 Carrera 2020 8755 Gasoline \n", "\n", - " gill-attachment gill-spacing gill-color stem-height stem-width \\\n", - "2006170 NaN NaN y 3.59 0.82 \n", - "207690 f f f 5.37 18.09 \n", - "2396937 a NaN p 4.11 2.57 \n", - "890751 a NaN n 5.01 1.66 \n", - "530679 a c w 6.54 17.52 \n", - "... ... ... ... ... ... \n", - "2537501 x c y 6.71 14.65 \n", - "1498074 x c w 6.47 9.63 \n", - "1399239 x d y 5.16 4.47 \n", - "275415 s c g 7.09 21.54 \n", - "252426 f f f 5.27 19.09 \n", + " engine transmission \\\n", + "71793 159.0HP 2.7L 4 Cylinder Engine Gasoline Fuel 6-Speed M/T \n", + "33214 412.0HP 5.0L 8 Cylinder Engine Gasoline Fuel 6-Speed M/T \n", + "4936 298.0HP 3.5L V6 Cylinder Engine Gasoline Fuel 6-Speed M/T \n", + "121116 2.0L I4 16V GDI DOHC Turbo 9-Speed Automatic \n", + "36972 250.0HP 3.5L V6 Cylinder Engine Gasoline Fuel A/T \n", + "... ... ... \n", + "3948 395.0HP 5.7L 8 Cylinder Engine Gasoline Fuel A/T \n", + "84168 557.0HP Electric Motor Electric Fuel System A/T \n", + "186205 362.0HP 3.0L V6 Cylinder Engine Gasoline Fuel 7-Speed A/T \n", + "38335 2.0 Liter Turbo Automatic \n", + "93996 379.0HP 3.0L Flat 6 Cylinder Engine Gasoline Fuel 8-Speed A/T \n", "\n", - " stem-root stem-surface stem-color veil-type veil-color has-ring \\\n", - "2006170 NaN NaN w NaN NaN f \n", - "207690 NaN NaN n NaN NaN f \n", - "2396937 NaN NaN w NaN NaN f \n", - "890751 NaN t n NaN NaN f \n", - "530679 NaN NaN w NaN NaN f \n", - "... ... ... ... ... ... ... \n", - "2537501 NaN NaN w NaN NaN f \n", - "1498074 NaN s n NaN NaN f \n", - "1399239 NaN NaN n NaN NaN f \n", - "275415 b NaN u NaN NaN f \n", - "252426 NaN NaN n NaN NaN f \n", + " ext_col int_col \\\n", + "71793 Red Black \n", + "33214 Black Black \n", + "4936 Black Black \n", + "121116 Gray Black/Gun Metal \n", + "36972 Gray Gray \n", + "... ... ... \n", + "3948 Gray Gray \n", + "84168 Gray Black \n", + "186205 White Beige \n", + "38335 Graphite Grey Metallic – \n", + "93996 Silver Black \n", "\n", - " ring-type spore-print-color habitat season \n", - "2006170 f NaN d a \n", - "207690 f NaN l s \n", - "2396937 f p g u \n", - "890751 f NaN g a \n", - "530679 f NaN d a \n", - "... ... ... ... ... \n", - "2537501 f NaN d u \n", - "1498074 f NaN d w \n", - "1399239 f NaN l u \n", - "275415 f NaN g a \n", - "252426 f NaN d s \n", + " accident clean_title \n", + "71793 None reported Yes \n", + "33214 At least 1 accident or damage reported Yes \n", + "4936 None reported Yes \n", + "121116 None reported NaN \n", + "36972 At least 1 accident or damage reported Yes \n", + "... ... ... \n", + "3948 At least 1 accident or damage reported Yes \n", + "84168 None reported Yes \n", + "186205 None reported Yes \n", + "38335 None reported NaN \n", + "93996 None reported Yes \n", "\n", - "[100000 rows x 20 columns]" + "[20000 rows x 11 columns]" ] }, "execution_count": 2, @@ -405,7 +297,7 @@ } ], "source": [ - "target_column = 'class'\n", + "target_column = 'price'\n", "X_train = train_dataset.drop(columns=[target_column, 'id'])\n", "y_train = train_dataset[[target_column]]\n", "X_train" @@ -413,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "2e13738e-0446-4557-9a94-f26ff18c3d10", "metadata": {}, "outputs": [], @@ -424,141 +316,31 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "036e9c02-0cd8-4ab3-884f-93a2782e0c42", "metadata": { "scrolled": true }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/preprocessing/_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " y = column_or_1d(y, warn=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LLMFeatureGenerator generate data with bad quality: 0 / 100000\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/preprocessing/_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " y = column_or_1d(y, warn=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LLMFeatureGenerator generate data with bad quality: 2024 / 100000\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/preprocessing/_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " y = column_or_1d(y, warn=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LLMFeatureGenerator generate data with bad quality: 37581 / 100000\n" - ] - }, { "name": "stderr", "output_type": "stream", "text": [ "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/preprocessing/_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", " y = column_or_1d(y, warn=True)\n", - "/ext3/miniconda3/lib/python3.9/site-packages/datamart_profiler/core.py:199: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", - " data = data.astype(object).fillna('').astype(str)\n", - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/base.py:1474: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", - " return fit_method(estimator, *args, **kwargs)\n", - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/base.py:1474: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", - " return fit_method(estimator, *args, **kwargs)\n", - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/base.py:1474: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", - " return fit_method(estimator, *args, **kwargs)\n", - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/base.py:1474: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", - " return fit_method(estimator, *args, **kwargs)\n", - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/base.py:1474: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", - " return fit_method(estimator, *args, **kwargs)\n", - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/base.py:1474: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", - " return fit_method(estimator, *args, **kwargs)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "training, score: 0.9919818399939008...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/preprocessing/_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " y = column_or_1d(y, warn=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LLMFeatureGenerator generate data with bad quality: 1490 / 100000\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/preprocessing/_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " y = column_or_1d(y, warn=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LLMFeatureGenerator generate data with bad quality: 5572 / 100000\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/preprocessing/_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " y = column_or_1d(y, warn=True)\n", - ":6: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + ":5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", - ":7: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", - "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", - "\n", - "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", - "\n", - "\n", - ":8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + ":6: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", - ":14: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + ":7: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", @@ -567,78 +349,16 @@ "/ext3/miniconda3/lib/python3.9/site-packages/datamart_profiler/core.py:199: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", " data = data.astype(object).fillna('').astype(str)\n", "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/base.py:1474: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", - " return fit_method(estimator, *args, **kwargs)\n", - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/base.py:1474: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", - " return fit_method(estimator, *args, **kwargs)\n", - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/base.py:1474: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", - " return fit_method(estimator, *args, **kwargs)\n", - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/base.py:1474: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", - " return fit_method(estimator, *args, **kwargs)\n", - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/base.py:1474: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", - " return fit_method(estimator, *args, **kwargs)\n", - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/base.py:1474: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", " return fit_method(estimator, *args, **kwargs)\n" ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "training, score: 0.9903666194552061...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/preprocessing/_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " y = column_or_1d(y, warn=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LLMFeatureGenerator generate data with bad quality: 3513 / 100000\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/preprocessing/_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " y = column_or_1d(y, warn=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LLMFeatureGenerator generate data with bad quality: 2263 / 100000\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/ext3/miniconda3/lib/python3.9/site-packages/sklearn/preprocessing/_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " y = column_or_1d(y, warn=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LLMFeatureGenerator generate data with bad quality: 2024 / 100000\n" - ] } ], "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer\n", - "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.preprocessing import LabelEncoder\n", - "from sklearn.metrics import f1_score, roc_auc_score\n", + "from sklearn.metrics import f1_score, roc_auc_score, mean_squared_error\n", "from sklearn.metrics import make_scorer\n", "from sklearn.model_selection import cross_val_score\n", "\n", @@ -688,7 +408,19 @@ " y_gen = label_encoder.fit_transform(y_train)\n", " \n", " generator = LLMFeatureGenerator(\n", - " description=\"Here's the information of the target dataset: This data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family (pp. 500-525). Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the poisonous one. The Guide clearly states that there is no simple rule for determining the edibility of a mushroom; no rule like ``leaflets three, let it be'' for Poisonous Oak and Ivy.\"\n", + " description=\"\"\"\n", + "Used Car Price Prediction Dataset is a comprehensive collection of automotive information extracted from the popular automotive marketplace website, https://www.cars.com. This dataset comprises 4,009 data points, each representing a unique vehicle listing, and includes nine distinct features providing valuable insights into the world of automobiles.\n", + " Brand & Model: Identify the brand or company name along with the specific model of each vehicle.\n", + " Model Year: Discover the manufacturing year of the vehicles, crucial for assessing depreciation and technology advancements.\n", + " Mileage: Obtain the mileage of each vehicle, a key indicator of wear and tear and potential maintenance requirements.\n", + " Fuel Type: Learn about the type of fuel the vehicles run on, whether it's gasoline, diesel, electric, or hybrid.\n", + " Engine Type: Understand the engine specifications, shedding light on performance and efficiency.\n", + " Transmission: Determine the transmission type, whether automatic, manual, or another variant.\n", + " Exterior & Interior Colors: Explore the aesthetic aspects of the vehicles, including exterior and interior color options.\n", + " Accident History: Discover whether a vehicle has a prior history of accidents or damage, crucial for informed decision-making.\n", + " Clean Title: Evaluate the availability of a clean title, which can impact the vehicle's resale value and legal status.\n", + " Price: Access the listed prices for each vehicle, aiding in price comparison and budgeting.\n", + "This dataset is a valuable resource for automotive enthusiasts, buyers, and researchers interested in analyzing trends, making informed purchasing decisions or conducting studies related to the automotive industry and consumer preferences. Whether you are a data analyst, car buyer, or researcher, this dataset offers a wealth of information to explore and analyze.\"\"\"\n", " )\n", " X_gen = generator.fit_transform(X_train, y_train)\n", " X_gen[target_column] = y_gen\n", @@ -705,13 +437,13 @@ " pipeline = Pipeline(steps=[\n", " ('sklearn.impute.SimpleImputer', SimpleImputer(strategy='most_frequent', keep_empty_features=True)),\n", " ('sklearn.compose.ColumnTransformer', generate_column_transformer(metadata)),\n", - " ('sklearn.ensemble.RandomForestClassifier', RandomForestClassifier())\n", + " ('sklearn.ensemble.RandomForestRegressor', RandomForestRegressor())\n", " ])\n", " \n", " \n", " pipeline.fit(X_gen, y_gen)\n", " \n", - " scores = cross_val_score(pipeline, X_gen, y_gen, scoring=make_scorer(f1_score), cv=5)\n", + " scores = cross_val_score(pipeline, X_gen, y_gen, scoring=make_scorer(mean_squared_error), cv=5)\n", " score = sum(scores) / len(scores)\n", " if score > best_score:\n", " best_score = score\n", @@ -729,10 +461,45 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, + "id": "cfde5200-86d2-44e9-9b0e-6cc8a302ebfe", + "metadata": {}, + "outputs": [], + "source": [ + "code = llm_featgen.code\n", + "code" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "a294fef2-465b-4610-b099-5edf96a1fa4c", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":6: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + ":7: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + ":8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n" + ] + }, { "data": { "text/html": [ @@ -773,130 +540,130 @@ " veil_type_missing\n", " veil_color_missing\n", " spore_print_color_missing\n", - " cap_diameter_to_stem_height_ratio\n", - " stem_width_to_stem_height_ratio\n", + " cap_shape_surface_interaction\n", + " stem_dimensions_ratio\n", " \n", " \n", " \n", " \n", - " 2006170\n", - " 0.76\n", - " x\n", - " g\n", - " o\n", + " 857763\n", + " 5.99\n", + " p\n", + " s\n", + " w\n", " f\n", - " missing\n", - " missing\n", - " y\n", - " 3.59\n", - " 0.82\n", + " e\n", + " c\n", + " p\n", + " 18.48\n", + " 12.66\n", " ...\n", - " False\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " 0.211699\n", - " 0.228412\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", + " 0\n", + " 1\n", + " 1\n", + " 1\n", + " p_s\n", + " 1.459716\n", " \n", " \n", - " 207690\n", - " 5.72\n", - " p\n", - " missing\n", - " g\n", - " f\n", - " f\n", - " f\n", + " 360364\n", + " 8.58\n", + " x\n", + " s\n", + " w\n", " f\n", - " 5.37\n", - " 18.09\n", + " s\n", + " c\n", + " w\n", + " 7.22\n", + " 31.27\n", " ...\n", - " True\n", - " False\n", - " False\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " 1.065177\n", - " 3.368715\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " x_s\n", + " 0.230892\n", " \n", " \n", - " 2396937\n", - " 2.24\n", - " x\n", - " i\n", - " l\n", + " 839097\n", + " 8.25\n", " f\n", - " a\n", - " missing\n", - " p\n", - " 4.11\n", - " 2.57\n", + " t\n", + " e\n", + " f\n", + " x\n", + " d\n", + " w\n", + " 6.46\n", + " 17.42\n", " ...\n", - " False\n", - " False\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " False\n", - " 0.545012\n", - " 0.625304\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " f_t\n", + " 0.370838\n", " \n", " \n", - " 890751\n", - " 1.52\n", - " b\n", - " h\n", - " n\n", - " f\n", - " a\n", - " missing\n", + " 3013331\n", + " 10.58\n", + " x\n", + " e\n", " n\n", - " 5.01\n", - " 1.66\n", + " t\n", + " p\n", + " unknown\n", + " o\n", + " 6.63\n", + " 23.48\n", " ...\n", - " False\n", - " False\n", - " True\n", - " True\n", - " False\n", - " True\n", - " True\n", - " True\n", - " 0.303393\n", - " 0.331337\n", + " 0\n", + " 0\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " x_e\n", + " 0.282368\n", " \n", " \n", - " 530679\n", - " 7.67\n", - " f\n", - " missing\n", - " y\n", + " 2834831\n", + " 15.45\n", + " x\n", + " s\n", + " n\n", " f\n", - " a\n", + " s\n", " c\n", - " w\n", - " 6.54\n", - " 17.52\n", + " g\n", + " 9.50\n", + " 19.29\n", " ...\n", - " True\n", - " False\n", - " False\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " 1.172783\n", - " 2.678899\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " x_s\n", + " 0.492483\n", " \n", " \n", " ...\n", @@ -923,213 +690,213 @@ " ...\n", " \n", " \n", - " 2537501\n", - " 5.67\n", - " x\n", - " h\n", - " y\n", - " t\n", - " x\n", - " c\n", - " y\n", - " 6.71\n", - " 14.65\n", - " ...\n", - " False\n", - " False\n", - " False\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " 0.845007\n", - " 2.183308\n", - " \n", - " \n", - " 1498074\n", - " 5.85\n", + " 2247836\n", + " 4.34\n", " x\n", - " s\n", - " b\n", + " unknown\n", + " w\n", " f\n", " x\n", " c\n", " w\n", - " 6.47\n", - " 9.63\n", + " 7.93\n", + " 8.71\n", " ...\n", - " False\n", - " False\n", - " False\n", - " True\n", - " False\n", - " True\n", - " True\n", - " True\n", - " 0.904173\n", - " 1.488408\n", + " 1\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", + " 1\n", + " 0\n", + " 1\n", + " x_unknown\n", + " 0.910448\n", " \n", " \n", - " 1399239\n", - " 3.59\n", + " 703166\n", + " 3.09\n", " f\n", - " l\n", " y\n", + " n\n", " f\n", - " x\n", - " d\n", - " y\n", - " 5.16\n", - " 4.47\n", + " e\n", + " c\n", + " w\n", + " 5.21\n", + " 3.50\n", " ...\n", - " False\n", - " False\n", - " False\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " 0.695736\n", - " 0.866279\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", + " 0\n", + " 1\n", + " 1\n", + " 1\n", + " f_y\n", + " 1.488571\n", " \n", " \n", - " 275415\n", - " 9.80\n", + " 2159902\n", + " 2.27\n", " f\n", - " missing\n", - " n\n", + " unknown\n", + " u\n", " f\n", - " s\n", + " a\n", " c\n", - " g\n", - " 7.09\n", - " 21.54\n", + " n\n", + " 3.42\n", + " 3.37\n", " ...\n", - " True\n", - " False\n", - " False\n", - " False\n", - " True\n", - " True\n", - " True\n", - " True\n", - " 1.382228\n", - " 3.038082\n", + " 1\n", + " 0\n", + " 0\n", + " 1\n", + " 0\n", + " 1\n", + " 1\n", + " 1\n", + " f_unknown\n", + " 1.014837\n", " \n", " \n", - " 252426\n", - " 4.61\n", - " o\n", - " missing\n", + " 273217\n", + " 26.70\n", + " x\n", + " s\n", " n\n", " f\n", - " f\n", - " f\n", - " f\n", - " 5.27\n", - " 19.09\n", + " p\n", + " unknown\n", + " y\n", + " 5.91\n", + " 40.87\n", " ...\n", - " True\n", - " False\n", - " False\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " 0.874763\n", - " 3.622391\n", + " 0\n", + " 0\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " x_s\n", + " 0.144605\n", + " \n", + " \n", + " 323296\n", + " 8.04\n", + " s\n", + " t\n", + " p\n", + " t\n", + " d\n", + " c\n", + " p\n", + " 7.16\n", + " 14.78\n", + " ...\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " s_t\n", + " 0.484438\n", " \n", " \n", "\n", - "

99982 rows × 26 columns

\n", + "

66988 rows × 27 columns

\n", "" ], "text/plain": [ " cap-diameter cap-shape cap-surface cap-color does-bruise-or-bleed \\\n", - "2006170 0.76 x g o f \n", - "207690 5.72 p missing g f \n", - "2396937 2.24 x i l f \n", - "890751 1.52 b h n f \n", - "530679 7.67 f missing y f \n", + "857763 5.99 p s w f \n", + "360364 8.58 x s w f \n", + "839097 8.25 f t e f \n", + "3013331 10.58 x e n t \n", + "2834831 15.45 x s n f \n", "... ... ... ... ... ... \n", - "2537501 5.67 x h y t \n", - "1498074 5.85 x s b f \n", - "1399239 3.59 f l y f \n", - "275415 9.80 f missing n f \n", - "252426 4.61 o missing n f \n", + "2247836 4.34 x unknown w f \n", + "703166 3.09 f y n f \n", + "2159902 2.27 f unknown u f \n", + "273217 26.70 x s n f \n", + "323296 8.04 s t p t \n", "\n", " gill-attachment gill-spacing gill-color stem-height stem-width ... \\\n", - "2006170 missing missing y 3.59 0.82 ... \n", - "207690 f f f 5.37 18.09 ... \n", - "2396937 a missing p 4.11 2.57 ... \n", - "890751 a missing n 5.01 1.66 ... \n", - "530679 a c w 6.54 17.52 ... \n", + "857763 e c p 18.48 12.66 ... \n", + "360364 s c w 7.22 31.27 ... \n", + "839097 x d w 6.46 17.42 ... \n", + "3013331 p unknown o 6.63 23.48 ... \n", + "2834831 s c g 9.50 19.29 ... \n", "... ... ... ... ... ... ... \n", - "2537501 x c y 6.71 14.65 ... \n", - "1498074 x c w 6.47 9.63 ... \n", - "1399239 x d y 5.16 4.47 ... \n", - "275415 s c g 7.09 21.54 ... \n", - "252426 f f f 5.27 19.09 ... \n", + "2247836 x c w 7.93 8.71 ... \n", + "703166 e c w 5.21 3.50 ... \n", + "2159902 a c n 3.42 3.37 ... \n", + "273217 p unknown y 5.91 40.87 ... \n", + "323296 d c p 7.16 14.78 ... \n", "\n", " cap_surface_missing gill_attachment_missing gill_spacing_missing \\\n", - "2006170 False True True \n", - "207690 True False False \n", - "2396937 False False True \n", - "890751 False False True \n", - "530679 True False False \n", + "857763 0 0 0 \n", + "360364 0 0 0 \n", + "839097 0 0 0 \n", + "3013331 0 0 1 \n", + "2834831 0 0 0 \n", "... ... ... ... \n", - "2537501 False False False \n", - "1498074 False False False \n", - "1399239 False False False \n", - "275415 True False False \n", - "252426 True False False \n", + "2247836 1 0 0 \n", + "703166 0 0 0 \n", + "2159902 1 0 0 \n", + "273217 0 0 1 \n", + "323296 0 0 0 \n", "\n", " stem_root_missing stem_surface_missing veil_type_missing \\\n", - "2006170 True True True \n", - "207690 True True True \n", - "2396937 True True True \n", - "890751 True False True \n", - "530679 True True True \n", + "857763 1 0 1 \n", + "360364 1 1 1 \n", + "839097 0 1 1 \n", + "3013331 1 1 1 \n", + "2834831 1 1 1 \n", "... ... ... ... \n", - "2537501 True True True \n", - "1498074 True False True \n", - "1399239 True True True \n", - "275415 False True True \n", - "252426 True True True \n", + "2247836 0 1 1 \n", + "703166 1 0 1 \n", + "2159902 1 0 1 \n", + "273217 1 1 1 \n", + "323296 1 1 1 \n", "\n", - " veil_color_missing spore_print_color_missing \\\n", - "2006170 True True \n", - "207690 True True \n", - "2396937 True False \n", - "890751 True True \n", - "530679 True True \n", - "... ... ... \n", - "2537501 True True \n", - "1498074 True True \n", - "1399239 True True \n", - "275415 True True \n", - "252426 True True \n", + " veil_color_missing spore_print_color_missing \\\n", + "857763 1 1 \n", + "360364 1 1 \n", + "839097 1 1 \n", + "3013331 1 1 \n", + "2834831 1 1 \n", + "... ... ... \n", + "2247836 0 1 \n", + "703166 1 1 \n", + "2159902 1 1 \n", + "273217 1 1 \n", + "323296 1 1 \n", "\n", - " cap_diameter_to_stem_height_ratio stem_width_to_stem_height_ratio \n", - "2006170 0.211699 0.228412 \n", - "207690 1.065177 3.368715 \n", - "2396937 0.545012 0.625304 \n", - "890751 0.303393 0.331337 \n", - "530679 1.172783 2.678899 \n", - "... ... ... \n", - "2537501 0.845007 2.183308 \n", - "1498074 0.904173 1.488408 \n", - "1399239 0.695736 0.866279 \n", - "275415 1.382228 3.038082 \n", - "252426 0.874763 3.622391 \n", + " cap_shape_surface_interaction stem_dimensions_ratio \n", + "857763 p_s 1.459716 \n", + "360364 x_s 0.230892 \n", + "839097 f_t 0.370838 \n", + "3013331 x_e 0.282368 \n", + "2834831 x_s 0.492483 \n", + "... ... ... \n", + "2247836 x_unknown 0.910448 \n", + "703166 f_y 1.488571 \n", + "2159902 f_unknown 1.014837 \n", + "273217 x_s 0.144605 \n", + "323296 s_t 0.484438 \n", "\n", - "[99982 rows x 26 columns]" + "[66988 rows x 27 columns]" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -1145,13 +912,13 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "e6e2832f-fb1d-408d-8dac-080738643b3d", "metadata": {}, "outputs": [], "source": [ "import logging\n", - "automl = AutoMLClassifier(time_bound=20,\n", + "automl = AutoMLRegressor(time_bound=1,\n", " output_folder=\"tmp/\",\n", " checkpoints_folder=\"tmp/\",\n", " verbose=logging.INFO,\n", @@ -1161,7 +928,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "2f91a087-849c-4bc5-97a7-30fa9927511b", "metadata": {}, "outputs": [], @@ -1197,7 +964,7 @@ "text": [ ":job_id:01000000\n", ":actor_name:RolloutWorker\n", - "2024-08-31 19:06:56,196\tWARNING env.py:162 -- Your env doesn't have a .spec.max_episode_steps attribute. Your horizon will default to infinity, and your environment will not be reset.\n" + "2024-08-31 20:00:18,501\tWARNING env.py:162 -- Your env doesn't have a .spec.max_episode_steps attribute. Your horizon will default to infinity, and your environment will not be reset.\n" ] }, { @@ -1212,10 +979,34 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-08-31 19:07:13,298\tWARNING deprecation.py:50 -- DeprecationWarning: `_get_slice_indices` has been deprecated. This will raise an error in the future!\n", + "2024-08-31 20:00:35,317\tWARNING deprecation.py:50 -- DeprecationWarning: `_get_slice_indices` has been deprecated. This will raise an error in the future!\n", "TBB Warning: The number of workers is currently limited to 3. The request for 95 workers is ignored. Further requests for more workers will be silently ignored until the limit changes.\n", "\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO|2024-08-31 20:01:33|Scored pipeline, score=0.984\n", + "INFO|2024-08-31 20:01:34|Scored pipeline, score=0.98\n", + "INFO|2024-08-31 20:01:34|Scored pipeline, score=0.986\n", + "INFO|2024-08-31 20:01:35|Scored pipeline, score=0.986\n", + "INFO|2024-08-31 20:01:35|Scored pipeline, score=0.988\n", + "INFO|2024-08-31 20:01:35|Scored pipeline, score=0.98\n", + "INFO|2024-08-31 20:01:38|Scored pipeline, score=0.986\n", + "INFO|2024-08-31 20:01:40|Scored pipeline, score=0.986\n", + "INFO|2024-08-31 20:01:40|Scored pipeline, score=0.976\n", + "INFO|2024-08-31 20:01:41|Scored pipeline, score=0.978\n", + "INFO|2024-08-31 20:01:41|Scored pipeline, score=0.976\n", + "INFO|2024-08-31 20:01:41|Scored pipeline, score=0.98\n", + "INFO|2024-08-31 20:01:41|Scored pipeline, score=0.98\n", + "INFO|2024-08-31 20:01:41|Found 13 pipelines\n", + "[INFO][abstract_initial_design.py:147] Using 40 initial design configurations and 0 additional configurations.\n", + "[INFO][smbo.py:497] Continuing from previous run.\n", + "[INFO][abstract_intensifier.py:287] Added existing seed 209652396 from runhistory to the intensifier.\n", + "[INFO][abstract_intensifier.py:305] Using only one seed for deterministic scenario.\n" + ] } ], "source": [ @@ -1242,6 +1033,26 @@ "automl.score(X_val, y_val)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "2224ca93-6dc0-4d8a-9b31-307e4fcd9f18", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "\n", + "def parse_by_llm(x):\n", + " loc = {}\n", + " access_scope = {\"df\": x, \"pd\": pd, \"np\": np}\n", + " parsed = ast.parse(code)\n", + " exec(compile(parsed, filename=\"\", mode=\"exec\"), access_scope, loc)\n", + "\n", + "X_test = test_dataset.drop(columns=['id'])\n", + "parse_by_llm(X_test)\n", + "X_test" + ] + }, { "cell_type": "code", "execution_count": null,