From c437bd731cb808fa44c6ec1e85cd787b844d15bc Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Tue, 26 Nov 2024 09:56:05 +0000
Subject: [PATCH 001/304] Draft new data science RDLoop

---
 rdagent/app/data_mining/conf.py               |   2 +-
 rdagent/app/data_science/conf.py              |  88 ++++++++++
 rdagent/app/data_science/loop.py              | 161 ++++++++++++++++++
 rdagent/app/kaggle/conf.py                    |   2 +-
 rdagent/app/kaggle/loop.py                    |   4 +-
 rdagent/app/qlib_rd_loop/conf.py              |   4 +-
 .../data_science/raw_data_loader/__init__.py  |  12 ++
 .../components/coder/factor_coder/factor.py   |   8 +-
 rdagent/components/coder/model_coder/model.py |   3 +-
 rdagent/components/proposal/__init__.py       |   4 +-
 rdagent/components/workflow/rd_loop.py        |   4 +-
 rdagent/core/experiment.py                    |  14 +-
 rdagent/core/proposal.py                      |  22 ++-
 .../data_mining/developer/feedback.py         |   7 +-
 .../data_mining/proposal/model_proposal.py    |   4 +-
 rdagent/scenarios/data_science/dev/coder.py   |   0
 rdagent/scenarios/data_science/dev/runner.py  |   0
 .../data_science/proposal/__init__.py         |   0
 .../data_science/proposal/task_gen.py         |  21 +++
 .../scenarios/kaggle/developer/feedback.py    |   7 +-
 rdagent/scenarios/kaggle/proposal/proposal.py |  12 +-
 rdagent/scenarios/qlib/developer/feedback.py  |  13 +-
 .../qlib/proposal/factor_proposal.py          |   4 +-
 .../scenarios/qlib/proposal/model_proposal.py |   4 +-
 24 files changed, 359 insertions(+), 41 deletions(-)
 create mode 100644 rdagent/app/data_science/conf.py
 create mode 100644 rdagent/app/data_science/loop.py
 create mode 100644 rdagent/scenarios/data_science/dev/coder.py
 create mode 100644 rdagent/scenarios/data_science/dev/runner.py
 create mode 100644 rdagent/scenarios/data_science/proposal/__init__.py
 create mode 100644 rdagent/scenarios/data_science/proposal/task_gen.py

diff --git a/rdagent/app/data_mining/conf.py b/rdagent/app/data_mining/conf.py
index 45b1ef355..e8b6ab8f9 100644
--- a/rdagent/app/data_mining/conf.py
+++ b/rdagent/app/data_mining/conf.py
@@ -23,7 +23,7 @@ class MedBasePropSetting(BasePropSetting):
     runner: str = "rdagent.scenarios.data_mining.developer.model_runner.DMModelRunner"
     """Runner class"""
 
-    summarizer: str = "rdagent.scenarios.data_mining.developer.feedback.DMModelHypothesisExperiment2Feedback"
+    summarizer: str = "rdagent.scenarios.data_mining.developer.feedback.DMModelExperiment2Feedback"
     """Summarizer class"""
 
     evolving_n: int = 10
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
new file mode 100644
index 000000000..fa9ba290b
--- /dev/null
+++ b/rdagent/app/data_science/conf.py
@@ -0,0 +1,88 @@
+from rdagent.components.workflow.conf import BasePropSetting
+from rdagent.core.conf import ExtendedSettingsConfigDict
+
+
+class KaggleBasePropSetting(BasePropSetting):
+    model_config = ExtendedSettingsConfigDict(env_prefix="DS_", protected_namespaces=())
+
+    # Main components
+    ## Scen
+    scen: str = "rdagent.scenarios.kaggle.experiment.scenario.KGScenario"
+    """Scenario class for data mining model"""
+
+    ## proposal
+    hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen"
+    """Hypothesis generation class"""
+
+    hypothesis2experiment: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesis2Experiment"
+    """Hypothesis to experiment class"""
+
+    ## dev/coder
+    feature_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGFactorCoSTEER"
+    """Feature Coder class"""
+
+    model_feature_selection_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGModelFeatureSelectionCoder"
+    """Model Feature Selection Coder class"""
+
+    model_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGModelCoSTEER"
+    """Model Coder class"""
+
+    ## dev/runner
+    feature_runner: str = "rdagent.scenarios.kaggle.developer.runner.KGFactorRunner"
+    """Feature Runner class"""
+
+    model_runner: str = "rdagent.scenarios.kaggle.developer.runner.KGModelRunner"
+    """Model Runner class"""
+
+    ## feedback
+    summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGExperiment2Feedback"
+    """Summarizer class"""
+
+    # Configs
+    ## Base
+    competition: str = ""
+    """Kaggle competition name, e.g., 'sf-crime'"""
+
+    template_path: str = "rdagent/scenarios/kaggle/experiment/templates"  # TODO: we may not need this
+    """Kaggle competition base templates path"""
+
+    local_data_path: str = ""
+    """Folder storing Kaggle competition data"""
+
+    if_using_mle_data: bool = False
+
+    ## Workflow
+    evolving_n: int = 10
+    """Number of evolutions"""
+
+    auto_submit: bool = False
+    """Automatically upload and submit each experiment result to Kaggle platform"""
+
+    ### shared components in the workflow
+    # Conditionally set the knowledge_base based on the use of graph RAG
+    knowledge_base: str = ""
+    """Knowledge base class, uses 'KGKnowledgeGraph' when advanced graph-based RAG is enabled, otherwise empty."""
+
+    domain_knowledge_path: str = "/data/userdata/share/kaggle/domain_knowledge"  # TODO: It should be sth like knowledge_base_kwargs
+    """Folder storing domain knowledge files in .case format"""
+
+    knowledge_base_path: str = "kg_graph.pkl"
+    """Advanced version of graph-based RAG"""
+
+    rag_path: str = "git_ignore_folder/kaggle_vector_base.pkl"
+    """Base version of vector-based RAG"""
+
+    ## proposal
+    # (TODO: should goto sub config of proposal)
+    #  Move to hypothesis_gen as a sub config instead of global config
+    if_action_choosing_based_on_UCB: bool = False
+    """Enable decision mechanism based on UCB algorithm"""
+
+    if_using_vector_rag: bool = False
+    """Enable basic vector-based RAG"""
+
+    if_using_graph_rag: bool = False
+    """Enable advanced graph-based RAG"""
+
+
+KAGGLE_IMPLEMENT_SETTING = KaggleBasePropSetting()
diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
new file mode 100644
index 000000000..449a1e651
--- /dev/null
+++ b/rdagent/app/data_science/loop.py
@@ -0,0 +1,161 @@
+
+import subprocess
+from typing import Any, Literal
+
+import fire
+
+from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+from rdagent.components.workflow.conf import BasePropSetting
+from rdagent.components.workflow.dummy import DummyHypothesisGen
+from rdagent.components.workflow.rd_loop import RDLoop
+from rdagent.core.developer import Developer
+from rdagent.core.exception import FactorEmptyError, ModelEmptyError
+from rdagent.core.proposal import (
+    Hypothesis2Experiment,
+    Experiment2Feedback,
+    HypothesisGen,
+)
+from rdagent.core.scenario import Scenario
+from rdagent.core.utils import import_class
+from rdagent.log import rdagent_logger as logger
+from rdagent.log.time import measure_time
+from rdagent.scenarios.kaggle.experiment.scenario import (
+    KG_ACTION_FEATURE_ENGINEERING,
+    KG_ACTION_FEATURE_PROCESSING,
+    KG_ACTION_MODEL_FEATURE_SELECTION,
+)
+from rdagent.scenarios.kaggle.experiment.utils import python_files_to_notebook
+from rdagent.scenarios.kaggle.kaggle_crawler import download_data
+from rdagent.scenarios.kaggle.proposal.proposal import KGTrace
+
+
+class KaggleRDLoop(RDLoop):
+    @measure_time
+    def __init__(self, PROP_SETTING: BasePropSetting):
+
+        with logger.tag("init"):
+            scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
+            logger.log_object(scen, tag="scenario")
+
+            ### shared components in the workflow  # TODO: check if 
+            knowledge_base = (
+                import_class(PROP_SETTING.knowledge_base)(PROP_SETTING.knowledge_base_path, scen)
+                if PROP_SETTING.knowledge_base != ""
+                else None
+            )
+            logger.log_object(knowledge_base, tag="knowledge_base")
+
+            # 1) task generation from scratch
+            # self.scratch_gen: tuple[HypothesisGen, Hypothesis2Experiment] = DummyHypothesisGen(scen),
+            
+            # 2) task generation from a complete solution
+            self.exp_gen: ExpGen = import_class(PROP_SETTING.exp_gen)(scen)
+
+
+            self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
+            logger.log_object(self.hypothesis_gen, tag="hypothesis generator")
+            self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
+            logger.log_object(self.hypothesis2experiment, tag="hypothesis2experiment")
+            self.feature_coder: Developer = import_class(PROP_SETTING.feature_coder)(scen)
+            logger.log_object(self.feature_coder, tag="feature coder")
+            self.model_feature_selection_coder: Developer = import_class(PROP_SETTING.model_feature_selection_coder)(
+                scen
+            )
+            logger.log_object(self.model_feature_selection_coder, tag="model feature selection coder")
+            self.model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen)
+            logger.log_object(self.model_coder, tag="model coder")
+            self.feature_runner: Developer = import_class(PROP_SETTING.feature_runner)(scen)
+            logger.log_object(self.feature_runner, tag="feature runner")
+            self.model_runner: Developer = import_class(PROP_SETTING.model_runner)(scen)
+            logger.log_object(self.model_runner, tag="model runner")
+            self.summarizer: Experiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
+            logger.log_object(self.summarizer, tag="summarizer")
+
+            self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
+            super(RDLoop, self).__init__()
+
+    @measure_time
+    def coding(self, prev_out: dict[str, Any]):
+        with logger.tag("d"):  # develop
+            if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
+                exp = self.feature_coder.develop(prev_out["exp_gen"])
+            elif prev_out["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
+                exp = self.model_feature_selection_coder.develop(prev_out["exp_gen"])
+            else:
+                exp = self.model_coder.develop(prev_out["exp_gen"])
+            logger.log_object(exp.sub_workspace_list, tag="coder result")
+        return exp
+
+    @measure_time
+    def running(self, prev_out: dict[str, Any]):
+        if not self.exp_gen.is_complete():
+            raise NextLoopExcpetion()
+
+        with logger.tag("ef"):  # evaluate and feedback
+            if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
+                exp = self.feature_runner.develop(prev_out["coding"])
+            else:
+                exp = self.model_runner.develop(prev_out["coding"])
+            logger.log_object(exp, tag="runner result")
+            if KAGGLE_IMPLEMENT_SETTING.competition in [
+                "optiver-realized-volatility-prediction",
+                "covid19-global-forecasting-week-1",
+            ]:
+                try:
+                    python_files_to_notebook(
+                        KAGGLE_IMPLEMENT_SETTING.competition, exp.experiment_workspace.workspace_path
+                    )
+                except Exception as e:
+                    logger.error(f"Merge python files to one file failed: {e}")
+            if KAGGLE_IMPLEMENT_SETTING.auto_submit:
+                csv_path = exp.experiment_workspace.workspace_path / "submission.csv"
+                try:
+                    subprocess.run(
+                        [
+                            "kaggle",
+                            "competitions",
+                            "submit",
+                            "-f",
+                            str(csv_path.absolute()),
+                            "-m",
+                            str(csv_path.parent.absolute()),
+                            KAGGLE_IMPLEMENT_SETTING.competition,
+                        ],
+                        check=True,
+                    )
+                except subprocess.CalledProcessError as e:
+                    logger.error(f"Auto submission failed: \n{e}")
+                except Exception as e:
+                    logger.error(f"Other exception when use kaggle api:\n{e}")
+
+        return exp
+
+    skip_loop_error = (ModelEmptyError, FactorEmptyError)
+
+
+def main(path=None, step_n=None, competition=None):
+    """
+    Auto R&D Evolving loop for models in a kaggle{} scenario.
+    You can continue running session by
+    .. code-block:: bash
+        dotenv run -- python rdagent/app/kaggle/loop.py [--competition titanic] $LOG_PATH/__session__/1/0_propose  --step_n 1   # `step_n` is a optional parameter
+        rdagent kaggle --competition playground-series-s4e8  # You are encouraged to use this one.
+    """
+    if competition:
+        KAGGLE_IMPLEMENT_SETTING.competition = competition
+        download_data(competition=competition, local_path=KAGGLE_IMPLEMENT_SETTING.local_data_path)
+        if KAGGLE_IMPLEMENT_SETTING.if_using_graph_rag:
+            KAGGLE_IMPLEMENT_SETTING.knowledge_base = (
+                "rdagent.scenarios.kaggle.knowledge_management.graph.KGKnowledgeGraph"
+            )
+    else:
+        logger.error("Please specify competition name.")
+    if path is None:
+        kaggle_loop = KaggleRDLoop(KAGGLE_IMPLEMENT_SETTING)
+    else:
+        kaggle_loop = KaggleRDLoop.load(path)
+    kaggle_loop.run(step_n=step_n)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py
index b52047308..0d45d35a8 100644
--- a/rdagent/app/kaggle/conf.py
+++ b/rdagent/app/kaggle/conf.py
@@ -30,7 +30,7 @@ class KaggleBasePropSetting(BasePropSetting):
     model_runner: str = "rdagent.scenarios.kaggle.developer.runner.KGModelRunner"
     """Model Runner class"""
 
-    summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGHypothesisExperiment2Feedback"
+    summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGExperiment2Feedback"
     """Summarizer class"""
 
     evolving_n: int = 10
diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
index 2c66d668a..d48988a7e 100644
--- a/rdagent/app/kaggle/loop.py
+++ b/rdagent/app/kaggle/loop.py
@@ -10,7 +10,7 @@
 from rdagent.core.exception import FactorEmptyError, ModelEmptyError
 from rdagent.core.proposal import (
     Hypothesis2Experiment,
-    HypothesisExperiment2Feedback,
+    Experiment2Feedback,
     HypothesisGen,
 )
 from rdagent.core.scenario import Scenario
@@ -55,7 +55,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):
             logger.log_object(self.feature_runner, tag="feature runner")
             self.model_runner: Developer = import_class(PROP_SETTING.model_runner)(scen)
             logger.log_object(self.model_runner, tag="model runner")
-            self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
+            self.summarizer: Experiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
             logger.log_object(self.summarizer, tag="summarizer")
             self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
             super(RDLoop, self).__init__()
diff --git a/rdagent/app/qlib_rd_loop/conf.py b/rdagent/app/qlib_rd_loop/conf.py
index e6a91351a..da1a98c56 100644
--- a/rdagent/app/qlib_rd_loop/conf.py
+++ b/rdagent/app/qlib_rd_loop/conf.py
@@ -21,7 +21,7 @@ class ModelBasePropSetting(BasePropSetting):
     runner: str = "rdagent.scenarios.qlib.developer.model_runner.QlibModelRunner"
     """Runner class"""
 
-    summarizer: str = "rdagent.scenarios.qlib.developer.feedback.QlibModelHypothesisExperiment2Feedback"
+    summarizer: str = "rdagent.scenarios.qlib.developer.feedback.QlibModelExperiment2Feedback"
     """Summarizer class"""
 
     evolving_n: int = 10
@@ -47,7 +47,7 @@ class FactorBasePropSetting(BasePropSetting):
     runner: str = "rdagent.scenarios.qlib.developer.factor_runner.QlibFactorRunner"
     """Runner class"""
 
-    summarizer: str = "rdagent.scenarios.qlib.developer.feedback.QlibFactorHypothesisExperiment2Feedback"
+    summarizer: str = "rdagent.scenarios.qlib.developer.feedback.QlibFactorExperiment2Feedback"
     """Summarizer class"""
 
     evolving_n: int = 10
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index 22ed405df..358046de1 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -1,3 +1,15 @@
+"""
+
+Loop should not large change excclude
+- Action Choice[current data loader & spec]
+- other should share
+    - Propose[choice] => Task[Choice] => CoSTEER => 
+        - 
+
+Extra feature:
+- cache
+"""
+
 # from rdagent.components.coder.CoSTEER import CoSTEER
 # from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
 # from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py
index edb7a9ce8..df41a2937 100644
--- a/rdagent/components/coder/factor_coder/factor.py
+++ b/rdagent/components/coder/factor_coder/factor.py
@@ -34,12 +34,16 @@ def __init__(
         self.factor_name = (
             factor_name  # TODO: remove it in the later version. Keep it only for pickle version compatibility
         )
-        self.factor_description = factor_description
         self.factor_formulation = factor_formulation
         self.variables = variables
         self.factor_resources = resource
         self.factor_implementation = factor_implementation
-        super().__init__(name=factor_name, *args, **kwargs)
+        super().__init__(name=factor_name, description=factor_description, *args, **kwargs)
+
+    @property
+    def factor_description(self):
+        """for compatibility"""
+        return self.description
 
     def get_task_information(self):
         return f"""factor_name: {self.factor_name}
diff --git a/rdagent/components/coder/model_coder/model.py b/rdagent/components/coder/model_coder/model.py
index b5a3e3ac5..ce842459f 100644
--- a/rdagent/components/coder/model_coder/model.py
+++ b/rdagent/components/coder/model_coder/model.py
@@ -24,7 +24,6 @@ def __init__(
         model_type: Optional[str] = None,
         **kwargs,
     ) -> None:
-        self.description: str = description
         self.formulation: str = formulation
         self.architecture: str = architecture
         self.variables: str = variables
@@ -32,7 +31,7 @@ def __init__(
         self.model_type: str = (
             model_type  # Tabular for tabular model, TimesSeries for time series model, Graph for graph model, XGBoost for XGBoost model
         )
-        super().__init__(name=name, *args, **kwargs)
+        super().__init__(name=name, description=description, *args, **kwargs)
 
     def get_task_information(self):
         task_desc = f"""name: {self.name}
diff --git a/rdagent/components/proposal/__init__.py b/rdagent/components/proposal/__init__.py
index 305df2196..51980766f 100644
--- a/rdagent/components/proposal/__init__.py
+++ b/rdagent/components/proposal/__init__.py
@@ -82,7 +82,7 @@ class LLMHypothesis2Experiment(Hypothesis2Experiment[Experiment]):
     def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, bool]: ...
 
     @abstractmethod
-    def convert_response(self, response: str, trace: Trace) -> Experiment: ...
+    def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace) -> Experiment: ...
 
     def convert(self, hypothesis: Hypothesis, trace: Trace) -> Experiment:
         context, json_flag = self.prepare_context(hypothesis, trace)
@@ -109,7 +109,7 @@ def convert(self, hypothesis: Hypothesis, trace: Trace) -> Experiment:
 
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=json_flag)
 
-        return self.convert_response(resp, trace)
+        return self.convert_response(resp, hypothesis, trace)
 
 
 class FactorHypothesis2Experiment(LLMHypothesis2Experiment):
diff --git a/rdagent/components/workflow/rd_loop.py b/rdagent/components/workflow/rd_loop.py
index a331afd7d..9d6c4fd18 100644
--- a/rdagent/components/workflow/rd_loop.py
+++ b/rdagent/components/workflow/rd_loop.py
@@ -10,7 +10,7 @@
 from rdagent.core.developer import Developer
 from rdagent.core.proposal import (
     Hypothesis2Experiment,
-    HypothesisExperiment2Feedback,
+    Experiment2Feedback,
     HypothesisGen,
     Trace,
 )
@@ -39,7 +39,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):
             self.runner: Developer = import_class(PROP_SETTING.runner)(scen)
             logger.log_object(self.runner, tag="runner")
 
-            self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
+            self.summarizer: Experiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
             logger.log_object(self.summarizer, tag="summarizer")
             self.trace = Trace(scen=scen)
             super().__init__()
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index 2fda6af97..b80b59e36 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -8,16 +8,17 @@
 from collections.abc import Sequence
 from copy import deepcopy
 from pathlib import Path
-from typing import Any, Generic, TypeVar
+from typing import Any, Generic, Optional, TypeVar
 
 from rdagent.core.conf import RD_AGENT_SETTINGS
+from rdagent.core.proposal import Hypothesis
 
 """
 This file contains the all the class about organizing the task in RD-Agent.
 """
 
 
-class Task(ABC):
+class AbsTask(ABC):
     def __init__(self, name: str, version: int = 1) -> None:
         """
         The version of the task, default is 1
@@ -33,6 +34,13 @@ def get_task_information(self) -> str:
         Get the task information string to build the unique key
         """
 
+class Task(AbsTask):
+    def __init__(self, name: str, version: int = 1, desc: str = "") -> None:
+        super().__init__(name, version)
+        self.description = desc
+
+    def get_task_information(self) -> str:
+        return f"{self.name}_{self.version}: {self.desc}"
 
 ASpecificTask = TypeVar("ASpecificTask", bound=Task)
 
@@ -205,7 +213,9 @@ def __init__(
         self,
         sub_tasks: Sequence[ASpecificTask],
         based_experiments: Sequence[ASpecificWSForExperiment] = [],
+        hypothesis: Optional[Hypothesis] = None,
     ) -> None:
+        self.hypothesis: Optional[Hypothesis] = hypothesis  # Experiment is opptionally generated by hypothesis
         self.sub_tasks: Sequence[ASpecificTask] = sub_tasks
         self.sub_workspace_list: list[ASpecificWSForSubTasks | None] = [None] * len(self.sub_tasks)
         self.based_experiments: Sequence[ASpecificWSForExperiment] = based_experiments
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
index a420bb311..c2bd2d970 100644
--- a/rdagent/core/proposal.py
+++ b/rdagent/core/proposal.py
@@ -103,6 +103,23 @@ def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experim
         return None, None
 
 
+class ExpGen(ABC):
+    @abstractmethod
+    def gen(self, trace: Trace) -> Experiment:
+        """
+        Generate the experiment based on the trace.
+
+        `ExpGen().gen()` play a role like
+
+        .. code-block:: python
+
+            # ExpGen().gen() ==
+            Hypothesis2Experiment().convert(
+                HypothesisGen().gen(trace)
+            )
+        """
+
+
 class HypothesisGen(ABC):
     # NOTE: the design is a little wierd
     # - Sometimes we want accurate access the prompts in a specific level
@@ -141,7 +158,7 @@ def convert(self, hypothesis: Hypothesis, trace: Trace) -> ASpecificExp:
 # Boolean, Reason, Confidence, etc.
 
 
-class HypothesisExperiment2Feedback(ABC):
+class Experiment2Feedback(ABC):
     """ "Generated feedbacks on the hypothesis from **Executed** Implementations of different tasks
     & their comparisons with previous performances"""
 
@@ -149,11 +166,12 @@ def __init__(self, scen: Scenario) -> None:
         self.scen = scen
 
     @abstractmethod
-    def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
+    def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback:
         """
         The `exp` should be executed and the results should be included, as well as the comparison
         between previous results (done by LLM).
         For example: `mlflow` of Qlib will be included.
         """
+        # TODO: return a hypothesis feedback seems wierd now. Maybe we should return an ExerimentFeedback?
         error_message = "generate_feedback method is not implemented."
         raise NotImplementedError(error_message)
diff --git a/rdagent/scenarios/data_mining/developer/feedback.py b/rdagent/scenarios/data_mining/developer/feedback.py
index 5a96609be..ec278c6f7 100644
--- a/rdagent/scenarios/data_mining/developer/feedback.py
+++ b/rdagent/scenarios/data_mining/developer/feedback.py
@@ -10,7 +10,7 @@
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import (
     Hypothesis,
-    HypothesisExperiment2Feedback,
+    Experiment2Feedback,
     HypothesisFeedback,
     Trace,
 )
@@ -22,14 +22,15 @@
 DIRNAME = Path(__file__).absolute().resolve().parent
 
 
-class DMModelHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
+class DMModelExperiment2Feedback(Experiment2Feedback):
     """Generated feedbacks on the hypothesis from **Executed** Implementations of different tasks & their comparisons with previous performances"""
 
-    def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
+    def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback:
         """
         The `ti` should be executed and the results should be included, as well as the comparison between previous results (done by LLM).
         For example: `mlflow` of Qlib will be included.
         """
+        hypothesis = exp.hypothesis
 
         logger.info("Generating feedback...")
         # Define the system prompt for hypothesis feedback
diff --git a/rdagent/scenarios/data_mining/proposal/model_proposal.py b/rdagent/scenarios/data_mining/proposal/model_proposal.py
index 37f36e2f4..547978edc 100644
--- a/rdagent/scenarios/data_mining/proposal/model_proposal.py
+++ b/rdagent/scenarios/data_mining/proposal/model_proposal.py
@@ -95,7 +95,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             "RAG": None,
         }, True
 
-    def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
+    def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace) -> ModelExperiment:
         response_dict = json.loads(response)
         tasks = []
         for model_name in response_dict:
@@ -116,6 +116,6 @@ def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
                     model_type=model_type,
                 )
             )
-        exp = DMModelExperiment(tasks)
+        exp = DMModelExperiment(tasks, hypothesis=hypothesis)
         exp.based_experiments = [t[1] for t in trace.hist if t[2]]
         return exp
diff --git a/rdagent/scenarios/data_science/dev/coder.py b/rdagent/scenarios/data_science/dev/coder.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/scenarios/data_science/dev/runner.py b/rdagent/scenarios/data_science/dev/runner.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/scenarios/data_science/proposal/__init__.py b/rdagent/scenarios/data_science/proposal/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/scenarios/data_science/proposal/task_gen.py b/rdagent/scenarios/data_science/proposal/task_gen.py
new file mode 100644
index 000000000..227c84c34
--- /dev/null
+++ b/rdagent/scenarios/data_science/proposal/task_gen.py
@@ -0,0 +1,21 @@
+from typing import Literal
+from rdagent.core.experiment import Experiment
+from rdagent.core.proposal import ExpGen, Trace
+
+COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Workflow", "Ensemble"]
+MAX_NUM = COMPONENT.__args__
+
+class DSExpGen(ExpGen):
+    """Data Science Task Generator."""
+    def __init__(self) -> None:
+        self.complete_component: set[COMPONENT] = set()  # Initialize as an empty set
+
+    def _is_complete(self):
+        """is all components complete"""
+        # TODO: place it into ExpGen
+        return self.complete_component  == set(COMPONENT.__args__)
+
+    def gen(self, trace: Trace) -> Experiment:
+        
+        return super().gen(trace)
+
diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
index 708c74e77..37077a6e9 100644
--- a/rdagent/scenarios/kaggle/developer/feedback.py
+++ b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -9,7 +9,7 @@
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import (
     Hypothesis,
-    HypothesisExperiment2Feedback,
+    Experiment2Feedback,
     HypothesisFeedback,
     Trace,
 )
@@ -22,7 +22,7 @@
 DIRNAME = Path(__file__).absolute().resolve().parent
 
 
-class KGHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
+class KGExperiment2Feedback(Experiment2Feedback):
     def process_results(self, current_result, sota_result):
         # Convert the results to dataframes
         current_df = pd.DataFrame(current_result)
@@ -46,7 +46,7 @@ def process_results(self, current_result, sota_result):
 
         return combined_df, evaluation_description
 
-    def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
+    def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback:
         """
         The `ti` should be executed and the results should be included, as well as the comparison between previous results (done by LLM).
         For example: `mlflow` of Qlib will be included.
@@ -60,6 +60,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
         Returns:
             Any: The feedback generated for the given experiment and hypothesis.
         """
+        hypothesis = exp.hypothesis
         logger.info("Generating feedback...")
         current_result = exp.result
 
diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py
index 273860bb5..1e1cb4845 100644
--- a/rdagent/scenarios/kaggle/proposal/proposal.py
+++ b/rdagent/scenarios/kaggle/proposal/proposal.py
@@ -362,7 +362,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             ),
         }, True
 
-    def convert_feature_experiment(self, response: str, trace: Trace) -> KGFactorExperiment:
+    def convert_feature_experiment(self, response: str, hypothesis: Hypothesis, trace: Trace) -> KGFactorExperiment:
         response_dict = json.loads(response)
         tasks = []
 
@@ -386,10 +386,11 @@ def convert_feature_experiment(self, response: str, trace: Trace) -> KGFactorExp
                 [KGFactorExperiment(sub_tasks=[], source_feature_size=trace.scen.input_shape[-1])]
                 + [t[1] for t in trace.hist if t[2]]
             ),
+            hypothesis=hypothesis,
         )
         return exp
 
-    def convert_model_experiment(self, response: str, trace: Trace) -> KGModelExperiment:
+    def convert_model_experiment(self, response: str, hypothesis: Hypothesis, trace: Trace) -> KGModelExperiment:
         response_dict = json.loads(response)
         tasks = []
         model_type = response_dict.get("model_type", "Model type not provided")
@@ -421,14 +422,15 @@ def convert_model_experiment(self, response: str, trace: Trace) -> KGModelExperi
         exp = KGModelExperiment(
             sub_tasks=tasks,
             based_experiments=based_experiments,
+            hypothesis=hypothesis,
         )
         return exp
 
-    def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
+    def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace) -> ModelExperiment:
         if self.current_action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
-            return self.convert_feature_experiment(response, trace)
+            return self.convert_feature_experiment(response, hypothesis, trace)
         elif self.current_action in [KG_ACTION_MODEL_FEATURE_SELECTION, KG_ACTION_MODEL_TUNING]:
-            return self.convert_model_experiment(response, trace)
+            return self.convert_model_experiment(response, hypothesis, trace)
 
 
 class KGTrace(Trace[KGScenario, KGKnowledgeGraph]):
diff --git a/rdagent/scenarios/qlib/developer/feedback.py b/rdagent/scenarios/qlib/developer/feedback.py
index c34e0e89b..240c67d0f 100644
--- a/rdagent/scenarios/qlib/developer/feedback.py
+++ b/rdagent/scenarios/qlib/developer/feedback.py
@@ -8,7 +8,7 @@
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import (
     Hypothesis,
-    HypothesisExperiment2Feedback,
+    Experiment2Feedback,
     HypothesisFeedback,
     Trace,
 )
@@ -56,8 +56,8 @@ def process_results(current_result, sota_result):
     return filtered_combined_df.to_string()
 
 
-class QlibFactorHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
-    def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
+class QlibFactorExperiment2Feedback(Experiment2Feedback):
+    def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback:
         """
         Generate feedback for the given experiment and hypothesis.
 
@@ -69,6 +69,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
         Returns:
             Any: The feedback generated for the given experiment and hypothesis.
         """
+        hypothesis = exp.hypothesis
         logger.info("Generating feedback...")
         hypothesis_text = hypothesis.hypothesis
         current_result = exp.result
@@ -122,15 +123,15 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
         )
 
 
-class QlibModelHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
+class QlibModelExperiment2Feedback(Experiment2Feedback):
     """Generated feedbacks on the hypothesis from **Executed** Implementations of different tasks & their comparisons with previous performances"""
 
-    def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
+    def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback:
         """
         The `ti` should be executed and the results should be included, as well as the comparison between previous results (done by LLM).
         For example: `mlflow` of Qlib will be included.
         """
-
+        hypothesis = exp.hypothesis
         logger.info("Generating feedback...")
         # Define the system prompt for hypothesis feedback
         system_prompt = feedback_prompts["model_feedback_generation"]["system"]
diff --git a/rdagent/scenarios/qlib/proposal/factor_proposal.py b/rdagent/scenarios/qlib/proposal/factor_proposal.py
index 204e9bcbb..a1928ccd8 100644
--- a/rdagent/scenarios/qlib/proposal/factor_proposal.py
+++ b/rdagent/scenarios/qlib/proposal/factor_proposal.py
@@ -80,7 +80,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict |
             "RAG": None,
         }, True
 
-    def convert_response(self, response: str, trace: Trace) -> FactorExperiment:
+    def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace) -> FactorExperiment:
         response_dict = json.loads(response)
         tasks = []
 
@@ -97,7 +97,7 @@ def convert_response(self, response: str, trace: Trace) -> FactorExperiment:
                 )
             )
 
-        exp = QlibFactorExperiment(tasks)
+        exp = QlibFactorExperiment(tasks, hypothesis=hypothesis)
         exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [t[1] for t in trace.hist if t[2]]
 
         unique_tasks = []
diff --git a/rdagent/scenarios/qlib/proposal/model_proposal.py b/rdagent/scenarios/qlib/proposal/model_proposal.py
index 51d92b032..98ec33b65 100644
--- a/rdagent/scenarios/qlib/proposal/model_proposal.py
+++ b/rdagent/scenarios/qlib/proposal/model_proposal.py
@@ -80,7 +80,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             "RAG": None,
         }, True
 
-    def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
+    def convert_response(self, response: str, hypothesis: Hypothesis,  trace: Trace) -> ModelExperiment:
         response_dict = json.loads(response)
         tasks = []
         for model_name in response_dict:
@@ -101,6 +101,6 @@ def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
                     model_type=model_type,
                 )
             )
-        exp = QlibModelExperiment(tasks)
+        exp = QlibModelExperiment(tasks, hypothesis=hypothesis)
         exp.based_experiments = [t[1] for t in trace.hist if t[2]]
         return exp

From c5c5d7f7b1c533ad49e742dae4a688e571fb5bf9 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Wed, 27 Nov 2024 04:59:22 +0000
Subject: [PATCH 002/304] Pushforward and refactor

---
 rdagent/app/data_science/conf.py              |  17 ++-
 rdagent/app/data_science/loop.py              | 112 ++++++++-------
 rdagent/app/kaggle/loop.py                    |  12 +-
 rdagent/components/workflow/rd_loop.py        |  37 +++--
 rdagent/core/evaluation.py                    |  12 +-
 rdagent/core/experiment.py                    |   8 +-
 rdagent/core/proposal.py                      |   4 +
 rdagent/core/scenario.py                      |  22 +--
 rdagent/scenarios/data_science/__init__.py    |   0
 .../data_science/proposal/exp_gen.py          |  42 ++++++
 .../data_science/proposal/prompts.yaml        |   3 +
 .../data_science/proposal/task_gen.py         |  21 ---
 .../scenarios/data_science/scen/__init__.py   |   4 +
 .../scenarios/data_science/scen/prompts.yaml  |   9 ++
 rdagent/scenarios/data_science/scen/scen.py   | 131 ++++++++++++++++++
 .../scenarios/kaggle/experiment/scenario.py   |   2 +-
 rdagent/scenarios/kaggle/kaggle_crawler.py    |   6 +-
 rdagent/utils/agent/tpl.py                    |   3 +
 rdagent/utils/workflow.py                     |   2 +-
 19 files changed, 320 insertions(+), 127 deletions(-)
 create mode 100644 rdagent/scenarios/data_science/__init__.py
 create mode 100644 rdagent/scenarios/data_science/proposal/exp_gen.py
 create mode 100644 rdagent/scenarios/data_science/proposal/prompts.yaml
 delete mode 100644 rdagent/scenarios/data_science/proposal/task_gen.py
 create mode 100644 rdagent/scenarios/data_science/scen/__init__.py
 create mode 100644 rdagent/scenarios/data_science/scen/prompts.yaml
 create mode 100644 rdagent/scenarios/data_science/scen/scen.py

diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index fa9ba290b..72c4eeb7a 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -2,20 +2,23 @@
 from rdagent.core.conf import ExtendedSettingsConfigDict
 
 
-class KaggleBasePropSetting(BasePropSetting):
+class DataScienceBasePropSetting(BasePropSetting):
     model_config = ExtendedSettingsConfigDict(env_prefix="DS_", protected_namespaces=())
 
     # Main components
     ## Scen
-    scen: str = "rdagent.scenarios.kaggle.experiment.scenario.KGScenario"
+    scen: str = "rdagent.scenarios.data_science.scen.DataScienceScen"
     """Scenario class for data mining model"""
 
     ## proposal
-    hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen"
-    """Hypothesis generation class"""
+    exp_gen: str = "rdagent.scenarios.data_science.proposal.exp_gen.DSExpGen"
 
-    hypothesis2experiment: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesis2Experiment"
-    """Hypothesis to experiment class"""
+    # the two below should be used in ExpGen
+    # hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen"
+    # """Hypothesis generation class"""
+    #
+    # hypothesis2experiment: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesis2Experiment"
+    # """Hypothesis to experiment class"""
 
     ## dev/coder
     feature_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGFactorCoSTEER"
@@ -85,4 +88,4 @@ class KaggleBasePropSetting(BasePropSetting):
     """Enable advanced graph-based RAG"""
 
 
-KAGGLE_IMPLEMENT_SETTING = KaggleBasePropSetting()
+DS_RD_SETTING = DataScienceBasePropSetting()
diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 449a1e651..25c61b0e7 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -4,32 +4,29 @@
 
 import fire
 
-from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.workflow.conf import BasePropSetting
-from rdagent.components.workflow.dummy import DummyHypothesisGen
-from rdagent.components.workflow.rd_loop import RDLoop
-from rdagent.core.developer import Developer
+from rdagent.components.workflow.rd_loop import RDLoop, NextLoopException
 from rdagent.core.exception import FactorEmptyError, ModelEmptyError
 from rdagent.core.proposal import (
+    ExpGen,
     Hypothesis2Experiment,
     Experiment2Feedback,
     HypothesisGen,
+    Trace,
 )
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
 from rdagent.log.time import measure_time
-from rdagent.scenarios.kaggle.experiment.scenario import (
-    KG_ACTION_FEATURE_ENGINEERING,
-    KG_ACTION_FEATURE_PROCESSING,
-    KG_ACTION_MODEL_FEATURE_SELECTION,
-)
 from rdagent.scenarios.kaggle.experiment.utils import python_files_to_notebook
 from rdagent.scenarios.kaggle.kaggle_crawler import download_data
-from rdagent.scenarios.kaggle.proposal.proposal import KGTrace
 
 
-class KaggleRDLoop(RDLoop):
+
+class DataScienceRDLoop(RDLoop):
+    skip_loop_error = (NextLoopException,)
+
     @measure_time
     def __init__(self, PROP_SETTING: BasePropSetting):
 
@@ -47,42 +44,55 @@ def __init__(self, PROP_SETTING: BasePropSetting):
 
             # 1) task generation from scratch
             # self.scratch_gen: tuple[HypothesisGen, Hypothesis2Experiment] = DummyHypothesisGen(scen),
-            
+
             # 2) task generation from a complete solution
             self.exp_gen: ExpGen = import_class(PROP_SETTING.exp_gen)(scen)
 
 
-            self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
-            logger.log_object(self.hypothesis_gen, tag="hypothesis generator")
-            self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
-            logger.log_object(self.hypothesis2experiment, tag="hypothesis2experiment")
-            self.feature_coder: Developer = import_class(PROP_SETTING.feature_coder)(scen)
-            logger.log_object(self.feature_coder, tag="feature coder")
-            self.model_feature_selection_coder: Developer = import_class(PROP_SETTING.model_feature_selection_coder)(
-                scen
-            )
-            logger.log_object(self.model_feature_selection_coder, tag="model feature selection coder")
-            self.model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen)
-            logger.log_object(self.model_coder, tag="model coder")
-            self.feature_runner: Developer = import_class(PROP_SETTING.feature_runner)(scen)
-            logger.log_object(self.feature_runner, tag="feature runner")
-            self.model_runner: Developer = import_class(PROP_SETTING.model_runner)(scen)
-            logger.log_object(self.model_runner, tag="model runner")
-            self.summarizer: Experiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
-            logger.log_object(self.summarizer, tag="summarizer")
-
-            self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
+            # self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
+            # logger.log_object(self.hypothesis_gen, tag="hypothesis generator")
+            # self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
+            # logger.log_object(self.hypothesis2experiment, tag="hypothesis2experiment")
+
+            # TODO: we need more coder
+            # self.feature_coder: Developer = import_class(PROP_SETTING.feature_coder)(scen)
+            # logger.log_object(self.feature_coder, tag="feature coder")
+            # self.model_feature_selection_coder: Developer = import_class(PROP_SETTING.model_feature_selection_coder)(
+            #     scen
+            # )
+            # logger.log_object(self.model_feature_selection_coder, tag="model feature selection coder")
+            # self.model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen)
+            # logger.log_object(self.model_coder, tag="model coder")
+
+
+            # TODO: now we only need on runner
+            # self.feature_runner: Developer = import_class(PROP_SETTING.feature_runner)(scen)
+            # logger.log_object(self.feature_runner, tag="feature runner")
+            # self.model_runner: Developer = import_class(PROP_SETTING.model_runner)(scen)
+            # logger.log_object(self.model_runner, tag="model runner")
+
+            # self.summarizer: Experiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
+            # logger.log_object(self.summarizer, tag="summarizer")
+
+            # self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
+            self.trace = Trace(scen=scen)
             super(RDLoop, self).__init__()
 
+    @measure_time
+    def direct_exp_gen(self, prev_out: dict[str, Any]):
+        exp = self.exp_gen.gen(self.trace)
+        hypo = exp.hypothesis
+        return {"propose": hypo, "exp_gen": exp}
+
     @measure_time
     def coding(self, prev_out: dict[str, Any]):
         with logger.tag("d"):  # develop
-            if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
-                exp = self.feature_coder.develop(prev_out["exp_gen"])
-            elif prev_out["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
-                exp = self.model_feature_selection_coder.develop(prev_out["exp_gen"])
+            if prev_out["direct_exp_gen"]["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
+                exp = self.feature_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
+            elif prev_out["direct_exp_gen"]["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
+                exp = self.model_feature_selection_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
             else:
-                exp = self.model_coder.develop(prev_out["exp_gen"])
+                exp = self.model_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
             logger.log_object(exp.sub_workspace_list, tag="coder result")
         return exp
 
@@ -92,22 +102,22 @@ def running(self, prev_out: dict[str, Any]):
             raise NextLoopExcpetion()
 
         with logger.tag("ef"):  # evaluate and feedback
-            if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
+            if prev_out["direct_exp_gen"]["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
                 exp = self.feature_runner.develop(prev_out["coding"])
             else:
                 exp = self.model_runner.develop(prev_out["coding"])
             logger.log_object(exp, tag="runner result")
-            if KAGGLE_IMPLEMENT_SETTING.competition in [
+            if DS_RD_SETTING.competition in [
                 "optiver-realized-volatility-prediction",
                 "covid19-global-forecasting-week-1",
             ]:
                 try:
                     python_files_to_notebook(
-                        KAGGLE_IMPLEMENT_SETTING.competition, exp.experiment_workspace.workspace_path
+                        DS_RD_SETTING.competition, exp.experiment_workspace.workspace_path
                     )
                 except Exception as e:
                     logger.error(f"Merge python files to one file failed: {e}")
-            if KAGGLE_IMPLEMENT_SETTING.auto_submit:
+            if DS_RD_SETTING.auto_submit:
                 csv_path = exp.experiment_workspace.workspace_path / "submission.csv"
                 try:
                     subprocess.run(
@@ -119,7 +129,7 @@ def running(self, prev_out: dict[str, Any]):
                             str(csv_path.absolute()),
                             "-m",
                             str(csv_path.parent.absolute()),
-                            KAGGLE_IMPLEMENT_SETTING.competition,
+                            DS_RD_SETTING.competition,
                         ],
                         check=True,
                     )
@@ -130,8 +140,6 @@ def running(self, prev_out: dict[str, Any]):
 
         return exp
 
-    skip_loop_error = (ModelEmptyError, FactorEmptyError)
-
 
 def main(path=None, step_n=None, competition=None):
     """
@@ -141,19 +149,17 @@ def main(path=None, step_n=None, competition=None):
         dotenv run -- python rdagent/app/kaggle/loop.py [--competition titanic] $LOG_PATH/__session__/1/0_propose  --step_n 1   # `step_n` is a optional parameter
         rdagent kaggle --competition playground-series-s4e8  # You are encouraged to use this one.
     """
-    if competition:
-        KAGGLE_IMPLEMENT_SETTING.competition = competition
-        download_data(competition=competition, local_path=KAGGLE_IMPLEMENT_SETTING.local_data_path)
-        if KAGGLE_IMPLEMENT_SETTING.if_using_graph_rag:
-            KAGGLE_IMPLEMENT_SETTING.knowledge_base = (
-                "rdagent.scenarios.kaggle.knowledge_management.graph.KGKnowledgeGraph"
-            )
+    if competition is not None:
+        DS_RD_SETTING.competition = competition
+
+    if DS_RD_SETTING.competition:
+        download_data(competition=DS_RD_SETTING.competition, local_path=DS_RD_SETTING.local_data_path)
     else:
         logger.error("Please specify competition name.")
     if path is None:
-        kaggle_loop = KaggleRDLoop(KAGGLE_IMPLEMENT_SETTING)
+        kaggle_loop = DataScienceRDLoop(DS_RD_SETTING)
     else:
-        kaggle_loop = KaggleRDLoop.load(path)
+        kaggle_loop = DataScienceRDLoop.load(path)
     kaggle_loop.run(step_n=step_n)
 
 
diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
index d48988a7e..1962e0931 100644
--- a/rdagent/app/kaggle/loop.py
+++ b/rdagent/app/kaggle/loop.py
@@ -63,19 +63,19 @@ def __init__(self, PROP_SETTING: BasePropSetting):
     @measure_time
     def coding(self, prev_out: dict[str, Any]):
         with logger.tag("d"):  # develop
-            if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
-                exp = self.feature_coder.develop(prev_out["exp_gen"])
-            elif prev_out["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
-                exp = self.model_feature_selection_coder.develop(prev_out["exp_gen"])
+            if prev_out["direct_exp_gen"]["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
+                exp = self.feature_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
+            elif prev_out["direct_exp_gen"]["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
+                exp = self.model_feature_selection_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
             else:
-                exp = self.model_coder.develop(prev_out["exp_gen"])
+                exp = self.model_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
             logger.log_object(exp.sub_workspace_list, tag="coder result")
         return exp
 
     @measure_time
     def running(self, prev_out: dict[str, Any]):
         with logger.tag("ef"):  # evaluate and feedback
-            if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
+            if prev_out["direct_exp_gen"]["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
                 exp = self.feature_runner.develop(prev_out["coding"])
             else:
                 exp = self.model_runner.develop(prev_out["coding"])
diff --git a/rdagent/components/workflow/rd_loop.py b/rdagent/components/workflow/rd_loop.py
index 9d6c4fd18..b489595d3 100644
--- a/rdagent/components/workflow/rd_loop.py
+++ b/rdagent/components/workflow/rd_loop.py
@@ -3,12 +3,12 @@
 It is from `rdagent/app/qlib_rd_loop/model.py` and try to replace `rdagent/app/qlib_rd_loop/RDAgent.py`
 """
 
-import time
 from typing import Any
 
 from rdagent.components.workflow.conf import BasePropSetting
 from rdagent.core.developer import Developer
 from rdagent.core.proposal import (
+    Hypothesis,
     Hypothesis2Experiment,
     Experiment2Feedback,
     HypothesisGen,
@@ -21,7 +21,13 @@
 from rdagent.utils.workflow import LoopBase, LoopMeta
 
 
+class NextLoopException(Exception):
+    """TODO: should we place in in rdagent/core/exception.py?"""
+    pass
+
+
 class RDLoop(LoopBase, metaclass=LoopMeta):
+
     @measure_time
     def __init__(self, PROP_SETTING: BasePropSetting):
         with logger.tag("init"):
@@ -44,24 +50,31 @@ def __init__(self, PROP_SETTING: BasePropSetting):
             self.trace = Trace(scen=scen)
             super().__init__()
 
+    # excluded steps
     @measure_time
-    def propose(self, prev_out: dict[str, Any]):
-        with logger.tag("r"):  # research
-            hypothesis = self.hypothesis_gen.gen(self.trace)
-            logger.log_object(hypothesis, tag="hypothesis generation")
+    def _propose(self):
+        hypothesis = self.hypothesis_gen.gen(self.trace)
+        logger.log_object(hypothesis, tag="hypothesis generation")
         return hypothesis
 
     @measure_time
-    def exp_gen(self, prev_out: dict[str, Any]):
-        with logger.tag("r"):  # research
-            exp = self.hypothesis2experiment.convert(prev_out["propose"], self.trace)
-            logger.log_object(exp.sub_tasks, tag="experiment generation")
+    def _exp_gen(self, hypothesis: Hypothesis):
+        exp = self.hypothesis2experiment.convert(hypothesis, self.trace)
+        logger.log_object(exp.sub_tasks, tag="experiment generation")
         return exp
 
+    # included steps
+    @measure_time
+    def direct_exp_gen(self, prev_out: dict[str, Any]):
+        with logger.tag("r"):  # research
+            hypo = self._propose()
+            exp = self._exp_gen(hypo)
+        return {"propose": hypo, "exp_gen": exp}
+
     @measure_time
     def coding(self, prev_out: dict[str, Any]):
         with logger.tag("d"):  # develop
-            exp = self.coder.develop(prev_out["exp_gen"])
+            exp = self.coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
             logger.log_object(exp.sub_workspace_list, tag="coder result")
         return exp
 
@@ -74,7 +87,7 @@ def running(self, prev_out: dict[str, Any]):
 
     @measure_time
     def feedback(self, prev_out: dict[str, Any]):
-        feedback = self.summarizer.generate_feedback(prev_out["running"], prev_out["propose"], self.trace)
+        feedback = self.summarizer.generate_feedback(prev_out["running"], prev_out["direct_exp_gen"]["propose"], self.trace)
         with logger.tag("ef"):  # evaluate and feedback
             logger.log_object(feedback, tag="feedback")
-        self.trace.hist.append((prev_out["propose"], prev_out["running"], feedback))
+        self.trace.hist.append((prev_out["direct_exp_gen"]["propose"], prev_out["running"], feedback))
diff --git a/rdagent/core/evaluation.py b/rdagent/core/evaluation.py
index fae07a96a..cf3fc2e2e 100644
--- a/rdagent/core/evaluation.py
+++ b/rdagent/core/evaluation.py
@@ -1,8 +1,10 @@
 from abc import ABC, abstractmethod
-
-from rdagent.core.experiment import Task, Workspace
+import typing
 from rdagent.core.scenario import Scenario
 
+if typing.TYPE_CHECKING:
+    from rdagent.core.experiment import Task, Workspace
+
 
 class Feedback:
     pass
@@ -18,9 +20,9 @@ def __init__(
     @abstractmethod
     def evaluate(
         self,
-        target_task: Task,
-        implementation: Workspace,
-        gt_implementation: Workspace,
+        target_task: "Task",
+        implementation: "Workspace",
+        gt_implementation: "Workspace",
         **kwargs: object,
     ) -> None:
         raise NotImplementedError
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index b80b59e36..c3e0bc7c3 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+import typing
 import platform
 import shutil
 import uuid
@@ -11,7 +12,8 @@
 from typing import Any, Generic, Optional, TypeVar
 
 from rdagent.core.conf import RD_AGENT_SETTINGS
-from rdagent.core.proposal import Hypothesis
+if typing.TYPE_CHECKING:
+    from rdagent.core.proposal import Hypothesis
 
 """
 This file contains the all the class about organizing the task in RD-Agent.
@@ -213,9 +215,9 @@ def __init__(
         self,
         sub_tasks: Sequence[ASpecificTask],
         based_experiments: Sequence[ASpecificWSForExperiment] = [],
-        hypothesis: Optional[Hypothesis] = None,
+        hypothesis: Optional["Hypothesis"] = None,
     ) -> None:
-        self.hypothesis: Optional[Hypothesis] = hypothesis  # Experiment is opptionally generated by hypothesis
+        self.hypothesis: Optional["Hypothesis"] = hypothesis  # Experiment is opptionally generated by hypothesis
         self.sub_tasks: Sequence[ASpecificTask] = sub_tasks
         self.sub_workspace_list: list[ASpecificWSForSubTasks | None] = [None] * len(self.sub_tasks)
         self.based_experiments: Sequence[ASpecificWSForExperiment] = based_experiments
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
index c2bd2d970..093c34e78 100644
--- a/rdagent/core/proposal.py
+++ b/rdagent/core/proposal.py
@@ -104,6 +104,10 @@ def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experim
 
 
 class ExpGen(ABC):
+
+    def __init__(self, scen: Scenario) -> None:
+        self.scen = scen
+
     @abstractmethod
     def gen(self, trace: Trace) -> Experiment:
         """
diff --git a/rdagent/core/scenario.py b/rdagent/core/scenario.py
index 3a384a85d..7776d02a5 100644
--- a/rdagent/core/scenario.py
+++ b/rdagent/core/scenario.py
@@ -4,6 +4,10 @@
 
 
 class Scenario(ABC):
+    """
+    We should include scenario information here. Following inform should not be included
+    - method related (e.g. rag... config for a concrete module)
+    """
     @property
     @abstractmethod
     def background(self) -> str:
@@ -24,21 +28,9 @@ def source_data(self) -> str:
         A convenient shortcut for describing source data
         """
         return self.get_source_data_desc()
-
-    @property
-    @abstractmethod
-    def interface(self) -> str:
-        """Interface description about how to run the code"""
-
-    @property
-    @abstractmethod
-    def output_format(self) -> str:
-        """Output format description"""
-
-    @property
-    @abstractmethod
-    def simulator(self) -> str:
-        """Simulator description"""
+    
+    # NOTE: we should keep the interface simpler. So some previous interfaces are deleted.
+    # If we need some specific function only used in the subclass(no exeternal usage). We should not set them in the base class
 
     @property
     @abstractmethod
diff --git a/rdagent/scenarios/data_science/__init__.py b/rdagent/scenarios/data_science/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
new file mode 100644
index 000000000..e1b3a013f
--- /dev/null
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -0,0 +1,42 @@
+from argparse import ONE_OR_MORE
+from typing import Literal
+from rdagent.components.proposal import LLMHypothesis2Experiment, LLMHypothesisGen
+from rdagent.core.experiment import Experiment
+from rdagent.core.proposal import ExpGen, Trace
+from rdagent.core.scenario import Scenario
+from rdagent.utils.agent.tpl import T
+
+COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Workflow", "Ensemble"]
+ORDER = COMPONENT.__args__
+
+
+class DSExpGen(ExpGen):
+    """Data Science Task Generator."""
+    def __init__(self, scen: Scenario) -> None:
+        self.complete_component: set[COMPONENT] = set()  # Initialize as an empty set
+        super().__init__(scen)
+
+    def is_complete(self):
+        """is all components complete"""
+        # TODO: place it into ExpGen
+        return self.complete_component  == set(COMPONENT.__args__)
+
+    def gen(self, trace: Trace) -> Experiment:
+        if self.is_complete():
+            # proposal + design
+            pass
+            # TODO: We can create subclasses for them if we need two components
+            LLMHypothesisGen
+            LLMHypothesis2Experiment
+        else:
+            #         
+            for o in ORDER:
+                if o in self.complete_component:
+                    continue
+                elif o == "DataLoadSpec":
+                    system = T(".prompts:DataLoadSpec.system").r()
+                    user  = T(".prompts:DataLoadSpec.user").r()
+                else:
+                    ... # two components
+        return super().gen(trace)
+
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
new file mode 100644
index 000000000..a5f3a6527
--- /dev/null
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -0,0 +1,3 @@
+DataLoadSpec:
+  system: -|
+  user: -|
diff --git a/rdagent/scenarios/data_science/proposal/task_gen.py b/rdagent/scenarios/data_science/proposal/task_gen.py
deleted file mode 100644
index 227c84c34..000000000
--- a/rdagent/scenarios/data_science/proposal/task_gen.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from typing import Literal
-from rdagent.core.experiment import Experiment
-from rdagent.core.proposal import ExpGen, Trace
-
-COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Workflow", "Ensemble"]
-MAX_NUM = COMPONENT.__args__
-
-class DSExpGen(ExpGen):
-    """Data Science Task Generator."""
-    def __init__(self) -> None:
-        self.complete_component: set[COMPONENT] = set()  # Initialize as an empty set
-
-    def _is_complete(self):
-        """is all components complete"""
-        # TODO: place it into ExpGen
-        return self.complete_component  == set(COMPONENT.__args__)
-
-    def gen(self, trace: Trace) -> Experiment:
-        
-        return super().gen(trace)
-
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
new file mode 100644
index 000000000..ef874a671
--- /dev/null
+++ b/rdagent/scenarios/data_science/scen/__init__.py
@@ -0,0 +1,4 @@
+from .scen import DataScienceScen
+
+
+__all__ = ["DataScienceScen"]
diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
new file mode 100644
index 000000000..006c01efc
--- /dev/null
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -0,0 +1,9 @@
+scen_desc: -|
+  ------Background of the scenario------
+  {{scen.background}}
+
+  ------The source dataset you can use to generate the features------
+  {{scen.source_data}}
+
+  ------The expected output & submission format specifications------
+  {{scen.submission_specifications}}
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
new file mode 100644
index 000000000..81af1b9df
--- /dev/null
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -0,0 +1,131 @@
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.core.experiment import Task
+from rdagent.core.scenario import Scenario
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.kaggle.experiment.scenario import prompt_dict as kaggle_prompt_dict
+from rdagent.scenarios.kaggle.kaggle_crawler import crawl_descriptions, leaderboard_scores
+from rdagent.utils.agent.tpl import T
+import json
+
+
+class DataScienceScen(Scenario):
+    """Data Science Scenario
+    It is based on kaggle now.
+        - But it is not use the same interface with previous kaggle version.
+        - Ideally, we should reuse previous kaggle scenario.
+          But we found that too much scenario unrelated code in kaggle scenario and hard to reuse.
+          So we start from a simple one....
+    """
+
+    def __init__(self, competition: str) -> None:
+        self.competition = competition
+        self.competition_descriptions = crawl_descriptions(competition, DS_RD_SETTING.local_data_path)
+        self.leaderboard = leaderboard_scores(competition)
+        self.evaluation_metric_direction = float(self.leaderboard[0]) > float(self.leaderboard[-1])
+        self._analysis_competition_description()
+
+    def _analysis_competition_description(self):
+        sys_prompt = T("scenarios.kaggle.experiment.prompts:kg_description_template.system").r()
+        user_prompt = T("scenarios.kaggle.experiment.prompts:kg_description_template.user").r(
+                competition_descriptions=self.competition_descriptions,
+                raw_data_information=self.source_data,
+                evaluation_metric_direction=self.evaluation_metric_direction,
+        )
+
+        response_analysis = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=sys_prompt,
+            json_mode=True,
+        )
+
+        response_json_analysis = json.loads(response_analysis)
+        self.competition_type = response_json_analysis.get("Competition Type", "No type provided")
+        self.competition_description = response_json_analysis.get("Competition Description", "No description provided")
+        self.target_description = response_json_analysis.get("Target Description", "No target provided")
+        self.competition_features = response_json_analysis.get("Competition Features", "No features provided")
+        self.submission_specifications = response_json_analysis.get("Submission Specifications",
+                                                                    "No submission requirements provided")
+        self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
+        self.evaluation_desc = response_json_analysis.get("Evaluation Description",
+                                                          "No evaluation specification provided.")
+
+    def get_competition_full_desc(self) -> str:
+        evaluation_direction = "higher the better" if self.evaluation_metric_direction else "lower the better"
+        return f"""Competition Type: {self.competition_type}
+    Competition Description: {self.competition_description}
+    Target Description: {self.target_description}
+    Competition Features: {self.competition_features}
+    Submission Specifications: {self.submission_specifications}
+    Model Output Channel: {self.model_output_channel}
+    Evaluation Descriptions: {self.evaluation_desc}
+    Is the evaluation metric the higher the better: {evaluation_direction}
+    """
+
+    @property
+    def background(self) -> str:
+        background_template = T("rdagent.scenarios.kaggle.experiment.prompts:kg_background")
+        background_prompt = background_template.r(
+            competition_type=self.competition_type,
+            competition_description=self.competition_description,
+            target_description=self.target_description,
+            competition_features=self.competition_features,
+            submission_specifications=self.submission_specifications,
+            evaluation_desc=self.evaluation_desc,
+            evaluate_bool=self.evaluation_metric_direction,
+        )
+        return background_prompt
+
+    @property
+    def source_data(self) -> str:
+        # TODO: remove me if not used
+        # TODO: (bowen)
+        # phase1:
+        # - If we have not implement load data and dump cache
+        # - describe the raw data
+        # phase2: (cache detected)
+        # - Describe the cached data.
+        return "!!!!!!!!! I'm the fake source data !!!!!!!!"
+        raise NotImplementedError(f"We are not sure how it is called. We place a exception here")
+
+    def output_format(self, tag=None) -> str:
+        # TODO: remove me if not used
+        raise NotImplementedError(f"We are not sure how it is called. We place a exception here")
+
+    def simulator(self, tag=None) -> str:
+        # TODO: remove me if not used
+        raise NotImplementedError(f"We are not sure how it is called. We place a exception here")
+
+    @property
+    def rich_style_description(self) -> str:
+        return f"""
+### Kaggle Agent: Automated Feature Engineering & Model Tuning Evolution
+
+#### [Overview](#_summary)
+
+In this scenario, our automated system proposes hypothesis, choose action, implements code, conducts validation, and utilizes feedback in a continuous, iterative process.
+
+#### Kaggle Competition info
+
+Current Competition: [{self.competition}](https://www.kaggle.com/competitions/{self.competition})
+
+#### [Automated R&D](#_rdloops)
+
+- **[R (Research)](#_research)**
+- Iteration of ideas and hypotheses.
+- Continuous learning and knowledge construction.
+
+- **[D (Development)](#_development)**
+- Evolving code generation, model refinement, and features generation.
+- Automated implementation and testing of models/features.
+
+#### [Objective](#_summary)
+
+To automatically optimize performance metrics within the validation set or Kaggle Leaderboard, ultimately discovering the most efficient features and models through autonomous research and development.
+"""
+
+    def get_scenario_all_desc(self, task: Task | None = None, filtered_tag: str | None = None) -> str:
+        # TODO: remove me if not used
+        raise NotImplementedError(f"We are not sure how it is called. We place a exception here")
+        # if filtered_tag is None:
+        #     return common_description() + interface(None) + output(None) + simulator(None)
+        # NOTE: we suggest such implementation: `return T(".prompts:scen_desc").r()`
diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
index b5a7f84e5..b6ef2f6bb 100644
--- a/rdagent/scenarios/kaggle/experiment/scenario.py
+++ b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -39,7 +39,7 @@ class KGScenario(Scenario):
     def __init__(self, competition: str) -> None:
         super().__init__()
         self.competition = competition
-        self.competition_descriptions = crawl_descriptions(competition)
+        self.competition_descriptions = crawl_descriptions(competition, KAGGLE_IMPLEMENT_SETTING.local_data_path)
         self.input_shape = None
 
         self.competition_type = None
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
index b6b71917d..7f2a31722 100644
--- a/rdagent/scenarios/kaggle/kaggle_crawler.py
+++ b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -31,8 +31,8 @@
 service = Service("/usr/local/bin/chromedriver")
 
 
-def crawl_descriptions(competition: str, wait: float = 3.0, force: bool = False) -> dict[str, str]:
-    if (fp := Path(f"{KAGGLE_IMPLEMENT_SETTING.local_data_path}/{competition}.json")).exists() and not force:
+def crawl_descriptions(competition: str, local_data_path: str, wait: float = 3.0, force: bool = False) -> dict[str, str]:
+    if (fp := Path(f"{local_data_path}/{competition}.json")).exists() and not force:
         logger.info(f"Found {competition}.json, loading from local file.")
         with fp.open("r") as f:
             return json.load(f)
@@ -94,7 +94,7 @@ def kaggle_description_css_selectors() -> tuple[str, str]:
     descriptions["Data Description"] = data_element.get_attribute("innerHTML")
 
     driver.quit()
-    with open(f"{KAGGLE_IMPLEMENT_SETTING.local_data_path}/{competition}.json", "w") as f:
+    with open(f"{local_data_path}/{competition}.json", "w") as f:
         json.dump(descriptions, f)
     return descriptions
 
diff --git a/rdagent/utils/agent/tpl.py b/rdagent/utils/agent/tpl.py
index 99df9b91d..f3a5bd940 100644
--- a/rdagent/utils/agent/tpl.py
+++ b/rdagent/utils/agent/tpl.py
@@ -29,6 +29,9 @@ def __init__(self, uri: str):
         here are some uri usages
             case 1) "a.b.c:x.y.z"
                 It will load DIRNAME/a/b/c.yaml as `yaml` and load yaml[x][y][z]
+
+                Form example, if you want to load "rdagent/scenarios/kaggle/experiment/prompts.yaml"
+                `a.b.c` should be "scenarios.kaggle.experiment.prompts" and "rdagent" should be exclude
             case 2) ".c:x.y.z"
                 It will load c.yaml in caller's (who call `T(uri)`) directory as `yaml` and load yaml[x][y][z]
 
diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py
index 0c1f369b6..ece190625 100644
--- a/rdagent/utils/workflow.py
+++ b/rdagent/utils/workflow.py
@@ -54,7 +54,7 @@ def __new__(cls, clsname, bases, attrs):
         """
         steps = LoopMeta._get_steps(bases)  # all the base classes of parents
         for name, attr in attrs.items():
-            if not name.startswith("__") and isinstance(attr, Callable):
+            if not name.startswith("_") and isinstance(attr, Callable):
                 if name not in steps:
                     # NOTE: if we override the step in the subclass
                     # Then it is not the new step. So we skip it.

From db8770459403f139a99aa5d8b55833b72ee59792 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 27 Nov 2024 07:21:53 +0000
Subject: [PATCH 003/304] prompt T fix

---
 rdagent/scenarios/data_science/scen/scen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index 81af1b9df..c58d9f3fe 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -63,7 +63,7 @@ def get_competition_full_desc(self) -> str:
 
     @property
     def background(self) -> str:
-        background_template = T("rdagent.scenarios.kaggle.experiment.prompts:kg_background")
+        background_template = T("scenarios.kaggle.experiment.prompts:kg_background")
         background_prompt = background_template.r(
             competition_type=self.competition_type,
             competition_description=self.competition_description,

From 98121f486fdbc346ca1a2c2aa8175a65e11a1844 Mon Sep 17 00:00:00 2001
From: WinstonLiye <1957922024@qq.com>
Date: Wed, 27 Nov 2024 08:13:42 +0000
Subject: [PATCH 004/304] finish data loader template

---
 rdagent/app/data_science/loop.py              |   2 +-
 .../raw_data_loader/data_loader_unit_test     |   0
 .../raw_data_loader/raw_data_loader.py        |  71 ++++++++++++
 .../data_science/experiment/__init__.py       |   0
 .../experiment/kaggle_experiment.py           | 103 ++++++++++++++++++
 .../data_science/experiment/workspace.py      |  97 +++++++++++++++++
 .../data_science/proposal/exp_gen.py          |  21 +++-
 .../data_science/proposal/prompts.yaml        |   4 +-
 rdagent/scenarios/data_science/scen/scen.py   |   2 +-
 rdagent/scenarios/kaggle/prompts.yaml         |  10 ++
 10 files changed, 304 insertions(+), 6 deletions(-)
 create mode 100644 rdagent/components/coder/data_science/raw_data_loader/data_loader_unit_test
 create mode 100644 rdagent/components/coder/data_science/raw_data_loader/raw_data_loader.py
 create mode 100644 rdagent/scenarios/data_science/experiment/__init__.py
 create mode 100644 rdagent/scenarios/data_science/experiment/kaggle_experiment.py
 create mode 100644 rdagent/scenarios/data_science/experiment/workspace.py

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 25c61b0e7..e153c26e0 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -146,7 +146,7 @@ def main(path=None, step_n=None, competition=None):
     Auto R&D Evolving loop for models in a kaggle{} scenario.
     You can continue running session by
     .. code-block:: bash
-        dotenv run -- python rdagent/app/kaggle/loop.py [--competition titanic] $LOG_PATH/__session__/1/0_propose  --step_n 1   # `step_n` is a optional parameter
+        dotenv run -- python rdagent/app/data_science/loop.py [--competition titanic] $LOG_PATH/__session__/1/0_propose  --step_n 1   # `step_n` is a optional parameter
         rdagent kaggle --competition playground-series-s4e8  # You are encouraged to use this one.
     """
     if competition is not None:
diff --git a/rdagent/components/coder/data_science/raw_data_loader/data_loader_unit_test b/rdagent/components/coder/data_science/raw_data_loader/data_loader_unit_test
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/components/coder/data_science/raw_data_loader/raw_data_loader.py b/rdagent/components/coder/data_science/raw_data_loader/raw_data_loader.py
new file mode 100644
index 000000000..b6e0c5d4d
--- /dev/null
+++ b/rdagent/components/coder/data_science/raw_data_loader/raw_data_loader.py
@@ -0,0 +1,71 @@
+import pickle
+import site
+import traceback
+from pathlib import Path
+from typing import Dict, Optional
+
+from rdagent.components.coder.CoSTEER.task import CoSTEERTask
+from rdagent.core.experiment import Experiment, FBWorkspace
+from rdagent.core.utils import cache_with_pickle
+from rdagent.oai.llm_utils import md5_hash
+from rdagent.utils.env import KGDockerEnv, QTDockerEnv
+
+# TODO: Complete the implementation of the class DataLoaderTask and class DataLoaderFBWorkspace
+
+class DataLoaderTask(CoSTEERTask):
+    def __init__(
+        self,
+        name: str,
+        description: str,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(name=name,desc=description, *args, **kwargs)
+
+    def get_task_information(self):
+        task_desc = f"""name: {self.name}
+description: {self.description}
+"""
+        return task_desc
+
+    @staticmethod
+    def from_dict(dict):
+        return DataLoaderTask(**dict)
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} {self.name}>"
+
+
+class DataLoaderFBWorkspace(FBWorkspace):
+    def hash_func(
+        self,
+        batch_size: int = 8,
+        num_features: int = 10,
+        num_timesteps: int = 4,
+        num_edges: int = 20,
+        input_value: float = 1.0,
+        param_init_value: float = 1.0,
+    ) -> str:
+        target_file_name = f"{batch_size}_{num_features}_{num_timesteps}_{input_value}_{param_init_value}"
+        for code_file_name in sorted(list(self.code_dict.keys())):
+            target_file_name = f"{target_file_name}_{self.code_dict[code_file_name]}"
+        return md5_hash(target_file_name)
+
+    @cache_with_pickle(hash_func)
+    def execute(self):
+        super().execute()
+        try:
+            qtde = QTDockerEnv() if self.target_task.version == 1 else KGDockerEnv()
+            qtde.prepare()
+
+            # TODO: UNIT TEST for data loader
+            dump_code = (Path(__file__).parent / "data_loader_unit_test.txt").read_text()
+
+            # TODO: Cache the processed data into a pickle file
+            pass
+
+        except Exception as e:
+            pass
+
+
+DataLoaderExperiment = Experiment
diff --git a/rdagent/scenarios/data_science/experiment/__init__.py b/rdagent/scenarios/data_science/experiment/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/scenarios/data_science/experiment/kaggle_experiment.py b/rdagent/scenarios/data_science/experiment/kaggle_experiment.py
new file mode 100644
index 000000000..28b795e83
--- /dev/null
+++ b/rdagent/scenarios/data_science/experiment/kaggle_experiment.py
@@ -0,0 +1,103 @@
+from copy import deepcopy
+from pathlib import Path
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.components.coder.factor_coder.factor import (
+    FactorFBWorkspace,
+    FactorTask,
+    FeatureExperiment,
+)
+from rdagent.components.coder.model_coder.model import (
+    ModelExperiment,
+    ModelFBWorkspace,
+    ModelTask,
+)
+from rdagent.components.coder.data_science.raw_data_loader.raw_data_loader import (
+    DataLoaderExperiment,
+    DataLoaderFBWorkspace,
+    DataLoaderTask,
+)
+
+from rdagent.scenarios.data_science.experiment.workspace import KGFBWorkspace
+
+KG_MODEL_TYPE_XGBOOST = "XGBoost"
+KG_MODEL_TYPE_RANDOMFOREST = "RandomForest"
+KG_MODEL_TYPE_LIGHTGBM = "LightGBM"
+KG_MODEL_TYPE_NN = "NN"
+
+KG_MODEL_MAPPING = {
+    KG_MODEL_TYPE_XGBOOST: "model/model_xgboost.py",
+    KG_MODEL_TYPE_RANDOMFOREST: "model/model_randomforest.py",
+    KG_MODEL_TYPE_LIGHTGBM: "model/model_lightgbm.py",
+    KG_MODEL_TYPE_NN: "model/model_nn.py",
+}
+
+KG_SELECT_MAPPING = {
+    KG_MODEL_TYPE_XGBOOST: "model/select_xgboost.py",
+    KG_MODEL_TYPE_RANDOMFOREST: "model/select_randomforest.py",
+    KG_MODEL_TYPE_LIGHTGBM: "model/select_lightgbm.py",
+    KG_MODEL_TYPE_NN: "model/select_nn.py",
+}
+
+class KGDataLoaderExperiment(DataLoaderExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]):
+    # TODO: complete the implementation
+    def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        # TODO: It seems there are some problems as the folder has not been created.
+        # self.experiment_workspace = KGFBWorkspace(
+        #     template_folder_path=Path(__file__).resolve()
+        #     / Path(DS_RD_SETTING.template_path).resolve()
+        #     / DS_RD_SETTING.competition
+        # )
+
+
+class KGModelExperiment(ModelExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]):
+    def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.experiment_workspace = KGFBWorkspace(
+            template_folder_path=Path(__file__).resolve()
+            / Path(DS_RD_SETTING.template_path).resolve()
+            / DS_RD_SETTING.competition
+        )
+        if len(self.based_experiments) > 0:
+            self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
+            self.experiment_workspace.data_description = deepcopy(
+                self.based_experiments[-1].experiment_workspace.data_description
+            )
+        else:
+            self.experiment_workspace.data_description = [
+                (
+                    FactorTask(
+                        factor_name="Original features",
+                        factor_description="The original features",
+                        factor_formulation="",
+                    ).get_task_information(),
+                    source_feature_size,
+                )
+            ]
+
+
+class KGFactorExperiment(FeatureExperiment[FactorTask, KGFBWorkspace, FactorFBWorkspace]):
+    def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.experiment_workspace = KGFBWorkspace(
+            template_folder_path=Path(__file__).resolve()
+            / Path(DS_RD_SETTING.template_path).resolve()
+            / DS_RD_SETTING.competition
+        )
+        if len(self.based_experiments) > 0:
+            self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
+            self.experiment_workspace.data_description = deepcopy(
+                self.based_experiments[-1].experiment_workspace.data_description
+            )
+        else:
+            self.experiment_workspace.data_description = [
+                (
+                    FactorTask(
+                        factor_name="Original features",
+                        factor_description="The original features",
+                        factor_formulation="",
+                    ).get_task_information(),
+                    source_feature_size,
+                )
+            ]
diff --git a/rdagent/scenarios/data_science/experiment/workspace.py b/rdagent/scenarios/data_science/experiment/workspace.py
new file mode 100644
index 000000000..1bfaaf866
--- /dev/null
+++ b/rdagent/scenarios/data_science/experiment/workspace.py
@@ -0,0 +1,97 @@
+import subprocess
+import zipfile
+from pathlib import Path
+from typing import Any, List, Tuple
+
+import pandas as pd
+
+from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+from rdagent.core.experiment import FBWorkspace
+from rdagent.log import rdagent_logger as logger
+from rdagent.utils.env import KGDockerEnv
+
+KG_FEATURE_PREPROCESS_SCRIPT = """import pickle
+
+from fea_share_preprocess import preprocess_script
+
+X_train, X_valid, y_train, y_valid, X_test, *others = preprocess_script()
+
+pickle.dump(X_train, open("X_train.pkl", "wb"))
+pickle.dump(X_valid, open("X_valid.pkl", "wb"))
+pickle.dump(y_train, open("y_train.pkl", "wb"))
+pickle.dump(y_valid, open("y_valid.pkl", "wb"))
+pickle.dump(X_test, open("X_test.pkl", "wb"))
+pickle.dump(others, open("others.pkl", "wb"))
+"""
+
+
+class KGFBWorkspace(FBWorkspace):
+    def __init__(self, template_folder_path: Path, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.inject_code_from_folder(template_folder_path)
+        self.data_description: List[Tuple[str, int]] = []
+
+    @property
+    def model_description(self) -> dict[str, str]:
+        model_description = {}
+        for k, v in self.code_dict.items():
+            if k.startswith("model/"):
+                model_description[k] = v
+        return model_description
+
+    def generate_preprocess_data(
+        self,
+    ) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, Any]:
+        kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
+        kgde.prepare()
+
+        execute_log, results = kgde.dump_python_code_run_and_get_results(
+            code=KG_FEATURE_PREPROCESS_SCRIPT,
+            local_path=str(self.workspace_path),
+            dump_file_names=[
+                "X_train.pkl",
+                "X_valid.pkl",
+                "y_train.pkl",
+                "y_valid.pkl",
+                "X_test.pkl",
+                "others.pkl",
+            ],
+            running_extra_volume=(
+                {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"}
+                if KAGGLE_IMPLEMENT_SETTING.competition
+                else None
+            ),
+        )
+        if results is None:
+            logger.error("Feature preprocess failed.")
+            raise Exception("Feature preprocess failed.")
+        else:
+            X_train, X_valid, y_train, y_valid, X_test, others = results
+            return X_train, X_valid, y_train, y_valid, X_test, *others
+
+    def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
+        logger.info(f"Running the experiment in {self.workspace_path}")
+
+        kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
+        kgde.prepare()
+
+        running_extra_volume = {}
+        if KAGGLE_IMPLEMENT_SETTING.competition:
+            running_extra_volume = {
+                KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"
+            }
+        else:
+            running_extra_volume = {}
+
+        execute_log = kgde.run(
+            local_path=str(self.workspace_path),
+            env=run_env,
+            running_extra_volume=running_extra_volume,
+        )
+
+        csv_path = self.workspace_path / "submission_score.csv"
+
+        if not csv_path.exists():
+            logger.error(f"File {csv_path} does not exist.")
+            return None
+        return pd.read_csv(csv_path, index_col=0).iloc[:, 0]
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index e1b3a013f..72bb23e87 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -5,6 +5,12 @@
 from rdagent.core.proposal import ExpGen, Trace
 from rdagent.core.scenario import Scenario
 from rdagent.utils.agent.tpl import T
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.components.coder.data_science.raw_data_loader.raw_data_loader import (
+    DataLoaderExperiment,
+    DataLoaderFBWorkspace,
+    DataLoaderTask,
+) 
 
 COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Workflow", "Ensemble"]
 ORDER = COMPONENT.__args__
@@ -34,8 +40,19 @@ def gen(self, trace: Trace) -> Experiment:
                 if o in self.complete_component:
                     continue
                 elif o == "DataLoadSpec":
-                    system = T(".prompts:DataLoadSpec.system").r()
-                    user  = T(".prompts:DataLoadSpec.user").r()
+                    # TODO return a description of the data loading task
+                    system = T(".prompts:DataLoaderSpec.system").r()
+                    user  = T(".prompts:DataLoaderSpec.user").r()
+                    data_load_exp = APIBackend().build_messages_and_create_chat_completion(
+                        user_prompt=user,
+                        system_prompt=system,
+                        json_mode=True,
+                    )
+                    dlt = DataLoaderTask(name="DataLoaderTask", description=data_load_exp)
+                    exp = DataLoaderExperiment(
+                        sub_tasks=[dlt],
+                    )
+                    return exp
                 else:
                     ... # two components
         return super().gen(trace)
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index a5f3a6527..8340d5883 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -1,3 +1,3 @@
-DataLoadSpec:
+DataLoaderSpec:
   system: -|
-  user: -|
+  user: -|
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index c58d9f3fe..ac4552287 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -83,7 +83,7 @@ def source_data(self) -> str:
         # - If we have not implement load data and dump cache
         # - describe the raw data
         # phase2: (cache detected)
-        # - Describe the cached data.
+        # - Describe the cached data (preprocessed data).
         return "!!!!!!!!! I'm the fake source data !!!!!!!!"
         raise NotImplementedError(f"We are not sure how it is called. We place a exception here")
 
diff --git a/rdagent/scenarios/kaggle/prompts.yaml b/rdagent/scenarios/kaggle/prompts.yaml
index c73ae099f..ec5072efb 100644
--- a/rdagent/scenarios/kaggle/prompts.yaml
+++ b/rdagent/scenarios/kaggle/prompts.yaml
@@ -175,6 +175,16 @@ model_experiment_output_format: |-
   }
   Usually, a larger model works better than a smaller one. Hence, the parameters should be larger.
 
+data_loader_experiment_output_format: |-
+  According to the hypothesis, please help user design one data loader task.
+  The output should follow JSON format. The schema is as follows:
+  {
+      "data loader name": {
+          "description": "description of the overall data loader for the data science workflow",
+      }
+      # Don't add ellipsis (...) or any filler text that might cause JSON parsing errors here!
+  }
+
 kg_feedback_generation_user: |-
   We are in a process of finding and validating hypotheses to build a powerful model. Each round aims to confirm or reject hypotheses based on results.
 

From a5a0d7017d78c146bda48d06129a1c0a3848702e Mon Sep 17 00:00:00 2001
From: WinstonLiye <1957922024@qq.com>
Date: Wed, 27 Nov 2024 08:16:25 +0000
Subject: [PATCH 005/304] ci

---
 rdagent/app/data_science/conf.py              |  4 ++-
 rdagent/app/data_science/loop.py              | 24 ++++++++--------
 rdagent/app/kaggle/loop.py                    | 12 ++++++--
 .../raw_data_loader/raw_data_loader.py        |  3 +-
 rdagent/components/workflow/rd_loop.py        |  7 +++--
 rdagent/core/evaluation.py                    |  3 +-
 rdagent/core/experiment.py                    |  5 +++-
 rdagent/core/scenario.py                      |  3 +-
 .../data_mining/developer/feedback.py         |  2 +-
 .../experiment/kaggle_experiment.py           | 12 ++++----
 .../data_science/proposal/exp_gen.py          | 23 +++++++--------
 .../scenarios/data_science/scen/__init__.py   |  1 -
 rdagent/scenarios/data_science/scen/scen.py   | 28 ++++++++++++-------
 .../scenarios/kaggle/developer/feedback.py    |  2 +-
 rdagent/scenarios/kaggle/kaggle_crawler.py    |  4 ++-
 rdagent/scenarios/qlib/developer/feedback.py  |  2 +-
 .../scenarios/qlib/proposal/model_proposal.py |  2 +-
 17 files changed, 82 insertions(+), 55 deletions(-)

diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index 72c4eeb7a..7bfd23cf3 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -66,7 +66,9 @@ class DataScienceBasePropSetting(BasePropSetting):
     knowledge_base: str = ""
     """Knowledge base class, uses 'KGKnowledgeGraph' when advanced graph-based RAG is enabled, otherwise empty."""
 
-    domain_knowledge_path: str = "/data/userdata/share/kaggle/domain_knowledge"  # TODO: It should be sth like knowledge_base_kwargs
+    domain_knowledge_path: str = (
+        "/data/userdata/share/kaggle/domain_knowledge"  # TODO: It should be sth like knowledge_base_kwargs
+    )
     """Folder storing domain knowledge files in .case format"""
 
     knowledge_base_path: str = "kg_graph.pkl"
diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index e153c26e0..48e951719 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -1,4 +1,3 @@
-
 import subprocess
 from typing import Any, Literal
 
@@ -6,12 +5,12 @@
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.workflow.conf import BasePropSetting
-from rdagent.components.workflow.rd_loop import RDLoop, NextLoopException
+from rdagent.components.workflow.rd_loop import NextLoopException, RDLoop
 from rdagent.core.exception import FactorEmptyError, ModelEmptyError
 from rdagent.core.proposal import (
+    Experiment2Feedback,
     ExpGen,
     Hypothesis2Experiment,
-    Experiment2Feedback,
     HypothesisGen,
     Trace,
 )
@@ -23,7 +22,6 @@
 from rdagent.scenarios.kaggle.kaggle_crawler import download_data
 
 
-
 class DataScienceRDLoop(RDLoop):
     skip_loop_error = (NextLoopException,)
 
@@ -34,7 +32,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):
             scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
             logger.log_object(scen, tag="scenario")
 
-            ### shared components in the workflow  # TODO: check if 
+            ### shared components in the workflow  # TODO: check if
             knowledge_base = (
                 import_class(PROP_SETTING.knowledge_base)(PROP_SETTING.knowledge_base_path, scen)
                 if PROP_SETTING.knowledge_base != ""
@@ -48,7 +46,6 @@ def __init__(self, PROP_SETTING: BasePropSetting):
             # 2) task generation from a complete solution
             self.exp_gen: ExpGen = import_class(PROP_SETTING.exp_gen)(scen)
 
-
             # self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
             # logger.log_object(self.hypothesis_gen, tag="hypothesis generator")
             # self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
@@ -64,7 +61,6 @@ def __init__(self, PROP_SETTING: BasePropSetting):
             # self.model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen)
             # logger.log_object(self.model_coder, tag="model coder")
 
-
             # TODO: now we only need on runner
             # self.feature_runner: Developer = import_class(PROP_SETTING.feature_runner)(scen)
             # logger.log_object(self.feature_runner, tag="feature runner")
@@ -87,7 +83,10 @@ def direct_exp_gen(self, prev_out: dict[str, Any]):
     @measure_time
     def coding(self, prev_out: dict[str, Any]):
         with logger.tag("d"):  # develop
-            if prev_out["direct_exp_gen"]["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
+            if prev_out["direct_exp_gen"]["propose"].action in [
+                KG_ACTION_FEATURE_ENGINEERING,
+                KG_ACTION_FEATURE_PROCESSING,
+            ]:
                 exp = self.feature_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
             elif prev_out["direct_exp_gen"]["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
                 exp = self.model_feature_selection_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
@@ -102,7 +101,10 @@ def running(self, prev_out: dict[str, Any]):
             raise NextLoopExcpetion()
 
         with logger.tag("ef"):  # evaluate and feedback
-            if prev_out["direct_exp_gen"]["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
+            if prev_out["direct_exp_gen"]["propose"].action in [
+                KG_ACTION_FEATURE_ENGINEERING,
+                KG_ACTION_FEATURE_PROCESSING,
+            ]:
                 exp = self.feature_runner.develop(prev_out["coding"])
             else:
                 exp = self.model_runner.develop(prev_out["coding"])
@@ -112,9 +114,7 @@ def running(self, prev_out: dict[str, Any]):
                 "covid19-global-forecasting-week-1",
             ]:
                 try:
-                    python_files_to_notebook(
-                        DS_RD_SETTING.competition, exp.experiment_workspace.workspace_path
-                    )
+                    python_files_to_notebook(DS_RD_SETTING.competition, exp.experiment_workspace.workspace_path)
                 except Exception as e:
                     logger.error(f"Merge python files to one file failed: {e}")
             if DS_RD_SETTING.auto_submit:
diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
index 1962e0931..fa822d354 100644
--- a/rdagent/app/kaggle/loop.py
+++ b/rdagent/app/kaggle/loop.py
@@ -9,8 +9,8 @@
 from rdagent.core.developer import Developer
 from rdagent.core.exception import FactorEmptyError, ModelEmptyError
 from rdagent.core.proposal import (
-    Hypothesis2Experiment,
     Experiment2Feedback,
+    Hypothesis2Experiment,
     HypothesisGen,
 )
 from rdagent.core.scenario import Scenario
@@ -63,7 +63,10 @@ def __init__(self, PROP_SETTING: BasePropSetting):
     @measure_time
     def coding(self, prev_out: dict[str, Any]):
         with logger.tag("d"):  # develop
-            if prev_out["direct_exp_gen"]["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
+            if prev_out["direct_exp_gen"]["propose"].action in [
+                KG_ACTION_FEATURE_ENGINEERING,
+                KG_ACTION_FEATURE_PROCESSING,
+            ]:
                 exp = self.feature_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
             elif prev_out["direct_exp_gen"]["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
                 exp = self.model_feature_selection_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
@@ -75,7 +78,10 @@ def coding(self, prev_out: dict[str, Any]):
     @measure_time
     def running(self, prev_out: dict[str, Any]):
         with logger.tag("ef"):  # evaluate and feedback
-            if prev_out["direct_exp_gen"]["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
+            if prev_out["direct_exp_gen"]["propose"].action in [
+                KG_ACTION_FEATURE_ENGINEERING,
+                KG_ACTION_FEATURE_PROCESSING,
+            ]:
                 exp = self.feature_runner.develop(prev_out["coding"])
             else:
                 exp = self.model_runner.develop(prev_out["coding"])
diff --git a/rdagent/components/coder/data_science/raw_data_loader/raw_data_loader.py b/rdagent/components/coder/data_science/raw_data_loader/raw_data_loader.py
index b6e0c5d4d..18d548d5c 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/raw_data_loader.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/raw_data_loader.py
@@ -12,6 +12,7 @@
 
 # TODO: Complete the implementation of the class DataLoaderTask and class DataLoaderFBWorkspace
 
+
 class DataLoaderTask(CoSTEERTask):
     def __init__(
         self,
@@ -20,7 +21,7 @@ def __init__(
         *args,
         **kwargs,
     ) -> None:
-        super().__init__(name=name,desc=description, *args, **kwargs)
+        super().__init__(name=name, desc=description, *args, **kwargs)
 
     def get_task_information(self):
         task_desc = f"""name: {self.name}
diff --git a/rdagent/components/workflow/rd_loop.py b/rdagent/components/workflow/rd_loop.py
index b489595d3..deb79093f 100644
--- a/rdagent/components/workflow/rd_loop.py
+++ b/rdagent/components/workflow/rd_loop.py
@@ -8,9 +8,9 @@
 from rdagent.components.workflow.conf import BasePropSetting
 from rdagent.core.developer import Developer
 from rdagent.core.proposal import (
+    Experiment2Feedback,
     Hypothesis,
     Hypothesis2Experiment,
-    Experiment2Feedback,
     HypothesisGen,
     Trace,
 )
@@ -23,6 +23,7 @@
 
 class NextLoopException(Exception):
     """TODO: should we place in in rdagent/core/exception.py?"""
+
     pass
 
 
@@ -87,7 +88,9 @@ def running(self, prev_out: dict[str, Any]):
 
     @measure_time
     def feedback(self, prev_out: dict[str, Any]):
-        feedback = self.summarizer.generate_feedback(prev_out["running"], prev_out["direct_exp_gen"]["propose"], self.trace)
+        feedback = self.summarizer.generate_feedback(
+            prev_out["running"], prev_out["direct_exp_gen"]["propose"], self.trace
+        )
         with logger.tag("ef"):  # evaluate and feedback
             logger.log_object(feedback, tag="feedback")
         self.trace.hist.append((prev_out["direct_exp_gen"]["propose"], prev_out["running"], feedback))
diff --git a/rdagent/core/evaluation.py b/rdagent/core/evaluation.py
index cf3fc2e2e..4c0af18ca 100644
--- a/rdagent/core/evaluation.py
+++ b/rdagent/core/evaluation.py
@@ -1,5 +1,6 @@
-from abc import ABC, abstractmethod
 import typing
+from abc import ABC, abstractmethod
+
 from rdagent.core.scenario import Scenario
 
 if typing.TYPE_CHECKING:
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index c3e0bc7c3..abd456856 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -1,9 +1,9 @@
 from __future__ import annotations
 
 import os
-import typing
 import platform
 import shutil
+import typing
 import uuid
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
@@ -12,6 +12,7 @@
 from typing import Any, Generic, Optional, TypeVar
 
 from rdagent.core.conf import RD_AGENT_SETTINGS
+
 if typing.TYPE_CHECKING:
     from rdagent.core.proposal import Hypothesis
 
@@ -36,6 +37,7 @@ def get_task_information(self) -> str:
         Get the task information string to build the unique key
         """
 
+
 class Task(AbsTask):
     def __init__(self, name: str, version: int = 1, desc: str = "") -> None:
         super().__init__(name, version)
@@ -44,6 +46,7 @@ def __init__(self, name: str, version: int = 1, desc: str = "") -> None:
     def get_task_information(self) -> str:
         return f"{self.name}_{self.version}: {self.desc}"
 
+
 ASpecificTask = TypeVar("ASpecificTask", bound=Task)
 
 
diff --git a/rdagent/core/scenario.py b/rdagent/core/scenario.py
index 7776d02a5..0fe2dfbf5 100644
--- a/rdagent/core/scenario.py
+++ b/rdagent/core/scenario.py
@@ -8,6 +8,7 @@ class Scenario(ABC):
     We should include scenario information here. Following inform should not be included
     - method related (e.g. rag... config for a concrete module)
     """
+
     @property
     @abstractmethod
     def background(self) -> str:
@@ -28,7 +29,7 @@ def source_data(self) -> str:
         A convenient shortcut for describing source data
         """
         return self.get_source_data_desc()
-    
+
     # NOTE: we should keep the interface simpler. So some previous interfaces are deleted.
     # If we need some specific function only used in the subclass(no exeternal usage). We should not set them in the base class
 
diff --git a/rdagent/scenarios/data_mining/developer/feedback.py b/rdagent/scenarios/data_mining/developer/feedback.py
index ec278c6f7..d87862271 100644
--- a/rdagent/scenarios/data_mining/developer/feedback.py
+++ b/rdagent/scenarios/data_mining/developer/feedback.py
@@ -9,8 +9,8 @@
 from rdagent.core.experiment import Experiment
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import (
-    Hypothesis,
     Experiment2Feedback,
+    Hypothesis,
     HypothesisFeedback,
     Trace,
 )
diff --git a/rdagent/scenarios/data_science/experiment/kaggle_experiment.py b/rdagent/scenarios/data_science/experiment/kaggle_experiment.py
index 28b795e83..d641a77e5 100644
--- a/rdagent/scenarios/data_science/experiment/kaggle_experiment.py
+++ b/rdagent/scenarios/data_science/experiment/kaggle_experiment.py
@@ -2,6 +2,11 @@
 from pathlib import Path
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.components.coder.data_science.raw_data_loader.raw_data_loader import (
+    DataLoaderExperiment,
+    DataLoaderFBWorkspace,
+    DataLoaderTask,
+)
 from rdagent.components.coder.factor_coder.factor import (
     FactorFBWorkspace,
     FactorTask,
@@ -12,12 +17,6 @@
     ModelFBWorkspace,
     ModelTask,
 )
-from rdagent.components.coder.data_science.raw_data_loader.raw_data_loader import (
-    DataLoaderExperiment,
-    DataLoaderFBWorkspace,
-    DataLoaderTask,
-)
-
 from rdagent.scenarios.data_science.experiment.workspace import KGFBWorkspace
 
 KG_MODEL_TYPE_XGBOOST = "XGBoost"
@@ -39,6 +38,7 @@
     KG_MODEL_TYPE_NN: "model/select_nn.py",
 }
 
+
 class KGDataLoaderExperiment(DataLoaderExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]):
     # TODO: complete the implementation
     def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 72bb23e87..8fcce50ef 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -1,16 +1,17 @@
 from argparse import ONE_OR_MORE
 from typing import Literal
+
+from rdagent.components.coder.data_science.raw_data_loader.raw_data_loader import (
+    DataLoaderExperiment,
+    DataLoaderFBWorkspace,
+    DataLoaderTask,
+)
 from rdagent.components.proposal import LLMHypothesis2Experiment, LLMHypothesisGen
 from rdagent.core.experiment import Experiment
 from rdagent.core.proposal import ExpGen, Trace
 from rdagent.core.scenario import Scenario
-from rdagent.utils.agent.tpl import T
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.components.coder.data_science.raw_data_loader.raw_data_loader import (
-    DataLoaderExperiment,
-    DataLoaderFBWorkspace,
-    DataLoaderTask,
-) 
+from rdagent.utils.agent.tpl import T
 
 COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Workflow", "Ensemble"]
 ORDER = COMPONENT.__args__
@@ -18,6 +19,7 @@
 
 class DSExpGen(ExpGen):
     """Data Science Task Generator."""
+
     def __init__(self, scen: Scenario) -> None:
         self.complete_component: set[COMPONENT] = set()  # Initialize as an empty set
         super().__init__(scen)
@@ -25,7 +27,7 @@ def __init__(self, scen: Scenario) -> None:
     def is_complete(self):
         """is all components complete"""
         # TODO: place it into ExpGen
-        return self.complete_component  == set(COMPONENT.__args__)
+        return self.complete_component == set(COMPONENT.__args__)
 
     def gen(self, trace: Trace) -> Experiment:
         if self.is_complete():
@@ -35,14 +37,14 @@ def gen(self, trace: Trace) -> Experiment:
             LLMHypothesisGen
             LLMHypothesis2Experiment
         else:
-            #         
+            #
             for o in ORDER:
                 if o in self.complete_component:
                     continue
                 elif o == "DataLoadSpec":
                     # TODO return a description of the data loading task
                     system = T(".prompts:DataLoaderSpec.system").r()
-                    user  = T(".prompts:DataLoaderSpec.user").r()
+                    user = T(".prompts:DataLoaderSpec.user").r()
                     data_load_exp = APIBackend().build_messages_and_create_chat_completion(
                         user_prompt=user,
                         system_prompt=system,
@@ -54,6 +56,5 @@ def gen(self, trace: Trace) -> Experiment:
                     )
                     return exp
                 else:
-                    ... # two components
+                    ...  # two components
         return super().gen(trace)
-
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
index ef874a671..29324c02a 100644
--- a/rdagent/scenarios/data_science/scen/__init__.py
+++ b/rdagent/scenarios/data_science/scen/__init__.py
@@ -1,4 +1,3 @@
 from .scen import DataScienceScen
 
-
 __all__ = ["DataScienceScen"]
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index ac4552287..7bc9aa847 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -1,11 +1,17 @@
+import json
+
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.core.experiment import Task
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.scenarios.kaggle.experiment.scenario import prompt_dict as kaggle_prompt_dict
-from rdagent.scenarios.kaggle.kaggle_crawler import crawl_descriptions, leaderboard_scores
+from rdagent.scenarios.kaggle.experiment.scenario import (
+    prompt_dict as kaggle_prompt_dict,
+)
+from rdagent.scenarios.kaggle.kaggle_crawler import (
+    crawl_descriptions,
+    leaderboard_scores,
+)
 from rdagent.utils.agent.tpl import T
-import json
 
 
 class DataScienceScen(Scenario):
@@ -27,9 +33,9 @@ def __init__(self, competition: str) -> None:
     def _analysis_competition_description(self):
         sys_prompt = T("scenarios.kaggle.experiment.prompts:kg_description_template.system").r()
         user_prompt = T("scenarios.kaggle.experiment.prompts:kg_description_template.user").r(
-                competition_descriptions=self.competition_descriptions,
-                raw_data_information=self.source_data,
-                evaluation_metric_direction=self.evaluation_metric_direction,
+            competition_descriptions=self.competition_descriptions,
+            raw_data_information=self.source_data,
+            evaluation_metric_direction=self.evaluation_metric_direction,
         )
 
         response_analysis = APIBackend().build_messages_and_create_chat_completion(
@@ -43,11 +49,13 @@ def _analysis_competition_description(self):
         self.competition_description = response_json_analysis.get("Competition Description", "No description provided")
         self.target_description = response_json_analysis.get("Target Description", "No target provided")
         self.competition_features = response_json_analysis.get("Competition Features", "No features provided")
-        self.submission_specifications = response_json_analysis.get("Submission Specifications",
-                                                                    "No submission requirements provided")
+        self.submission_specifications = response_json_analysis.get(
+            "Submission Specifications", "No submission requirements provided"
+        )
         self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
-        self.evaluation_desc = response_json_analysis.get("Evaluation Description",
-                                                          "No evaluation specification provided.")
+        self.evaluation_desc = response_json_analysis.get(
+            "Evaluation Description", "No evaluation specification provided."
+        )
 
     def get_competition_full_desc(self) -> str:
         evaluation_direction = "higher the better" if self.evaluation_metric_direction else "lower the better"
diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
index 37077a6e9..d3dcee949 100644
--- a/rdagent/scenarios/kaggle/developer/feedback.py
+++ b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -8,8 +8,8 @@
 from rdagent.core.experiment import Experiment
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import (
-    Hypothesis,
     Experiment2Feedback,
+    Hypothesis,
     HypothesisFeedback,
     Trace,
 )
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
index 7f2a31722..02f680afd 100644
--- a/rdagent/scenarios/kaggle/kaggle_crawler.py
+++ b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -31,7 +31,9 @@
 service = Service("/usr/local/bin/chromedriver")
 
 
-def crawl_descriptions(competition: str, local_data_path: str, wait: float = 3.0, force: bool = False) -> dict[str, str]:
+def crawl_descriptions(
+    competition: str, local_data_path: str, wait: float = 3.0, force: bool = False
+) -> dict[str, str]:
     if (fp := Path(f"{local_data_path}/{competition}.json")).exists() and not force:
         logger.info(f"Found {competition}.json, loading from local file.")
         with fp.open("r") as f:
diff --git a/rdagent/scenarios/qlib/developer/feedback.py b/rdagent/scenarios/qlib/developer/feedback.py
index 240c67d0f..8d014fe63 100644
--- a/rdagent/scenarios/qlib/developer/feedback.py
+++ b/rdagent/scenarios/qlib/developer/feedback.py
@@ -7,8 +7,8 @@
 from rdagent.core.experiment import Experiment
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import (
-    Hypothesis,
     Experiment2Feedback,
+    Hypothesis,
     HypothesisFeedback,
     Trace,
 )
diff --git a/rdagent/scenarios/qlib/proposal/model_proposal.py b/rdagent/scenarios/qlib/proposal/model_proposal.py
index 98ec33b65..c519fddb3 100644
--- a/rdagent/scenarios/qlib/proposal/model_proposal.py
+++ b/rdagent/scenarios/qlib/proposal/model_proposal.py
@@ -80,7 +80,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             "RAG": None,
         }, True
 
-    def convert_response(self, response: str, hypothesis: Hypothesis,  trace: Trace) -> ModelExperiment:
+    def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace) -> ModelExperiment:
         response_dict = json.loads(response)
         tasks = []
         for model_name in response_dict:

From 9874fbb61acd30599202c6694b3f6ff21ccc6761 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 27 Nov 2024 08:27:32 +0000
Subject: [PATCH 006/304] fix Task __init__ name bug

---
 rdagent/core/experiment.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index c3e0bc7c3..c07dcc7ea 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -37,9 +37,9 @@ def get_task_information(self) -> str:
         """
 
 class Task(AbsTask):
-    def __init__(self, name: str, version: int = 1, desc: str = "") -> None:
+    def __init__(self, name: str, version: int = 1, description: str = "") -> None:
         super().__init__(name, version)
-        self.description = desc
+        self.description = description
 
     def get_task_information(self) -> str:
         return f"{self.name}_{self.version}: {self.desc}"

From a9b04ff30b102f80bfb4a96bdd1c2236ca5b09b1 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 27 Nov 2024 08:53:08 +0000
Subject: [PATCH 007/304] changes about source data in scenario

---
 rdagent/scenarios/data_science/scen/scen.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index 7bc9aa847..9e7a323f1 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -1,4 +1,5 @@
 import json
+from pathlib import Path
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.core.experiment import Task
@@ -87,11 +88,16 @@ def background(self) -> str:
     def source_data(self) -> str:
         # TODO: remove me if not used
         # TODO: (bowen)
-        # phase1:
-        # - If we have not implement load data and dump cache
-        # - describe the raw data
-        # phase2: (cache detected)
-        # - Describe the cached data (preprocessed data).
+        if Path(f"{DS_RD_SETTING.local_data_path}/{DS_RD_SETTING.competition}/cache").exists():
+            # phase2: (cache detected)
+            # - Describe the cached data (preprocessed data).
+            pass
+        else:
+            # phase1:
+            # - If we have not implement load data and dump cache
+            # - describe the raw data
+            return self.competition_descriptions['Data Description']
+            
         return "!!!!!!!!! I'm the fake source data !!!!!!!!"
         raise NotImplementedError(f"We are not sure how it is called. We place a exception here")
 

From 4d782c0f632f81f390c11c6168c9e22e4d61186e Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Wed, 27 Nov 2024 10:27:53 +0000
Subject: [PATCH 008/304] add more stubs

---
 .../coder/CoSTEER/evolving_strategy.py        |  8 ++-
 .../data_science/raw_data_loader/__init__.py  | 63 +++++++++++++------
 .../data_science/raw_data_loader/conf.py      |  0
 .../data_science/raw_data_loader/eval.py      |  2 +
 .../{raw_data_loader.py => exp.py}            |  0
 .../data_science/raw_data_loader/test.py      | 14 +++++
 rdagent/core/experiment.py                    |  2 +-
 .../data_science/proposal/exp_gen.py          | 17 ++---
 test/utils/coder/test_CoSTEER.py              | 31 +++++++++
 9 files changed, 108 insertions(+), 29 deletions(-)
 create mode 100644 rdagent/components/coder/data_science/raw_data_loader/conf.py
 create mode 100644 rdagent/components/coder/data_science/raw_data_loader/eval.py
 rename rdagent/components/coder/data_science/raw_data_loader/{raw_data_loader.py => exp.py} (100%)
 create mode 100644 rdagent/components/coder/data_science/raw_data_loader/test.py
 create mode 100644 test/utils/coder/test_CoSTEER.py

diff --git a/rdagent/components/coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/CoSTEER/evolving_strategy.py
index c7126e7ff..5c0551ec5 100644
--- a/rdagent/components/coder/CoSTEER/evolving_strategy.py
+++ b/rdagent/components/coder/CoSTEER/evolving_strategy.py
@@ -30,7 +30,13 @@ def implement_one_task(
         self,
         target_task: Task,
         queried_knowledge: QueriedKnowledge = None,
-    ) -> Workspace:
+    ) -> dict[str, str]:  # FIXME: fix interface of previous implement
+        """
+
+        Return
+        ------
+        The new files {<filename>: <content>} to update the workspace.
+        """
         raise NotImplementedError
 
     def select_one_round_tasks(
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index 358046de1..c8f951085 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -8,24 +8,49 @@
 
 Extra feature:
 - cache
+
+
+File structure
+- ___init__.py: the entrance/agent of coder
+- evaluator.py
+- conf.py
+- exp.py: everything under the experiment, e.g.
+    - Task
+    - Experiment
+    - Workspace
+- test.py
+    - Each coder could be tested.
 """
 
-# from rdagent.components.coder.CoSTEER import CoSTEER
-# from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
-# from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-# from rdagent.core.scenario import Scenario
-
-
-# class DataLoaderCoSTEER(CoSTEER):
-#     def __init__(
-#         self,
-#         scen: Scenario,
-#         *args,
-#         **kwargs,
-#     ) -> None:
-#         eva = CoSTEERMultiEvaluator(
-#             DataLoaderCoSTEEREvaluator(scen=scen), scen=scen
-#         )  # Please specify whether you agree running your eva in parallel or not
-#         es = DataLoaderMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
-
-#         super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
+from rdagent.components.coder.CoSTEER import CoSTEER
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
+from rdagent.components.coder.CoSTEER.evolving_strategy import MultiProcessEvolvingStrategy
+from rdagent.components.coder.CoSTEER.knowledge_management import CoSTEERQueriedKnowledge
+from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.core.scenario import Scenario
+
+class DataLoaderMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
+    def implement_one_task(
+        self,
+        target_task: DataLoaderTask,
+        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+    ) -> str:
+        ... # prompting
+        # return a workspace with "load_data.py", "spec/load_data.md" inside
+        # assign the implemented code to the new workspace.
+
+
+class DataLoaderCoSTEER(CoSTEER):
+    def __init__(
+        self,
+        scen: Scenario,
+        *args,
+        **kwargs,
+    ) -> None:
+        eva = CoSTEERMultiEvaluator(
+            # DataLoaderCoSTEEREvaluator(scen=scen), scen=scen
+        )  # Please specify whether you agree running your eva in parallel or not
+        es = DataLoaderMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+
+        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
diff --git a/rdagent/components/coder/data_science/raw_data_loader/conf.py b/rdagent/components/coder/data_science/raw_data_loader/conf.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
new file mode 100644
index 000000000..bc8318385
--- /dev/null
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -0,0 +1,2 @@
+# tess successfully running.
+# (GPT) if it aligns with the spec & rationality of the spec.
diff --git a/rdagent/components/coder/data_science/raw_data_loader/raw_data_loader.py b/rdagent/components/coder/data_science/raw_data_loader/exp.py
similarity index 100%
rename from rdagent/components/coder/data_science/raw_data_loader/raw_data_loader.py
rename to rdagent/components/coder/data_science/raw_data_loader/exp.py
diff --git a/rdagent/components/coder/data_science/raw_data_loader/test.py b/rdagent/components/coder/data_science/raw_data_loader/test.py
new file mode 100644
index 000000000..57c441ebe
--- /dev/null
+++ b/rdagent/components/coder/data_science/raw_data_loader/test.py
@@ -0,0 +1,14 @@
+"""
+Helper functions for testing the raw_data_loader coder(CoSTEER-based) component.
+
+It is NOT:
+- it is not interface unittest(i.e. workspace evaluator in the CoSTEER Loop)
+"""
+
+
+def build_dummpy_exp(): # -> experiment
+    ...
+
+
+def get_developer():
+    ...
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index e06813cff..a8b3111b7 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -44,7 +44,7 @@ def __init__(self, name: str, version: int = 1, description: str = "") -> None:
         self.description = description
 
     def get_task_information(self) -> str:
-        return f"{self.name}_{self.version}: {self.desc}"
+        return f"{self.name}_{self.version}: {self.description}"
 
 
 ASpecificTask = TypeVar("ASpecificTask", bound=Task)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 8fcce50ef..b259d11e9 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -40,17 +40,18 @@ def gen(self, trace: Trace) -> Experiment:
             #
             for o in ORDER:
                 if o in self.complete_component:
+                    # we already have the component, the skip
                     continue
                 elif o == "DataLoadSpec":
                     # TODO return a description of the data loading task
-                    system = T(".prompts:DataLoaderSpec.system").r()
-                    user = T(".prompts:DataLoaderSpec.user").r()
-                    data_load_exp = APIBackend().build_messages_and_create_chat_completion(
-                        user_prompt=user,
-                        system_prompt=system,
-                        json_mode=True,
-                    )
-                    dlt = DataLoaderTask(name="DataLoaderTask", description=data_load_exp)
+                    # system = T(".prompts:DataLoaderSpec.system").r()
+                    # user = T(".prompts:DataLoaderSpec.user").r()
+                    # data_load_exp = APIBackend().build_messages_and_create_chat_completion(
+                    #     user_prompt=user,
+                    #     system_prompt=system,
+                    #     json_mode=True,
+                    # )
+                    dlt = DataLoaderTask(name="DataLoaderTask", description="")
                     exp = DataLoaderExperiment(
                         sub_tasks=[dlt],
                     )
diff --git a/test/utils/coder/test_CoSTEER.py b/test/utils/coder/test_CoSTEER.py
new file mode 100644
index 000000000..e38965354
--- /dev/null
+++ b/test/utils/coder/test_CoSTEER.py
@@ -0,0 +1,31 @@
+import unittest
+
+
+class CoSTEERTest(unittest.TestCase):
+
+    def setUp(self):
+        pass
+
+    def tearDown(self):
+        pass
+
+    def to_str(self, obj):
+        return "".join(str(obj).split())
+
+    def test_data_loader(self):
+        # 1) Build the data loader task/experiment
+        # 2) build an according CoSTEER
+        # 3) test the results
+        # - check spec.md
+        # - check data_loader.py
+        pass
+
+    def test_model(self):
+        # 1) Build the model experiment/task/workspace from tpl_ex
+        # 2) build an according CoSTEER
+        # 3) test the results
+        pass
+
+
+if __name__ == "__main__":
+    unittest.main()

From 69a861d5ce1f44b94e51267982bbaefad076c601 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 28 Nov 2024 04:13:34 +0000
Subject: [PATCH 009/304] fix bug

---
 rdagent/components/coder/data_science/raw_data_loader/exp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/exp.py b/rdagent/components/coder/data_science/raw_data_loader/exp.py
index 18d548d5c..2c7a4b4f5 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/exp.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/exp.py
@@ -21,7 +21,7 @@ def __init__(
         *args,
         **kwargs,
     ) -> None:
-        super().__init__(name=name, desc=description, *args, **kwargs)
+        super().__init__(name=name, description=description, *args, **kwargs)
 
     def get_task_information(self):
         task_desc = f"""name: {self.name}

From 87e83080e6e8a5d1717ea500d69a1d30602d6f19 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 28 Nov 2024 09:28:13 +0000
Subject: [PATCH 010/304] small changes

---
 rdagent/app/data_science/conf.py              | 15 ++++++-----
 rdagent/app/data_science/loop.py              | 25 ++++++++++---------
 .../data_science/raw_data_loader/eval.py      |  2 --
 .../raw_data_loader/evaluators.py             | 20 +++++++++++++++
 .../data_science/raw_data_loader/test.py      |  6 ++++-
 .../data_science/proposal/exp_gen.py          |  2 +-
 6 files changed, 48 insertions(+), 22 deletions(-)
 delete mode 100644 rdagent/components/coder/data_science/raw_data_loader/eval.py
 create mode 100644 rdagent/components/coder/data_science/raw_data_loader/evaluators.py

diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index 7bfd23cf3..350a38759 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -21,14 +21,17 @@ class DataScienceBasePropSetting(BasePropSetting):
     # """Hypothesis to experiment class"""
 
     ## dev/coder
-    feature_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGFactorCoSTEER"
-    """Feature Coder class"""
+    data_loader_coder: str = "rdagent.components.coder.data_science.raw_data_loader.DataLoaderCoSTEER"
+    """Data Loader CoSTEER"""
+    
+    # feature_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGFactorCoSTEER"
+    # """Feature Coder class"""
 
-    model_feature_selection_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGModelFeatureSelectionCoder"
-    """Model Feature Selection Coder class"""
+    # model_feature_selection_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGModelFeatureSelectionCoder"
+    # """Model Feature Selection Coder class"""
 
-    model_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGModelCoSTEER"
-    """Model Coder class"""
+    # model_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGModelCoSTEER"
+    # """Model Coder class"""
 
     ## dev/runner
     feature_runner: str = "rdagent.scenarios.kaggle.developer.runner.KGFactorRunner"
diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 48e951719..819e17510 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -20,7 +20,7 @@
 from rdagent.log.time import measure_time
 from rdagent.scenarios.kaggle.experiment.utils import python_files_to_notebook
 from rdagent.scenarios.kaggle.kaggle_crawler import download_data
-
+from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
 
 class DataScienceRDLoop(RDLoop):
     skip_loop_error = (NextLoopException,)
@@ -45,7 +45,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):
 
             # 2) task generation from a complete solution
             self.exp_gen: ExpGen = import_class(PROP_SETTING.exp_gen)(scen)
-
+            self.data_loader_coder: DataLoaderCoSTEER = import_class(PROP_SETTING.data_loader_coder)(scen)
             # self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
             # logger.log_object(self.hypothesis_gen, tag="hypothesis generator")
             # self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
@@ -83,16 +83,17 @@ def direct_exp_gen(self, prev_out: dict[str, Any]):
     @measure_time
     def coding(self, prev_out: dict[str, Any]):
         with logger.tag("d"):  # develop
-            if prev_out["direct_exp_gen"]["propose"].action in [
-                KG_ACTION_FEATURE_ENGINEERING,
-                KG_ACTION_FEATURE_PROCESSING,
-            ]:
-                exp = self.feature_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
-            elif prev_out["direct_exp_gen"]["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
-                exp = self.model_feature_selection_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
-            else:
-                exp = self.model_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
-            logger.log_object(exp.sub_workspace_list, tag="coder result")
+            exp = self.data_loader_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
+            # if prev_out["direct_exp_gen"]["propose"].action in [
+            #     KG_ACTION_FEATURE_ENGINEERING,
+            #     KG_ACTION_FEATURE_PROCESSING,
+            # ]:
+            #     exp = self.feature_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
+            # elif prev_out["direct_exp_gen"]["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
+            #     exp = self.model_feature_selection_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
+            # else:
+            #     exp = self.model_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
+            # logger.log_object(exp.sub_workspace_list, tag="coder result")
         return exp
 
     @measure_time
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
deleted file mode 100644
index bc8318385..000000000
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# tess successfully running.
-# (GPT) if it aligns with the spec & rationality of the spec.
diff --git a/rdagent/components/coder/data_science/raw_data_loader/evaluators.py b/rdagent/components/coder/data_science/raw_data_loader/evaluators.py
new file mode 100644
index 000000000..8807a0b70
--- /dev/null
+++ b/rdagent/components/coder/data_science/raw_data_loader/evaluators.py
@@ -0,0 +1,20 @@
+# tess successfully running.
+# (GPT) if it aligns with the spec & rationality of the spec.
+from rdagent.components.coder.CoSTEER.evaluators import (
+    CoSTEEREvaluator,
+    CoSTEERSingleFeedback,
+    CoSTEERMultiFeedback
+)
+from rdagent.core.experiment import Task, Workspace
+from rdagent.core.evolving_framework import QueriedKnowledge
+
+class DataLoaderCoSTEEREvaluator(CoSTEEREvaluator):
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: Workspace,
+        gt_implementation: Workspace,
+        queried_knowledge: QueriedKnowledge = None,
+        **kwargs,
+    ) -> CoSTEERSingleFeedback:
+        
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/raw_data_loader/test.py b/rdagent/components/coder/data_science/raw_data_loader/test.py
index 57c441ebe..f6a61b1ad 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/test.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/test.py
@@ -5,9 +5,13 @@
 - it is not interface unittest(i.e. workspace evaluator in the CoSTEER Loop)
 """
 
+from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask, DataLoaderExperiment
 
 def build_dummpy_exp(): # -> experiment
-    ...
+    dlt = DataLoaderTask(name="DataLoaderTask", description="")
+    exp = DataLoaderExperiment(
+        sub_tasks=[dlt],
+    )
 
 
 def get_developer():
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index b259d11e9..22c7082e1 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -1,7 +1,7 @@
 from argparse import ONE_OR_MORE
 from typing import Literal
 
-from rdagent.components.coder.data_science.raw_data_loader.raw_data_loader import (
+from rdagent.components.coder.data_science.raw_data_loader.exp import (
     DataLoaderExperiment,
     DataLoaderFBWorkspace,
     DataLoaderTask,

From cc1dae1d874da6678afaf0093a29914f9fabfd2d Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Thu, 28 Nov 2024 10:22:31 +0000
Subject: [PATCH 011/304] update some docs

---
 rdagent/components/coder/CoSTEER/evolving_strategy.py         | 3 +++
 rdagent/components/coder/CoSTEER/task.py                      | 1 +
 rdagent/components/coder/data_science/raw_data_loader/test.py | 1 +
 rdagent/core/experiment.py                                    | 4 ++++
 rdagent/scenarios/data_science/proposal/exp_gen.py            | 1 -
 5 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/rdagent/components/coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/CoSTEER/evolving_strategy.py
index 5c0551ec5..ba17851c4 100644
--- a/rdagent/components/coder/CoSTEER/evolving_strategy.py
+++ b/rdagent/components/coder/CoSTEER/evolving_strategy.py
@@ -32,6 +32,9 @@ def implement_one_task(
         queried_knowledge: QueriedKnowledge = None,
     ) -> dict[str, str]:  # FIXME: fix interface of previous implement
         """
+        This method will input the task & current workspace,
+        and output the modification to applied to the workspace.
+        (i.e. replace the content <filename> with <content>)
 
         Return
         ------
diff --git a/rdagent/components/coder/CoSTEER/task.py b/rdagent/components/coder/CoSTEER/task.py
index aaa38a4f1..92e90ed0e 100644
--- a/rdagent/components/coder/CoSTEER/task.py
+++ b/rdagent/components/coder/CoSTEER/task.py
@@ -4,4 +4,5 @@
 class CoSTEERTask(Task):
     def __init__(self, base_code: str = None, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
+        # TODO: we may upgrade the base_code into a workspace-like thing to know previous.
         self.base_code = base_code
diff --git a/rdagent/components/coder/data_science/raw_data_loader/test.py b/rdagent/components/coder/data_science/raw_data_loader/test.py
index f6a61b1ad..862de32a9 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/test.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/test.py
@@ -1,5 +1,6 @@
 """
 Helper functions for testing the raw_data_loader coder(CoSTEER-based) component.
+- Does the developer loop work correctly
 
 It is NOT:
 - it is not interface unittest(i.e. workspace evaluator in the CoSTEER Loop)
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index a8b3111b7..b2bea270f 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -223,7 +223,11 @@ def __init__(
         self.hypothesis: Optional["Hypothesis"] = hypothesis  # Experiment is opptionally generated by hypothesis
         self.sub_tasks: Sequence[ASpecificTask] = sub_tasks
         self.sub_workspace_list: list[ASpecificWSForSubTasks | None] = [None] * len(self.sub_tasks)
+        # TODO:
+        # It will be used in runner in history
+        # If we implement the whole workflow, we don't have to use it, then we remove it.
         self.based_experiments: Sequence[ASpecificWSForExperiment] = based_experiments
+
         self.result: object = None  # The result of the experiment, can be different types in different scenarios.
         self.sub_results: dict[str, float] = {}
         self.experiment_workspace: ASpecificWSForExperiment | None = None
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 22c7082e1..45b900b67 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -37,7 +37,6 @@ def gen(self, trace: Trace) -> Experiment:
             LLMHypothesisGen
             LLMHypothesis2Experiment
         else:
-            #
             for o in ORDER:
                 if o in self.complete_component:
                     # we already have the component, the skip

From ea5dc120bf8b6f86872452515f23d513ba8abc57 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Fri, 29 Nov 2024 05:37:42 +0000
Subject: [PATCH 012/304] Interface

---
 .../coder/data_science/model/eval.py          | 61 +++++++++++++++++++
 .../aerial-cactus-identification/spec.md      | 12 +++-
 2 files changed, 70 insertions(+), 3 deletions(-)
 create mode 100644 rdagent/components/coder/data_science/model/eval.py

diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
new file mode 100644
index 000000000..fffc3f511
--- /dev/null
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -0,0 +1,61 @@
+"""
+Beyond previous tests
+- 
+"""
+
+
+# Below are unit tests for testing the specification of the implemented model ------------------
+#
+class XXX1SpecEval:
+    """
+    Motivation case:
+    - Simplest case, we already split the data into train_data, valid_data, and test_data. We require the model to learn (optionally validate on valid data), and infer on test data.
+
+    Test workflow:
+    - Build train, valid, and test data to run it, and test the output (e.g., shape, value, etc.)
+    """
+
+
+class XXX2SpecEval:
+    """
+    Based on XXX1SpecEval, but considering the following case:
+    
+    Motivation case:
+    - Sometimes we don't need validation (e.g., simple models not prone to overfitting, or data is too scarce to split).
+
+    Test workflow:
+    - Build train and test data to run it, and test the output (e.g., shape, value, etc.)
+    - valid_data == None
+    """
+
+
+class XXX3SpecEval:
+    """
+    Motivation case:
+    - We need to tune hyperparameters.
+
+    Test workflow:
+    - Input:
+        - Build train and valid data
+        - test == None
+        - Hyperparameters are not blank
+    - Output:
+        - The early stop hyperparameters must be returned
+    """
+
+
+class XXX4SpecEval:
+    """
+    Motivation case:
+    - After obtaining good hyperparameters, we retrain the model.
+    
+    Test workflow:
+    - Test1: Since we have already tested it in XXX2SpecEval, we'll focus on another aspect.
+        - Input:
+            - Build train and test data
+            - valid == None
+            - Previous good hyperparameters (a parameter representing early stop)
+    - Test2: Ensure the hyperparameters are 1) being used, and 2) the model remains stable.
+        - Different hyperparameters will yield different results
+        - Same hyperparameters will yield the same results
+    """
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec.md
index 69378e075..623a9a6b7 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec.md
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec.md
@@ -55,9 +55,15 @@ def feature_eng(X: np.ndarray, y: np.ndarray | None = None, X_fit: np.ndarray |
 - Implement a function to manage the model workflow with the following signature:
 
 ```python
-def model_workflow(X: np.ndarray, y: np.ndarray, val_X: np.ndarray = None, val_y: np.ndarray = None, test_X: np.ndarray = None, **hyper_params) -> tuple[np.ndarray | None, np.ndarray | None]:
+def model_workflow(X: np.ndarray, y: np.ndarray, val_X: np.ndarray = None, val_y: np.ndarray = None, test_X: np.ndarray = None, **hyper_params; dict = {}) -> tuple[np.ndarray | None, np.ndarray | None, dict]:
     """
-    Manages the workflow of a machine learning model, including training, validation, and testing.
+    Manages the workflow of a machine learning model, including training, validation.
+    The testing&validation's inference is included, as well
+
+    - If test/valid exist, output inference on them
+    - Follow the hyperparameter if exists.
+        - the returned hyperparameter should align with the input(except the newly generated early stop)
+    - If valid exist, add <early stop> to update the hyperparameter
 
     Parameters
     ----------
@@ -76,7 +82,7 @@ def model_workflow(X: np.ndarray, y: np.ndarray, val_X: np.ndarray = None, val_y
 
     Returns
     -------
-    tuple[np.ndarray | None, np.ndarray | None]
+    tuple[np.ndarray | None, np.ndarray | None, dict]
         Predictions on the validation data, predictions on the test data
     """
 ```

From fc4e0632534662e5251d2d6b92de1de910f268f1 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 29 Nov 2024 09:02:23 +0000
Subject: [PATCH 013/304] some changes

---
 .../coder/data_science/raw_data_loader/exp.py |  25 +----
 .../data_science/raw_data_loader/test.py      |  18 +++-
 rdagent/core/experiment.py                    |   5 +-
 .../{kaggle_experiment.py => experiment.py}   |  63 +++++------
 .../data_science/experiment/workspace.py      | 102 ++++++------------
 rdagent/utils/env.py                          |  12 +++
 test/utils/coder/test_CoSTEER.py              |   4 +
 7 files changed, 94 insertions(+), 135 deletions(-)
 rename rdagent/scenarios/data_science/experiment/{kaggle_experiment.py => experiment.py} (60%)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/exp.py b/rdagent/components/coder/data_science/raw_data_loader/exp.py
index 2c7a4b4f5..495c8751d 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/exp.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/exp.py
@@ -8,8 +8,7 @@
 from rdagent.core.experiment import Experiment, FBWorkspace
 from rdagent.core.utils import cache_with_pickle
 from rdagent.oai.llm_utils import md5_hash
-from rdagent.utils.env import KGDockerEnv, QTDockerEnv
-
+from rdagent.utils.env import DockerEnv, DSDockerConf
 # TODO: Complete the implementation of the class DataLoaderTask and class DataLoaderFBWorkspace
 
 
@@ -38,26 +37,13 @@ def __repr__(self) -> str:
 
 
 class DataLoaderFBWorkspace(FBWorkspace):
-    def hash_func(
-        self,
-        batch_size: int = 8,
-        num_features: int = 10,
-        num_timesteps: int = 4,
-        num_edges: int = 20,
-        input_value: float = 1.0,
-        param_init_value: float = 1.0,
-    ) -> str:
-        target_file_name = f"{batch_size}_{num_features}_{num_timesteps}_{input_value}_{param_init_value}"
-        for code_file_name in sorted(list(self.code_dict.keys())):
-            target_file_name = f"{target_file_name}_{self.code_dict[code_file_name]}"
-        return md5_hash(target_file_name)
 
-    @cache_with_pickle(hash_func)
+    # TODO: use the cache_with_pickle decorator.
     def execute(self):
         super().execute()
         try:
-            qtde = QTDockerEnv() if self.target_task.version == 1 else KGDockerEnv()
-            qtde.prepare()
+            de = DockerEnv(conf=DSDockerConf())
+            de.prepare()
 
             # TODO: UNIT TEST for data loader
             dump_code = (Path(__file__).parent / "data_loader_unit_test.txt").read_text()
@@ -68,5 +54,4 @@ def execute(self):
         except Exception as e:
             pass
 
-
-DataLoaderExperiment = Experiment
+        return "data_loader.py and spec.md executed successfully", "content of spec.md", "pkl generated by data_loader.py"
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/raw_data_loader/test.py b/rdagent/components/coder/data_science/raw_data_loader/test.py
index 862de32a9..d106c52f1 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/test.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/test.py
@@ -6,14 +6,24 @@
 - it is not interface unittest(i.e. workspace evaluator in the CoSTEER Loop)
 """
 
-from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask, DataLoaderExperiment
+from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
+from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.scenarios.data_science.experiment.experiment import DataLoaderExperiment
 
-def build_dummpy_exp(): # -> experiment
+def develop_one_competition(competition: str): # -> experiment
+    scen = DataScienceScen(competition=competition)
+    data_loader_coder = DataLoaderCoSTEER(scen)
+
+    # Create the experiment
     dlt = DataLoaderTask(name="DataLoaderTask", description="")
     exp = DataLoaderExperiment(
         sub_tasks=[dlt],
     )
 
+    # Develop the experiment
+    exp = data_loader_coder.develop(exp)
+
 
-def get_developer():
-    ...
+if __name__ == "__main__":
+    develop_one_competition("aerial-cactus-identification")
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index b2bea270f..e4e363bed 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -105,10 +105,7 @@ def run_pipeline(self, **files: str):
 
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
-        self.code_dict: dict[str, Any] = {}
-        self.code_dict = (
-            {}
-        )  # The code injected into the folder, store them in the variable to reproduce the former result
+        self.code_dict: dict[str, Any] = {} # The code injected into the folder, store them in the variable to reproduce the former result
         self.workspace_path: Path = RD_AGENT_SETTINGS.workspace_path / uuid.uuid4().hex
 
     @property
diff --git a/rdagent/scenarios/data_science/experiment/kaggle_experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
similarity index 60%
rename from rdagent/scenarios/data_science/experiment/kaggle_experiment.py
rename to rdagent/scenarios/data_science/experiment/experiment.py
index d641a77e5..5cac06335 100644
--- a/rdagent/scenarios/data_science/experiment/kaggle_experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -1,60 +1,53 @@
 from copy import deepcopy
 from pathlib import Path
 
+from rdagent.core.experiment import Experiment
 from rdagent.app.data_science.conf import DS_RD_SETTING
-from rdagent.components.coder.data_science.raw_data_loader.raw_data_loader import (
-    DataLoaderExperiment,
+from rdagent.components.coder.data_science.raw_data_loader.exp import (
     DataLoaderFBWorkspace,
     DataLoaderTask,
 )
 from rdagent.components.coder.factor_coder.factor import (
     FactorFBWorkspace,
     FactorTask,
-    FeatureExperiment,
 )
 from rdagent.components.coder.model_coder.model import (
-    ModelExperiment,
     ModelFBWorkspace,
     ModelTask,
 )
-from rdagent.scenarios.data_science.experiment.workspace import KGFBWorkspace
+from rdagent.scenarios.data_science.experiment.workspace import DSFBWorkspace
 
-KG_MODEL_TYPE_XGBOOST = "XGBoost"
-KG_MODEL_TYPE_RANDOMFOREST = "RandomForest"
-KG_MODEL_TYPE_LIGHTGBM = "LightGBM"
-KG_MODEL_TYPE_NN = "NN"
+# KG_MODEL_TYPE_XGBOOST = "XGBoost"
+# KG_MODEL_TYPE_RANDOMFOREST = "RandomForest"
+# KG_MODEL_TYPE_LIGHTGBM = "LightGBM"
+# KG_MODEL_TYPE_NN = "NN"
 
-KG_MODEL_MAPPING = {
-    KG_MODEL_TYPE_XGBOOST: "model/model_xgboost.py",
-    KG_MODEL_TYPE_RANDOMFOREST: "model/model_randomforest.py",
-    KG_MODEL_TYPE_LIGHTGBM: "model/model_lightgbm.py",
-    KG_MODEL_TYPE_NN: "model/model_nn.py",
-}
+# KG_MODEL_MAPPING = {
+#     KG_MODEL_TYPE_XGBOOST: "model/model_xgboost.py",
+#     KG_MODEL_TYPE_RANDOMFOREST: "model/model_randomforest.py",
+#     KG_MODEL_TYPE_LIGHTGBM: "model/model_lightgbm.py",
+#     KG_MODEL_TYPE_NN: "model/model_nn.py",
+# }
 
-KG_SELECT_MAPPING = {
-    KG_MODEL_TYPE_XGBOOST: "model/select_xgboost.py",
-    KG_MODEL_TYPE_RANDOMFOREST: "model/select_randomforest.py",
-    KG_MODEL_TYPE_LIGHTGBM: "model/select_lightgbm.py",
-    KG_MODEL_TYPE_NN: "model/select_nn.py",
-}
+# KG_SELECT_MAPPING = {
+#     KG_MODEL_TYPE_XGBOOST: "model/select_xgboost.py",
+#     KG_MODEL_TYPE_RANDOMFOREST: "model/select_randomforest.py",
+#     KG_MODEL_TYPE_LIGHTGBM: "model/select_lightgbm.py",
+#     KG_MODEL_TYPE_NN: "model/select_nn.py",
+# }
 
 
-class KGDataLoaderExperiment(DataLoaderExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]):
-    # TODO: complete the implementation
-    def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
+
+class DataLoaderExperiment(Experiment[DataLoaderTask, DSFBWorkspace, DataLoaderFBWorkspace]):
+    def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        # TODO: It seems there are some problems as the folder has not been created.
-        # self.experiment_workspace = KGFBWorkspace(
-        #     template_folder_path=Path(__file__).resolve()
-        #     / Path(DS_RD_SETTING.template_path).resolve()
-        #     / DS_RD_SETTING.competition
-        # )
+        self.experiment_workspace = DataLoaderFBWorkspace()
 
 
-class KGModelExperiment(ModelExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]):
+class ModelExperiment(Experiment[ModelTask, DSFBWorkspace, ModelFBWorkspace]):
     def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.experiment_workspace = KGFBWorkspace(
+        self.experiment_workspace = DSFBWorkspace(
             template_folder_path=Path(__file__).resolve()
             / Path(DS_RD_SETTING.template_path).resolve()
             / DS_RD_SETTING.competition
@@ -77,10 +70,10 @@ def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
             ]
 
 
-class KGFactorExperiment(FeatureExperiment[FactorTask, KGFBWorkspace, FactorFBWorkspace]):
+class FactorExperiment(Experiment[FactorTask, DSFBWorkspace, FactorFBWorkspace]):
     def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.experiment_workspace = KGFBWorkspace(
+        self.experiment_workspace = DSFBWorkspace(
             template_folder_path=Path(__file__).resolve()
             / Path(DS_RD_SETTING.template_path).resolve()
             / DS_RD_SETTING.competition
@@ -100,4 +93,4 @@ def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
                     ).get_task_information(),
                     source_feature_size,
                 )
-            ]
+            ]
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/experiment/workspace.py b/rdagent/scenarios/data_science/experiment/workspace.py
index 1bfaaf866..ed079df89 100644
--- a/rdagent/scenarios/data_science/experiment/workspace.py
+++ b/rdagent/scenarios/data_science/experiment/workspace.py
@@ -1,97 +1,55 @@
-import subprocess
-import zipfile
 from pathlib import Path
 from typing import Any, List, Tuple
 
 import pandas as pd
 
-from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.core.experiment import FBWorkspace
 from rdagent.log import rdagent_logger as logger
-from rdagent.utils.env import KGDockerEnv
-
-KG_FEATURE_PREPROCESS_SCRIPT = """import pickle
-
-from fea_share_preprocess import preprocess_script
-
-X_train, X_valid, y_train, y_valid, X_test, *others = preprocess_script()
-
-pickle.dump(X_train, open("X_train.pkl", "wb"))
-pickle.dump(X_valid, open("X_valid.pkl", "wb"))
-pickle.dump(y_train, open("y_train.pkl", "wb"))
-pickle.dump(y_valid, open("y_valid.pkl", "wb"))
-pickle.dump(X_test, open("X_test.pkl", "wb"))
-pickle.dump(others, open("others.pkl", "wb"))
-"""
-
-
-class KGFBWorkspace(FBWorkspace):
-    def __init__(self, template_folder_path: Path, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.inject_code_from_folder(template_folder_path)
-        self.data_description: List[Tuple[str, int]] = []
-
-    @property
-    def model_description(self) -> dict[str, str]:
-        model_description = {}
-        for k, v in self.code_dict.items():
-            if k.startswith("model/"):
-                model_description[k] = v
-        return model_description
-
-    def generate_preprocess_data(
-        self,
-    ) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, Any]:
-        kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
-        kgde.prepare()
-
-        execute_log, results = kgde.dump_python_code_run_and_get_results(
-            code=KG_FEATURE_PREPROCESS_SCRIPT,
-            local_path=str(self.workspace_path),
-            dump_file_names=[
-                "X_train.pkl",
-                "X_valid.pkl",
-                "y_train.pkl",
-                "y_valid.pkl",
-                "X_test.pkl",
-                "others.pkl",
-            ],
-            running_extra_volume=(
-                {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"}
-                if KAGGLE_IMPLEMENT_SETTING.competition
-                else None
-            ),
-        )
-        if results is None:
-            logger.error("Feature preprocess failed.")
-            raise Exception("Feature preprocess failed.")
-        else:
-            X_train, X_valid, y_train, y_valid, X_test, others = results
-            return X_train, X_valid, y_train, y_valid, X_test, *others
-
-    def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
+from rdagent.utils.env import DockerEnv, DSDockerConf
+
+
+class DSFBWorkspace(FBWorkspace):
+    
+    # TODO: use the cache_with_pickle decorator.
+    def execute(self, run_env: dict = {}, *args, **kwargs) -> pd.DataFrame:
+        """
+        Executes the experiment(a competition) within the specified workspace.
+
+        Args:
+            run_env (dict): The runtime environment variables.
+
+        Returns:
+            pd.DataFrame: Scores of each Model and ensemble Model.
+            Example:
+            | Model                 | <Metric like ACC/AUROC/MAE...> |
+            |-----------------------|--------------------------------|
+            | model1                | 0.9                            |
+            | model2                | 0.8                            |
+            | <ensemble model name> | 0.95                           |
+        """
         logger.info(f"Running the experiment in {self.workspace_path}")
 
-        kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
-        kgde.prepare()
+        de = DockerEnv(DSDockerConf())
+        de.prepare()
 
         running_extra_volume = {}
-        if KAGGLE_IMPLEMENT_SETTING.competition:
+        if DS_RD_SETTING.competition:
             running_extra_volume = {
-                KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"
+                DS_RD_SETTING.local_data_path + "/" + DS_RD_SETTING.competition: "/kaggle/input"
             }
         else:
             running_extra_volume = {}
 
-        execute_log = kgde.run(
+        execute_log = de.run(
             local_path=str(self.workspace_path),
             env=run_env,
             running_extra_volume=running_extra_volume,
         )
 
-        csv_path = self.workspace_path / "submission_score.csv"
+        csv_path = self.workspace_path / "scores.csv"
 
         if not csv_path.exists():
             logger.error(f"File {csv_path} does not exist.")
             return None
-        return pd.read_csv(csv_path, index_col=0).iloc[:, 0]
+        return pd.read_csv(csv_path, index_col=0)
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
index d6f817849..71c1b83ae 100644
--- a/rdagent/utils/env.py
+++ b/rdagent/utils/env.py
@@ -187,6 +187,18 @@ class KGDockerConf(DockerConf):
         "48g"  # Add memory limit attribute # new-york-city-taxi-fare-prediction may need more memory
     )
 
+class DSDockerConf(DockerConf):
+    model_config = ExtendedSettingsConfigDict(env_prefix="DS_DOCKER_")
+
+    build_from_dockerfile: bool = False
+    image: str = "gcr.io/kaggle-gpu-images/python:latest"
+    mount_path: str = "/kaggle/workspace"
+    default_entry: str = "python main.py"
+
+    running_timeout_period: int = 600
+    mem_limit: str | None = (
+        "48g"  # Add memory limit attribute # new-york-city-taxi-fare-prediction may need more memory
+    )
 
 class MLEBDockerConf(DockerConf):
     model_config = ExtendedSettingsConfigDict(env_prefix="MLEB_DOCKER_")
diff --git a/test/utils/coder/test_CoSTEER.py b/test/utils/coder/test_CoSTEER.py
index e38965354..2da8e6be9 100644
--- a/test/utils/coder/test_CoSTEER.py
+++ b/test/utils/coder/test_CoSTEER.py
@@ -18,6 +18,10 @@ def test_data_loader(self):
         # 3) test the results
         # - check spec.md
         # - check data_loader.py
+        from rdagent.components.coder.data_science.raw_data_loader.test import develop_one_competition
+
+        exp = develop_one_competition("aerial-cactus-identification")
+        
         pass
 
     def test_model(self):

From 55f3737a7a69241a8f0ddfdf1b9d21ecb3e0d03a Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Fri, 29 Nov 2024 09:06:24 +0000
Subject: [PATCH 014/304] ds_model_initial_changes

---
 .../coder/data_science/model/eval.py          | 31 ++++++++++++++++++-
 .../coder/data_science/model/exp.py           |  0
 .../coder/data_science/model/test.py          |  8 +++++
 3 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 rdagent/components/coder/data_science/model/exp.py
 create mode 100644 rdagent/components/coder/data_science/model/test.py

diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index fffc3f511..0ae0760d6 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -2,11 +2,16 @@
 Beyond previous tests
 - 
 """
+from rdagent.components.coder.CoSTEER.evaluators import (
+    CoSTEEREvaluator,
+    CoSTEERMultiFeedback,
+    CoSTEERSingleFeedback,
+)
 
 
 # Below are unit tests for testing the specification of the implemented model ------------------
 #
-class XXX1SpecEval:
+class ModelGeneralCaseSpecEvaluator(CoSTEEREvaluator):
     """
     Motivation case:
     - Simplest case, we already split the data into train_data, valid_data, and test_data. We require the model to learn (optionally validate on valid data), and infer on test data.
@@ -14,6 +19,30 @@ class XXX1SpecEval:
     Test workflow:
     - Build train, valid, and test data to run it, and test the output (e.g., shape, value, etc.)
     """
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: Workspace,
+        gt_implementation: Workspace,
+        queried_knowledge: QueriedKnowledge = None,
+        **kwargs,
+    ) -> ModelSingleFeedback:
+        target_task_information = target_task.get_task_information()
+        if (
+            queried_knowledge is not None
+            and target_task_information in queried_knowledge.success_task_to_knowledge_dict
+        ):
+            return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
+        elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
+            return ModelSingleFeedback(
+                execution_feedback="This task has failed too many times, skip implementation.",
+                shape_feedback="This task has failed too many times, skip implementation.",
+                value_feedback="This task has failed too many times, skip implementation.",
+                code_feedback="This task has failed too many times, skip implementation.",
+                final_feedback="This task has failed too many times, skip implementation.",
+                final_decision=False,
+            )
+        assert isinstance(target_task, ModelTask)
 
 
 class XXX2SpecEval:
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
new file mode 100644
index 000000000..d8c65059a
--- /dev/null
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -0,0 +1,8 @@
+"""
+Generate dataset to test the model workflow output
+"""
+
+# Take tasks, spec.md and feat as input, generate a feedback as output
+def DiffSpec():
+    
+    pass
\ No newline at end of file

From abcf1cf289eb9193efb9cf1d9aff2926e41036b7 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 29 Nov 2024 09:59:45 +0000
Subject: [PATCH 015/304] model base py files

---
 .../coder/data_science/model/__init__.py      |  59 ++++++---
 .../coder/data_science/model/exp.py           | 121 ++++++++++++++++++
 .../model/model_execute_template.txt          |  24 ++++
 .../coder/data_science/model/prompts.yaml     |   5 +
 .../coder/data_science/model/test.py          |  29 ++++-
 .../data_science/experiment/experiment.py     |  40 +++---
 test/utils/coder/test_CoSTEER.py              |   5 +
 7 files changed, 243 insertions(+), 40 deletions(-)
 create mode 100644 rdagent/components/coder/data_science/model/model_execute_template.txt
 create mode 100644 rdagent/components/coder/data_science/model/prompts.yaml

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 7d4020cfa..e15692173 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -1,19 +1,46 @@
-# from rdagent.components.coder.CoSTEER import CoSTEER
-# from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
-# from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-# from rdagent.core.scenario import Scenario
+from rdagent.components.coder.CoSTEER import CoSTEER
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
+from rdagent.core.scenario import Scenario
+from rdagent.components.coder.CoSTEER.evolving_strategy import MultiProcessEvolvingStrategy
+from rdagent.components.coder.data_science.model.exp import ModelTask
+from rdagent.components.coder.CoSTEER.knowledge_management import CoSTEERQueriedKnowledge
 
+# from rdagent.utils.agent.tpl import T
+# T(".prompts:model_generator.user").r()
 
-# class ModelCoSTEER(CoSTEER):
-#     def __init__(
-#         self,
-#         scen: Scenario,
-#         *args,
-#         **kwargs,
-#     ) -> None:
-#         eva = CoSTEERMultiEvaluator(
-#             ModelCoSTEEREvaluator(scen=scen), scen=scen
-#         )  # Please specify whether you agree running your eva in parallel or not
-#         es = ModelMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+class ModelMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
+    def implement_one_task(
+        self,
+        target_task: ModelTask,
+        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+    ) -> str:
+        return """
+        import pandas as pd
+        def Model():
+            pass
+        """
+    
+    def assign_code_list_to_evo(self, code_list: list, evo) -> None:
+        """
+        Assign the code list to the evolving item.
 
-#         super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
+        The code list is aligned with the evolving item's sub-tasks.
+        If a task is not implemented, put a None in the list.
+        """
+        raise NotImplementedError
+
+
+class ModelCoSTEER(CoSTEER):
+    def __init__(
+        self,
+        scen: Scenario,
+        *args,
+        **kwargs,
+    ) -> None:
+        eva = CoSTEERMultiEvaluator(
+            # ModelCoSTEEREvaluator(scen=scen), scen=scen
+        )  # Please specify whether you agree running your eva in parallel or not
+        es = ModelMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+
+        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
index e69de29bb..ac93d1dd6 100644
--- a/rdagent/components/coder/data_science/model/exp.py
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -0,0 +1,121 @@
+import pickle
+import site
+import traceback
+from pathlib import Path
+from typing import Dict, Optional
+
+from rdagent.components.coder.CoSTEER.task import CoSTEERTask
+from rdagent.core.experiment import Experiment, FBWorkspace
+from rdagent.core.utils import cache_with_pickle
+from rdagent.oai.llm_utils import md5_hash
+from rdagent.utils.env import DockerEnv, DSDockerConf
+# TODO: Complete the implementation of the class DataLoaderTask and class DataLoaderFBWorkspace
+
+
+class ModelTask(CoSTEERTask):
+    def __init__(
+        self,
+        name: str,
+        description: str,
+        architecture: str,
+        *args,
+        hyperparameters: Dict[str, str],
+        formulation: str = None,
+        variables: Dict[str, str] = None,
+        model_type: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        self.formulation: str = formulation
+        self.architecture: str = architecture
+        self.variables: str = variables
+        self.hyperparameters: str = hyperparameters
+        self.model_type: str = (
+            model_type  # Tabular for tabular model, TimesSeries for time series model, Graph for graph model, XGBoost for XGBoost model
+        )
+        super().__init__(name=name, description=description, *args, **kwargs)
+
+    def get_task_information(self):
+        task_desc = f"""name: {self.name}
+description: {self.description}
+"""
+        task_desc += f"formulation: {self.formulation}\n" if self.formulation else ""
+        task_desc += f"architecture: {self.architecture}\n"
+        task_desc += f"variables: {self.variables}\n" if self.variables else ""
+        task_desc += f"hyperparameters: {self.hyperparameters}\n"
+        task_desc += f"model_type: {self.model_type}\n"
+        return task_desc
+
+    @staticmethod
+    def from_dict(dict):
+        return ModelTask(**dict)
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} {self.name}>"
+
+
+class ModelFBWorkspace(FBWorkspace):
+    """
+    It is a Pytorch model implementation task;
+    All the things are placed in a folder.
+
+    Folder
+    - data source and documents prepared by `prepare`
+        - Please note that new data may be passed in dynamically in `execute`
+    - code (file `model.py` ) injected by `inject_code`
+        - the `model.py` that contains a variable named `model_cls` which indicates the implemented model structure
+            - `model_cls` is a instance of `torch.nn.Module`;
+
+    We support two ways of interface:
+        (version 1) for qlib we'll make a script to import the model in the implementation in file `model.py` after setting the cwd into the directory
+            - from model import model_cls
+            - initialize the model by initializing it `model_cls(input_dim=INPUT_DIM)`
+            - And then verify the model.
+
+        (version 2) for kaggle we'll make a script to call the fit and predict function in the implementation in file `model.py` after setting the cwd into the directory
+    """
+
+    def hash_func(
+        self,
+        batch_size: int = 8,
+        num_features: int = 10,
+        num_timesteps: int = 4,
+        num_edges: int = 20,
+        input_value: float = 1.0,
+        param_init_value: float = 1.0,
+    ) -> str:
+        target_file_name = f"{batch_size}_{num_features}_{num_timesteps}_{input_value}_{param_init_value}"
+        for code_file_name in sorted(list(self.code_dict.keys())):
+            target_file_name = f"{target_file_name}_{self.code_dict[code_file_name]}"
+        return md5_hash(target_file_name)
+
+    @cache_with_pickle(hash_func)
+    def execute(
+        self
+    ):
+        super().execute()
+        try:
+            de = DockerEnv(DSDockerConf())
+            de.prepare()
+
+            dump_code = (Path(__file__).parent / "model_execute_template.txt").read_text()
+
+            log, results = de.dump_python_code_run_and_get_results(
+                code=dump_code,
+                dump_file_names=["execution_feedback_str.pkl", "execution_model_output.pkl"],
+                local_path=str(self.workspace_path),
+                env={},
+                code_dump_file_py_name="model_test",
+            )
+            if results is None:
+                raise RuntimeError(f"Error in running the model code: {log}")
+            [execution_feedback_str, execution_model_output] = results
+
+        except Exception as e:
+            execution_feedback_str = f"Execution error: {e}\nTraceback: {traceback.format_exc()}"
+            execution_model_output = None
+
+        if len(execution_feedback_str) > 2000:
+            execution_feedback_str = (
+                execution_feedback_str[:1000] + "....hidden long error message...." + execution_feedback_str[-1000:]
+            )
+        return execution_feedback_str, execution_model_output
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/model/model_execute_template.txt b/rdagent/components/coder/data_science/model/model_execute_template.txt
new file mode 100644
index 000000000..78962c2a0
--- /dev/null
+++ b/rdagent/components/coder/data_science/model/model_execute_template.txt
@@ -0,0 +1,24 @@
+import os
+import pickle
+
+import numpy as np
+import pandas as pd
+import torch
+from model import fit, predict
+
+train_X = pd.DataFrame(np.random.randn(8, 30), columns=[f"{i}" for i in range(30)])
+train_y = pd.Series(np.random.randint(0, 2, 8))
+valid_X = pd.DataFrame(np.random.randn(8, 30), columns=[f"{i}" for i in range(30)])
+valid_y = pd.Series(np.random.randint(0, 2, 8))
+
+model = fit(train_X, train_y, valid_X, valid_y)
+execution_model_output = predict(model, valid_X)
+
+if isinstance(execution_model_output, torch.Tensor):
+    execution_model_output = execution_model_output.cpu().detach().numpy()
+
+
+execution_feedback_str = f"Execution successful, output numpy ndarray shape: {execution_model_output.shape}"
+
+pickle.dump(execution_model_output, open("execution_model_output.pkl", "wb"))
+pickle.dump(execution_feedback_str, open("execution_feedback_str.pkl", "wb"))
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
new file mode 100644
index 000000000..cab247b35
--- /dev/null
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -0,0 +1,5 @@
+model_generator:
+  system:
+    system
+  user:
+    user
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index d8c65059a..3d0ce46c7 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -1,8 +1,33 @@
 """
 Generate dataset to test the model workflow output
 """
+from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.components.coder.data_science.model import ModelCoSTEER
+from rdagent.components.coder.data_science.model.exp import ModelTask
+from rdagent.scenarios.data_science.experiment.experiment import ModelExperiment
+from pathlib import Path
 
 # Take tasks, spec.md and feat as input, generate a feedback as output
-def DiffSpec():
+def develop_one_competition(competition: str):
+    scen = DataScienceScen(competition=competition)
+    model_coder = ModelCoSTEER(scen)
     
-    pass
\ No newline at end of file
+    # Create the experiment
+    mt = ModelTask(name="ModelTask", description="", base_code="import pandas...")
+    exp = ModelExperiment(
+        sub_tasks=[mt],
+    )
+    
+    tpl_ex_path = Path(__file__).resolve() / Path("rdagent/scenarios/kaggle/tpl_ex").resolve() / competition
+    injected_file_names = ["spec.md", "load_data.py", "feat01.py"]
+    for file_name in injected_file_names:
+        file_path = tpl_ex_path / file_name
+        exp.experiment_workspace.inject_code(**{file_name: file_path.read_text()})
+    
+    # Run the experiment
+    exp = model_coder.develop(exp)
+
+
+if __name__ == "__main__":
+    develop_one_competition("aerial-cactus-identification")
+    # dotenv run -- python rdagent/components/coder/data_science/model/test.py
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index 5cac06335..7e9c00da9 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -41,33 +41,29 @@
 class DataLoaderExperiment(Experiment[DataLoaderTask, DSFBWorkspace, DataLoaderFBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.experiment_workspace = DataLoaderFBWorkspace()
+        self.experiment_workspace = DSFBWorkspace()
 
 
 class ModelExperiment(Experiment[ModelTask, DSFBWorkspace, ModelFBWorkspace]):
     def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.experiment_workspace = DSFBWorkspace(
-            template_folder_path=Path(__file__).resolve()
-            / Path(DS_RD_SETTING.template_path).resolve()
-            / DS_RD_SETTING.competition
-        )
-        if len(self.based_experiments) > 0:
-            self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
-            self.experiment_workspace.data_description = deepcopy(
-                self.based_experiments[-1].experiment_workspace.data_description
-            )
-        else:
-            self.experiment_workspace.data_description = [
-                (
-                    FactorTask(
-                        factor_name="Original features",
-                        factor_description="The original features",
-                        factor_formulation="",
-                    ).get_task_information(),
-                    source_feature_size,
-                )
-            ]
+        self.experiment_workspace = DSFBWorkspace()
+        # if len(self.based_experiments) > 0:
+        #     self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
+        #     self.experiment_workspace.data_description = deepcopy(
+        #         self.based_experiments[-1].experiment_workspace.data_description
+        #     )
+        # else:
+        #     self.experiment_workspace.data_description = [
+        #         (
+        #             FactorTask(
+        #                 factor_name="Original features",
+        #                 factor_description="The original features",
+        #                 factor_formulation="",
+        #             ).get_task_information(),
+        #             source_feature_size,
+        #         )
+        #     ]
 
 
 class FactorExperiment(Experiment[FactorTask, DSFBWorkspace, FactorFBWorkspace]):
diff --git a/test/utils/coder/test_CoSTEER.py b/test/utils/coder/test_CoSTEER.py
index 2da8e6be9..6e083a3eb 100644
--- a/test/utils/coder/test_CoSTEER.py
+++ b/test/utils/coder/test_CoSTEER.py
@@ -28,8 +28,13 @@ def test_model(self):
         # 1) Build the model experiment/task/workspace from tpl_ex
         # 2) build an according CoSTEER
         # 3) test the results
+        from rdagent.components.coder.data_science.model.test import develop_one_competition
+        
+        exp = develop_one_competition("aerial-cactus-identification")
+        
         pass
 
 
 if __name__ == "__main__":
     unittest.main()
+    # pytest test/utils/coder/test_CoSTEER.py
\ No newline at end of file

From e0f80421f3d2c7d961d3e4c04df38300a8110a60 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 29 Nov 2024 10:09:06 +0000
Subject: [PATCH 016/304] CI

---
 rdagent/app/data_science/conf.py                 |  2 +-
 rdagent/app/data_science/loop.py                 |  3 ++-
 .../coder/data_science/model/__init__.py         | 13 +++++++++----
 .../components/coder/data_science/model/eval.py  |  6 ++++--
 .../components/coder/data_science/model/exp.py   |  7 +++----
 .../components/coder/data_science/model/test.py  | 15 +++++++++------
 .../data_science/raw_data_loader/__init__.py     | 11 ++++++++---
 .../data_science/raw_data_loader/evaluators.py   |  7 ++++---
 .../coder/data_science/raw_data_loader/exp.py    |  7 ++++++-
 .../coder/data_science/raw_data_loader/test.py   |  7 ++++---
 rdagent/core/experiment.py                       |  4 +++-
 .../data_science/experiment/experiment.py        | 15 ++++-----------
 .../data_science/experiment/workspace.py         |  6 ++----
 rdagent/scenarios/data_science/scen/scen.py      |  4 ++--
 rdagent/utils/env.py                             |  2 ++
 test/utils/coder/test_CoSTEER.py                 | 16 ++++++++++------
 16 files changed, 73 insertions(+), 52 deletions(-)

diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index 350a38759..33cf0b575 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -23,7 +23,7 @@ class DataScienceBasePropSetting(BasePropSetting):
     ## dev/coder
     data_loader_coder: str = "rdagent.components.coder.data_science.raw_data_loader.DataLoaderCoSTEER"
     """Data Loader CoSTEER"""
-    
+
     # feature_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGFactorCoSTEER"
     # """Feature Coder class"""
 
diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 819e17510..6bde5f352 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -4,6 +4,7 @@
 import fire
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
 from rdagent.components.workflow.conf import BasePropSetting
 from rdagent.components.workflow.rd_loop import NextLoopException, RDLoop
 from rdagent.core.exception import FactorEmptyError, ModelEmptyError
@@ -20,7 +21,7 @@
 from rdagent.log.time import measure_time
 from rdagent.scenarios.kaggle.experiment.utils import python_files_to_notebook
 from rdagent.scenarios.kaggle.kaggle_crawler import download_data
-from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
+
 
 class DataScienceRDLoop(RDLoop):
     skip_loop_error = (NextLoopException,)
diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index e15692173..d9af46371 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -1,14 +1,19 @@
 from rdagent.components.coder.CoSTEER import CoSTEER
 from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
 from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-from rdagent.core.scenario import Scenario
-from rdagent.components.coder.CoSTEER.evolving_strategy import MultiProcessEvolvingStrategy
+from rdagent.components.coder.CoSTEER.evolving_strategy import (
+    MultiProcessEvolvingStrategy,
+)
+from rdagent.components.coder.CoSTEER.knowledge_management import (
+    CoSTEERQueriedKnowledge,
+)
 from rdagent.components.coder.data_science.model.exp import ModelTask
-from rdagent.components.coder.CoSTEER.knowledge_management import CoSTEERQueriedKnowledge
+from rdagent.core.scenario import Scenario
 
 # from rdagent.utils.agent.tpl import T
 # T(".prompts:model_generator.user").r()
 
+
 class ModelMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
     def implement_one_task(
         self,
@@ -20,7 +25,7 @@ def implement_one_task(
         def Model():
             pass
         """
-    
+
     def assign_code_list_to_evo(self, code_list: list, evo) -> None:
         """
         Assign the code list to the evolving item.
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 0ae0760d6..54049ac1e 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -2,6 +2,7 @@
 Beyond previous tests
 - 
 """
+
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
     CoSTEERMultiFeedback,
@@ -19,6 +20,7 @@ class ModelGeneralCaseSpecEvaluator(CoSTEEREvaluator):
     Test workflow:
     - Build train, valid, and test data to run it, and test the output (e.g., shape, value, etc.)
     """
+
     def evaluate(
         self,
         target_task: Task,
@@ -48,7 +50,7 @@ def evaluate(
 class XXX2SpecEval:
     """
     Based on XXX1SpecEval, but considering the following case:
-    
+
     Motivation case:
     - Sometimes we don't need validation (e.g., simple models not prone to overfitting, or data is too scarce to split).
 
@@ -77,7 +79,7 @@ class XXX4SpecEval:
     """
     Motivation case:
     - After obtaining good hyperparameters, we retrain the model.
-    
+
     Test workflow:
     - Test1: Since we have already tested it in XXX2SpecEval, we'll focus on another aspect.
         - Input:
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
index ac93d1dd6..fa0373035 100644
--- a/rdagent/components/coder/data_science/model/exp.py
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -9,6 +9,7 @@
 from rdagent.core.utils import cache_with_pickle
 from rdagent.oai.llm_utils import md5_hash
 from rdagent.utils.env import DockerEnv, DSDockerConf
+
 # TODO: Complete the implementation of the class DataLoaderTask and class DataLoaderFBWorkspace
 
 
@@ -89,9 +90,7 @@ def hash_func(
         return md5_hash(target_file_name)
 
     @cache_with_pickle(hash_func)
-    def execute(
-        self
-    ):
+    def execute(self):
         super().execute()
         try:
             de = DockerEnv(DSDockerConf())
@@ -118,4 +117,4 @@ def execute(
             execution_feedback_str = (
                 execution_feedback_str[:1000] + "....hidden long error message...." + execution_feedback_str[-1000:]
             )
-        return execution_feedback_str, execution_model_output
\ No newline at end of file
+        return execution_feedback_str, execution_model_output
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index 3d0ce46c7..9b833d04a 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -1,33 +1,36 @@
 """
 Generate dataset to test the model workflow output
 """
-from rdagent.scenarios.data_science.scen import DataScienceScen
+
+from pathlib import Path
+
 from rdagent.components.coder.data_science.model import ModelCoSTEER
 from rdagent.components.coder.data_science.model.exp import ModelTask
 from rdagent.scenarios.data_science.experiment.experiment import ModelExperiment
-from pathlib import Path
+from rdagent.scenarios.data_science.scen import DataScienceScen
+
 
 # Take tasks, spec.md and feat as input, generate a feedback as output
 def develop_one_competition(competition: str):
     scen = DataScienceScen(competition=competition)
     model_coder = ModelCoSTEER(scen)
-    
+
     # Create the experiment
     mt = ModelTask(name="ModelTask", description="", base_code="import pandas...")
     exp = ModelExperiment(
         sub_tasks=[mt],
     )
-    
+
     tpl_ex_path = Path(__file__).resolve() / Path("rdagent/scenarios/kaggle/tpl_ex").resolve() / competition
     injected_file_names = ["spec.md", "load_data.py", "feat01.py"]
     for file_name in injected_file_names:
         file_path = tpl_ex_path / file_name
         exp.experiment_workspace.inject_code(**{file_name: file_path.read_text()})
-    
+
     # Run the experiment
     exp = model_coder.develop(exp)
 
 
 if __name__ == "__main__":
     develop_one_competition("aerial-cactus-identification")
-    # dotenv run -- python rdagent/components/coder/data_science/model/test.py
\ No newline at end of file
+    # dotenv run -- python rdagent/components/coder/data_science/model/test.py
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index c8f951085..1f7ab1449 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -25,18 +25,23 @@
 from rdagent.components.coder.CoSTEER import CoSTEER
 from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
 from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-from rdagent.components.coder.CoSTEER.evolving_strategy import MultiProcessEvolvingStrategy
-from rdagent.components.coder.CoSTEER.knowledge_management import CoSTEERQueriedKnowledge
+from rdagent.components.coder.CoSTEER.evolving_strategy import (
+    MultiProcessEvolvingStrategy,
+)
+from rdagent.components.coder.CoSTEER.knowledge_management import (
+    CoSTEERQueriedKnowledge,
+)
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.core.scenario import Scenario
 
+
 class DataLoaderMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
     def implement_one_task(
         self,
         target_task: DataLoaderTask,
         queried_knowledge: CoSTEERQueriedKnowledge | None = None,
     ) -> str:
-        ... # prompting
+        ...  # prompting
         # return a workspace with "load_data.py", "spec/load_data.md" inside
         # assign the implemented code to the new workspace.
 
diff --git a/rdagent/components/coder/data_science/raw_data_loader/evaluators.py b/rdagent/components/coder/data_science/raw_data_loader/evaluators.py
index 8807a0b70..f2ac52428 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/evaluators.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/evaluators.py
@@ -2,11 +2,12 @@
 # (GPT) if it aligns with the spec & rationality of the spec.
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
+    CoSTEERMultiFeedback,
     CoSTEERSingleFeedback,
-    CoSTEERMultiFeedback
 )
-from rdagent.core.experiment import Task, Workspace
 from rdagent.core.evolving_framework import QueriedKnowledge
+from rdagent.core.experiment import Task, Workspace
+
 
 class DataLoaderCoSTEEREvaluator(CoSTEEREvaluator):
     def evaluate(
@@ -17,4 +18,4 @@ def evaluate(
         queried_knowledge: QueriedKnowledge = None,
         **kwargs,
     ) -> CoSTEERSingleFeedback:
-        
\ No newline at end of file
+        pass
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/raw_data_loader/exp.py b/rdagent/components/coder/data_science/raw_data_loader/exp.py
index 495c8751d..a10160b9c 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/exp.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/exp.py
@@ -9,6 +9,7 @@
 from rdagent.core.utils import cache_with_pickle
 from rdagent.oai.llm_utils import md5_hash
 from rdagent.utils.env import DockerEnv, DSDockerConf
+
 # TODO: Complete the implementation of the class DataLoaderTask and class DataLoaderFBWorkspace
 
 
@@ -54,4 +55,8 @@ def execute(self):
         except Exception as e:
             pass
 
-        return "data_loader.py and spec.md executed successfully", "content of spec.md", "pkl generated by data_loader.py"
\ No newline at end of file
+        return (
+            "data_loader.py and spec.md executed successfully",
+            "content of spec.md",
+            "pkl generated by data_loader.py",
+        )
diff --git a/rdagent/components/coder/data_science/raw_data_loader/test.py b/rdagent/components/coder/data_science/raw_data_loader/test.py
index d106c52f1..cf705ce78 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/test.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/test.py
@@ -6,12 +6,13 @@
 - it is not interface unittest(i.e. workspace evaluator in the CoSTEER Loop)
 """
 
-from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
-from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.scenarios.data_science.experiment.experiment import DataLoaderExperiment
+from rdagent.scenarios.data_science.scen import DataScienceScen
+
 
-def develop_one_competition(competition: str): # -> experiment
+def develop_one_competition(competition: str):  # -> experiment
     scen = DataScienceScen(competition=competition)
     data_loader_coder = DataLoaderCoSTEER(scen)
 
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index e4e363bed..d1318b6a3 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -105,7 +105,9 @@ def run_pipeline(self, **files: str):
 
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
-        self.code_dict: dict[str, Any] = {} # The code injected into the folder, store them in the variable to reproduce the former result
+        self.code_dict: dict[str, Any] = (
+            {}
+        )  # The code injected into the folder, store them in the variable to reproduce the former result
         self.workspace_path: Path = RD_AGENT_SETTINGS.workspace_path / uuid.uuid4().hex
 
     @property
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index 7e9c00da9..af5ec4203 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -1,20 +1,14 @@
 from copy import deepcopy
 from pathlib import Path
 
-from rdagent.core.experiment import Experiment
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.data_science.raw_data_loader.exp import (
     DataLoaderFBWorkspace,
     DataLoaderTask,
 )
-from rdagent.components.coder.factor_coder.factor import (
-    FactorFBWorkspace,
-    FactorTask,
-)
-from rdagent.components.coder.model_coder.model import (
-    ModelFBWorkspace,
-    ModelTask,
-)
+from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace, FactorTask
+from rdagent.components.coder.model_coder.model import ModelFBWorkspace, ModelTask
+from rdagent.core.experiment import Experiment
 from rdagent.scenarios.data_science.experiment.workspace import DSFBWorkspace
 
 # KG_MODEL_TYPE_XGBOOST = "XGBoost"
@@ -37,7 +31,6 @@
 # }
 
 
-
 class DataLoaderExperiment(Experiment[DataLoaderTask, DSFBWorkspace, DataLoaderFBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
@@ -89,4 +82,4 @@ def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
                     ).get_task_information(),
                     source_feature_size,
                 )
-            ]
\ No newline at end of file
+            ]
diff --git a/rdagent/scenarios/data_science/experiment/workspace.py b/rdagent/scenarios/data_science/experiment/workspace.py
index ed079df89..09d0142ea 100644
--- a/rdagent/scenarios/data_science/experiment/workspace.py
+++ b/rdagent/scenarios/data_science/experiment/workspace.py
@@ -10,7 +10,7 @@
 
 
 class DSFBWorkspace(FBWorkspace):
-    
+
     # TODO: use the cache_with_pickle decorator.
     def execute(self, run_env: dict = {}, *args, **kwargs) -> pd.DataFrame:
         """
@@ -35,9 +35,7 @@ def execute(self, run_env: dict = {}, *args, **kwargs) -> pd.DataFrame:
 
         running_extra_volume = {}
         if DS_RD_SETTING.competition:
-            running_extra_volume = {
-                DS_RD_SETTING.local_data_path + "/" + DS_RD_SETTING.competition: "/kaggle/input"
-            }
+            running_extra_volume = {DS_RD_SETTING.local_data_path + "/" + DS_RD_SETTING.competition: "/kaggle/input"}
         else:
             running_extra_volume = {}
 
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index 9e7a323f1..2be520ab6 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -96,8 +96,8 @@ def source_data(self) -> str:
             # phase1:
             # - If we have not implement load data and dump cache
             # - describe the raw data
-            return self.competition_descriptions['Data Description']
-            
+            return self.competition_descriptions["Data Description"]
+
         return "!!!!!!!!! I'm the fake source data !!!!!!!!"
         raise NotImplementedError(f"We are not sure how it is called. We place a exception here")
 
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
index 71c1b83ae..61ce00f6a 100644
--- a/rdagent/utils/env.py
+++ b/rdagent/utils/env.py
@@ -187,6 +187,7 @@ class KGDockerConf(DockerConf):
         "48g"  # Add memory limit attribute # new-york-city-taxi-fare-prediction may need more memory
     )
 
+
 class DSDockerConf(DockerConf):
     model_config = ExtendedSettingsConfigDict(env_prefix="DS_DOCKER_")
 
@@ -200,6 +201,7 @@ class DSDockerConf(DockerConf):
         "48g"  # Add memory limit attribute # new-york-city-taxi-fare-prediction may need more memory
     )
 
+
 class MLEBDockerConf(DockerConf):
     model_config = ExtendedSettingsConfigDict(env_prefix="MLEB_DOCKER_")
 
diff --git a/test/utils/coder/test_CoSTEER.py b/test/utils/coder/test_CoSTEER.py
index 6e083a3eb..9423917c8 100644
--- a/test/utils/coder/test_CoSTEER.py
+++ b/test/utils/coder/test_CoSTEER.py
@@ -18,23 +18,27 @@ def test_data_loader(self):
         # 3) test the results
         # - check spec.md
         # - check data_loader.py
-        from rdagent.components.coder.data_science.raw_data_loader.test import develop_one_competition
+        from rdagent.components.coder.data_science.raw_data_loader.test import (
+            develop_one_competition,
+        )
 
         exp = develop_one_competition("aerial-cactus-identification")
-        
+
         pass
 
     def test_model(self):
         # 1) Build the model experiment/task/workspace from tpl_ex
         # 2) build an according CoSTEER
         # 3) test the results
-        from rdagent.components.coder.data_science.model.test import develop_one_competition
-        
+        from rdagent.components.coder.data_science.model.test import (
+            develop_one_competition,
+        )
+
         exp = develop_one_competition("aerial-cactus-identification")
-        
+
         pass
 
 
 if __name__ == "__main__":
     unittest.main()
-    # pytest test/utils/coder/test_CoSTEER.py
\ No newline at end of file
+    # pytest test/utils/coder/test_CoSTEER.py

From a4c6845a579ff908c5a1da46775256fd62ecef7f Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Mon, 2 Dec 2024 09:31:18 +0000
Subject: [PATCH 017/304] ds_model some changes in eval and exp

---
 .../coder/data_science/model/__init__.py      |  2 +-
 .../coder/data_science/model/eval.py          | 66 ++++++++++++++++++-
 .../coder/data_science/model/exp.py           | 34 ++++------
 .../model/model_execute_template.txt          | 63 +++++++++++-------
 4 files changed, 120 insertions(+), 45 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index d9af46371..6bd8e8d69 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -44,7 +44,7 @@ def __init__(
         **kwargs,
     ) -> None:
         eva = CoSTEERMultiEvaluator(
-            # ModelCoSTEEREvaluator(scen=scen), scen=scen
+            ModelCoSTEEREvaluator(scen=scen), scen=scen
         )  # Please specify whether you agree running your eva in parallel or not
         es = ModelMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
 
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 54049ac1e..12fd1b2e0 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -8,6 +8,15 @@
     CoSTEERMultiFeedback,
     CoSTEERSingleFeedback,
 )
+from rdagent.components.coder.model_coder.eva_utils import (
+    ModelCodeEvaluator,
+    ModelFinalEvaluator,
+    shape_evaluator,
+    value_evaluator,
+)
+
+ModelSingleFeedback = CoSTEERSingleFeedback
+ModelMultiFeedback = CoSTEERMultiFeedback
 
 
 # Below are unit tests for testing the specification of the implemented model ------------------
@@ -46,6 +55,61 @@ def evaluate(
             )
         assert isinstance(target_task, ModelTask)
 
+        assert isinstance(implementation, ModelFBWorkspace)
+        model_execution_feedback, val_pred_array, test_pred_array = implementation.execute(
+        # Parameters?
+        )
+        # ignore gt_implementation
+        gt_np_array = None
+
+        # TODO: auto specify shape for other task types
+        # will spec.md provide the needed shape? the below code still only support 2d-output
+        batch_size = 8
+        num_classes = self.scen.model_output_channel if hasattr(self.scen, "model_output_channel") else 1
+        # TODO: num_class may not be specified in data description. Maybe shape evaluate is not necessary.
+        shape_feedback = ""  
+        expected_val_shape = (batch_size, num_classes)  
+        expected_test_shape = (batch_size, num_classes)  
+        val_shape_feedback, val_shape_decision = shape_evaluator(
+            val_pred_array, 
+            expected_val_shape,
+        )
+        shape_feedback += f"Validation Output: {val_shape_feedback}\n"    
+        test_shape_feedback, test_shape_decision = shape_evaluator(
+            test_pred_array, 
+            expected_test_shape,
+        ) 
+        shape_feedback += f"Test Output: {test_shape_feedback}\n"
+        # value feedback necessary?
+        value_feedback = "" 
+        code_feedback, _ = ModelCodeEvaluator(scen=self.scen).evaluate(
+            target_task=target_task,
+            implementation=implementation,
+            gt_implementation=gt_implementation,
+            model_execution_feedback=model_execution_feedback,
+            model_value_feedback="\n".join([shape_feedback, value_feedback]),
+        )
+        final_feedback, final_decision = ModelFinalEvaluator(scen=self.scen).evaluate(
+            target_task=target_task,
+            implementation=implementation,
+            gt_implementation=gt_implementation,
+            model_execution_feedback=model_execution_feedback,
+            model_value_feedback=value_feedback,
+            model_code_feedback=code_feedback,
+        )
+
+        return ModelSingleFeedback(
+            execution_feedback=model_execution_feedback,
+            shape_feedback=shape_feedback,
+            value_feedback=value_feedback,
+            code_feedback=code_feedback,
+            final_feedback=final_feedback,
+            final_decision=final_decision,
+            # value_generated_flag=(gen_np_array is not None),
+            value_generated_flag=(val_pred_array is not None and test_pred_array is not None),  
+            final_decision_based_on_gt=(gt_implementation is not None),
+        )
+
 
 class XXX2SpecEval:
     """
@@ -89,4 +153,4 @@ class XXX4SpecEval:
     - Test2: Ensure the hyperparameters are 1) being used, and 2) the model remains stable.
         - Different hyperparameters will yield different results
         - Same hyperparameters will yield the same results
-    """
+    """
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
index fa0373035..5169d318c 100644
--- a/rdagent/components/coder/data_science/model/exp.py
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -12,7 +12,6 @@
 
 # TODO: Complete the implementation of the class DataLoaderTask and class DataLoaderFBWorkspace
 
-
 class ModelTask(CoSTEERTask):
     def __init__(
         self,
@@ -31,7 +30,8 @@ def __init__(
         self.variables: str = variables
         self.hyperparameters: str = hyperparameters
         self.model_type: str = (
-            model_type  # Tabular for tabular model, TimesSeries for time series model, Graph for graph model, XGBoost for XGBoost model
+            model_type  # Tabular for tabular model, TimesSeries for time series model, Graph for graph model, XGBoost for XGBoost model 
+            # TODO: More Models Supported
         )
         super().__init__(name=name, description=description, *args, **kwargs)
 
@@ -53,7 +53,6 @@ def from_dict(dict):
     def __repr__(self) -> str:
         return f"<{self.__class__.__name__} {self.name}>"
 
-
 class ModelFBWorkspace(FBWorkspace):
     """
     It is a Pytorch model implementation task;
@@ -75,32 +74,23 @@ class ModelFBWorkspace(FBWorkspace):
         (version 2) for kaggle we'll make a script to call the fit and predict function in the implementation in file `model.py` after setting the cwd into the directory
     """
 
-    def hash_func(
-        self,
-        batch_size: int = 8,
-        num_features: int = 10,
-        num_timesteps: int = 4,
-        num_edges: int = 20,
-        input_value: float = 1.0,
-        param_init_value: float = 1.0,
-    ) -> str:
-        target_file_name = f"{batch_size}_{num_features}_{num_timesteps}_{input_value}_{param_init_value}"
-        for code_file_name in sorted(list(self.code_dict.keys())):
-            target_file_name = f"{target_file_name}_{self.code_dict[code_file_name]}"
-        return md5_hash(target_file_name)
-
-    @cache_with_pickle(hash_func)
     def execute(self):
         super().execute()
         try:
             de = DockerEnv(DSDockerConf())
             de.prepare()
+            np.save(os.path.join(self.workspace_path, "train_X.npy"), train_X)  
+            np.save(os.path.join(self.workspace_path, "train_y.npy"), train_y)  
+            np.save(os.path.join(self.workspace_path, "val_X.npy"), val_X)  
+            np.save(os.path.join(self.workspace_path, "val_y.npy"), val_y)  
+            np.save(os.path.join(self.workspace_path, "test_X.npy"), test_X)  
+            # TODO: generate dataset automatically
 
             dump_code = (Path(__file__).parent / "model_execute_template.txt").read_text()
 
             log, results = de.dump_python_code_run_and_get_results(
                 code=dump_code,
-                dump_file_names=["execution_feedback_str.pkl", "execution_model_output.pkl"],
+                dump_file_names=["execution_feedback_str.pkl", "val_pred.pkl", "test_pred.pkl"],  
                 local_path=str(self.workspace_path),
                 env={},
                 code_dump_file_py_name="model_test",
@@ -111,10 +101,12 @@ def execute(self):
 
         except Exception as e:
             execution_feedback_str = f"Execution error: {e}\nTraceback: {traceback.format_exc()}"
-            execution_model_output = None
+            val_pred_array = None  
+            test_pred_array = None 
 
         if len(execution_feedback_str) > 2000:
             execution_feedback_str = (
                 execution_feedback_str[:1000] + "....hidden long error message...." + execution_feedback_str[-1000:]
             )
-        return execution_feedback_str, execution_model_output
+        return execution_feedback_str, val_pred_array, test_pred_array 
+    
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/model/model_execute_template.txt b/rdagent/components/coder/data_science/model/model_execute_template.txt
index 78962c2a0..191b8fceb 100644
--- a/rdagent/components/coder/data_science/model/model_execute_template.txt
+++ b/rdagent/components/coder/data_science/model/model_execute_template.txt
@@ -1,24 +1,43 @@
-import os
-import pickle
+"""
+adapt for cv models
+"""
 
-import numpy as np
-import pandas as pd
-import torch
-from model import fit, predict
+import os  
+import pickle  
+import traceback  
+  
+import numpy as np  
+from model import model_workflow  
+  
+train_X = np.load("train_X.npy")  
+train_y = np.load("train_y.npy")  
+val_X = np.load("val_X.npy")  
+val_y = np.load("val_y.npy")  
+test_X = np.load("test_X.npy")  
+  
 
-train_X = pd.DataFrame(np.random.randn(8, 30), columns=[f"{i}" for i in range(30)])
-train_y = pd.Series(np.random.randint(0, 2, 8))
-valid_X = pd.DataFrame(np.random.randn(8, 30), columns=[f"{i}" for i in range(30)])
-valid_y = pd.Series(np.random.randint(0, 2, 8))
-
-model = fit(train_X, train_y, valid_X, valid_y)
-execution_model_output = predict(model, valid_X)
-
-if isinstance(execution_model_output, torch.Tensor):
-    execution_model_output = execution_model_output.cpu().detach().numpy()
-
-
-execution_feedback_str = f"Execution successful, output numpy ndarray shape: {execution_model_output.shape}"
-
-pickle.dump(execution_model_output, open("execution_model_output.pkl", "wb"))
-pickle.dump(execution_feedback_str, open("execution_feedback_str.pkl", "wb"))
+# Call model_workflow  
+val_pred, test_pred, updated_hyper_params = model_workflow(  
+    X=train_X,  
+    y=train_y,  
+    val_X=val_X,  
+    val_y=val_y,  
+    test_X=test_X,  
+    hyper_params={}  
+)  
+ 
+execution_feedback_str = "Execution successful.\n"  
+if val_pred is not None:  
+    execution_feedback_str += f"Validation predictions shape: {val_pred.shape}\n"  
+else:  
+    execution_feedback_str += "Validation predictions are None.\n"  
+if test_pred is not None:  
+    execution_feedback_str += f"Test predictions shape: {test_pred.shape}\n"  
+else:  
+    execution_feedback_str += "Test predictions are None.\n"  
+  
+# Save the outputs  
+pickle.dump(val_pred, open("val_pred.pkl", "wb"))  
+pickle.dump(test_pred, open("test_pred.pkl", "wb"))  
+pickle.dump(execution_feedback_str, open("execution_feedback_str.pkl", "wb"))  
+  

From f1003b84816eb7c9fb3940f305676b992d4be1e9 Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Tue, 3 Dec 2024 05:57:01 +0000
Subject: [PATCH 018/304] annotation

---
 .../coder/data_science/model/exp.py           | 20 -------------------
 .../components/coder/model_coder/eva_utils.py |  2 +-
 2 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
index 5169d318c..c58e2db6e 100644
--- a/rdagent/components/coder/data_science/model/exp.py
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -54,26 +54,6 @@ def __repr__(self) -> str:
         return f"<{self.__class__.__name__} {self.name}>"
 
 class ModelFBWorkspace(FBWorkspace):
-    """
-    It is a Pytorch model implementation task;
-    All the things are placed in a folder.
-
-    Folder
-    - data source and documents prepared by `prepare`
-        - Please note that new data may be passed in dynamically in `execute`
-    - code (file `model.py` ) injected by `inject_code`
-        - the `model.py` that contains a variable named `model_cls` which indicates the implemented model structure
-            - `model_cls` is a instance of `torch.nn.Module`;
-
-    We support two ways of interface:
-        (version 1) for qlib we'll make a script to import the model in the implementation in file `model.py` after setting the cwd into the directory
-            - from model import model_cls
-            - initialize the model by initializing it `model_cls(input_dim=INPUT_DIM)`
-            - And then verify the model.
-
-        (version 2) for kaggle we'll make a script to call the fit and predict function in the implementation in file `model.py` after setting the cwd into the directory
-    """
-
     def execute(self):
         super().execute()
         try:
diff --git a/rdagent/components/coder/model_coder/eva_utils.py b/rdagent/components/coder/model_coder/eva_utils.py
index 8b2869d8c..f333de193 100644
--- a/rdagent/components/coder/model_coder/eva_utils.py
+++ b/rdagent/components/coder/model_coder/eva_utils.py
@@ -14,7 +14,7 @@
 
 evaluate_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
-
+# This shape evaluator is also used in data_science
 def shape_evaluator(prediction: np.ndarray, target_shape: Tuple = None) -> Tuple[str, bool]:
     if target_shape is None or prediction is None:
         return (

From 6d30f572b4701fea0608dd0c4a67bb53a64d4ab7 Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Tue, 3 Dec 2024 06:40:35 +0000
Subject: [PATCH 019/304] changes in model coder final evaluate

---
 rdagent/components/coder/model_coder/eva_utils.py  | 2 ++
 rdagent/components/coder/model_coder/evaluators.py | 1 +
 rdagent/components/coder/model_coder/prompts.yaml  | 2 ++
 3 files changed, 5 insertions(+)

diff --git a/rdagent/components/coder/model_coder/eva_utils.py b/rdagent/components/coder/model_coder/eva_utils.py
index f333de193..4ee17ffac 100644
--- a/rdagent/components/coder/model_coder/eva_utils.py
+++ b/rdagent/components/coder/model_coder/eva_utils.py
@@ -123,6 +123,7 @@ def evaluate(
         implementation: Workspace,
         gt_implementation: Workspace,
         model_execution_feedback: str,
+        model_shape_feedback: str,
         model_value_feedback: str,
         model_code_feedback: str,
     ):
@@ -154,6 +155,7 @@ def evaluate(
                 .render(
                     model_information=target_task.get_task_information(),
                     model_execution_feedback=execution_feedback_to_render,
+                    model_shape_feedback=model_shape_feedback,
                     model_code_feedback=model_code_feedback,
                     model_value_feedback=model_value_feedback,
                 )
diff --git a/rdagent/components/coder/model_coder/evaluators.py b/rdagent/components/coder/model_coder/evaluators.py
index 926c2e6a9..a311ded81 100644
--- a/rdagent/components/coder/model_coder/evaluators.py
+++ b/rdagent/components/coder/model_coder/evaluators.py
@@ -87,6 +87,7 @@ def evaluate(
             implementation=implementation,
             gt_implementation=gt_implementation,
             model_execution_feedback=model_execution_feedback,
+            model_shape_feedback=shape_feedback,
             model_value_feedback=value_feedback,
             model_code_feedback=code_feedback,
         )
diff --git a/rdagent/components/coder/model_coder/prompts.yaml b/rdagent/components/coder/model_coder/prompts.yaml
index fa6d0212a..8742bd26b 100644
--- a/rdagent/components/coder/model_coder/prompts.yaml
+++ b/rdagent/components/coder/model_coder/prompts.yaml
@@ -160,6 +160,8 @@ evaluator_final_feedback:
         {{ model_information }}
         --------------Model Execution feedback:---------------
         {{ model_execution_feedback }}
+        --------------Model shape feedback:---------------
+        {{ model_shape_feedback }}
         --------------Model Code feedback:---------------
         {{ model_code_feedback }}
         --------------Model value feedback:---------------

From 33e9a00e56ebd44f1b1f72a05595003444b3130f Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Thu, 5 Dec 2024 09:38:50 +0000
Subject: [PATCH 020/304] ds model test init

---
 .../coder/data_science/model/__init__.py      |   6 +-
 .../coder/data_science/model/eva_utils.py     | 152 +++++++++++++++
 .../coder/data_science/model/eval.py          |  31 ++--
 .../coder/data_science/model/exp.py           |  21 +--
 .../model/model_execute_template.txt          |  25 ++-
 .../coder/data_science/model/prompts.yaml     | 173 +++++++++++++++++-
 .../coder/data_science/model/test.py          |  27 ++-
 .../aerial-cactus-identification/model01.py   |  52 ++++--
 8 files changed, 420 insertions(+), 67 deletions(-)
 create mode 100644 rdagent/components/coder/data_science/model/eva_utils.py

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 6bd8e8d69..1b11cde75 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -4,6 +4,7 @@
 from rdagent.components.coder.CoSTEER.evolving_strategy import (
     MultiProcessEvolvingStrategy,
 )
+from rdagent.components.coder.data_science.model.eval import ModelGeneralCaseSpecEvaluator
 from rdagent.components.coder.CoSTEER.knowledge_management import (
     CoSTEERQueriedKnowledge,
 )
@@ -44,8 +45,9 @@ def __init__(
         **kwargs,
     ) -> None:
         eva = CoSTEERMultiEvaluator(
-            ModelCoSTEEREvaluator(scen=scen), scen=scen
+            ModelGeneralCaseSpecEvaluator(scen=scen), scen=scen
         )  # Please specify whether you agree running your eva in parallel or not
+        # eva = ModelGeneralCaseSpecEvaluator(scen=scen)
         es = ModelMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
 
-        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
+        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=2, scen=scen, **kwargs)
diff --git a/rdagent/components/coder/data_science/model/eva_utils.py b/rdagent/components/coder/data_science/model/eva_utils.py
new file mode 100644
index 000000000..5554e1cd4
--- /dev/null
+++ b/rdagent/components/coder/data_science/model/eva_utils.py
@@ -0,0 +1,152 @@
+import json
+from pathlib import Path
+from typing import Tuple
+
+import numpy as np
+from jinja2 import Environment, StrictUndefined
+
+from rdagent.components.coder.data_science.model.exp import ModelFBWorkspace, ModelTask
+from rdagent.core.evaluation import Evaluator
+from rdagent.core.experiment import Task, Workspace
+from rdagent.core.prompts import Prompts
+from rdagent.oai.llm_conf import LLM_SETTINGS
+from rdagent.oai.llm_utils import APIBackend
+
+evaluate_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
+
+class ModelCodeEvaluator(Evaluator):
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: Workspace,
+        gt_implementation: Workspace,
+        model_execution_feedback: str = "",
+        model_value_feedback: str = "",
+    ):
+        assert isinstance(target_task, ModelTask)
+        assert isinstance(implementation, ModelFBWorkspace)
+        if gt_implementation is not None:
+            assert isinstance(gt_implementation, ModelFBWorkspace)
+
+        model_task_information = target_task.get_task_information()
+        code = implementation.code
+
+        system_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(evaluate_prompts["evaluator_code_feedback"]["system"])
+            .render(
+                scenario=(
+                    # self.scen.get_scenario_all_desc(target_task, filtered_tag=target_task.model_type)
+                    # if self.scen is not None
+                    # else "No scenario description."
+                    "No scenario description."
+                )
+            )
+        )
+
+        execution_feedback_to_render = model_execution_feedback
+        for _ in range(10):  # 10 times to split the content is enough
+            user_prompt = (
+                Environment(undefined=StrictUndefined)
+                .from_string(
+                    evaluate_prompts["evaluator_code_feedback"]["user"],
+                )
+                .render(
+                    model_information=model_task_information,
+                    code=code,
+                    model_execution_feedback=execution_feedback_to_render,
+                    model_value_feedback=model_value_feedback,
+                    gt_code=gt_implementation.code if gt_implementation else None,
+                )
+            )
+            if (
+                APIBackend().build_messages_and_calculate_token(
+                    user_prompt=user_prompt,
+                    system_prompt=system_prompt,
+                )
+                > LLM_SETTINGS.chat_token_limit
+            ):
+                execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :]
+            else:
+                break
+
+        critic_response = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=system_prompt,
+            json_mode=False,
+        )
+
+        return critic_response, None
+
+
+class ModelFinalEvaluator(Evaluator):
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: Workspace,
+        gt_implementation: Workspace,
+        model_execution_feedback: str,
+        model_shape_feedback: str,
+        model_value_feedback: str,
+        model_code_feedback: str,
+    ):
+        assert isinstance(target_task, ModelTask)
+        assert isinstance(implementation, ModelFBWorkspace)
+        if gt_implementation is not None:
+            assert isinstance(gt_implementation, ModelFBWorkspace)
+
+        system_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(evaluate_prompts["evaluator_final_feedback"]["system"])
+            .render(
+                scenario=(
+                    # self.scen.get_scenario_all_desc(target_task, filtered_tag=target_task.model_type)
+                    # if self.scen is not None
+                    # else "No scenario description."
+                    "No scenario description."
+                )
+            )
+        )
+
+        execution_feedback_to_render = model_execution_feedback
+
+        for _ in range(10):  # 10 times to split the content is enough
+            user_prompt = (
+                Environment(undefined=StrictUndefined)
+                .from_string(
+                    evaluate_prompts["evaluator_final_feedback"]["user"],
+                )
+                .render(
+                    model_information=target_task.get_task_information(),
+                    model_execution_feedback=execution_feedback_to_render,
+                    model_shape_feedback=model_shape_feedback,
+                    model_code_feedback=model_code_feedback,
+                    model_value_feedback=model_value_feedback,
+                )
+            )
+            if (
+                APIBackend().build_messages_and_calculate_token(
+                    user_prompt=user_prompt,
+                    system_prompt=system_prompt,
+                )
+                > LLM_SETTINGS.chat_token_limit
+            ):
+                execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :]
+            else:
+                break
+
+        final_evaluation_dict = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(
+                user_prompt=user_prompt,
+                system_prompt=system_prompt,
+                json_mode=True,
+            ),
+        )
+        if isinstance(final_evaluation_dict["final_decision"], str) and final_evaluation_dict[
+            "final_decision"
+        ].lower() in ("true", "false"):
+            final_evaluation_dict["final_decision"] = bool(final_evaluation_dict["final_decision"])
+        return (
+            final_evaluation_dict["final_feedback"],
+            final_evaluation_dict["final_decision"],
+        )
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 12fd1b2e0..56f83505f 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -8,12 +8,14 @@
     CoSTEERMultiFeedback,
     CoSTEERSingleFeedback,
 )
-from rdagent.components.coder.model_coder.eva_utils import (
+from rdagent.components.coder.data_science.model.eva_utils import (
     ModelCodeEvaluator,
     ModelFinalEvaluator,
-    shape_evaluator,
-    value_evaluator,
 )
+from rdagent.components.coder.model_coder.eva_utils import shape_evaluator
+from rdagent.components.coder.data_science.model.exp import ModelFBWorkspace, ModelTask
+from rdagent.core.evolving_framework import QueriedKnowledge
+from rdagent.core.experiment import Task, Workspace
 
 ModelSingleFeedback = CoSTEERSingleFeedback
 ModelMultiFeedback = CoSTEERMultiFeedback
@@ -38,7 +40,7 @@ def evaluate(
         queried_knowledge: QueriedKnowledge = None,
         **kwargs,
     ) -> ModelSingleFeedback:
-        target_task_information = target_task.get_task_information()
+        # target_task_information = target_task.get_task_information()
         if (
             queried_knowledge is not None
             and target_task_information in queried_knowledge.success_task_to_knowledge_dict
@@ -53,20 +55,20 @@ def evaluate(
                 final_feedback="This task has failed too many times, skip implementation.",
                 final_decision=False,
             )
-        assert isinstance(target_task, ModelTask)
-
+        # assert isinstance(target_task, ModelTask)
+        
+        batch_size = 8
         assert isinstance(implementation, ModelFBWorkspace)
-        model_execution_feedback, val_pred_array, test_pred_array = implementation.execute(
-        # Parameters?
+        model_execution_feedback, pred_list= implementation.execute(
+            batch_size=batch_size,
         )
+        val_pred_array, test_pred_array = pred_list
         # ignore gt_implementation
         gt_np_array = None
 
-        # TODO: auto specify shape for other task types
-        # will spec.md provide the needed shape? the below code still only support 2d-output
-        batch_size = 8
+        # TODO: auto specify shape from spec.md using GPT
+        
         num_classes = self.scen.model_output_channel if hasattr(self.scen, "model_output_channel") else 1
-        # TODO: num_class may not be specified in data description. Maybe shape evaluate is not necessary.
         shape_feedback = ""  
         expected_val_shape = (batch_size, num_classes)  
         expected_test_shape = (batch_size, num_classes)  
@@ -80,8 +82,7 @@ def evaluate(
             expected_test_shape,
         ) 
         shape_feedback += f"Test Output: {test_shape_feedback}\n"
-        # value feedback necessary?
-        value_feedback = "" 
+        value_feedback = "The value feedback is passed, and the value decision is true." 
         code_feedback, _ = ModelCodeEvaluator(scen=self.scen).evaluate(
             target_task=target_task,
             implementation=implementation,
@@ -94,6 +95,7 @@ def evaluate(
             implementation=implementation,
             gt_implementation=gt_implementation,
             model_execution_feedback=model_execution_feedback,
+            model_shape_feedback=shape_feedback,
             model_value_feedback=value_feedback,
             model_code_feedback=code_feedback,
         )
@@ -105,7 +107,6 @@ def evaluate(
             code_feedback=code_feedback,
             final_feedback=final_feedback,
             final_decision=final_decision,
-            # value_generated_flag=(gen_np_array is not None),
             value_generated_flag=(val_pred_array is not None and test_pred_array is not None),  
             final_decision_based_on_gt=(gt_implementation is not None),
         )
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
index c58e2db6e..5442fcdd3 100644
--- a/rdagent/components/coder/data_science/model/exp.py
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -54,39 +54,38 @@ def __repr__(self) -> str:
         return f"<{self.__class__.__name__} {self.name}>"
 
 class ModelFBWorkspace(FBWorkspace):
-    def execute(self):
+    def execute(
+        self,
+        batch_size: int = 8,    
+    ):
         super().execute()
         try:
             de = DockerEnv(DSDockerConf())
             de.prepare()
-            np.save(os.path.join(self.workspace_path, "train_X.npy"), train_X)  
-            np.save(os.path.join(self.workspace_path, "train_y.npy"), train_y)  
-            np.save(os.path.join(self.workspace_path, "val_X.npy"), val_X)  
-            np.save(os.path.join(self.workspace_path, "val_y.npy"), val_y)  
-            np.save(os.path.join(self.workspace_path, "test_X.npy"), test_X)  
+
+            # self.code_dict["spec.md"]
             # TODO: generate dataset automatically
 
             dump_code = (Path(__file__).parent / "model_execute_template.txt").read_text()
 
             log, results = de.dump_python_code_run_and_get_results(
                 code=dump_code,
-                dump_file_names=["execution_feedback_str.pkl", "val_pred.pkl", "test_pred.pkl"],  
+                dump_file_names=["execution_feedback_str.pkl", "pred_list.pkl"],  
                 local_path=str(self.workspace_path),
                 env={},
                 code_dump_file_py_name="model_test",
             )
             if results is None:
                 raise RuntimeError(f"Error in running the model code: {log}")
-            [execution_feedback_str, execution_model_output] = results
+            [execution_feedback_str, pred_list] = results
 
         except Exception as e:
             execution_feedback_str = f"Execution error: {e}\nTraceback: {traceback.format_exc()}"
-            val_pred_array = None  
-            test_pred_array = None 
+            pred_list  = None  
 
         if len(execution_feedback_str) > 2000:
             execution_feedback_str = (
                 execution_feedback_str[:1000] + "....hidden long error message...." + execution_feedback_str[-1000:]
             )
-        return execution_feedback_str, val_pred_array, test_pred_array 
+        return execution_feedback_str, pred_list 
     
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/model/model_execute_template.txt b/rdagent/components/coder/data_science/model/model_execute_template.txt
index 191b8fceb..a86ad17a8 100644
--- a/rdagent/components/coder/data_science/model/model_execute_template.txt
+++ b/rdagent/components/coder/data_science/model/model_execute_template.txt
@@ -7,22 +7,27 @@ import pickle
 import traceback  
   
 import numpy as np  
-from model import model_workflow  
+from model01 import model_workflow  
   
-train_X = np.load("train_X.npy")  
-train_y = np.load("train_y.npy")  
-val_X = np.load("val_X.npy")  
-val_y = np.load("val_y.npy")  
-test_X = np.load("test_X.npy")  
+# train_X = np.load("train_X.npy")  
+# train_y = np.load("train_y.npy")  
+# val_X = np.load("val_X.npy")  
+# val_y = np.load("val_y.npy")  
+# test_X = np.load("test_X.npy")
+train_X = np.random.rand(8, 64, 64, 3)
+train_y = np.random.rand(8, 1)
+val_X = np.random.rand(8, 64, 64, 3)
+val_y = np.random.rand(8, 1)
+test_X = np.random.rand(8, 64, 64, 3)
   
 
 # Call model_workflow  
-val_pred, test_pred, updated_hyper_params = model_workflow(  
+val_pred, test_pred = model_workflow(  
     X=train_X,  
     y=train_y,  
     val_X=val_X,  
     val_y=val_y,  
-    test_X=test_X,  
+    test_X=None, 
     hyper_params={}  
 )  
  
@@ -37,7 +42,7 @@ else:
     execution_feedback_str += "Test predictions are None.\n"  
   
 # Save the outputs  
-pickle.dump(val_pred, open("val_pred.pkl", "wb"))  
-pickle.dump(test_pred, open("test_pred.pkl", "wb"))  
+pred_list = [val_pred, test_pred]  
+pickle.dump(pred_list, open("pred_list.pkl", "wb"))  
 pickle.dump(execution_feedback_str, open("execution_feedback_str.pkl", "wb"))  
   
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index cab247b35..8742bd26b 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -1,5 +1,168 @@
-model_generator:
-  system:
-    system
-  user:
-    user
\ No newline at end of file
+extract_model_formulation_system: |-
+    offer description of the proposed model in this paper, write a latex formula with variable as well as the architecture of the model. the format should be like 
+    {
+    "model_name (The name of the model)": {
+        "description": "A detailed description of the model",
+        "formulation": "A LaTeX formula representing the model's formulation",
+        "architecture": "A detailed description of the model's architecture, e.g., neural network layers or tree structures",
+        "variables": {
+            "\\hat{y}_u": "The predicted output for node u",
+            "variable_name_2": "Description of variable 2",
+            "variable_name_3": "Description of variable 3"
+        },
+        "hyperparameters": {
+            "hyperparameter_name_1": "value of hyperparameter 1",
+            "hyperparameter_name_2": "value of hyperparameter 2",
+            "hyperparameter_name_3": "value of hyperparameter 3"
+        },
+        "model_type": "Tabular or TimeSeries or Graph or XGBoost"  # Should be one of "Tabular", "TimeSeries", "Graph", or "XGBoost"
+    }
+    }
+    Eg. 
+    {
+    "ABC Model": {
+        "description": "A detailed description of the model",
+        "formulation": "A LaTeX formula representing the model's formulation",
+        "architecture": "A detailed description of the model's architecture, e.g., neural network layers or tree structures",
+        "variables": {
+            "\\hat{y}_u": "The predicted output for node u",
+            "variable_name_2": "Description of variable 2",
+            "variable_name_3": "Description of variable 3"
+        },
+        "hyperparameters": {
+            "hyperparameter_name_1": "value of hyperparameter 1",
+            "hyperparameter_name_2": "value of hyperparameter 2",
+            "hyperparameter_name_3": "value of hyperparameter 3"
+        },
+        "model_type": "Tabular or TimeSeries or Graph or RandomForest or XGBoost"  # If torch & Neural network models are required, the choice should be one of "Tabular", "TimeSeries", or "Graph" 
+    }
+    }
+    such format content should be begin with ```json and end with ``` and the content should be in json format.
+
+evolving_strategy_model_coder:
+    system: |-
+        User is trying to implement some pytorch models in the following scenario:
+        {{ scenario }}
+        Your code is expected to align the scenario in any form which means The user needs to get the prediction of the model based on the input data.
+
+        To help you write the correct code, the user might provide multiple information that helps you write the correct code:
+        1. The user might provide you the correct code to similar models. Your should learn from these code to write the correct code.
+        2. The user might provide you the failed former code and the corresponding feedback to the code. The feedback contains to the execution, the code and the model output value. You should analyze the feedback and try to correct the latest code.
+        3. The user might provide you the suggestion to the latest fail code and some similar fail to correct pairs. Each pair contains the fail code with similar error and the corresponding corrected version code. You should learn from these suggestion to write the correct code.
+
+        Your must write your code based on your former latest attempt below which consists of your former code and code feedback, you should read the former attempt carefully and must not modify the right part of your former code.
+
+        {% if current_code is not none %}
+        User has write some code before. You should write the new code based on this code. Here is the latest code:
+        ```python
+        {{ current_code }}
+        ```
+        Your code should be very similar to the former code which means your code should be ninety more percent same as the former code! You should not modify the right part of the code.
+        {% else %}
+        User has not write any code before. You should write the new code from scratch.
+        {% endif %}
+
+        {% if queried_former_failed_knowledge|length != 0 %}
+        --------------Your former latest attempt:---------------
+        =====Code to the former implementation=====
+        {{ queried_former_failed_knowledge[-1].implementation.code }}
+        =====Feedback to the former implementation=====
+        {{ queried_former_failed_knowledge[-1].feedback }}
+        {% endif %}
+        
+        Please response the code in the following json format. Here is an example structure for the JSON output:
+        {
+            "code": "The Python code as a string."
+        }
+
+    user: |-
+        --------------Target model information:---------------
+        {{ model_information_str }}
+
+        {% if queried_similar_successful_knowledge|length != 0 %}
+        --------------Correct code to similar models:---------------
+        {% for similar_successful_knowledge in queried_similar_successful_knowledge %}
+        =====Model {{loop.index}}:=====
+        {{ similar_successful_knowledge.target_task.get_task_information() }}
+        =====Code:=====
+        {{ similar_successful_knowledge.implementation.code }}
+        {% endfor %}
+        {% endif %}
+
+        {% if queried_former_failed_knowledge|length != 0 %}
+        --------------Former failed code:---------------
+        {% for former_failed_knowledge in queried_former_failed_knowledge %}
+        =====Code to implementation {{ loop.index }}=====
+        {{ former_failed_knowledge.implementation.code }}
+        =====Feedback to implementation {{ loop.index }}=====
+        {{ former_failed_knowledge.feedback }}
+        {% endfor %}
+        {% endif %}
+
+evaluator_code_feedback:
+    system: |-
+        User is trying to implement some models in the following scenario:
+        {{ scenario }}
+        User will provide you the information of the model.
+
+        Your job is to check whether user's code is align with the model information and the scenario.
+        The user will provide the source python code and the execution error message if execution failed.
+        The user might provide you the ground truth code for you to provide the critic. You should not leak the ground truth code to the user in any form but you can use it to provide the critic.
+
+        User has also compared the output generated by the user's code and the ground truth code. The user will provide you some analysis results comparing two output. You may find some error in the code which caused the difference between the two output.
+
+        If the ground truth code is provided, your critic should only consider checking whether the user's code is align with the ground truth code since the ground truth is definitely correct.
+        If the ground truth code is not provided, your critic should consider checking whether the user's code is reasonable and correct to the description and to the scenario.
+
+        Notice that your critics are not for user to debug the code. They are sent to the coding agent to correct the code. So don't give any following items for the user to check like "Please check the code line XXX".
+
+        You suggestion should not include any code, just some clear and short suggestions. Please point out very critical issues in your response, ignore non-important issues to avoid confusion. If no big issue found in the code, you can response "No critics found".
+
+        You should provide the suggestion to each of your critic to help the user improve the code. Please response the critic in the following format. Here is an example structure for the output:
+        critic 1: The critic message to critic 1
+        critic 2: The critic message to critic 2
+    
+    user: |-
+        --------------Model information:---------------
+        {{ model_information }}
+        --------------Python code:---------------
+        {{ code }}
+        --------------Execution feedback:---------------
+        {{ model_execution_feedback }}
+        {% if model_value_feedback is not none %}
+        --------------Model value feedback:---------------
+        {{ model_value_feedback }}
+        {% endif %}
+        {% if gt_code is not none %}
+        --------------Ground truth Python code:---------------
+        {{ gt_code }}
+        {% endif %}
+
+
+evaluator_final_feedback:
+    system: |-
+        User is trying to implement a model in the following scenario:
+        {{ scenario }}
+        User has finished evaluation and got some feedback from the evaluator.
+        The evaluator run the code and get the output and provide several feedback regarding user's code and code output. You should analyze the feedback and considering the scenario and model description to give a final decision about the evaluation result. The final decision concludes whether the model is implemented correctly and if not, detail feedback containing reason and suggestion if the final decision is False.
+
+        The implementation final decision is considered in the following logic:
+        1. If the value and the ground truth value are exactly the same under a small tolerance, the implementation is considered correct.
+        2. If no ground truth value is not provided, the implementation is considered correct if the code execution is successful and the code feedback is align with the scenario and model description.
+
+        Please response the critic in the json format. Here is an example structure for the JSON output, please strictly follow the format:
+        {
+            "final_decision": True,
+            "final_feedback": "The final feedback message",
+        }
+    user: |-
+        --------------Model information:---------------
+        {{ model_information }}
+        --------------Model Execution feedback:---------------
+        {{ model_execution_feedback }}
+        --------------Model shape feedback:---------------
+        {{ model_shape_feedback }}
+        --------------Model Code feedback:---------------
+        {{ model_code_feedback }}
+        --------------Model value feedback:---------------
+        {{ model_value_feedback }}
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index 9b833d04a..b2218d4b2 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -5,7 +5,9 @@
 from pathlib import Path
 
 from rdagent.components.coder.data_science.model import ModelCoSTEER
-from rdagent.components.coder.data_science.model.exp import ModelTask
+from rdagent.components.coder.data_science.model.eval import ModelGeneralCaseSpecEvaluator
+
+from rdagent.components.coder.data_science.model.exp import ModelTask, ModelFBWorkspace
 from rdagent.scenarios.data_science.experiment.experiment import ModelExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
 
@@ -16,19 +18,36 @@ def develop_one_competition(competition: str):
     model_coder = ModelCoSTEER(scen)
 
     # Create the experiment
-    mt = ModelTask(name="ModelTask", description="", base_code="import pandas...")
+    mt = ModelTask(
+        name="ModelTask", 
+        description="A CNN Model", 
+        architecture="\hat{y}_u = CNN(X_u)", 
+        variables="variables: {'\\hat{y}_u': 'The predicted output for node u', 'X_u': 'The input features for node u'}", 
+        hyperparameters="...",
+        base_code="import pandas...",
+    )
     exp = ModelExperiment(
         sub_tasks=[mt],
     )
 
     tpl_ex_path = Path(__file__).resolve() / Path("rdagent/scenarios/kaggle/tpl_ex").resolve() / competition
-    injected_file_names = ["spec.md", "load_data.py", "feat01.py"]
+    injected_file_names = ["spec.md", "load_data.py", "feat01.py", "model01.py"]
     for file_name in injected_file_names:
         file_path = tpl_ex_path / file_name
         exp.experiment_workspace.inject_code(**{file_name: file_path.read_text()})
 
     # Run the experiment
-    exp = model_coder.develop(exp)
+    # exp = model_coder.develop(exp)
+    # test the evaluator
+    eva = ModelGeneralCaseSpecEvaluator(scen=scen)
+    modelexp = ModelFBWorkspace()
+    for file_name in injected_file_names:
+        file_path = tpl_ex_path / file_name
+        modelexp.inject_code(**{file_name: file_path.read_text()})
+    
+    exp.feedback = eva.evaluate(target_task=mt, queried_knowledge=None, implementation=modelexp, gt_implementation=None)
+    print("hello world")
+    print(exp.feedback)
 
 
 if __name__ == "__main__":
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
index 3cb54dfb6..34ebee258 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
@@ -62,9 +62,6 @@ def model_workflow(
     train_datagen = ImageDataGenerator(rescale=1.0 / 255, horizontal_flip=True, vertical_flip=True)
     train_generator = train_datagen.flow(train_images, train_labels, batch_size=batch_size, shuffle=True)
 
-    validation_datagen = ImageDataGenerator(rescale=1.0 / 255)
-    validation_generator = validation_datagen.flow(validation_images, validation_labels, batch_size=batch_size)
-
     # Get input shape from the training data
     input_shape = X.shape[1:]
     num_classes = hyper_params.get("num_classes", 2)
@@ -122,20 +119,35 @@ def model_workflow(
 
     # Training
     epochs = hyper_params.get("epochs", 100)
-    history = model.fit(
-        train_generator,
-        validation_data=validation_generator,
-        epochs=epochs,
-        verbose=1,
-        shuffle=True,
-        callbacks=callbacks,
-    )
-    # Predict on validation data
-    val_pred = model.predict(validation_datagen.flow(validation_images, batch_size=1, shuffle=False), verbose=1)
-
-    # Load the test data and evaluate the model
-    test_datagen = ImageDataGenerator(rescale=1.0 / 255)
-    test_generator = test_datagen.flow(test_images, batch_size=1, shuffle=False)
-
-    test_pred = model.predict(test_generator, verbose=1)
-    return val_pred, test_pred
+    if val_X is not None and val_y is not None:
+        validation_datagen = ImageDataGenerator(rescale=1.0 / 255)
+        validation_generator = validation_datagen.flow(validation_images, validation_labels, batch_size=batch_size)
+        history = model.fit(
+            train_generator,
+            validation_data=validation_generator,
+            epochs=epochs,
+            verbose=1,
+            shuffle=True,
+            callbacks=callbacks,
+        )
+        # Predict on validation data
+        val_pred = model.predict(validation_datagen.flow(validation_images, batch_size=1, shuffle=False), verbose=1)
+    else:
+        history = model.fit(
+            train_generator,
+            epochs=epochs,
+            verbose=1,
+            shuffle=True,
+            callbacks=callbacks,
+        )
+        val_pred = None
+
+    # Predict on test data
+    if test_X is not None:
+        test_datagen = ImageDataGenerator(rescale=1.0 / 255)
+        test_generator = test_datagen.flow(test_images, batch_size=1, shuffle=False)
+        test_pred = model.predict(test_generator, verbose=1)
+    else:
+        test_pred = None
+
+    return val_pred, test_pred
\ No newline at end of file

From d04b2809e6d2790abd86537f60425a252aeb0c73 Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Thu, 5 Dec 2024 09:58:14 +0000
Subject: [PATCH 021/304] remove value detection from data_science model
 evaluator

---
 .../coder/data_science/model/eva_utils.py        | 12 +-----------
 .../components/coder/data_science/model/eval.py  | 16 +++++-----------
 .../model/model_execute_template.txt             |  2 +-
 .../coder/data_science/model/prompts.yaml        | 12 +-----------
 4 files changed, 8 insertions(+), 34 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/eva_utils.py b/rdagent/components/coder/data_science/model/eva_utils.py
index 5554e1cd4..b2f0e347f 100644
--- a/rdagent/components/coder/data_science/model/eva_utils.py
+++ b/rdagent/components/coder/data_science/model/eva_utils.py
@@ -19,14 +19,10 @@ def evaluate(
         self,
         target_task: Task,
         implementation: Workspace,
-        gt_implementation: Workspace,
         model_execution_feedback: str = "",
-        model_value_feedback: str = "",
     ):
         assert isinstance(target_task, ModelTask)
         assert isinstance(implementation, ModelFBWorkspace)
-        if gt_implementation is not None:
-            assert isinstance(gt_implementation, ModelFBWorkspace)
 
         model_task_information = target_task.get_task_information()
         code = implementation.code
@@ -55,8 +51,6 @@ def evaluate(
                     model_information=model_task_information,
                     code=code,
                     model_execution_feedback=execution_feedback_to_render,
-                    model_value_feedback=model_value_feedback,
-                    gt_code=gt_implementation.code if gt_implementation else None,
                 )
             )
             if (
@@ -84,16 +78,13 @@ def evaluate(
         self,
         target_task: Task,
         implementation: Workspace,
-        gt_implementation: Workspace,
         model_execution_feedback: str,
         model_shape_feedback: str,
-        model_value_feedback: str,
         model_code_feedback: str,
     ):
         assert isinstance(target_task, ModelTask)
         assert isinstance(implementation, ModelFBWorkspace)
-        if gt_implementation is not None:
-            assert isinstance(gt_implementation, ModelFBWorkspace)
+
 
         system_prompt = (
             Environment(undefined=StrictUndefined)
@@ -121,7 +112,6 @@ def evaluate(
                     model_execution_feedback=execution_feedback_to_render,
                     model_shape_feedback=model_shape_feedback,
                     model_code_feedback=model_code_feedback,
-                    model_value_feedback=model_value_feedback,
                 )
             )
             if (
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 56f83505f..b9c135215 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -29,7 +29,7 @@ class ModelGeneralCaseSpecEvaluator(CoSTEEREvaluator):
     - Simplest case, we already split the data into train_data, valid_data, and test_data. We require the model to learn (optionally validate on valid data), and infer on test data.
 
     Test workflow:
-    - Build train, valid, and test data to run it, and test the output (e.g., shape, value, etc.)
+    - Build train, valid, and test data to run it, and test the output (e.g., shape, etc.)
     """
 
     def evaluate(
@@ -63,8 +63,6 @@ def evaluate(
             batch_size=batch_size,
         )
         val_pred_array, test_pred_array = pred_list
-        # ignore gt_implementation
-        gt_np_array = None
 
         # TODO: auto specify shape from spec.md using GPT
         
@@ -82,21 +80,17 @@ def evaluate(
             expected_test_shape,
         ) 
         shape_feedback += f"Test Output: {test_shape_feedback}\n"
-        value_feedback = "The value feedback is passed, and the value decision is true." 
+        value_feedback = "The value feedback is ignored, and the value decision is automatically set as true." 
         code_feedback, _ = ModelCodeEvaluator(scen=self.scen).evaluate(
             target_task=target_task,
             implementation=implementation,
-            gt_implementation=gt_implementation,
             model_execution_feedback=model_execution_feedback,
-            model_value_feedback="\n".join([shape_feedback, value_feedback]),
         )
         final_feedback, final_decision = ModelFinalEvaluator(scen=self.scen).evaluate(
             target_task=target_task,
             implementation=implementation,
-            gt_implementation=gt_implementation,
             model_execution_feedback=model_execution_feedback,
             model_shape_feedback=shape_feedback,
-            model_value_feedback=value_feedback,
             model_code_feedback=code_feedback,
         )
 
@@ -107,8 +101,8 @@ def evaluate(
             code_feedback=code_feedback,
             final_feedback=final_feedback,
             final_decision=final_decision,
-            value_generated_flag=(val_pred_array is not None and test_pred_array is not None),  
-            final_decision_based_on_gt=(gt_implementation is not None),
+            value_generated_flag=(val_pred_array is not None and test_pred_array is not None),
+            final_decision_based_on_gt=False,
         )
 
 
@@ -120,7 +114,7 @@ class XXX2SpecEval:
     - Sometimes we don't need validation (e.g., simple models not prone to overfitting, or data is too scarce to split).
 
     Test workflow:
-    - Build train and test data to run it, and test the output (e.g., shape, value, etc.)
+    - Build train and test data to run it, and test the output (e.g., shape, etc.)
     - valid_data == None
     """
 
diff --git a/rdagent/components/coder/data_science/model/model_execute_template.txt b/rdagent/components/coder/data_science/model/model_execute_template.txt
index a86ad17a8..fd4668ee1 100644
--- a/rdagent/components/coder/data_science/model/model_execute_template.txt
+++ b/rdagent/components/coder/data_science/model/model_execute_template.txt
@@ -27,7 +27,7 @@ val_pred, test_pred = model_workflow(
     y=train_y,  
     val_X=val_X,  
     val_y=val_y,  
-    test_X=None, 
+    test_X=test_X,
     hyper_params={}  
 )  
  
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 8742bd26b..d2c8b887d 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -129,14 +129,6 @@ evaluator_code_feedback:
         {{ code }}
         --------------Execution feedback:---------------
         {{ model_execution_feedback }}
-        {% if model_value_feedback is not none %}
-        --------------Model value feedback:---------------
-        {{ model_value_feedback }}
-        {% endif %}
-        {% if gt_code is not none %}
-        --------------Ground truth Python code:---------------
-        {{ gt_code }}
-        {% endif %}
 
 
 evaluator_final_feedback:
@@ -163,6 +155,4 @@ evaluator_final_feedback:
         --------------Model shape feedback:---------------
         {{ model_shape_feedback }}
         --------------Model Code feedback:---------------
-        {{ model_code_feedback }}
-        --------------Model value feedback:---------------
-        {{ model_value_feedback }}
\ No newline at end of file
+        {{ model_code_feedback }}
\ No newline at end of file

From da8d0b73fe5ff49e58ea4d956294380507f838fb Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 6 Dec 2024 07:35:25 +0000
Subject: [PATCH 022/304] data loader CoSTEER

---
 rdagent/components/coder/CoSTEER/__init__.py  |   1 +
 .../coder/data_science/model/eval.py          |  22 ++--
 .../coder/data_science/model/exp.py           |  23 ++--
 .../data_science/raw_data_loader/__init__.py  |  68 +++++++++++-
 .../raw_data_loader/data_loader_unit_test     |   0
 .../data_science/raw_data_loader/eval.py      |  39 +++++++
 .../raw_data_loader/evaluators.py             |  21 ----
 .../coder/data_science/raw_data_loader/exp.py |  23 ++--
 .../data_science/raw_data_loader/prompts.yaml | 102 ++++++++++++++++++
 .../components/coder/model_coder/eva_utils.py |   1 +
 rdagent/core/experiment.py                    |   6 +-
 .../data_science/experiment/experiment.py     |   4 +-
 rdagent/scenarios/data_science/scen/scen.py   |   1 +
 13 files changed, 251 insertions(+), 60 deletions(-)
 delete mode 100644 rdagent/components/coder/data_science/raw_data_loader/data_loader_unit_test
 create mode 100644 rdagent/components/coder/data_science/raw_data_loader/eval.py
 delete mode 100644 rdagent/components/coder/data_science/raw_data_loader/evaluators.py
 create mode 100644 rdagent/components/coder/data_science/raw_data_loader/prompts.yaml

diff --git a/rdagent/components/coder/CoSTEER/__init__.py b/rdagent/components/coder/CoSTEER/__init__.py
index 7dddbfe2d..a6c53a65d 100644
--- a/rdagent/components/coder/CoSTEER/__init__.py
+++ b/rdagent/components/coder/CoSTEER/__init__.py
@@ -105,4 +105,5 @@ def develop(self, exp: Experiment) -> Experiment:
             pickle.dump(self.knowledge_base, open(self.new_knowledge_base_path, "wb"))
             logger.info(f"New knowledge base saved to {self.new_knowledge_base_path}")
         exp.sub_workspace_list = experiment.sub_workspace_list
+        exp.experiment_workspace = experiment.experiment_workspace
         return exp
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 12fd1b2e0..39ece8885 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -57,7 +57,7 @@ def evaluate(
 
         assert isinstance(implementation, ModelFBWorkspace)
         model_execution_feedback, val_pred_array, test_pred_array = implementation.execute(
-        # Parameters?
+            # Parameters?
         )
         # ignore gt_implementation
         gt_np_array = None
@@ -67,21 +67,21 @@ def evaluate(
         batch_size = 8
         num_classes = self.scen.model_output_channel if hasattr(self.scen, "model_output_channel") else 1
         # TODO: num_class may not be specified in data description. Maybe shape evaluate is not necessary.
-        shape_feedback = ""  
-        expected_val_shape = (batch_size, num_classes)  
-        expected_test_shape = (batch_size, num_classes)  
+        shape_feedback = ""
+        expected_val_shape = (batch_size, num_classes)
+        expected_test_shape = (batch_size, num_classes)
         val_shape_feedback, val_shape_decision = shape_evaluator(
-            val_pred_array, 
+            val_pred_array,
             expected_val_shape,
         )
-        shape_feedback += f"Validation Output: {val_shape_feedback}\n"    
+        shape_feedback += f"Validation Output: {val_shape_feedback}\n"
         test_shape_feedback, test_shape_decision = shape_evaluator(
-            test_pred_array, 
+            test_pred_array,
             expected_test_shape,
-        ) 
+        )
         shape_feedback += f"Test Output: {test_shape_feedback}\n"
         # value feedback necessary?
-        value_feedback = "" 
+        value_feedback = ""
         code_feedback, _ = ModelCodeEvaluator(scen=self.scen).evaluate(
             target_task=target_task,
             implementation=implementation,
@@ -106,7 +106,7 @@ def evaluate(
             final_feedback=final_feedback,
             final_decision=final_decision,
             # value_generated_flag=(gen_np_array is not None),
-            value_generated_flag=(val_pred_array is not None and test_pred_array is not None),  
+            value_generated_flag=(val_pred_array is not None and test_pred_array is not None),
             final_decision_based_on_gt=(gt_implementation is not None),
         )
 
@@ -153,4 +153,4 @@ class XXX4SpecEval:
     - Test2: Ensure the hyperparameters are 1) being used, and 2) the model remains stable.
         - Different hyperparameters will yield different results
         - Same hyperparameters will yield the same results
-    """
\ No newline at end of file
+    """
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
index c58e2db6e..1832cf870 100644
--- a/rdagent/components/coder/data_science/model/exp.py
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -12,6 +12,7 @@
 
 # TODO: Complete the implementation of the class DataLoaderTask and class DataLoaderFBWorkspace
 
+
 class ModelTask(CoSTEERTask):
     def __init__(
         self,
@@ -30,7 +31,7 @@ def __init__(
         self.variables: str = variables
         self.hyperparameters: str = hyperparameters
         self.model_type: str = (
-            model_type  # Tabular for tabular model, TimesSeries for time series model, Graph for graph model, XGBoost for XGBoost model 
+            model_type  # Tabular for tabular model, TimesSeries for time series model, Graph for graph model, XGBoost for XGBoost model
             # TODO: More Models Supported
         )
         super().__init__(name=name, description=description, *args, **kwargs)
@@ -53,24 +54,25 @@ def from_dict(dict):
     def __repr__(self) -> str:
         return f"<{self.__class__.__name__} {self.name}>"
 
+
 class ModelFBWorkspace(FBWorkspace):
     def execute(self):
         super().execute()
         try:
             de = DockerEnv(DSDockerConf())
             de.prepare()
-            np.save(os.path.join(self.workspace_path, "train_X.npy"), train_X)  
-            np.save(os.path.join(self.workspace_path, "train_y.npy"), train_y)  
-            np.save(os.path.join(self.workspace_path, "val_X.npy"), val_X)  
-            np.save(os.path.join(self.workspace_path, "val_y.npy"), val_y)  
-            np.save(os.path.join(self.workspace_path, "test_X.npy"), test_X)  
+            np.save(os.path.join(self.workspace_path, "train_X.npy"), train_X)
+            np.save(os.path.join(self.workspace_path, "train_y.npy"), train_y)
+            np.save(os.path.join(self.workspace_path, "val_X.npy"), val_X)
+            np.save(os.path.join(self.workspace_path, "val_y.npy"), val_y)
+            np.save(os.path.join(self.workspace_path, "test_X.npy"), test_X)
             # TODO: generate dataset automatically
 
             dump_code = (Path(__file__).parent / "model_execute_template.txt").read_text()
 
             log, results = de.dump_python_code_run_and_get_results(
                 code=dump_code,
-                dump_file_names=["execution_feedback_str.pkl", "val_pred.pkl", "test_pred.pkl"],  
+                dump_file_names=["execution_feedback_str.pkl", "val_pred.pkl", "test_pred.pkl"],
                 local_path=str(self.workspace_path),
                 env={},
                 code_dump_file_py_name="model_test",
@@ -81,12 +83,11 @@ def execute(self):
 
         except Exception as e:
             execution_feedback_str = f"Execution error: {e}\nTraceback: {traceback.format_exc()}"
-            val_pred_array = None  
-            test_pred_array = None 
+            val_pred_array = None
+            test_pred_array = None
 
         if len(execution_feedback_str) > 2000:
             execution_feedback_str = (
                 execution_feedback_str[:1000] + "....hidden long error message...." + execution_feedback_str[-1000:]
             )
-        return execution_feedback_str, val_pred_array, test_pred_array 
-    
\ No newline at end of file
+        return execution_feedback_str, val_pred_array, test_pred_array
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index 1f7ab1449..1642bb590 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -22,6 +22,8 @@
     - Each coder could be tested.
 """
 
+import json
+
 from rdagent.components.coder.CoSTEER import CoSTEER
 from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
 from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
@@ -31,8 +33,14 @@
 from rdagent.components.coder.CoSTEER.knowledge_management import (
     CoSTEERQueriedKnowledge,
 )
+from rdagent.components.coder.data_science.raw_data_loader.eval import (
+    DataLoaderCoSTEEREvaluator,
+)
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.core.scenario import Scenario
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.data_science.experiment.experiment import DataLoaderFBWorkspace
+from rdagent.utils.agent.tpl import T
 
 
 class DataLoaderMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
@@ -40,10 +48,62 @@ def implement_one_task(
         self,
         target_task: DataLoaderTask,
         queried_knowledge: CoSTEERQueriedKnowledge | None = None,
-    ) -> str:
-        ...  # prompting
+    ) -> dict[str, str]:
         # return a workspace with "load_data.py", "spec/load_data.md" inside
         # assign the implemented code to the new workspace.
+        competition_info = self.scen.competition_descriptions
+
+        # 1. specifications
+        system_prompt = T(".prompts:spec.system").r(competition_info=competition_info)
+        data_loader_prompt = T(".prompts:spec.user.data_loader").r()
+        feature_prompt = T(".prompts:spec.user.feature").r()
+        model_prompt = T(".prompts:spec.user.model").r()
+        ensemble_prompt = T(".prompts:spec.user.ensemble").r()
+        workflow_prompt = T(".prompts:spec.user.workflow").r()
+
+        spec_session = APIBackend().build_chat_session(session_system_prompt=system_prompt)
+
+        data_loader_spec = json.loads(spec_session.build_chat_completion(user_prompt=data_loader_prompt))["spec"]
+        feature_spec = json.loads(spec_session.build_chat_completion(user_prompt=feature_prompt))["spec"]
+        model_spec = json.loads(spec_session.build_chat_completion(user_prompt=model_prompt))["spec"]
+        ensemble_spec = json.loads(spec_session.build_chat_completion(user_prompt=ensemble_prompt))["spec"]
+        workflow_spec = json.loads(spec_session.build_chat_completion(user_prompt=workflow_prompt))["spec"]
+
+        # 2. code
+        system_prompt = T(".prompts:data_loader_coder.system").r()
+        user_prompt = T(".prompts:data_loader_coder.user").r(
+            competition_info=competition_info, data_loader_spec=data_loader_spec
+        )
+
+        data_loader_code = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt)
+        )["code"]
+
+        return {
+            "spec/data_loader.md": data_loader_spec,
+            "spec/feature.md": feature_spec,
+            "spec/model.md": model_spec,
+            "spec/ensemble.md": ensemble_spec,
+            "spec/workflow.md": workflow_spec,
+            "load_data.py": data_loader_code,
+        }
+
+    def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
+        """
+        Assign the code list to the evolving item.
+
+        The code list is aligned with the evolving item's sub-tasks.
+        If a task is not implemented, put a None in the list.
+        """
+        for index in range(len(evo.sub_tasks)):
+            if code_list[index] is None:
+                continue
+            if evo.sub_workspace_list[index] is None:
+                evo.experiment_workspace
+                # evo.sub_workspace_list[index] = DataLoaderFBWorkspace(target_task=evo.sub_tasks[index])
+                evo.sub_workspace_list[index] = evo.experiment_workspace
+            evo.sub_workspace_list[index].inject_code(**code_list[index])
+        return evo
 
 
 class DataLoaderCoSTEER(CoSTEER):
@@ -54,8 +114,8 @@ def __init__(
         **kwargs,
     ) -> None:
         eva = CoSTEERMultiEvaluator(
-            # DataLoaderCoSTEEREvaluator(scen=scen), scen=scen
+            DataLoaderCoSTEEREvaluator(scen=scen), scen=scen
         )  # Please specify whether you agree running your eva in parallel or not
         es = DataLoaderMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
 
-        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
+        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=2, scen=scen, **kwargs)
diff --git a/rdagent/components/coder/data_science/raw_data_loader/data_loader_unit_test b/rdagent/components/coder/data_science/raw_data_loader/data_loader_unit_test
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
new file mode 100644
index 000000000..ca3cc0513
--- /dev/null
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -0,0 +1,39 @@
+# tess successfully running.
+# (GPT) if it aligns with the spec & rationality of the spec.
+from rdagent.components.coder.CoSTEER.evaluators import (
+    CoSTEEREvaluator,
+    CoSTEERMultiFeedback,
+    CoSTEERSingleFeedback,
+)
+from rdagent.core.evolving_framework import QueriedKnowledge
+from rdagent.core.experiment import Task, Workspace
+
+
+class DataLoaderCoSTEEREvaluator(CoSTEEREvaluator):
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: Workspace,
+        gt_implementation: Workspace,
+        queried_knowledge: QueriedKnowledge = None,
+        **kwargs,
+    ) -> CoSTEERSingleFeedback:
+        target_task_information = target_task.get_task_information()
+        if (
+            queried_knowledge is not None
+            and target_task_information in queried_knowledge.success_task_to_knowledge_dict
+        ):
+            return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
+        elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
+            return CoSTEERSingleFeedback(
+                execution_feedback="This task has failed too many times, skip implementation.",
+                shape_feedback="This task has failed too many times, skip implementation.",
+                value_feedback="This task has failed too many times, skip implementation.",
+                code_feedback="This task has failed too many times, skip implementation.",
+                final_feedback="This task has failed too many times, skip implementation.",
+                final_decision=False,
+            )
+
+        implementation.execute()
+
+        return CoSTEERSingleFeedback()
diff --git a/rdagent/components/coder/data_science/raw_data_loader/evaluators.py b/rdagent/components/coder/data_science/raw_data_loader/evaluators.py
deleted file mode 100644
index f2ac52428..000000000
--- a/rdagent/components/coder/data_science/raw_data_loader/evaluators.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# tess successfully running.
-# (GPT) if it aligns with the spec & rationality of the spec.
-from rdagent.components.coder.CoSTEER.evaluators import (
-    CoSTEEREvaluator,
-    CoSTEERMultiFeedback,
-    CoSTEERSingleFeedback,
-)
-from rdagent.core.evolving_framework import QueriedKnowledge
-from rdagent.core.experiment import Task, Workspace
-
-
-class DataLoaderCoSTEEREvaluator(CoSTEEREvaluator):
-    def evaluate(
-        self,
-        target_task: Task,
-        implementation: Workspace,
-        gt_implementation: Workspace,
-        queried_knowledge: QueriedKnowledge = None,
-        **kwargs,
-    ) -> CoSTEERSingleFeedback:
-        pass
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/raw_data_loader/exp.py b/rdagent/components/coder/data_science/raw_data_loader/exp.py
index a10160b9c..184be0aba 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/exp.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/exp.py
@@ -8,6 +8,7 @@
 from rdagent.core.experiment import Experiment, FBWorkspace
 from rdagent.core.utils import cache_with_pickle
 from rdagent.oai.llm_utils import md5_hash
+from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
 
 # TODO: Complete the implementation of the class DataLoaderTask and class DataLoaderFBWorkspace
@@ -47,16 +48,22 @@ def execute(self):
             de.prepare()
 
             # TODO: UNIT TEST for data loader
-            dump_code = (Path(__file__).parent / "data_loader_unit_test.txt").read_text()
+            dump_code = T(".prompts:data_loader_execute_code").r()
+            log, results = de.dump_python_code_run_and_get_results(
+                code=dump_code,
+                dump_file_names=["data.pkl"],
+                local_path=str(self.workspace_path),
+                code_dump_file_py_name="execute_data_loader",
+            )
 
+            if results is None:
+                raise RuntimeError(f"Failed to execute load_data.py, Log: {log}")
             # TODO: Cache the processed data into a pickle file
-            pass
+            execution_feedback = "Execution successful"
+            preprocessed_data = results[0]
 
         except Exception as e:
-            pass
+            execution_feedback = f"Execution error: {e}\nTraceback: {traceback.format_exc()}"
+            preprocessed_data = None
 
-        return (
-            "data_loader.py and spec.md executed successfully",
-            "content of spec.md",
-            "pkl generated by data_loader.py",
-        )
+        return execution_feedback, preprocessed_data
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
new file mode 100644
index 000000000..1b7e6cead
--- /dev/null
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -0,0 +1,102 @@
+data_loader_execute_code: |-
+  # execute and cache the preprocessed data
+  import pickle
+  from load_data import load_data
+  data = load_data()
+  with open('data.pkl', 'wb') as f:
+      pickle.dump(data, f)
+  
+
+spec:
+  system: |-
+    You are a Python data scientist working on a new kaggle competition project. This project will be used to analyze data and build models to predict future outcomes, and this project codes will be written by GPT.
+    Your task is to write five specification texts (markdown format) for follow-up tasks. The five tasks are: data loader(and preprocess), feature engineering, model building, ensemble, and workflow.
+    The competition information is provided as a html format string.
+    
+    -----------Competition Information-----------
+    {{ competition_info }}
+
+  user:
+    data_loader: |-
+      Data loader specification text should include two parts:
+      1. function interface:
+        - all raw data files are in /kaggle/input/ directory, so the function should take no input.
+        - function name must be "load_data".
+        - have annotations for the output.
+        - have a docstring that describes the function.
+      2. Precautions:
+        some precautions for data loading and preprocessing.
+
+      Please response the specification in the following json format. Here is an example structure for the JSON output:
+      {
+          "spec": "The specification as a string."
+      }
+
+    feature: |-
+      Feature engineering specification text should include two parts:
+      1. function interface:
+        - function name must be "feat_eng".
+        - have annotations for the input and output.
+        - have a docstring that describes the function.
+      2. Precautions:
+        some precautions for feature engineering.
+
+      Please response the specification in the following json format. Here is an example structure for the JSON output:
+      {
+          "spec": "The specification as a string."
+      }
+
+    model: |-
+      Model building specification text should include two parts:
+      1. function interface:
+        - function name must be "model_workflow".
+        - have annotations for the input and output.
+        - have a docstring that describes the function.
+      2. Precautions:
+        some precautions for model building.
+
+      Please response the specification in the following json format. Here is an example structure for the JSON output:
+      {
+          "spec": "The specification as a string."
+      }
+
+    ensemble: |-
+      Ensemble specification text should include two parts:
+      1. function interface:
+        - function name must be "ensemble".
+        - have annotations for the input and output.
+        - have a docstring that describes the function.
+      2. Precautions:
+        some precautions for ensemble.
+
+      Please response the specification in the following json format. Here is an example structure for the JSON output:
+      {
+          "spec": "The specification as a string."
+      }
+
+    workflow: |-
+      Workflow specification text should include one parts:
+      1. Precautions:
+        some precautions for workflow.
+
+      Please response the specification in the following json format. Here is an example structure for the JSON output:
+      {
+          "spec": "The specification as a string."
+      }
+
+data_loader_coder:
+  system: |-
+    You are a Python data scientist working on a new project. This project will be used to analyze data and build models to predict future outcomes, and this project codes will be written by GPT.
+    Your task is to write a Python function that loads and preprocesses data. The function should take a file path as input and return a pandas DataFrame with the data loaded and preprocessed.
+    You should follow the provided specifications to complete this task.
+
+    Please response the code in the following json format. Here is an example structure for the JSON output:
+    {
+        "code": "The Python code as a string."
+    }
+  user: |-
+    ---------Competition Information---------
+    {{ competition_info }}
+
+    ---------Data Loader Specification---------
+    {{ data_loader_spec }}
diff --git a/rdagent/components/coder/model_coder/eva_utils.py b/rdagent/components/coder/model_coder/eva_utils.py
index 4ee17ffac..366000f2e 100644
--- a/rdagent/components/coder/model_coder/eva_utils.py
+++ b/rdagent/components/coder/model_coder/eva_utils.py
@@ -14,6 +14,7 @@
 
 evaluate_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
+
 # This shape evaluator is also used in data_science
 def shape_evaluator(prediction: np.ndarray, target_shape: Tuple = None) -> Tuple[str, bool]:
     if target_shape is None or prediction is None:
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index d1318b6a3..b9779b1a9 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -151,10 +151,8 @@ def inject_code(self, **files: str) -> None:
         for k, v in files.items():
             self.code_dict[k] = v
             target_file_path = self.workspace_path / k
-            if not target_file_path.parent.exists():
-                target_file_path.parent.mkdir(parents=True, exist_ok=True)
-            with Path.open(self.workspace_path / k, "w") as f:
-                f.write(v)
+            target_file_path.parent.mkdir(parents=True, exist_ok=True)
+            target_file_path.write_text(v)
 
     def get_files(self) -> list[Path]:
         """
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index af5ec4203..1f3573f9b 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -31,7 +31,7 @@
 # }
 
 
-class DataLoaderExperiment(Experiment[DataLoaderTask, DSFBWorkspace, DataLoaderFBWorkspace]):
+class DataLoaderExperiment(Experiment[DataLoaderTask, DSFBWorkspace, DSFBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.experiment_workspace = DSFBWorkspace()
@@ -40,6 +40,8 @@ def __init__(self, *args, **kwargs) -> None:
 class ModelExperiment(Experiment[ModelTask, DSFBWorkspace, ModelFBWorkspace]):
     def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
         super().__init__(*args, **kwargs)
+        
+        # TODO: use previeous workspace
         self.experiment_workspace = DSFBWorkspace()
         # if len(self.based_experiments) > 0:
         #     self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index 2be520ab6..256893509 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -1,5 +1,6 @@
 import json
 from pathlib import Path
+from typing import Literal
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.core.experiment import Task

From b2a445c404b2731ee2c3b31537f87c3ee0defa25 Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Fri, 6 Dec 2024 09:58:11 +0000
Subject: [PATCH 023/304] ds model eval: init use gpt for shape evaluator

---
 .../coder/data_science/model/eva_utils.py     | 61 ++++++++++++++++++-
 .../coder/data_science/model/eval.py          | 38 ++++++------
 .../coder/data_science/model/prompts.yaml     | 20 ++++++
 .../aerial-cactus-identification/spec.md      |  1 +
 4 files changed, 101 insertions(+), 19 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/eva_utils.py b/rdagent/components/coder/data_science/model/eva_utils.py
index b2f0e347f..ea51ed89e 100644
--- a/rdagent/components/coder/data_science/model/eva_utils.py
+++ b/rdagent/components/coder/data_science/model/eva_utils.py
@@ -14,6 +14,65 @@
 
 evaluate_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
+def expected_shape_detect(
+    prediction: np.ndarray,    
+    spec_message: str,
+    model_execution_feedback: str,
+) -> str:
+    if prediction is None:
+        return "No output generated from the model. Skip value evaluation"
+    elif spec_message is None:
+        return (
+            "No spec provided. Shape evaluation not impractical",
+        )
+    else:
+        pre_shape = prediction.shape
+
+        system_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(evaluate_prompts["evaluator_shape_feedback"]["system"])
+            .render(
+                spec=(
+                    spec_message
+                    if spec_message is not None
+                    else "No spec description provided."
+                )
+            )
+        )
+
+        execution_feedback_to_render = model_execution_feedback
+
+        for _ in range(10):  # 10 times to split the content is enough
+            user_prompt = (
+                Environment(undefined=StrictUndefined)
+                .from_string(
+                    evaluate_prompts["evaluator_shape_feedback"]["user"],
+                )
+                .render(
+                    pre_shape=pre_shape,
+                    model_execution_feedback=execution_feedback_to_render,
+                )
+            )
+            if (
+                APIBackend().build_messages_and_calculate_token(
+                    user_prompt=user_prompt,
+                    system_prompt=system_prompt,
+                )
+                > LLM_SETTINGS.chat_token_limit
+            ):
+                execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :]
+            else:
+                break
+        
+        critic_response = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=system_prompt,
+            json_mode=False,
+        )
+
+        return critic_response
+
+
 class ModelCodeEvaluator(Evaluator):
     def evaluate(
         self,
@@ -25,7 +84,7 @@ def evaluate(
         assert isinstance(implementation, ModelFBWorkspace)
 
         model_task_information = target_task.get_task_information()
-        code = implementation.code
+        code = implementation.code_dict["model01.py"]
 
         system_prompt = (
             Environment(undefined=StrictUndefined)
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index b9c135215..e862f8f12 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -11,6 +11,7 @@
 from rdagent.components.coder.data_science.model.eva_utils import (
     ModelCodeEvaluator,
     ModelFinalEvaluator,
+    expected_shape_detect,
 )
 from rdagent.components.coder.model_coder.eva_utils import shape_evaluator
 from rdagent.components.coder.data_science.model.exp import ModelFBWorkspace, ModelTask
@@ -62,24 +63,25 @@ def evaluate(
         model_execution_feedback, pred_list= implementation.execute(
             batch_size=batch_size,
         )
-        val_pred_array, test_pred_array = pred_list
-
-        # TODO: auto specify shape from spec.md using GPT
-        
-        num_classes = self.scen.model_output_channel if hasattr(self.scen, "model_output_channel") else 1
         shape_feedback = ""  
-        expected_val_shape = (batch_size, num_classes)  
-        expected_test_shape = (batch_size, num_classes)  
-        val_shape_feedback, val_shape_decision = shape_evaluator(
-            val_pred_array, 
-            expected_val_shape,
-        )
-        shape_feedback += f"Validation Output: {val_shape_feedback}\n"    
-        test_shape_feedback, test_shape_decision = shape_evaluator(
-            test_pred_array, 
-            expected_test_shape,
-        ) 
-        shape_feedback += f"Test Output: {test_shape_feedback}\n"
+        if pred_list is None:
+            shape_feedback += "No output generated from the model. No shape evaluation conducted."
+        else:
+            val_pred_array, test_pred_array = pred_list
+            spec_message = implementation.code_dict["spec.md"]
+            val_shape_feedback = expected_shape_detect(
+                val_pred_array,
+                spec_message,
+                model_execution_feedback=model_execution_feedback,
+            )
+            test_shape_feedback = expected_shape_detect(
+                test_pred_array,
+                spec_message,
+                model_execution_feedback=model_execution_feedback,
+            )
+
+            shape_feedback += f"Validation Output: {val_shape_feedback}\n" 
+            shape_feedback += f"Test Output: {test_shape_feedback}\n"
         value_feedback = "The value feedback is ignored, and the value decision is automatically set as true." 
         code_feedback, _ = ModelCodeEvaluator(scen=self.scen).evaluate(
             target_task=target_task,
@@ -101,7 +103,7 @@ def evaluate(
             code_feedback=code_feedback,
             final_feedback=final_feedback,
             final_decision=final_decision,
-            value_generated_flag=(val_pred_array is not None and test_pred_array is not None),
+            value_generated_flag=(pred_list is not None),
             final_decision_based_on_gt=False,
         )
 
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index d2c8b887d..1affe2381 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -99,6 +99,26 @@ evolving_strategy_model_coder:
         {% endfor %}
         {% endif %}
 
+evaluator_shape_feedback:
+    system: |-
+        User is trying to evaluate whether a model output shape is correct not. The correct message about the ground truth shape is given in spec.md as below:
+        {{ spec }}
+
+        The user will provide you the actual output of the model. The model is a part for solving a task in an given scenario. This model takes train dataset as input. Valid and test dataset are optional. The model workflow will generate prediction output of valid and test dataset.
+        The user will provide the execution result message.
+
+        Your job is to compare the output user provide and the message from spec.md to evaluate whether the user's model output is correct.
+
+        In your response you should give a clear judgement and also point out the expected shape and actual shape of the model output.
+        Here is an example structure for the output:
+        Expected prediction shape: (8, 1). The actual output shape: (8, 1). The shape of the output is correct.
+
+    user: |-
+        --------------Actual Output Shape:---------------
+        {{ pre_shape }}
+        --------------Execution feedback:---------------
+        {{ model_execution_feedback }}
+
 evaluator_code_feedback:
     system: |-
         User is trying to implement some models in the following scenario:
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec.md
index 623a9a6b7..696e2ffd7 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec.md
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec.md
@@ -86,6 +86,7 @@ def model_workflow(X: np.ndarray, y: np.ndarray, val_X: np.ndarray = None, val_y
         Predictions on the validation data, predictions on the test data
     """
 ```
+- In this task, the shape of output should be (batch_size, num_class), as num_class = 1 here.
 
 - The function should handle data augmentation, model creation, training, and prediction.
 

From dea5605c73b3b311c717ff2c15b77bc75952b4dc Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Fri, 6 Dec 2024 10:33:53 +0000
Subject: [PATCH 024/304] refactor: Update data loader evaluation and execution
 logic

---
 .gitignore                                    |  1 +
 .../components/coder/CoSTEER/evaluators.py    |  3 +
 .../coder/data_science/model/exp.py           |  2 -
 .../data_science/raw_data_loader/__init__.py  |  5 +-
 .../data_science/raw_data_loader/eval.py      | 59 ++++++++++++++++---
 .../eval_tests/data_loader_test.py            | 29 +++++++++
 .../coder/data_science/raw_data_loader/exp.py | 33 -----------
 .../data_science/raw_data_loader/prompts.yaml | 23 ++++++++
 rdagent/core/evaluation.py                    | 15 ++++-
 rdagent/core/experiment.py                    |  7 ++-
 .../data_science/experiment/experiment.py     | 15 +++--
 .../data_science/proposal/exp_gen.py          |  1 -
 12 files changed, 135 insertions(+), 58 deletions(-)
 create mode 100644 rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py

diff --git a/.gitignore b/.gitignore
index 400cf7d8e..773e96569 100644
--- a/.gitignore
+++ b/.gitignore
@@ -170,3 +170,4 @@ mlruns/
 # shell script
 *.out
 *.sh
+.aider*
diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
index 37f4d1ca0..aee25eb23 100644
--- a/rdagent/components/coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -12,6 +12,9 @@
 
 
 class CoSTEERSingleFeedback(Feedback):
+    # TODO: (xiao)
+    # it should be a subclass of FBWorkspaceExeFeedback
+    # A better name of it may be NormalFeedback
     """This class is a base class for all code generator feedback to single implementation"""
 
     def __init__(
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
index dd6cccd3a..18eeb0a29 100644
--- a/rdagent/components/coder/data_science/model/exp.py
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -10,8 +10,6 @@
 from rdagent.oai.llm_utils import md5_hash
 from rdagent.utils.env import DockerEnv, DSDockerConf
 
-# TODO: Complete the implementation of the class DataLoaderTask and class DataLoaderFBWorkspace
-
 
 class ModelTask(CoSTEERTask):
     def __init__(
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index 1642bb590..24d290501 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -39,7 +39,6 @@
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.scenarios.data_science.experiment.experiment import DataLoaderFBWorkspace
 from rdagent.utils.agent.tpl import T
 
 
@@ -76,7 +75,7 @@ def implement_one_task(
         )
 
         data_loader_code = json.loads(
-            APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt)
+            APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True)
         )["code"]
 
         return {
@@ -100,7 +99,7 @@ def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
                 continue
             if evo.sub_workspace_list[index] is None:
                 evo.experiment_workspace
-                # evo.sub_workspace_list[index] = DataLoaderFBWorkspace(target_task=evo.sub_tasks[index])
+                # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
                 evo.sub_workspace_list[index] = evo.experiment_workspace
             evo.sub_workspace_list[index].inject_code(**code_list[index])
         return evo
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index ca3cc0513..7ee8e198c 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -1,28 +1,58 @@
 # tess successfully running.
 # (GPT) if it aligns with the spec & rationality of the spec.
+import json
+from abc import abstractclassmethod
+from dataclasses import dataclass
+from os import system
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
     CoSTEERMultiFeedback,
     CoSTEERSingleFeedback,
 )
+from rdagent.core.evaluation import Feedback
 from rdagent.core.evolving_framework import QueriedKnowledge
-from rdagent.core.experiment import Task, Workspace
+from rdagent.core.experiment import FBWorkspace, Task, Workspace
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+from rdagent.utils.env import DSDockerConf, DockerEnv
+from pathlib import Path
+
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+# TODO:
+# 1. It seems logically sound, but we currently lack a scenario to apply it.
+# 2. If it proves to be useful, relocate it to a more general location.
+#
+# class FBWorkspaceExeFeedback(Feedback):
+#     """
+#     It pairs with FBWorkspace in the abstract level.
+#     """
+#     # ws: FBWorkspace   # potential
+#     stdout: str
+
+
+@dataclass
+class DataLoaderEvalFeedback(Feedback):
+    execution: str
+    checking: str  # inlucding every check in the testing (constraints about the generated value)
+    code: str
+    final_decision: bool
 
 
 class DataLoaderCoSTEEREvaluator(CoSTEEREvaluator):
+
     def evaluate(
         self,
         target_task: Task,
-        implementation: Workspace,
-        gt_implementation: Workspace,
+        implementation: FBWorkspace,
+        gt_implementation: FBWorkspace,
         queried_knowledge: QueriedKnowledge = None,
         **kwargs,
     ) -> CoSTEERSingleFeedback:
+
         target_task_information = target_task.get_task_information()
-        if (
-            queried_knowledge is not None
-            and target_task_information in queried_knowledge.success_task_to_knowledge_dict
-        ):
+        if (queried_knowledge is not None and
+                target_task_information in queried_knowledge.success_task_to_knowledge_dict):
             return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
         elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
             return CoSTEERSingleFeedback(
@@ -34,6 +64,17 @@ def evaluate(
                 final_decision=False,
             )
 
-        implementation.execute()
+        de = DockerEnv(conf=DSDockerConf())
+
+        # TODO: do we need to clean the generated tempory content?
+        fname = "data_loader_test.py"
+        with (DIRNAME / "eval_tests" / "data_loader_test.py").open("r") as f:
+            test_code = f.read()
+            implementation.inject_code(**{fname: test_code})
+        stdout = implementation.execute(env=de, entry=f"python {fname}")
+
+        system_prompt = T(".prompts:data_loader_eval.system").r(test_code=test_code)
+        user_prompt = T(".prompts:data_loader_eval.user").r(stdout=stdout)
 
-        return CoSTEERSingleFeedback()
+        resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
+        return DataLoaderEvalFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
new file mode 100644
index 000000000..aec0faa10
--- /dev/null
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
@@ -0,0 +1,29 @@
+"""
+A qualified data loader should support following features
+- successfully run
+- len(test) == len(test_ids) == submission length
+- len(train) == len(y)
+
+Please make sure the stdout is rich enough to support informative feedback
+"""
+import pickle
+import logging
+from load_data import load_data
+
+
+
+# Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+X, y, X_test, test_ids = load_data()
+
+# Validate the conditions mentioned in the docstring
+assert len(X_test) == len(test_ids), "Mismatch in length of test images and test IDs"
+assert len(X) == len(y), "Mismatch in length of training images and labels"
+
+logging.info("Data loader test passed successfully. Length of test images matches length of test IDs.")
+
+with open('data.pkl', 'wb') as f:
+    pickle.dump((X, y, X_test, test_ids), f)
+
+
diff --git a/rdagent/components/coder/data_science/raw_data_loader/exp.py b/rdagent/components/coder/data_science/raw_data_loader/exp.py
index 184be0aba..8bb98baaa 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/exp.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/exp.py
@@ -11,8 +11,6 @@
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
 
-# TODO: Complete the implementation of the class DataLoaderTask and class DataLoaderFBWorkspace
-
 
 class DataLoaderTask(CoSTEERTask):
     def __init__(
@@ -36,34 +34,3 @@ def from_dict(dict):
 
     def __repr__(self) -> str:
         return f"<{self.__class__.__name__} {self.name}>"
-
-
-class DataLoaderFBWorkspace(FBWorkspace):
-
-    # TODO: use the cache_with_pickle decorator.
-    def execute(self):
-        super().execute()
-        try:
-            de = DockerEnv(conf=DSDockerConf())
-            de.prepare()
-
-            # TODO: UNIT TEST for data loader
-            dump_code = T(".prompts:data_loader_execute_code").r()
-            log, results = de.dump_python_code_run_and_get_results(
-                code=dump_code,
-                dump_file_names=["data.pkl"],
-                local_path=str(self.workspace_path),
-                code_dump_file_py_name="execute_data_loader",
-            )
-
-            if results is None:
-                raise RuntimeError(f"Failed to execute load_data.py, Log: {log}")
-            # TODO: Cache the processed data into a pickle file
-            execution_feedback = "Execution successful"
-            preprocessed_data = results[0]
-
-        except Exception as e:
-            execution_feedback = f"Execution error: {e}\nTraceback: {traceback.format_exc()}"
-            preprocessed_data = None
-
-        return execution_feedback, preprocessed_data
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 1b7e6cead..7f0a8bb5b 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -100,3 +100,26 @@ data_loader_coder:
 
     ---------Data Loader Specification---------
     {{ data_loader_spec }}
+
+
+data_loader_eval:
+  system: |-
+    You are data scientist.
+    You are testing the data_loader with the following code
+    ```python
+    {{test_code}}
+    ```
+    You'll be given the stdout of your testing scripts.
+    Please respond with your feedback in the following JSON format and order
+    ```json
+    {
+        "execution": "Describe how well the data loader executed, including any errors or issues encountered.",
+        "checking": "Detail the checks performed on the data loaded, including data integrity and correctness.",
+        "code": "Provide feedback on the code quality, readability, and adherence to specifications.",
+        "final_decision": <true/false>
+    }
+    ```
+  user: |-
+    ```
+    {{stdout}}
+    ```
diff --git a/rdagent/core/evaluation.py b/rdagent/core/evaluation.py
index 4c0af18ca..71d32d920 100644
--- a/rdagent/core/evaluation.py
+++ b/rdagent/core/evaluation.py
@@ -8,10 +8,23 @@
 
 
 class Feedback:
+    """
+    Design Principle:
+        It will be more like a **dataclass**.
+        The building process of feedback will should be in evaluator
+    """
     pass
 
 
 class Evaluator(ABC):
+    """
+    Design Principle:
+
+        It should cover the building process of feedback from raw information.
+            Typically the buiilding of feedback will be two phases.
+            1. raw information including stdout & workspace  (feeedback itself will handle this)
+            2. advanced/summaried feedback information. (evaluate will handle this)
+    """
     def __init__(
         self,
         scen: Scenario,
@@ -25,5 +38,5 @@ def evaluate(
         implementation: "Workspace",
         gt_implementation: "Workspace",
         **kwargs: object,
-    ) -> None:
+    ) -> Feedback:
         raise NotImplementedError
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index b9779b1a9..64ef4ded9 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -12,6 +12,7 @@
 from typing import Any, Generic, Optional, TypeVar
 
 from rdagent.core.conf import RD_AGENT_SETTINGS
+from rdagent.utils.env import Env
 
 if typing.TYPE_CHECKING:
     from rdagent.core.proposal import Hypothesis
@@ -105,6 +106,7 @@ def run_pipeline(self, **files: str):
 
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
+        # TODO: rename it to file_dict;   inject_code -> inject_files
         self.code_dict: dict[str, Any] = (
             {}
         )  # The code injected into the folder, store them in the variable to reproduce the former result
@@ -185,12 +187,15 @@ def clear(self) -> None:
         shutil.rmtree(self.workspace_path, ignore_errors=True)
         self.code_dict = {}
 
-    def execute(self) -> object | None:
+    def execute(self, env: Env | None = None, entry: str | None = None) -> object | None:
         """
         Before each execution, make sure to prepare and inject code
         """
         self.prepare()
         self.inject_code(**self.code_dict)
+        # TODO: env should be not None in new design (no code can run without environment)
+        if env is not None and entry is not None:
+            return env.run(entry, self.workspace_path)
         return None
 
     def __str__(self) -> str:
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index 1f3573f9b..9499ce090 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -3,13 +3,12 @@
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.data_science.raw_data_loader.exp import (
-    DataLoaderFBWorkspace,
     DataLoaderTask,
 )
 from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace, FactorTask
 from rdagent.components.coder.model_coder.model import ModelFBWorkspace, ModelTask
 from rdagent.core.experiment import Experiment
-from rdagent.scenarios.data_science.experiment.workspace import DSFBWorkspace
+from rdagent.scenarios.data_science.experiment.workspace import FBWorkspace
 
 # KG_MODEL_TYPE_XGBOOST = "XGBoost"
 # KG_MODEL_TYPE_RANDOMFOREST = "RandomForest"
@@ -31,18 +30,18 @@
 # }
 
 
-class DataLoaderExperiment(Experiment[DataLoaderTask, DSFBWorkspace, DSFBWorkspace]):
+class DataLoaderExperiment(Experiment[DataLoaderTask, FBWorkspace, FBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.experiment_workspace = DSFBWorkspace()
+        self.experiment_workspace = FBWorkspace()
 
 
-class ModelExperiment(Experiment[ModelTask, DSFBWorkspace, ModelFBWorkspace]):
+class ModelExperiment(Experiment[ModelTask, FBWorkspace, ModelFBWorkspace]):
     def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         
         # TODO: use previeous workspace
-        self.experiment_workspace = DSFBWorkspace()
+        self.experiment_workspace = FBWorkspace()
         # if len(self.based_experiments) > 0:
         #     self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
         #     self.experiment_workspace.data_description = deepcopy(
@@ -61,10 +60,10 @@ def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
         #     ]
 
 
-class FactorExperiment(Experiment[FactorTask, DSFBWorkspace, FactorFBWorkspace]):
+class FactorExperiment(Experiment[FactorTask, FBWorkspace, FactorFBWorkspace]):
     def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.experiment_workspace = DSFBWorkspace(
+        self.experiment_workspace = FBWorkspace(
             template_folder_path=Path(__file__).resolve()
             / Path(DS_RD_SETTING.template_path).resolve()
             / DS_RD_SETTING.competition
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 45b900b67..ff7d04d98 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -3,7 +3,6 @@
 
 from rdagent.components.coder.data_science.raw_data_loader.exp import (
     DataLoaderExperiment,
-    DataLoaderFBWorkspace,
     DataLoaderTask,
 )
 from rdagent.components.proposal import LLMHypothesis2Experiment, LLMHypothesisGen

From e023791550453b67bd2d8325b8ca54a9334105b1 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 9 Dec 2024 03:10:41 +0000
Subject: [PATCH 025/304] redundance

---
 .../components/coder/data_science/raw_data_loader/__init__.py    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index 24d290501..f51ce5057 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -98,7 +98,6 @@ def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
             if code_list[index] is None:
                 continue
             if evo.sub_workspace_list[index] is None:
-                evo.experiment_workspace
                 # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
                 evo.sub_workspace_list[index] = evo.experiment_workspace
             evo.sub_workspace_list[index].inject_code(**code_list[index])

From bf36de8db656d9f023a8410b5eb4734c3885a39b Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 9 Dec 2024 07:28:28 +0000
Subject: [PATCH 026/304] split spec.md

---
 .../spec/data_loader.md                       |  4 ++
 .../spec/ensemble.md                          | 28 +++++++++++++
 .../spec/feature.md                           | 33 ++++++++++++++++
 .../spec/model.md                             | 39 +++++++++++++++++++
 .../spec/workflow.md                          | 24 ++++++++++++
 5 files changed, 128 insertions(+)
 create mode 100644 rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/data_loader.md
 create mode 100644 rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/ensemble.md
 create mode 100644 rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md
 create mode 100644 rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/model.md
 create mode 100644 rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/workflow.md

diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/data_loader.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/data_loader.md
new file mode 100644
index 000000000..0d5168f81
--- /dev/null
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/data_loader.md
@@ -0,0 +1,4 @@
+## Data Loading
+
+- Implement a function to load data from raw files.
+- The function should return training images, training labels, test images, and test IDs.
\ No newline at end of file
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/ensemble.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/ensemble.md
new file mode 100644
index 000000000..e3d04236e
--- /dev/null
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/ensemble.md
@@ -0,0 +1,28 @@
+## Ensemble and Decision Making
+
+- Implement a function for ensemble and decision making with the following signature:
+
+```python
+def ens_and_decision(test_pred_l: list[np.ndarray], val_pred_l: list[np.ndarray], val_label: np.ndarray) -> np.ndarray:
+    """
+    Handle the following:
+    1) Ensemble predictions using a simple average.
+    2) Make final decision after ensemble (convert the predictions to final binary form).
+
+    Parameters
+    ----------
+    test_pred_l : list[np.ndarray]
+        List of predictions on the test data.
+    val_pred_l : list[np.ndarray]
+        List of predictions on the validation data.
+    val_label : np.ndarray
+        True labels of the validation data.
+
+    Returns
+    -------
+    np.ndarray
+        Binary predictions on the test data.
+    """
+```
+
+- The function should combine predictions and convert them to a binary format.
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md
new file mode 100644
index 000000000..433dd4c04
--- /dev/null
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md
@@ -0,0 +1,33 @@
+
+## Feature Engineering
+
+- Implement a function for feature engineering with the following signature:
+
+```python
+def feature_eng(X: np.ndarray, y: np.ndarray | None = None, X_fit: np.ndarray | None = None, y_fit: np.ndarray | None = None, param: object | None = None) -> tuple[np.ndarray, np.ndarray | None, object]:
+    """
+    Perform feature engineering on the input data.
+
+    Parameters:
+    - X: np.ndarray
+        The input data to be transformed.
+    - y: np.ndarray | None
+        The target data.
+    - X_fit: np.ndarray | None
+        Data for fitting the transformation parameters.
+    - y_fit: np.ndarray | None
+        Target data for fitting.
+    - param: object | None
+        Pre-fitted parameters for transformation.
+
+    Returns:
+    - transformed_data: np.ndarray
+        Transformed data.
+    - transformed_target: np.ndarray | None
+        Transformed target data.
+    - fitted_param: object
+        Fitted parameters.
+    """
+```
+
+- Ensure that the feature engineering process is consistent and can be applied to both training and test data.
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/model.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/model.md
new file mode 100644
index 000000000..c333497ab
--- /dev/null
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/model.md
@@ -0,0 +1,39 @@
+## Model Workflow
+
+- Implement a function to manage the model workflow with the following signature:
+
+```python
+def model_workflow(X: np.ndarray, y: np.ndarray, val_X: np.ndarray = None, val_y: np.ndarray = None, test_X: np.ndarray = None, **hyper_params; dict = {}) -> tuple[np.ndarray | None, np.ndarray | None, dict]:
+    """
+    Manages the workflow of a machine learning model, including training, validation.
+    The testing&validation's inference is included, as well
+
+    - If test/valid exist, output inference on them
+    - Follow the hyperparameter if exists.
+        - the returned hyperparameter should align with the input(except the newly generated early stop)
+    - If valid exist, add <early stop> to update the hyperparameter
+
+    Parameters
+    ----------
+    X : np.ndarray
+        Training data features.
+    y : np.ndarray
+        Training data labels.
+    val_X : np.ndarray, optional
+        Validation data features.
+    val_y : np.ndarray, optional
+        Validation data labels.
+    test_X : np.ndarray, optional
+        Test data features.
+    **hyper_params
+        Additional hyperparameters for the model.
+
+    Returns
+    -------
+    tuple[np.ndarray | None, np.ndarray | None, dict]
+        Predictions on the validation data, predictions on the test data
+    """
+```
+- In this task, the shape of output should be (batch_size, num_class), as num_class = 1 here.
+
+- The function should handle data augmentation, model creation, training, and prediction.
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/workflow.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/workflow.md
new file mode 100644
index 000000000..10ee6de06
--- /dev/null
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/workflow.md
@@ -0,0 +1,24 @@
+# Specification for Implementing a Kaggle Competition Project
+
+This document outlines the structure and interface protocols for implementing a machine learning project, similar to a Kaggle competition. Follow these guidelines to ensure consistency and maintainability across projects.
+
+## Project Structure
+
+The project should be organized into the following components:
+
+1. **Data Loading** (`load_data.py`): A module responsible for loading and preprocessing raw data.
+2. **Feature Engineering**(`feat*.py`): A module for transforming raw data into features suitable for model training.
+3. **Model Workflow**(`model*.py`): A module that manages the training, validation, and testing of machine learning models.
+4. **Ensemble and Decision Making**(`ens.py`): A module for combining predictions from multiple models and making final decisions.
+5. **Workflow**(`main.py`): A script to put the above component together to get the final submission(`submission.csv`)
+
+## Submission
+
+- Implement a script to generate the submission file.
+- The script should write predictions to a CSV file in the format required by the competition.
+
+## General Guidelines
+
+- Ensure that all modules and functions are well-documented.
+- Follow consistent naming conventions and code style.
+- Use type annotations for function signatures to improve code readability and maintainability.

From 9674c167c744859162ebccb5eb313ffcd9a087e2 Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Mon, 9 Dec 2024 08:33:29 +0000
Subject: [PATCH 027/304] ds model test: init evolving strategy and unit test

---
 .../coder/data_science/model/__init__.py      |  27 +---
 .../components/coder/data_science/model/es.py | 119 ++++++++++++++++++
 .../coder/data_science/model/eva_utils.py     |   5 +-
 .../coder/data_science/model/eval.py          |  16 +--
 .../coder/data_science/model/exp.py           |   4 +-
 .../model/model_execute_template.txt          |   4 +-
 .../coder/data_science/model/prompts.yaml     |   5 +-
 .../coder/data_science/model/test.py          |  42 ++++---
 8 files changed, 169 insertions(+), 53 deletions(-)
 create mode 100644 rdagent/components/coder/data_science/model/es.py

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 1b11cde75..89e8fdaed 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -1,42 +1,19 @@
 from rdagent.components.coder.CoSTEER import CoSTEER
 from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
 from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-from rdagent.components.coder.CoSTEER.evolving_strategy import (
-    MultiProcessEvolvingStrategy,
-)
+
 from rdagent.components.coder.data_science.model.eval import ModelGeneralCaseSpecEvaluator
 from rdagent.components.coder.CoSTEER.knowledge_management import (
     CoSTEERQueriedKnowledge,
 )
 from rdagent.components.coder.data_science.model.exp import ModelTask
 from rdagent.core.scenario import Scenario
+from rdagent.components.coder.data_science.model.es import ModelMultiProcessEvolvingStrategy
 
 # from rdagent.utils.agent.tpl import T
 # T(".prompts:model_generator.user").r()
 
 
-class ModelMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
-    def implement_one_task(
-        self,
-        target_task: ModelTask,
-        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
-    ) -> str:
-        return """
-        import pandas as pd
-        def Model():
-            pass
-        """
-
-    def assign_code_list_to_evo(self, code_list: list, evo) -> None:
-        """
-        Assign the code list to the evolving item.
-
-        The code list is aligned with the evolving item's sub-tasks.
-        If a task is not implemented, put a None in the list.
-        """
-        raise NotImplementedError
-
-
 class ModelCoSTEER(CoSTEER):
     def __init__(
         self,
diff --git a/rdagent/components/coder/data_science/model/es.py b/rdagent/components/coder/data_science/model/es.py
new file mode 100644
index 000000000..9b27b801a
--- /dev/null
+++ b/rdagent/components/coder/data_science/model/es.py
@@ -0,0 +1,119 @@
+import json
+from pathlib import Path
+
+from jinja2 import Environment, StrictUndefined
+from rdagent.components.coder.CoSTEER.evolving_strategy import (
+    MultiProcessEvolvingStrategy,
+)
+
+from rdagent.components.coder.CoSTEER.knowledge_management import (
+    CoSTEERQueriedKnowledge,
+    CoSTEERQueriedKnowledgeV2,
+)
+from rdagent.components.coder.data_science.model.exp import (
+    ModelTask,
+    ModelFBWorkspace,
+)
+from rdagent.core.prompts import Prompts
+from rdagent.oai.llm_conf import LLM_SETTINGS
+from rdagent.oai.llm_utils import APIBackend
+
+coder_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
+
+class ModelMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
+    def implement_one_task(
+        self,
+        target_task: ModelTask,
+        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+    ) -> str:
+        model_information_str = target_task.get_task_information()
+
+        queried_similar_successful_knowledge = (
+            queried_knowledge.task_to_similar_task_successful_knowledge[model_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+        queried_former_failed_knowledge = (
+            queried_knowledge.task_to_former_failed_traces[model_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+
+        queried_former_failed_knowledge_to_render = (
+            queried_former_failed_knowledge[0]
+            if isinstance(queried_knowledge, CoSTEERQueriedKnowledgeV2)
+            else queried_former_failed_knowledge
+        )
+
+        system_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(
+                coder_prompts["evolving_strategy_model_coder"]["system"],
+            )
+            .render(
+                # scenario=self.scen.get_scenario_all_desc(filtered_tag=target_task.model_type),
+                # TODO: fit new scenario information
+                scenario=("No scenario description."),
+                spec=target_task.spec,
+                queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
+                current_code=target_task.base_code,
+            )
+        )
+
+        queried_similar_successful_knowledge_to_render = queried_similar_successful_knowledge
+        for _ in range(10):  # max attempt to reduce the length of user_prompt
+            user_prompt = (
+                Environment(undefined=StrictUndefined)
+                .from_string(
+                    coder_prompts["evolving_strategy_model_coder"]["user"],
+                )
+                .render(
+                    model_information_str=model_information_str,
+                    queried_similar_successful_knowledge=queried_similar_successful_knowledge_to_render,
+                    queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
+                )
+                .strip("\n")
+            )
+            if (
+                APIBackend().build_messages_and_calculate_token(
+                    user_prompt=user_prompt,
+                    system_prompt=system_prompt,
+                )
+                < LLM_SETTINGS.chat_token_limit
+            ):
+                break
+            elif len(queried_former_failed_knowledge_to_render) > 1:
+                queried_former_failed_knowledge_to_render = queried_former_failed_knowledge_to_render[1:]
+            elif len(queried_similar_successful_knowledge_to_render) > 1:
+                queried_similar_successful_knowledge_to_render = queried_similar_successful_knowledge_to_render[1:]
+
+        code = json.loads(
+            # APIBackend(use_chat_cache=CoSTEER_SETTINGS.coder_use_cache).build_messages_and_create_chat_completion(
+            APIBackend().build_messages_and_create_chat_completion(
+                user_prompt=user_prompt,
+                system_prompt=system_prompt,
+                json_mode=True,
+            ),
+        )["code"]
+        return code
+        """
+        import pandas as pd
+        def Model():
+            pass
+        """
+    
+    def assign_code_list_to_evo(self, code_list, evo):
+        """
+        Assign the code list to the evolving item.
+
+        The code list is aligned with the evolving item's sub-tasks.
+        If a task is not implemented, put a None in the list.
+        """
+        for index in range(len(evo.sub_tasks)):
+            if code_list[index] is None:
+                continue
+            if evo.sub_workspace_list[index] is None:
+                evo.sub_workspace_list[index] = ModelFBWorkspace(target_task=evo.sub_tasks[index])
+            # TODO: avoid hardcode of file name
+            evo.sub_workspace_list[index].inject_code(**{"model01.py": code_list[index]})
+        return evo
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/model/eva_utils.py b/rdagent/components/coder/data_science/model/eva_utils.py
index ea51ed89e..1224e9a75 100644
--- a/rdagent/components/coder/data_science/model/eva_utils.py
+++ b/rdagent/components/coder/data_science/model/eva_utils.py
@@ -14,7 +14,7 @@
 
 evaluate_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
-def expected_shape_detect(
+def expected_shape_evaluate(
     prediction: np.ndarray,    
     spec_message: str,
     model_execution_feedback: str,
@@ -84,6 +84,7 @@ def evaluate(
         assert isinstance(implementation, ModelFBWorkspace)
 
         model_task_information = target_task.get_task_information()
+        # TODO: avoid hardcode of file name
         code = implementation.code_dict["model01.py"]
 
         system_prompt = (
@@ -91,6 +92,7 @@ def evaluate(
             .from_string(evaluate_prompts["evaluator_code_feedback"]["system"])
             .render(
                 scenario=(
+                    # TODO: Here replaced with ds scen information
                     # self.scen.get_scenario_all_desc(target_task, filtered_tag=target_task.model_type)
                     # if self.scen is not None
                     # else "No scenario description."
@@ -150,6 +152,7 @@ def evaluate(
             .from_string(evaluate_prompts["evaluator_final_feedback"]["system"])
             .render(
                 scenario=(
+                    # TODO: Here replaced with ds scen information
                     # self.scen.get_scenario_all_desc(target_task, filtered_tag=target_task.model_type)
                     # if self.scen is not None
                     # else "No scenario description."
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 9d6d860d7..5bf6d0907 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -11,10 +11,9 @@
 from rdagent.components.coder.data_science.model.eva_utils import (
     ModelCodeEvaluator,
     ModelFinalEvaluator,
-    expected_shape_detect,
+    expected_shape_evaluate,
 )
-from rdagent.components.coder.model_coder.eva_utils import shape_evaluator
-from rdagent.components.coder.data_science.model.exp import ModelFBWorkspace, ModelTask
+from rdagent.components.coder.data_science.model.exp import ModelFBWorkspace
 from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.core.experiment import Task, Workspace
 
@@ -41,7 +40,7 @@ def evaluate(
         queried_knowledge: QueriedKnowledge = None,
         **kwargs,
     ) -> ModelSingleFeedback:
-        # target_task_information = target_task.get_task_information()
+        target_task_information = target_task.get_task_information()
         if (
             queried_knowledge is not None
             and target_task_information in queried_knowledge.success_task_to_knowledge_dict
@@ -67,14 +66,15 @@ def evaluate(
         if pred_list is None:
             shape_feedback += "No output generated from the model. No shape evaluation conducted."
         else:
-            val_pred_array, test_pred_array = pred_list
-            spec_message = implementation.code_dict["spec.md"]
-            val_shape_feedback = expected_shape_detect(
+            val_pred_array, test_pred_array, hypers = pred_list
+            # spec_message = implementation.code_dict["spec/model.md"]
+            spec_message = target_task.spec
+            val_shape_feedback = expected_shape_evaluate(
                 val_pred_array,
                 spec_message,
                 model_execution_feedback=model_execution_feedback,
             )
-            test_shape_feedback = expected_shape_detect(
+            test_shape_feedback = expected_shape_evaluate(
                 test_pred_array,
                 spec_message,
                 model_execution_feedback=model_execution_feedback,
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
index 18eeb0a29..1e7820b45 100644
--- a/rdagent/components/coder/data_science/model/exp.py
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -21,7 +21,8 @@ def __init__(
         hyperparameters: Dict[str, str],
         formulation: str = None,
         variables: Dict[str, str] = None,
-        model_type: Optional[str] = None,
+        model_type: Optional[str] = None,   
+        spec: str,
         **kwargs,
     ) -> None:
         self.formulation: str = formulation
@@ -32,6 +33,7 @@ def __init__(
             model_type  # Tabular for tabular model, TimesSeries for time series model, Graph for graph model, XGBoost for XGBoost model
             # TODO: More Models Supported
         )
+        self.spec: str = spec
         super().__init__(name=name, description=description, *args, **kwargs)
 
     def get_task_information(self):
diff --git a/rdagent/components/coder/data_science/model/model_execute_template.txt b/rdagent/components/coder/data_science/model/model_execute_template.txt
index fd4668ee1..6d890deb9 100644
--- a/rdagent/components/coder/data_science/model/model_execute_template.txt
+++ b/rdagent/components/coder/data_science/model/model_execute_template.txt
@@ -22,7 +22,7 @@ test_X = np.random.rand(8, 64, 64, 3)
   
 
 # Call model_workflow  
-val_pred, test_pred = model_workflow(  
+val_pred, test_pred, hypers = model_workflow(  
     X=train_X,  
     y=train_y,  
     val_X=val_X,  
@@ -42,7 +42,7 @@ else:
     execution_feedback_str += "Test predictions are None.\n"  
   
 # Save the outputs  
-pred_list = [val_pred, test_pred]  
+pred_list = [val_pred, test_pred, hypers]  
 pickle.dump(pred_list, open("pred_list.pkl", "wb"))  
 pickle.dump(execution_feedback_str, open("execution_feedback_str.pkl", "wb"))  
   
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 1affe2381..d831ffa40 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -50,6 +50,9 @@ evolving_strategy_model_coder:
         2. The user might provide you the failed former code and the corresponding feedback to the code. The feedback contains to the execution, the code and the model output value. You should analyze the feedback and try to correct the latest code.
         3. The user might provide you the suggestion to the latest fail code and some similar fail to correct pairs. Each pair contains the fail code with similar error and the corresponding corrected version code. You should learn from these suggestion to write the correct code.
 
+        The user will also provide some information about how to organize the whole code and give instructions. These information are as below, and the code you implement should align the framework given below:
+        {{ spec }}
+
         Your must write your code based on your former latest attempt below which consists of your former code and code feedback, you should read the former attempt carefully and must not modify the right part of your former code.
 
         {% if current_code is not none %}
@@ -101,7 +104,7 @@ evolving_strategy_model_coder:
 
 evaluator_shape_feedback:
     system: |-
-        User is trying to evaluate whether a model output shape is correct not. The correct message about the ground truth shape is given in spec.md as below:
+        User is trying to evaluate whether a model output shape is correct or not. The correct message about the ground truth shape is given in spec.md as below:
         {{ spec }}
 
         The user will provide you the actual output of the model. The model is a part for solving a task in an given scenario. This model takes train dataset as input. Valid and test dataset are optional. The model workflow will generate prediction output of valid and test dataset.
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index b2218d4b2..04a4e025f 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -10,6 +10,8 @@
 from rdagent.components.coder.data_science.model.exp import ModelTask, ModelFBWorkspace
 from rdagent.scenarios.data_science.experiment.experiment import ModelExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+from rdagent.components.coder.data_science.model.es import ModelMultiProcessEvolvingStrategy
 
 
 # Take tasks, spec.md and feat as input, generate a feedback as output
@@ -17,37 +19,47 @@ def develop_one_competition(competition: str):
     scen = DataScienceScen(competition=competition)
     model_coder = ModelCoSTEER(scen)
 
-    # Create the experiment
+    # Create the task
     mt = ModelTask(
         name="ModelTask", 
         description="A CNN Model", 
         architecture="\hat{y}_u = CNN(X_u)", 
         variables="variables: {'\\hat{y}_u': 'The predicted output for node u', 'X_u': 'The input features for node u'}", 
         hyperparameters="...",
-        base_code="import pandas...",
-    )
-    exp = ModelExperiment(
-        sub_tasks=[mt],
+        base_code="",
+        spec="<read from spec>",
     )
 
     tpl_ex_path = Path(__file__).resolve() / Path("rdagent/scenarios/kaggle/tpl_ex").resolve() / competition
-    injected_file_names = ["spec.md", "load_data.py", "feat01.py", "model01.py"]
-    for file_name in injected_file_names:
-        file_path = tpl_ex_path / file_name
-        exp.experiment_workspace.inject_code(**{file_name: file_path.read_text()})
+    injected_file_names = ["spec/model.md", "load_data.py", "feat01.py", "model01.py"]
 
-    # Run the experiment
-    # exp = model_coder.develop(exp)
-    # test the evaluator
-    eva = ModelGeneralCaseSpecEvaluator(scen=scen)
     modelexp = ModelFBWorkspace()
     for file_name in injected_file_names:
         file_path = tpl_ex_path / file_name
         modelexp.inject_code(**{file_name: file_path.read_text()})
     
+    mt.spec += modelexp.code_dict["spec/model.md"]
+    mt.base_code += modelexp.code_dict["model01.py"]
+    exp = ModelExperiment(
+        sub_tasks=[mt],
+    )
+ 
+    # Test the evaluator:
+    """eva = ModelGeneralCaseSpecEvaluator(scen=scen)
     exp.feedback = eva.evaluate(target_task=mt, queried_knowledge=None, implementation=modelexp, gt_implementation=None)
-    print("hello world")
-    print(exp.feedback)
+    print(exp.feedback)"""
+
+    # Test the evolving strategy:
+    """es = ModelMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+    new_code = es.implement_one_task(target_task=mt, queried_knowledge=None)
+    print(new_code)"""
+
+    # Run the experiment 
+    for file_name in injected_file_names:
+        file_path = tpl_ex_path / file_name
+        exp.experiment_workspace.inject_code(**{file_name: file_path.read_text()})
+
+    exp = model_coder.develop(exp)
 
 
 if __name__ == "__main__":

From 06e20431649e88f992d840b6442571b7485465f4 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 10 Dec 2024 04:36:22 +0000
Subject: [PATCH 028/304] data science scenario changes

---
 .../scenarios/data_science/scen/prompts.yaml  |   3 -
 rdagent/scenarios/data_science/scen/scen.py   |  45 +-----
 .../aerial-cactus-identification/spec.md      | 131 ------------------
 3 files changed, 6 insertions(+), 173 deletions(-)
 delete mode 100644 rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec.md

diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
index 006c01efc..d9f14d250 100644
--- a/rdagent/scenarios/data_science/scen/prompts.yaml
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -2,8 +2,5 @@ scen_desc: -|
   ------Background of the scenario------
   {{scen.background}}
 
-  ------The source dataset you can use to generate the features------
-  {{scen.source_data}}
-
   ------The expected output & submission format specifications------
   {{scen.submission_specifications}}
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index 256893509..7471efea8 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -1,14 +1,8 @@
 import json
-from pathlib import Path
-from typing import Literal
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
-from rdagent.core.experiment import Task
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.scenarios.kaggle.experiment.scenario import (
-    prompt_dict as kaggle_prompt_dict,
-)
 from rdagent.scenarios.kaggle.kaggle_crawler import (
     crawl_descriptions,
     leaderboard_scores,
@@ -28,8 +22,10 @@ class DataScienceScen(Scenario):
     def __init__(self, competition: str) -> None:
         self.competition = competition
         self.competition_descriptions = crawl_descriptions(competition, DS_RD_SETTING.local_data_path)
-        self.leaderboard = leaderboard_scores(competition)
-        self.evaluation_metric_direction = float(self.leaderboard[0]) > float(self.leaderboard[-1])
+        
+        leaderboard = leaderboard_scores(competition)
+        self.evaluation_metric_direction = float(leaderboard[0]) > float(leaderboard[-1])
+
         self._analysis_competition_description()
 
     def _analysis_competition_description(self):
@@ -85,31 +81,6 @@ def background(self) -> str:
         )
         return background_prompt
 
-    @property
-    def source_data(self) -> str:
-        # TODO: remove me if not used
-        # TODO: (bowen)
-        if Path(f"{DS_RD_SETTING.local_data_path}/{DS_RD_SETTING.competition}/cache").exists():
-            # phase2: (cache detected)
-            # - Describe the cached data (preprocessed data).
-            pass
-        else:
-            # phase1:
-            # - If we have not implement load data and dump cache
-            # - describe the raw data
-            return self.competition_descriptions["Data Description"]
-
-        return "!!!!!!!!! I'm the fake source data !!!!!!!!"
-        raise NotImplementedError(f"We are not sure how it is called. We place a exception here")
-
-    def output_format(self, tag=None) -> str:
-        # TODO: remove me if not used
-        raise NotImplementedError(f"We are not sure how it is called. We place a exception here")
-
-    def simulator(self, tag=None) -> str:
-        # TODO: remove me if not used
-        raise NotImplementedError(f"We are not sure how it is called. We place a exception here")
-
     @property
     def rich_style_description(self) -> str:
         return f"""
@@ -138,9 +109,5 @@ def rich_style_description(self) -> str:
 To automatically optimize performance metrics within the validation set or Kaggle Leaderboard, ultimately discovering the most efficient features and models through autonomous research and development.
 """
 
-    def get_scenario_all_desc(self, task: Task | None = None, filtered_tag: str | None = None) -> str:
-        # TODO: remove me if not used
-        raise NotImplementedError(f"We are not sure how it is called. We place a exception here")
-        # if filtered_tag is None:
-        #     return common_description() + interface(None) + output(None) + simulator(None)
-        # NOTE: we suggest such implementation: `return T(".prompts:scen_desc").r()`
+    def get_scenario_all_desc(self) -> str:
+        return T(".prompts:scen_desc").r(scen=self)
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec.md
deleted file mode 100644
index 696e2ffd7..000000000
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec.md
+++ /dev/null
@@ -1,131 +0,0 @@
-# Specification for Implementing a Kaggle Competition Project
-
-This document outlines the structure and interface protocols for implementing a machine learning project, similar to a Kaggle competition. Follow these guidelines to ensure consistency and maintainability across projects.
-
-## Project Structure
-
-The project should be organized into the following components:
-
-1. **Data Loading** (`load_data.py`): A module responsible for loading and preprocessing raw data.
-2. **Feature Engineering**(`feat*.py`): A module for transforming raw data into features suitable for model training.
-3. **Model Workflow**(`model*.py`): A module that manages the training, validation, and testing of machine learning models.
-4. **Ensemble and Decision Making**(`ens.py`): A module for combining predictions from multiple models and making final decisions.
-5. **Workflow**(`main.py`): A script to put the above component together to get the final submission(`submission.csv`)
-
-## Data Loading
-
-- Implement a function to load data from raw files.
-- The function should return training images, training labels, test images, and test IDs.
-
-## Feature Engineering
-
-- Implement a function for feature engineering with the following signature:
-
-```python
-def feature_eng(X: np.ndarray, y: np.ndarray | None = None, X_fit: np.ndarray | None = None, y_fit: np.ndarray | None = None, param: object | None = None) -> tuple[np.ndarray, np.ndarray | None, object]:
-    """
-    Perform feature engineering on the input data.
-
-    Parameters:
-    - X: np.ndarray
-        The input data to be transformed.
-    - y: np.ndarray | None
-        The target data.
-    - X_fit: np.ndarray | None
-        Data for fitting the transformation parameters.
-    - y_fit: np.ndarray | None
-        Target data for fitting.
-    - param: object | None
-        Pre-fitted parameters for transformation.
-
-    Returns:
-    - transformed_data: np.ndarray
-        Transformed data.
-    - transformed_target: np.ndarray | None
-        Transformed target data.
-    - fitted_param: object
-        Fitted parameters.
-    """
-```
-
-- Ensure that the feature engineering process is consistent and can be applied to both training and test data.
-
-## Model Workflow
-
-- Implement a function to manage the model workflow with the following signature:
-
-```python
-def model_workflow(X: np.ndarray, y: np.ndarray, val_X: np.ndarray = None, val_y: np.ndarray = None, test_X: np.ndarray = None, **hyper_params; dict = {}) -> tuple[np.ndarray | None, np.ndarray | None, dict]:
-    """
-    Manages the workflow of a machine learning model, including training, validation.
-    The testing&validation's inference is included, as well
-
-    - If test/valid exist, output inference on them
-    - Follow the hyperparameter if exists.
-        - the returned hyperparameter should align with the input(except the newly generated early stop)
-    - If valid exist, add <early stop> to update the hyperparameter
-
-    Parameters
-    ----------
-    X : np.ndarray
-        Training data features.
-    y : np.ndarray
-        Training data labels.
-    val_X : np.ndarray, optional
-        Validation data features.
-    val_y : np.ndarray, optional
-        Validation data labels.
-    test_X : np.ndarray, optional
-        Test data features.
-    **hyper_params
-        Additional hyperparameters for the model.
-
-    Returns
-    -------
-    tuple[np.ndarray | None, np.ndarray | None, dict]
-        Predictions on the validation data, predictions on the test data
-    """
-```
-- In this task, the shape of output should be (batch_size, num_class), as num_class = 1 here.
-
-- The function should handle data augmentation, model creation, training, and prediction.
-
-## Ensemble and Decision Making
-
-- Implement a function for ensemble and decision making with the following signature:
-
-```python
-def ens_and_decision(test_pred_l: list[np.ndarray], val_pred_l: list[np.ndarray], val_label: np.ndarray) -> np.ndarray:
-    """
-    Handle the following:
-    1) Ensemble predictions using a simple average.
-    2) Make final decision after ensemble (convert the predictions to final binary form).
-
-    Parameters
-    ----------
-    test_pred_l : list[np.ndarray]
-        List of predictions on the test data.
-    val_pred_l : list[np.ndarray]
-        List of predictions on the validation data.
-    val_label : np.ndarray
-        True labels of the validation data.
-
-    Returns
-    -------
-    np.ndarray
-        Binary predictions on the test data.
-    """
-```
-
-- The function should combine predictions and convert them to a binary format.
-
-## Submission
-
-- Implement a script to generate the submission file.
-- The script should write predictions to a CSV file in the format required by the competition.
-
-## General Guidelines
-
-- Ensure that all modules and functions are well-documented.
-- Follow consistent naming conventions and code style.
-- Use type annotations for function signatures to improve code readability and maintainability.

From 4f76a0b7c01926d3ec2e7c38546ab144d3b6e963 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 10 Dec 2024 05:57:20 +0000
Subject: [PATCH 029/304] data science base file

---
 .../coder/data_science/ensemble/exp.py        | 26 +++++++
 .../coder/data_science/feature_process/exp.py | 26 +++++++
 .../coder/data_science/workflow/exp.py        | 26 +++++++
 .../data_science/experiment/experiment.py     | 69 ++++++++++---------
 .../data_science/experiment/workspace.py      |  1 +
 .../data_science/proposal/exp_gen.py          | 12 ++--
 6 files changed, 123 insertions(+), 37 deletions(-)
 create mode 100644 rdagent/components/coder/data_science/ensemble/exp.py
 create mode 100644 rdagent/components/coder/data_science/feature_process/exp.py
 create mode 100644 rdagent/components/coder/data_science/workflow/exp.py

diff --git a/rdagent/components/coder/data_science/ensemble/exp.py b/rdagent/components/coder/data_science/ensemble/exp.py
new file mode 100644
index 000000000..f232cb801
--- /dev/null
+++ b/rdagent/components/coder/data_science/ensemble/exp.py
@@ -0,0 +1,26 @@
+import pickle
+import site
+import traceback
+from pathlib import Path
+from typing import Dict, Optional
+
+from rdagent.components.coder.CoSTEER.task import CoSTEERTask
+from rdagent.core.utils import cache_with_pickle
+
+
+class EnsembleTask(CoSTEERTask):
+    def __init__(
+        self,
+        name: str,
+        description: str,
+        spec: str,
+        **kwargs,
+    ) -> None:
+        pass
+
+    @staticmethod
+    def from_dict(dict):
+        return EnsembleTask(**dict)
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} {self.name}>"
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/feature_process/exp.py b/rdagent/components/coder/data_science/feature_process/exp.py
new file mode 100644
index 000000000..7ea67a140
--- /dev/null
+++ b/rdagent/components/coder/data_science/feature_process/exp.py
@@ -0,0 +1,26 @@
+import pickle
+import site
+import traceback
+from pathlib import Path
+from typing import Dict, Optional
+
+from rdagent.components.coder.CoSTEER.task import CoSTEERTask
+from rdagent.core.utils import cache_with_pickle
+
+
+class FeatureTask(CoSTEERTask):
+    def __init__(
+        self,
+        name: str,
+        description: str,
+        spec: str,
+        **kwargs,
+    ) -> None:
+        pass
+
+    @staticmethod
+    def from_dict(dict):
+        return FeatureTask(**dict)
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} {self.name}>"
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/workflow/exp.py b/rdagent/components/coder/data_science/workflow/exp.py
new file mode 100644
index 000000000..5ae63f752
--- /dev/null
+++ b/rdagent/components/coder/data_science/workflow/exp.py
@@ -0,0 +1,26 @@
+import pickle
+import site
+import traceback
+from pathlib import Path
+from typing import Dict, Optional
+
+from rdagent.components.coder.CoSTEER.task import CoSTEERTask
+from rdagent.core.utils import cache_with_pickle
+
+
+class WorkflowTask(CoSTEERTask):
+    def __init__(
+        self,
+        name: str,
+        description: str,
+        spec: str,
+        **kwargs,
+    ) -> None:
+        pass
+
+    @staticmethod
+    def from_dict(dict):
+        return WorkflowTask(**dict)
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} {self.name}>"
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index 9499ce090..bd1ed14e5 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -1,14 +1,11 @@
-from copy import deepcopy
-from pathlib import Path
+from rdagent.core.experiment import Experiment, FBWorkspace
+
+from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.components.coder.data_science.feature_process.exp import FeatureTask
+from rdagent.components.coder.data_science.model.exp import ModelTask
+from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
+from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
 
-from rdagent.app.data_science.conf import DS_RD_SETTING
-from rdagent.components.coder.data_science.raw_data_loader.exp import (
-    DataLoaderTask,
-)
-from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace, FactorTask
-from rdagent.components.coder.model_coder.model import ModelFBWorkspace, ModelTask
-from rdagent.core.experiment import Experiment
-from rdagent.scenarios.data_science.experiment.workspace import FBWorkspace
 
 # KG_MODEL_TYPE_XGBOOST = "XGBoost"
 # KG_MODEL_TYPE_RANDOMFOREST = "RandomForest"
@@ -35,8 +32,18 @@ def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.experiment_workspace = FBWorkspace()
 
+class EnsembleExperiment(Experiment[EnsembleTask, FBWorkspace, FBWorkspace]):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.experiment_workspace = FBWorkspace()
 
-class ModelExperiment(Experiment[ModelTask, FBWorkspace, ModelFBWorkspace]):
+class WorkflowExperiment(Experiment[WorkflowTask, FBWorkspace, FBWorkspace]):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.experiment_workspace = FBWorkspace()
+
+
+class ModelExperiment(Experiment[ModelTask, FBWorkspace, FBWorkspace]):
     def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         
@@ -60,27 +67,23 @@ def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
         #     ]
 
 
-class FactorExperiment(Experiment[FactorTask, FBWorkspace, FactorFBWorkspace]):
+class FeatureExperiment(Experiment[FeatureTask, FBWorkspace, FBWorkspace]):
     def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.experiment_workspace = FBWorkspace(
-            template_folder_path=Path(__file__).resolve()
-            / Path(DS_RD_SETTING.template_path).resolve()
-            / DS_RD_SETTING.competition
-        )
-        if len(self.based_experiments) > 0:
-            self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
-            self.experiment_workspace.data_description = deepcopy(
-                self.based_experiments[-1].experiment_workspace.data_description
-            )
-        else:
-            self.experiment_workspace.data_description = [
-                (
-                    FactorTask(
-                        factor_name="Original features",
-                        factor_description="The original features",
-                        factor_formulation="",
-                    ).get_task_information(),
-                    source_feature_size,
-                )
-            ]
+        self.experiment_workspace = FBWorkspace()
+        # if len(self.based_experiments) > 0:
+        #     self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
+        #     self.experiment_workspace.data_description = deepcopy(
+        #         self.based_experiments[-1].experiment_workspace.data_description
+        #     )
+        # else:
+        #     self.experiment_workspace.data_description = [
+        #         (
+        #             FactorTask(
+        #                 factor_name="Original features",
+        #                 factor_description="The original features",
+        #                 factor_formulation="",
+        #             ).get_task_information(),
+        #             source_feature_size,
+        #         )
+        #     ]
diff --git a/rdagent/scenarios/data_science/experiment/workspace.py b/rdagent/scenarios/data_science/experiment/workspace.py
index 09d0142ea..ef52eceee 100644
--- a/rdagent/scenarios/data_science/experiment/workspace.py
+++ b/rdagent/scenarios/data_science/experiment/workspace.py
@@ -12,6 +12,7 @@
 class DSFBWorkspace(FBWorkspace):
 
     # TODO: use the cache_with_pickle decorator.
+    # TODO: delete this, it is not used.
     def execute(self, run_env: dict = {}, *args, **kwargs) -> pd.DataFrame:
         """
         Executes the experiment(a competition) within the specified workspace.
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index ff7d04d98..920475617 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -1,10 +1,14 @@
 from argparse import ONE_OR_MORE
 from typing import Literal
 
-from rdagent.components.coder.data_science.raw_data_loader.exp import (
-    DataLoaderExperiment,
-    DataLoaderTask,
-)
+from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.components.coder.data_science.feature_process.exp import FeatureTask
+from rdagent.components.coder.data_science.model.exp import ModelTask
+from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
+from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
+
+from rdagent.scenarios.data_science.experiment.experiment import DataLoaderExperiment, FeatureExperiment, ModelExperiment, EnsembleExperiment, WorkflowExperiment
+
 from rdagent.components.proposal import LLMHypothesis2Experiment, LLMHypothesisGen
 from rdagent.core.experiment import Experiment
 from rdagent.core.proposal import ExpGen, Trace

From 14325de6bb49cbbcdc6da113a7fe1dfcdd3ded48 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 10 Dec 2024 06:02:25 +0000
Subject: [PATCH 030/304] proposal related

---
 rdagent/components/coder/data_science/raw_data_loader/eval.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index 7ee8e198c..bd81e509e 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -1,7 +1,6 @@
 # tess successfully running.
 # (GPT) if it aligns with the spec & rationality of the spec.
 import json
-from abc import abstractclassmethod
 from dataclasses import dataclass
 from os import system
 from rdagent.components.coder.CoSTEER.evaluators import (

From 70316108f8d1659a8376e3ebfbbf31de4bbf1afc Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 10 Dec 2024 07:36:19 +0000
Subject: [PATCH 031/304] proposal related

---
 .../data_science/experiment/experiment.py     |  51 ++-------
 .../data_science/proposal/exp_gen.py          | 100 +++++++++++++++---
 rdagent/scenarios/kaggle/proposal/proposal.py |   2 +-
 3 files changed, 93 insertions(+), 60 deletions(-)

diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index bd1ed14e5..86017c357 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -32,58 +32,23 @@ def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.experiment_workspace = FBWorkspace()
 
-class EnsembleExperiment(Experiment[EnsembleTask, FBWorkspace, FBWorkspace]):
-    def __init__(self, *args, **kwargs) -> None:
+class ModelExperiment(Experiment[ModelTask, FBWorkspace, FBWorkspace]):
+    def __init__(self, *args, **kwargs) -> None: # TODO: use previeous step workspace
         super().__init__(*args, **kwargs)
         self.experiment_workspace = FBWorkspace()
 
-class WorkflowExperiment(Experiment[WorkflowTask, FBWorkspace, FBWorkspace]):
+class FeatureExperiment(Experiment[FeatureTask, FBWorkspace, FBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.experiment_workspace = FBWorkspace()
 
-
-class ModelExperiment(Experiment[ModelTask, FBWorkspace, FBWorkspace]):
-    def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
+class EnsembleExperiment(Experiment[EnsembleTask, FBWorkspace, FBWorkspace]):
+    def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        
-        # TODO: use previeous workspace
         self.experiment_workspace = FBWorkspace()
-        # if len(self.based_experiments) > 0:
-        #     self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
-        #     self.experiment_workspace.data_description = deepcopy(
-        #         self.based_experiments[-1].experiment_workspace.data_description
-        #     )
-        # else:
-        #     self.experiment_workspace.data_description = [
-        #         (
-        #             FactorTask(
-        #                 factor_name="Original features",
-        #                 factor_description="The original features",
-        #                 factor_formulation="",
-        #             ).get_task_information(),
-        #             source_feature_size,
-        #         )
-        #     ]
 
-
-class FeatureExperiment(Experiment[FeatureTask, FBWorkspace, FBWorkspace]):
-    def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
+class WorkflowExperiment(Experiment[WorkflowTask, FBWorkspace, FBWorkspace]):
+    def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.experiment_workspace = FBWorkspace()
-        # if len(self.based_experiments) > 0:
-        #     self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
-        #     self.experiment_workspace.data_description = deepcopy(
-        #         self.based_experiments[-1].experiment_workspace.data_description
-        #     )
-        # else:
-        #     self.experiment_workspace.data_description = [
-        #         (
-        #             FactorTask(
-        #                 factor_name="Original features",
-        #                 factor_description="The original features",
-        #                 factor_formulation="",
-        #             ).get_task_information(),
-        #             source_feature_size,
-        #         )
-        #     ]
+
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 920475617..c21275f7d 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -11,15 +11,53 @@
 
 from rdagent.components.proposal import LLMHypothesis2Experiment, LLMHypothesisGen
 from rdagent.core.experiment import Experiment
-from rdagent.core.proposal import ExpGen, Trace
+from rdagent.core.proposal import ExpGen, Trace, Hypothesis
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 
-COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Workflow", "Ensemble"]
+COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow"]
 ORDER = COMPONENT.__args__
 
 
+class DSHypothesis(Hypothesis):
+    def __init__(
+        self,
+        hypothesis: str,
+        reason: str,
+        concise_reason: str,
+        concise_observation: str,
+        concise_justification: str,
+        concise_knowledge: str,
+        action: COMPONENT,
+    ) -> None:
+        super().__init__(
+            hypothesis, reason, concise_reason, concise_observation, concise_justification, concise_knowledge
+        )
+        self.action = action
+
+    def __str__(self) -> str:
+        return f"""Chosen Action: {self.action}
+Hypothesis: {self.hypothesis}
+Reason: {self.reason}
+Concise Reason & Knowledge: {self.concise_reason}
+Concise Observation: {self.concise_observation}
+Concise Justification: {self.concise_justification}
+Concise Knowledge: {self.concise_knowledge}
+"""
+
+
+class DSHypothesisGen(LLMHypothesisGen):
+    def get_next_action(self, trace):
+        pass
+
+    def prepare_context(self, trace):
+        pass
+    
+    def convert_response(self, response):
+        pass
+
+
 class DSExpGen(ExpGen):
     """Data Science Task Generator."""
 
@@ -35,29 +73,59 @@ def is_complete(self):
     def gen(self, trace: Trace) -> Experiment:
         if self.is_complete():
             # proposal + design
-            pass
+            hypothesis: DSHypothesis = DSHypothesisGen(scen=self.scen).gen(trace)
+
             # TODO: We can create subclasses for them if we need two components
-            LLMHypothesisGen
-            LLMHypothesis2Experiment
+            # LLMHypothesisGen
+            # LLMHypothesis2Experiment
+            if hypothesis.action == "DataLoadSpec":
+                pass
+            elif hypothesis.action == "FeatureEng":
+                pass
+            elif hypothesis.action == "Model":
+                pass
+            elif hypothesis.action == "Ensemble":
+                pass
+            elif hypothesis.action == "Workflow":
+                pass
         else:
             for o in ORDER:
                 if o in self.complete_component:
-                    # we already have the component, the skip
+                    # we already have the component, then skip
                     continue
                 elif o == "DataLoadSpec":
-                    # TODO return a description of the data loading task
-                    # system = T(".prompts:DataLoaderSpec.system").r()
-                    # user = T(".prompts:DataLoaderSpec.user").r()
-                    # data_load_exp = APIBackend().build_messages_and_create_chat_completion(
-                    #     user_prompt=user,
-                    #     system_prompt=system,
-                    #     json_mode=True,
-                    # )
                     dlt = DataLoaderTask(name="DataLoaderTask", description="")
                     exp = DataLoaderExperiment(
                         sub_tasks=[dlt],
                     )
+                    self.complete_component.add(o)
+                    return exp
+                elif o == "FeatureEng":
+                    ft = FeatureTask(name="FeatureTask", description="")
+                    exp = FeatureExperiment(
+                        sub_tasks=[ft],
+                    )
+                    self.complete_component.add(o)
+                    return exp
+                elif o == "Model":
+                    mt = ModelTask(name="ModelTask", description="")
+                    exp = ModelExperiment(
+                        sub_tasks=[mt],
+                    )
+                    self.complete_component.add(o)
+                    return exp
+                elif o == "Ensemble":
+                    et = EnsembleTask(name="EnsembleTask", description="")
+                    exp = EnsembleExperiment(
+                        sub_tasks=[et],
+                    )
+                    self.complete_component.add(o)
+                    return exp
+                elif o == "Workflow":
+                    wt = WorkflowTask(name="WorkflowTask", description="")
+                    exp = WorkflowExperiment(
+                        sub_tasks=[wt],
+                    )
+                    self.complete_component.add(o)
                     return exp
-                else:
-                    ...  # two components
         return super().gen(trace)
diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py
index 1e1cb4845..da6752777 100644
--- a/rdagent/scenarios/kaggle/proposal/proposal.py
+++ b/rdagent/scenarios/kaggle/proposal/proposal.py
@@ -233,7 +233,7 @@ def update_reward_estimates(self, trace: Trace) -> None:
             reward = (performance_t - performance_t_minus_1) / performance_t_minus_1
             n_o = self.scen.action_counts[last_action]
             mu_o = self.scen.reward_estimates[last_action]
-            self.scen.scen.reward_estimates[last_action] += (reward - mu_o) / n_o
+            self.scen.reward_estimates[last_action] += (reward - mu_o) / n_o
         else:
             # First iteration, nothing to update
             pass

From 075aa9ec28828364478e00fc0b66cbb10bf40674 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 11 Dec 2024 02:47:04 +0000
Subject: [PATCH 032/304] complete judge

---
 .../data_science/proposal/exp_gen.py          | 25 +++++++++----------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index c21275f7d..13c2a92e4 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -29,15 +29,15 @@ def __init__(
         concise_observation: str,
         concise_justification: str,
         concise_knowledge: str,
-        action: COMPONENT,
+        component: COMPONENT,
     ) -> None:
         super().__init__(
             hypothesis, reason, concise_reason, concise_observation, concise_justification, concise_knowledge
         )
-        self.action = action
+        self.component = component
 
     def __str__(self) -> str:
-        return f"""Chosen Action: {self.action}
+        return f"""Chosen Component: {self.component}
 Hypothesis: {self.hypothesis}
 Reason: {self.reason}
 Concise Reason & Knowledge: {self.concise_reason}
@@ -61,17 +61,16 @@ def convert_response(self, response):
 class DSExpGen(ExpGen):
     """Data Science Task Generator."""
 
-    def __init__(self, scen: Scenario) -> None:
-        self.complete_component: set[COMPONENT] = set()  # Initialize as an empty set
-        super().__init__(scen)
-
-    def is_complete(self):
-        """is all components complete"""
-        # TODO: place it into ExpGen
-        return self.complete_component == set(COMPONENT.__args__)
-
     def gen(self, trace: Trace) -> Experiment:
-        if self.is_complete():
+        def is_complete():
+            """is all components complete"""
+            successful_components = set()
+            for h, _, hf in trace.hist:
+                if hf.decision:
+                    successful_components.add(h.component)
+            return set(ORDER) == successful_components
+
+        if is_complete():
             # proposal + design
             hypothesis: DSHypothesis = DSHypothesisGen(scen=self.scen).gen(trace)
 

From e2133a0bb9ba2e0db78b4357c006d944a66cb05f Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 11 Dec 2024 02:57:27 +0000
Subject: [PATCH 033/304] some changes

---
 rdagent/scenarios/data_science/proposal/exp_gen.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 13c2a92e4..78c6b4892 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -62,12 +62,13 @@ class DSExpGen(ExpGen):
     """Data Science Task Generator."""
 
     def gen(self, trace: Trace) -> Experiment:
+        successful_components = set()
+        for h, _, hf in trace.hist:
+            if hf.decision:
+                successful_components.add(h.component)
+        
         def is_complete():
             """is all components complete"""
-            successful_components = set()
-            for h, _, hf in trace.hist:
-                if hf.decision:
-                    successful_components.add(h.component)
             return set(ORDER) == successful_components
 
         if is_complete():
@@ -89,7 +90,7 @@ def is_complete():
                 pass
         else:
             for o in ORDER:
-                if o in self.complete_component:
+                if o in successful_components:
                     # we already have the component, then skip
                     continue
                 elif o == "DataLoadSpec":

From ca7d7858dae96c69228f7ce2ee7b028210ff7e24 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 11 Dec 2024 09:43:58 +0000
Subject: [PATCH 034/304] simple readme for data loader costeer

---
 .../coder/data_science/raw_data_loader/README.md  | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 rdagent/components/coder/data_science/raw_data_loader/README.md

diff --git a/rdagent/components/coder/data_science/raw_data_loader/README.md b/rdagent/components/coder/data_science/raw_data_loader/README.md
new file mode 100644
index 000000000..368a08980
--- /dev/null
+++ b/rdagent/components/coder/data_science/raw_data_loader/README.md
@@ -0,0 +1,15 @@
+# CoSTEER
+
+- subworkspace使用主experiment_workspace `RD-Agent/rdagent/scenarios/data_science/experiment/experiment.py`
+
+## evolving_strategy ( implement_one_task() )
+
+1. xxxTask (in exp.py)
+    - spec
+    - description
+2. 
+
+## evaluator
+
+1. queried_knowledge部分 共用
+2. eval_test脚本
\ No newline at end of file

From 9c89f4b7aa001102de88994339ce4d1d32f73bc1 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 12 Dec 2024 03:11:38 +0000
Subject: [PATCH 035/304] proposal related

---
 .../data_science/proposal/exp_gen.py          | 43 +++++++----
 .../data_science/proposal/prompts.yaml        | 72 ++++++++++++++++++-
 2 files changed, 100 insertions(+), 15 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 78c6b4892..f14c1f00e 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -1,5 +1,6 @@
 from argparse import ONE_OR_MORE
 from typing import Literal
+import json
 
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.components.coder.data_science.feature_process.exp import FeatureTask
@@ -52,10 +53,30 @@ def get_next_action(self, trace):
         pass
 
     def prepare_context(self, trace):
-        pass
-    
+        hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(trace=trace)
+        
+        # TODO: how to generate sota solution
+        sota_solution = ""
+        hypothesis_specification = T(".prompts:hypothesis_specification").r(sota_solution=sota_solution)
+        
+        return {
+            "hypothesis_and_feedback": hypothesis_and_feedback,
+            # TODO: "RAG": "",
+            "hypothesis_output_format": T(".prompts:output_format.hypothesis").r(),
+            "hypothesis_specification": hypothesis_specification,
+        }, True
+
     def convert_response(self, response):
-        pass
+        response_dict = json.loads(response)
+        return DSHypothesis(
+            hypothesis=response_dict.get("hypothesis", "Hypothesis not provided"),
+            reason=response_dict.get("reason", "Reason not provided"),
+            concise_reason=response_dict.get("concise_reason", "Concise reason not provided"),
+            concise_observation=response_dict.get("concise_observation", "Concise observation not provided"),
+            concise_justification=response_dict.get("concise_justification", "Concise justification not provided"),
+            concise_knowledge=response_dict.get("concise_knowledge", "Concise knowledge not provided"),
+            component=response_dict.get("component", "Component not provided"),
+        )
 
 
 class DSExpGen(ExpGen):
@@ -74,19 +95,17 @@ def is_complete():
         if is_complete():
             # proposal + design
             hypothesis: DSHypothesis = DSHypothesisGen(scen=self.scen).gen(trace)
-
-            # TODO: We can create subclasses for them if we need two components
-            # LLMHypothesisGen
-            # LLMHypothesis2Experiment
-            if hypothesis.action == "DataLoadSpec":
+            scenario = trace.scen.get_scenario_all_desc()
+            
+            if hypothesis.component == "DataLoadSpec":
                 pass
-            elif hypothesis.action == "FeatureEng":
+            elif hypothesis.component == "FeatureEng":
                 pass
-            elif hypothesis.action == "Model":
+            elif hypothesis.component == "Model":
                 pass
-            elif hypothesis.action == "Ensemble":
+            elif hypothesis.component == "Ensemble":
                 pass
-            elif hypothesis.action == "Workflow":
+            elif hypothesis.component == "Workflow":
                 pass
         else:
             for o in ORDER:
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 8340d5883..0fbdc7292 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -1,3 +1,69 @@
-DataLoaderSpec:
-  system: -|
-  user: -|
\ No newline at end of file
+hypothesis_and_feedback: |-
+  {% for hypothesis, experiment, feedback in trace.hist[-10:] %}
+  Hypothesis {{ loop.index }}: {{ hypothesis }}
+  Observation on the result with the hypothesis: {{ feedback.observations }}
+  Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
+  Did changing to this hypothesis work? (focus on the change):  {{ feedback.decision }}
+  {% endfor %}
+
+hypothesis_specification: |-
+  Hypothesis should avoid being too general and vague, and should be specific and actionable. For example, hypothesis like 'tune a model' is too general, while hypothesis like 'increase the learning rate to 0.1 of the lightgbm model will improve the performance' is specific and actionable.
+  Your hypothesis should based on current SOTA solution. The user will conduct experiments based on the SOTA solution to test whether your hypothesis is right on this specific competition.
+  {{ sota_solution}}
+
+output_format:
+  hypothesis: |-
+    The output should follow JSON format. The schema is as follows:
+    {
+      "component": "If "hypothesis_specification" provides the component you need to take, please follow "hypothesis_specification" to choose the component. Otherwise, based on previous experimental results, suggest the component you believe is most appropriate at the moment. It should be one of ["DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow"]",
+      "hypothesis": "The new hypothesis generated based on the information provided.",
+      "reason": "The reason why you generate this hypothesis. It should be comprehensive and logical. It should cover the other keys below and extend them.",
+      "concise_reason": "Two-line summary. First line focuses on a concise justification for the change. Second line generalizes a knowledge statement.",
+      "concise_observation": "One line summary. It focuses on the observation of the given scenario, data characteristics, or previous experiences (failures & succeses).",
+      "concise_justification": "One line summary. Justify the hypothesis based on theoretical principles or initial assumptions.",
+      "concise_knowledge": "One line summary. Transferable knowledge based on theoretical principles. Use conditional grammar. eg. "If...., ..; When..., .; and etc" Make sure that you state things clearly without ambiguity. Eg. avoid saying "previous hypothesis", because one wouldn't know what that is."
+    }
+  data_loader: |-
+
+  feature: |-
+    According to the hypothesis, please help user design one or more feature engineering tasks.
+    The output should follow JSON format. The schema is as follows:
+    {
+        "factor or group name 1": {
+            "description": "description of factor or group name 1",
+            "formulation": "latex formulation of factor or group name 1",
+            "variables": {
+                "variable or function name 1": "description of variable or function 1",
+                "variable or function name 2": "description of variable or function 2"
+            }
+        },
+        "factor or group name 2": {
+            "description": "description of factor or group name 2",
+            "formulation": "latex formulation of factor or group name 2",
+            "variables": {
+                "variable or function name 1": "description of variable or function 1",
+                "variable or function name 2": "description of variable or function 2"
+            }
+        }
+        # Don't add ellipsis (...) or any filler text that might cause JSON parsing errors here!
+    }
+  model: |-
+    According to the hypothesis, please help user design one model task.
+    We only build one model from four main model types: ["XGBoost", "RandomForest", "LightGBM", "NN"].
+    The output should follow JSON format. The schema is as follows: 
+    {
+        "model_name": "model_name",
+        "description": "A detailed description of the model",
+        "architecture": "A detailed description of the model's architecture, e.g., neural network layers or tree structures",
+        "hyperparameters": {
+            "hyperparameter_name_1": "value of hyperparameter 1",
+            "hyperparameter_name_2": "value of hyperparameter 2",
+            "hyperparameter_name_3": "value of hyperparameter 3"
+        },
+        "model_type": "Please select only **one** model type from the following four options: XGBoost, RandomForest, LightGBM, or NN. The selected model must be unique and used as the **primary model**. You may choose an auxiliary model for support or optimization on specific tasks if necessary, but the primary model must come from the provided options."
+
+    }
+    Usually, a larger model works better than a smaller one. Hence, the parameters should be larger.
+  ensemble: |-
+    
+  workflow: |-

From 3352c99402e09f7bc633175c80666dcd91a3c0b1 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Thu, 12 Dec 2024 04:39:16 +0000
Subject: [PATCH 036/304] Draft for Bowen

---
 .../components/coder/CoSTEER/evaluators.py    | 49 ++++++++++++++++++-
 .../coder/CoSTEER/evolving_agent.py           |  4 +-
 .../coder/CoSTEER/knowledge_management.py     |  4 +-
 .../coder/data_science/model/eval.py          |  4 +-
 .../data_science/raw_data_loader/eval.py      | 24 ++-------
 .../coder/factor_coder/evaluators.py          |  4 +-
 .../coder/model_coder/evaluators.py           |  4 +-
 7 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
index aee25eb23..048a95971 100644
--- a/rdagent/components/coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -1,4 +1,5 @@
 from abc import abstractmethod
+from dataclasses import dataclass
 from typing import List
 
 from rdagent.components.coder.CoSTEER.evolvable_subjects import EvolvingItem
@@ -11,10 +12,48 @@
 from rdagent.log import rdagent_logger as logger
 
 
+# TODO:
+# 1. It seems logically sound, but we currently lack a scenario to apply it.
+# 2. If it proves to be useful, relocate it to a more general location.
+#
+# class FBWorkspaceExeFeedback(Feedback):
+#     """
+#     It pairs with FBWorkspace in the abstract level.
+#     """
+#     # ws: FBWorkspace   # potential
+#     stdout: str
+
+
+@dataclass
 class CoSTEERSingleFeedback(Feedback):
     # TODO: (xiao)
-    # it should be a subclass of FBWorkspaceExeFeedback
+    # it should be more general class for FBWorkspaceExeFeedback
     # A better name of it may be NormalFeedback
+    # TODO: It should be a general feeddback for CoSTEERR
+    """
+    The feedback for the data loader evaluation.
+    It is design align the phases of the implemented code
+    - Execution -> Return Value -> Code -> Final Decision
+    """
+    execution: str
+    # execution_feedback
+    return_checking: str | None  # inlucding every check in the testing (constraints about the generated value)
+    # value_feedback, shape_feedback, value_generated_flag
+    code: str
+    final_decision: bool
+
+    def __str__(self) -> str:
+        return f"""------------------Execution------------------
+{self.execution}
+------------------Return Checking------------------
+{self.return_checking if self.return_checking is not None else 'No return checking'}
+------------------Code------------------
+{self.code}
+------------------Final Decision------------------
+This implementation is {'SUCCESS' if self.final_decision else 'FAIL'}.
+"""
+
+class CoSTEERSingleFeedbackDeprecated(CoSTEERSingleFeedback):
     """This class is a base class for all code generator feedback to single implementation"""
 
     def __init__(
@@ -29,7 +68,6 @@ def __init__(
         final_decision_based_on_gt: bool = None,
     ) -> None:
         self.execution_feedback = execution_feedback
-        self.shape_feedback = shape_feedback
         self.code_feedback = code_feedback
         self.value_feedback = value_feedback
         self.final_decision = final_decision
@@ -37,6 +75,13 @@ def __init__(
         self.value_generated_flag = value_generated_flag
         self.final_decision_based_on_gt = final_decision_based_on_gt
 
+        # TODO:
+        # Not general enough. So we should not put them in the general costeer feedback
+        # Instead, we should create subclass for it.
+        self.shape_feedback = shape_feedback  # Not general enough. So 
+
+    # TODO: @property
+
     def __str__(self) -> str:
         return f"""------------------Execution Feedback------------------
 {self.execution_feedback if self.execution_feedback is not None else 'No execution feedback'}
diff --git a/rdagent/components/coder/CoSTEER/evolving_agent.py b/rdagent/components/coder/CoSTEER/evolving_agent.py
index 70c097290..ece9f85c3 100644
--- a/rdagent/components/coder/CoSTEER/evolving_agent.py
+++ b/rdagent/components/coder/CoSTEER/evolving_agent.py
@@ -1,4 +1,4 @@
-from rdagent.components.coder.CoSTEER.evaluators import CoSTEERSingleFeedback
+from rdagent.components.coder.CoSTEER.evaluators import CoSTEERSingleFeedbackDeprecated
 from rdagent.components.coder.CoSTEER.evolvable_subjects import EvolvingItem
 from rdagent.core.evolving_agent import RAGEvoAgent
 from rdagent.core.evolving_framework import EvolvableSubjects
@@ -6,7 +6,7 @@
 
 class FilterFailedRAGEvoAgent(RAGEvoAgent):
     def filter_evolvable_subjects_by_feedback(
-        self, evo: EvolvableSubjects, feedback: CoSTEERSingleFeedback
+        self, evo: EvolvableSubjects, feedback: CoSTEERSingleFeedbackDeprecated
     ) -> EvolvableSubjects:
         assert isinstance(evo, EvolvingItem)
         assert isinstance(feedback, list)
diff --git a/rdagent/components/coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/CoSTEER/knowledge_management.py
index d67215e56..5bd4d3a94 100644
--- a/rdagent/components/coder/CoSTEER/knowledge_management.py
+++ b/rdagent/components/coder/CoSTEER/knowledge_management.py
@@ -11,7 +11,7 @@
 from jinja2 import Environment, StrictUndefined
 
 from rdagent.components.coder.CoSTEER.config import CoSTEERSettings
-from rdagent.components.coder.CoSTEER.evaluators import CoSTEERSingleFeedback
+from rdagent.components.coder.CoSTEER.evaluators import CoSTEERSingleFeedbackDeprecated
 from rdagent.components.knowledge_management.graph import (
     UndirectedGraph,
     UndirectedNode,
@@ -242,7 +242,7 @@ def generate_knowledge(
                     target_task = implementations.sub_tasks[task_index]
                     target_task_information = target_task.get_task_information()
                     implementation = implementations.sub_workspace_list[task_index]
-                    single_feedback: CoSTEERSingleFeedback = feedback[task_index]
+                    single_feedback: CoSTEERSingleFeedbackDeprecated = feedback[task_index]
                     if implementation is None or single_feedback is None:
                         continue
                     single_knowledge = CoSTEERKnowledge(
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 5bf6d0907..05b511bb6 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -6,7 +6,7 @@
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
     CoSTEERMultiFeedback,
-    CoSTEERSingleFeedback,
+    CoSTEERSingleFeedbackDeprecated,
 )
 from rdagent.components.coder.data_science.model.eva_utils import (
     ModelCodeEvaluator,
@@ -17,7 +17,7 @@
 from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.core.experiment import Task, Workspace
 
-ModelSingleFeedback = CoSTEERSingleFeedback
+ModelSingleFeedback = CoSTEERSingleFeedbackDeprecated
 ModelMultiFeedback = CoSTEERMultiFeedback
 
 
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index bd81e509e..31cb6db68 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -7,6 +7,7 @@
     CoSTEEREvaluator,
     CoSTEERMultiFeedback,
     CoSTEERSingleFeedback,
+    CoSTEERSingleFeedbackDeprecated,
 )
 from rdagent.core.evaluation import Feedback
 from rdagent.core.evolving_framework import QueriedKnowledge
@@ -18,24 +19,7 @@
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
-# TODO:
-# 1. It seems logically sound, but we currently lack a scenario to apply it.
-# 2. If it proves to be useful, relocate it to a more general location.
-#
-# class FBWorkspaceExeFeedback(Feedback):
-#     """
-#     It pairs with FBWorkspace in the abstract level.
-#     """
-#     # ws: FBWorkspace   # potential
-#     stdout: str
-
-
-@dataclass
-class DataLoaderEvalFeedback(Feedback):
-    execution: str
-    checking: str  # inlucding every check in the testing (constraints about the generated value)
-    code: str
-    final_decision: bool
+DataLoaderEvalFeedback = CoSTEERSingleFeedback
 
 
 class DataLoaderCoSTEEREvaluator(CoSTEEREvaluator):
@@ -47,14 +31,14 @@ def evaluate(
         gt_implementation: FBWorkspace,
         queried_knowledge: QueriedKnowledge = None,
         **kwargs,
-    ) -> CoSTEERSingleFeedback:
+    ) -> CoSTEERSingleFeedbackDeprecated:
 
         target_task_information = target_task.get_task_information()
         if (queried_knowledge is not None and
                 target_task_information in queried_knowledge.success_task_to_knowledge_dict):
             return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
         elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
-            return CoSTEERSingleFeedback(
+            return CoSTEERSingleFeedbackDeprecated(
                 execution_feedback="This task has failed too many times, skip implementation.",
                 shape_feedback="This task has failed too many times, skip implementation.",
                 value_feedback="This task has failed too many times, skip implementation.",
diff --git a/rdagent/components/coder/factor_coder/evaluators.py b/rdagent/components/coder/factor_coder/evaluators.py
index c45a6c733..6b5b402d5 100644
--- a/rdagent/components/coder/factor_coder/evaluators.py
+++ b/rdagent/components/coder/factor_coder/evaluators.py
@@ -3,7 +3,7 @@
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
     CoSTEERMultiFeedback,
-    CoSTEERSingleFeedback,
+    CoSTEERSingleFeedbackDeprecated,
 )
 from rdagent.components.coder.factor_coder.eva_utils import (
     FactorCodeEvaluator,
@@ -14,7 +14,7 @@
 from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.core.experiment import Workspace
 
-FactorSingleFeedback = CoSTEERSingleFeedback
+FactorSingleFeedback = CoSTEERSingleFeedbackDeprecated
 FactorMultiFeedback = CoSTEERMultiFeedback
 
 
diff --git a/rdagent/components/coder/model_coder/evaluators.py b/rdagent/components/coder/model_coder/evaluators.py
index a311ded81..170039ab4 100644
--- a/rdagent/components/coder/model_coder/evaluators.py
+++ b/rdagent/components/coder/model_coder/evaluators.py
@@ -1,7 +1,7 @@
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
     CoSTEERMultiFeedback,
-    CoSTEERSingleFeedback,
+    CoSTEERSingleFeedbackDeprecated,
 )
 from rdagent.components.coder.model_coder.eva_utils import (
     ModelCodeEvaluator,
@@ -13,7 +13,7 @@
 from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.core.experiment import Task, Workspace
 
-ModelSingleFeedback = CoSTEERSingleFeedback
+ModelSingleFeedback = CoSTEERSingleFeedbackDeprecated
 ModelMultiFeedback = CoSTEERMultiFeedback
 
 

From 318e457834dcf1629a05aed3257492e1a110308a Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 12 Dec 2024 04:49:09 +0000
Subject: [PATCH 037/304] add property

---
 rdagent/components/coder/CoSTEER/evaluators.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
index 048a95971..dda96d41b 100644
--- a/rdagent/components/coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -81,6 +81,19 @@ def __init__(
         self.shape_feedback = shape_feedback  # Not general enough. So 
 
     # TODO: @property
+    @property
+    def execution(self):
+        return self.execution_feedback
+    
+    @property
+    def return_checking(self):
+        if self.value_generated_flag:
+            return f"value feedback: {self.value_feedback}\n\nshape feedback: {self.shape_feedback}"
+        return None
+    
+    @property
+    def code(self):
+        return self.code_feedback
 
     def __str__(self) -> str:
         return f"""------------------Execution Feedback------------------

From a26bdb4c23cb65da5350c878459df7f2bd9e6186 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 12 Dec 2024 04:55:50 +0000
Subject: [PATCH 038/304] fix knowledgemn

---
 .../coder/CoSTEER/knowledge_management.py          | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/CoSTEER/knowledge_management.py
index 5bd4d3a94..93120aec7 100644
--- a/rdagent/components/coder/CoSTEER/knowledge_management.py
+++ b/rdagent/components/coder/CoSTEER/knowledge_management.py
@@ -11,7 +11,7 @@
 from jinja2 import Environment, StrictUndefined
 
 from rdagent.components.coder.CoSTEER.config import CoSTEERSettings
-from rdagent.components.coder.CoSTEER.evaluators import CoSTEERSingleFeedbackDeprecated
+from rdagent.components.coder.CoSTEER.evaluators import CoSTEERSingleFeedback
 from rdagent.components.knowledge_management.graph import (
     UndirectedGraph,
     UndirectedNode,
@@ -242,7 +242,7 @@ def generate_knowledge(
                     target_task = implementations.sub_tasks[task_index]
                     target_task_information = target_task.get_task_information()
                     implementation = implementations.sub_workspace_list[task_index]
-                    single_feedback: CoSTEERSingleFeedbackDeprecated = feedback[task_index]
+                    single_feedback: CoSTEERSingleFeedback = feedback[task_index]
                     if implementation is None or single_feedback is None:
                         continue
                     single_knowledge = CoSTEERKnowledge(
@@ -269,15 +269,15 @@ def generate_knowledge(
                         else:
                             # generate error node and store into knowledge base
                             error_analysis_result = []
-                            if not single_feedback.value_generated_flag:
+                            if single_feedback.return_checking:
                                 error_analysis_result = self.analyze_error(
-                                    single_feedback.execution_feedback,
-                                    feedback_type="execution",
+                                    single_feedback.return_checking,
+                                    feedback_type="value",
                                 )
                             else:
                                 error_analysis_result = self.analyze_error(
-                                    single_feedback.value_feedback,
-                                    feedback_type="value",
+                                    single_feedback.execution,
+                                    feedback_type="execution",
                                 )
                             self.knowledgebase.working_trace_error_analysis.setdefault(
                                 target_task_information,

From 3c0883dd0e4b1be5da0a3cdea29f0f98500781ac Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 12 Dec 2024 05:19:02 +0000
Subject: [PATCH 039/304] fix feedback prompt

---
 .../components/coder/data_science/raw_data_loader/prompts.yaml  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 7f0a8bb5b..fd08bdb7c 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -114,7 +114,7 @@ data_loader_eval:
     ```json
     {
         "execution": "Describe how well the data loader executed, including any errors or issues encountered.",
-        "checking": "Detail the checks performed on the data loaded, including data integrity and correctness.",
+        "return_checking": "Detail the checks performed on the data loaded, including data integrity and correctness.",
         "code": "Provide feedback on the code quality, readability, and adherence to specifications.",
         "final_decision": <true/false>
     }

From bb4a0a473810efef8ef9f978f2a022e7e0409adb Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 12 Dec 2024 05:58:10 +0000
Subject: [PATCH 040/304] fix feedback bug

---
 rdagent/components/coder/CoSTEER/evaluators.py           | 1 +
 rdagent/components/coder/CoSTEER/knowledge_management.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
index dda96d41b..ffb3d533d 100644
--- a/rdagent/components/coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -41,6 +41,7 @@ class CoSTEERSingleFeedback(Feedback):
     # value_feedback, shape_feedback, value_generated_flag
     code: str
     final_decision: bool
+    final_decision_based_on_gt: bool | None = None
 
     def __str__(self) -> str:
         return f"""------------------Execution------------------
diff --git a/rdagent/components/coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/CoSTEER/knowledge_management.py
index 93120aec7..7d05e32bb 100644
--- a/rdagent/components/coder/CoSTEER/knowledge_management.py
+++ b/rdagent/components/coder/CoSTEER/knowledge_management.py
@@ -425,8 +425,8 @@ def former_trace_query(
                 current_index = 1
                 while current_index < len(former_trace_knowledge):
                     if (
-                        not former_trace_knowledge[current_index].feedback.value_generated_flag
-                        and former_trace_knowledge[current_index - 1].feedback.value_generated_flag
+                        not former_trace_knowledge[current_index].feedback.return_checking
+                        and former_trace_knowledge[current_index - 1].feedback.return_checking
                     ):
                         former_trace_knowledge.pop(current_index)
                     else:

From 67be799e40d67e2f27e1a48052cc46160d44d9c5 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 12 Dec 2024 08:05:39 +0000
Subject: [PATCH 041/304] fix json_mode bug

---
 .../coder/data_science/raw_data_loader/__init__.py     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index f51ce5057..6147cb79a 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -62,11 +62,11 @@ def implement_one_task(
 
         spec_session = APIBackend().build_chat_session(session_system_prompt=system_prompt)
 
-        data_loader_spec = json.loads(spec_session.build_chat_completion(user_prompt=data_loader_prompt))["spec"]
-        feature_spec = json.loads(spec_session.build_chat_completion(user_prompt=feature_prompt))["spec"]
-        model_spec = json.loads(spec_session.build_chat_completion(user_prompt=model_prompt))["spec"]
-        ensemble_spec = json.loads(spec_session.build_chat_completion(user_prompt=ensemble_prompt))["spec"]
-        workflow_spec = json.loads(spec_session.build_chat_completion(user_prompt=workflow_prompt))["spec"]
+        data_loader_spec = json.loads(spec_session.build_chat_completion(user_prompt=data_loader_prompt, json_mode=True))["spec"]
+        feature_spec = json.loads(spec_session.build_chat_completion(user_prompt=feature_prompt, json_mode=True))["spec"]
+        model_spec = json.loads(spec_session.build_chat_completion(user_prompt=model_prompt, json_mode=True))["spec"]
+        ensemble_spec = json.loads(spec_session.build_chat_completion(user_prompt=ensemble_prompt, json_mode=True))["spec"]
+        workflow_spec = json.loads(spec_session.build_chat_completion(user_prompt=workflow_prompt, json_mode=True))["spec"]
 
         # 2. code
         system_prompt = T(".prompts:data_loader_coder.system").r()

From d24ba791a542ffb90383b3d8909ede571dd208f3 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Thu, 12 Dec 2024 09:33:22 +0000
Subject: [PATCH 042/304] feature processing

---
 .../coder/data_science/feature/__init__.py    | 92 +++++++++++++++++++
 .../coder/data_science/feature/eval.py        | 61 ++++++++++++
 .../feature/eval_tests/feature_test.py        | 40 ++++++++
 .../coder/data_science/feature/exp.py         | 46 ++++++++++
 .../coder/data_science/feature/prompts.yaml   | 40 ++++++++
 .../coder/data_science/feature/test.py        | 43 +++++++++
 .../data_science/feature_process/__init__.py  | 19 ----
 .../coder/data_science/feature_process/exp.py | 26 ------
 .../data_science/experiment/experiment.py     |  2 +-
 .../data_science/proposal/exp_gen.py          |  2 +-
 10 files changed, 324 insertions(+), 47 deletions(-)
 create mode 100644 rdagent/components/coder/data_science/feature/__init__.py
 create mode 100644 rdagent/components/coder/data_science/feature/eval.py
 create mode 100644 rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
 create mode 100644 rdagent/components/coder/data_science/feature/exp.py
 create mode 100644 rdagent/components/coder/data_science/feature/prompts.yaml
 create mode 100644 rdagent/components/coder/data_science/feature/test.py
 delete mode 100644 rdagent/components/coder/data_science/feature_process/__init__.py
 delete mode 100644 rdagent/components/coder/data_science/feature_process/exp.py

diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
new file mode 100644
index 000000000..5bfdf93f0
--- /dev/null
+++ b/rdagent/components/coder/data_science/feature/__init__.py
@@ -0,0 +1,92 @@
+# from rdagent.components.coder.CoSTEER import CoSTEER
+# from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+# from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
+# from rdagent.core.scenario import Scenario
+
+
+# class FeatureCoSTEER(CoSTEER):
+#     def __init__(
+#         self,
+#         scen: Scenario,
+#         *args,
+#         **kwargs,
+#     ) -> None:
+#         eva = CoSTEERMultiEvaluator(
+#             FeatureCoSTEEREvaluator(scen=scen), scen=scen
+#         )  # Please specify whether you agree running your eva in parallel or not
+#         es = FeatureMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+
+#         super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
+
+import json
+
+from rdagent.components.coder.CoSTEER import CoSTEER
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
+from rdagent.components.coder.CoSTEER.evolving_strategy import (
+    MultiProcessEvolvingStrategy,
+)
+from rdagent.components.coder.CoSTEER.knowledge_management import (
+    CoSTEERQueriedKnowledge,
+)
+from rdagent.components.coder.data_science.raw_data_loader.eval import (
+    DataLoaderCoSTEEREvaluator,
+)
+from rdagent.components.coder.data_science.feature.exp import FeatureTask
+from rdagent.core.scenario import Scenario
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+
+class FeatureMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
+    def implement_one_task(
+        self,
+        target_task: FeatureTask,
+        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+    ) -> dict[str, str]:
+        # return a workspace with "load_data.py", "spec/load_data.md" inside
+        # assign the implemented code to the new workspace.
+        competition_info = self.scen.competition_descriptions
+
+        # 2. code
+        system_prompt = T(".prompts:feature.system").r()
+        user_prompt = T(".prompts:feature.user").r(
+            competition_info=competition_info, feature_spec=target_task.spec
+        )
+
+        feature_code = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True)
+        )["code"]
+
+        return {
+            "feat01.py": feature_code,
+        }
+
+    def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
+        """
+        Assign the code list to the evolving item.
+
+        The code list is aligned with the evolving item's sub-tasks.
+        If a task is not implemented, put a None in the list.
+        """
+        for index in range(len(evo.sub_tasks)):
+            if code_list[index] is None:
+                continue
+            if evo.sub_workspace_list[index] is None:
+                # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
+                evo.sub_workspace_list[index] = evo.experiment_workspace
+            evo.sub_workspace_list[index].inject_code(**code_list[index])
+        return evo
+
+class FeatureCoSTEER(CoSTEER):
+    def __init__(
+        self,
+        scen: Scenario,
+        *args,
+        **kwargs,
+    ) -> None:
+        eva = CoSTEERMultiEvaluator(
+            DataLoaderCoSTEEREvaluator(scen=scen), scen=scen
+        )  # Please specify whether you agree running your eva in parallel or not
+        es = FeatureMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+
+        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=2, scen=scen, **kwargs)
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
new file mode 100644
index 000000000..f363e27f4
--- /dev/null
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -0,0 +1,61 @@
+from rdagent.core.evolving_framework import QueriedKnowledge
+
+import json
+from dataclasses import dataclass
+from os import system
+from rdagent.components.coder.CoSTEER.evaluators import (
+    CoSTEEREvaluator,
+    CoSTEERMultiFeedback,
+    CoSTEERSingleFeedback,
+    CoSTEERSingleFeedbackDeprecated,
+)
+
+from rdagent.utils.env import DSDockerConf, DockerEnv
+from rdagent.core.experiment import FBWorkspace, Task, Workspace
+from pathlib import Path
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+FeatureEvalFeedback = CoSTEERSingleFeedback
+
+class FeatureCoSTEEREvaluator(CoSTEEREvaluator):
+
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: FBWorkspace,
+        gt_implementation: FBWorkspace,
+        queried_knowledge: QueriedKnowledge = None,
+        **kwargs,
+    ) -> CoSTEERSingleFeedbackDeprecated:
+
+        target_task_information = target_task.get_task_information()
+        if (queried_knowledge is not None and
+                target_task_information in queried_knowledge.success_task_to_knowledge_dict):
+            return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
+        elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
+            return CoSTEERSingleFeedbackDeprecated(
+                execution_feedback="This task has failed too many times, skip implementation.",
+                shape_feedback="This task has failed too many times, skip implementation.",
+                value_feedback="This task has failed too many times, skip implementation.",
+                code_feedback="This task has failed too many times, skip implementation.",
+                final_feedback="This task has failed too many times, skip implementation.",
+                final_decision=False,
+            )
+
+        de = DockerEnv(conf=DSDockerConf())
+
+        # TODO: do we need to clean the generated tempory content?
+        fname = "feature_test.py"
+        with (DIRNAME / "eval_tests" / "feature_test.py").open("r") as f:
+            test_code = f.read()
+            implementation.inject_code(**{fname: test_code})
+        stdout = implementation.execute(env=de, entry=f"python {fname}")
+
+        system_prompt = T(".prompts:feature.system").r(test_code=test_code)
+        user_prompt = T(".prompts:feature.user").r(stdout=stdout)
+
+        resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
+        return FeatureEvalFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
new file mode 100644
index 000000000..ef80fcdd7
--- /dev/null
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
@@ -0,0 +1,40 @@
+"""
+A qualified data loader should support following features
+- successfully run
+- len(test) == len(test_ids) == submission length
+- len(train) == len(y)
+
+Please make sure the stdout is rich enough to support informative feedback
+"""
+import pickle
+import logging
+from feat01 import feature_eng
+
+
+
+# Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Load data
+from load_data import load_from_raw_data
+from sklearn.model_selection import train_test_split
+X, y, X_test, test_ids = load_from_raw_data()
+
+X, y, X_param = feat_eng(X, y)
+X_test, _, _ = feat_eng(X_test, param=X_param)
+
+
+# Validate the conditions mentioned in the docstring
+assert len(X_test) == len(test_ids), "Mismatch in length of test images and test IDs"
+assert len(X) == len(y), "Mismatch in length of training images and labels"
+# Check for missing values
+assert not X.isnull().values.any(), "Missing values found in training data"
+assert not X_test.isnull().values.any(), "Missing values found in test data"
+assert not y.isnull().values.any(), "Missing values found in labels"
+
+logging.info("Data loader test passed successfully. Length of test images matches length of test IDs.")
+
+with open('data.pkl', 'wb') as f:
+    pickle.dump((X, y, X_test, test_ids), f)
+
+
diff --git a/rdagent/components/coder/data_science/feature/exp.py b/rdagent/components/coder/data_science/feature/exp.py
new file mode 100644
index 000000000..fec5ac681
--- /dev/null
+++ b/rdagent/components/coder/data_science/feature/exp.py
@@ -0,0 +1,46 @@
+import pickle
+import site
+import traceback
+from pathlib import Path
+from typing import Dict, Optional
+
+from rdagent.components.coder.CoSTEER.task import CoSTEERTask
+from rdagent.core.utils import cache_with_pickle
+
+
+class FeatureTask(CoSTEERTask):
+    def __init__(
+        self,
+        name: str,
+        description: str,
+        spec: str,
+        variables: dict = {},
+        implementation: bool = False,
+        **kwargs,
+    ) -> None:
+        self.variables: dict = variables
+        self.spec: str = spec
+        self.implementation: bool = implementation
+        super().__init__(name=name, description=description, **kwargs)
+    
+    def get_task_information(self):
+        return f"""name: {self.name}
+description: {self.description}
+variables: {str(self.variables)}
+spec: {self.spec}"""
+
+    def get_task_information_and_implementation_result(self):
+        return {
+            "name": self.factor_name,
+            "description": self.factor_description,
+            "variables": str(self.variables),
+            "spec": self.spec,
+            "implementation": str(self.implementation),
+        }
+
+    @staticmethod
+    def from_dict(dict):
+        return FeatureTask(**dict)
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} {self.name}>"
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
new file mode 100644
index 000000000..ae5a29b36
--- /dev/null
+++ b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -0,0 +1,40 @@
+feature:
+  system: |-
+    You are a Python data scientist working on a new project. This project involves implementing feature engineering techniques to prepare data for machine learning models, and this project code will be written by GPT.
+    Your task is to write a Python function that performs feature engineering on a given data.
+    You should follow the provided specifications to complete this task.
+
+    Please response the code in the following json format. Here is an example structure for the JSON output:
+    {
+        "code": "The Python code as a string."
+    }
+    ```
+  user: |-
+    ---------Competition Information---------
+    {{ competition_info }}
+
+    ---------Feature Processing Specification---------
+    {{ feature_spec }}
+
+
+feature_eval:
+  system: |-
+    You are data scientist.
+    You are testing the feature processing with the following code
+    ```python
+    {{test_code}}
+    ```
+    You'll be given the stdout of your testing scripts.
+    Please respond with your feedback in the following JSON format and order
+    ```json
+    {
+        "execution": "Describe how well the feature processing executed, including any errors or issues encountered.",
+        "return_checking": "Detail the checks performed on the data after feature processing, including data integrity and correctness.",
+        "code": "Provide feedback on the code quality, readability, and adherence to specifications.",
+        "final_decision": <true/false>
+    }
+    ```
+  user: |-
+    ```
+    {{stdout}}
+    ```
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/feature/test.py b/rdagent/components/coder/data_science/feature/test.py
new file mode 100644
index 000000000..e353f5801
--- /dev/null
+++ b/rdagent/components/coder/data_science/feature/test.py
@@ -0,0 +1,43 @@
+"""
+Helper functions for testing the feature coder(CoSTEER-based) component.
+- Does the developer loop work correctly
+
+It is NOT:
+- it is not interface unittest(i.e. workspace evaluator in the CoSTEER Loop)
+"""
+
+from rdagent.components.coder.data_science.feature import FeatureCoSTEER
+
+from rdagent.components.coder.data_science.feature.exp import FeatureTask
+from rdagent.scenarios.data_science.experiment.experiment import FeatureExperiment
+from rdagent.scenarios.data_science.scen import DataScienceScen
+
+import pickle
+# from rdagent.components.coder.data_science.feature.es import ModelMultiProcessEvolvingStrategy
+
+
+
+def develop_one_competition(competition: str):  # -> experiment
+    scen = DataScienceScen(competition=competition)
+    feature_coder = FeatureCoSTEER(scen)
+
+    with open('/home/v-yuanteli/RD-Agent/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md', 'r') as file:
+        feat_spec = file.read()
+    
+    # Create the experiment
+    ft = FeatureTask(name="FeatureTask", description=scen.competition_descriptions, spec=feat_spec)
+    exp = FeatureExperiment(
+        sub_tasks=[ft],
+    )
+
+    with open('/home/v-yuanteli/RD-Agent/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py', 'r') as file:
+        load_data_code = file.read()
+    exp.experiment_workspace.inject_code(**{"load_data.py": load_data_code})
+    
+
+    # Develop the experiment
+    exp = feature_coder.develop(exp)
+
+
+if __name__ == "__main__":
+    develop_one_competition("aerial-cactus-identification")
diff --git a/rdagent/components/coder/data_science/feature_process/__init__.py b/rdagent/components/coder/data_science/feature_process/__init__.py
deleted file mode 100644
index 68a1ee6b7..000000000
--- a/rdagent/components/coder/data_science/feature_process/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# from rdagent.components.coder.CoSTEER import CoSTEER
-# from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
-# from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-# from rdagent.core.scenario import Scenario
-
-
-# class FeatureCoSTEER(CoSTEER):
-#     def __init__(
-#         self,
-#         scen: Scenario,
-#         *args,
-#         **kwargs,
-#     ) -> None:
-#         eva = CoSTEERMultiEvaluator(
-#             FeatureCoSTEEREvaluator(scen=scen), scen=scen
-#         )  # Please specify whether you agree running your eva in parallel or not
-#         es = FeatureMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
-
-#         super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
diff --git a/rdagent/components/coder/data_science/feature_process/exp.py b/rdagent/components/coder/data_science/feature_process/exp.py
deleted file mode 100644
index 7ea67a140..000000000
--- a/rdagent/components/coder/data_science/feature_process/exp.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import pickle
-import site
-import traceback
-from pathlib import Path
-from typing import Dict, Optional
-
-from rdagent.components.coder.CoSTEER.task import CoSTEERTask
-from rdagent.core.utils import cache_with_pickle
-
-
-class FeatureTask(CoSTEERTask):
-    def __init__(
-        self,
-        name: str,
-        description: str,
-        spec: str,
-        **kwargs,
-    ) -> None:
-        pass
-
-    @staticmethod
-    def from_dict(dict):
-        return FeatureTask(**dict)
-
-    def __repr__(self) -> str:
-        return f"<{self.__class__.__name__} {self.name}>"
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index 86017c357..ab75e81c1 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -1,7 +1,7 @@
 from rdagent.core.experiment import Experiment, FBWorkspace
 
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
-from rdagent.components.coder.data_science.feature_process.exp import FeatureTask
+from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.components.coder.data_science.model.exp import ModelTask
 from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index f14c1f00e..313acc80d 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -3,7 +3,7 @@
 import json
 
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
-from rdagent.components.coder.data_science.feature_process.exp import FeatureTask
+from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.components.coder.data_science.model.exp import ModelTask
 from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask

From 7d7cab8f4dfd2e318363052c972eb10919030be2 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 12 Dec 2024 09:51:32 +0000
Subject: [PATCH 043/304] fix execute data volume problem

---
 .../components/coder/data_science/raw_data_loader/eval.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index 31cb6db68..161669fcc 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -16,7 +16,7 @@
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DSDockerConf, DockerEnv
 from pathlib import Path
-
+from rdagent.app.data_science.conf import DS_RD_SETTING
 DIRNAME = Path(__file__).absolute().resolve().parent
 
 DataLoaderEvalFeedback = CoSTEERSingleFeedback
@@ -47,7 +47,11 @@ def evaluate(
                 final_decision=False,
             )
 
-        de = DockerEnv(conf=DSDockerConf())
+        ds_docker_conf = DSDockerConf()
+        ds_docker_conf.extra_volumes = {
+            f"{DS_RD_SETTING.local_data_path}/{DS_RD_SETTING.competition}": "/kaggle/input"
+        }
+        de = DockerEnv(conf=ds_docker_conf)
 
         # TODO: do we need to clean the generated tempory content?
         fname = "data_loader_test.py"

From 57293b1e8a4049c2632065e5ecfe4bdef77dbcbe Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 12 Dec 2024 23:50:06 +0000
Subject: [PATCH 044/304] proposal related

---
 .../data_science/proposal/exp_gen.py          | 114 ++++++++++++------
 .../data_science/proposal/prompts.yaml        |  71 +++++++++--
 rdagent/scenarios/kaggle/prompts.yaml         |  10 --
 3 files changed, 138 insertions(+), 57 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 313acc80d..5f56ed681 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -10,7 +10,6 @@
 
 from rdagent.scenarios.data_science.experiment.experiment import DataLoaderExperiment, FeatureExperiment, ModelExperiment, EnsembleExperiment, WorkflowExperiment
 
-from rdagent.components.proposal import LLMHypothesis2Experiment, LLMHypothesisGen
 from rdagent.core.experiment import Experiment
 from rdagent.core.proposal import ExpGen, Trace, Hypothesis
 from rdagent.core.scenario import Scenario
@@ -47,38 +46,6 @@ def __str__(self) -> str:
 Concise Knowledge: {self.concise_knowledge}
 """
 
-
-class DSHypothesisGen(LLMHypothesisGen):
-    def get_next_action(self, trace):
-        pass
-
-    def prepare_context(self, trace):
-        hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(trace=trace)
-        
-        # TODO: how to generate sota solution
-        sota_solution = ""
-        hypothesis_specification = T(".prompts:hypothesis_specification").r(sota_solution=sota_solution)
-        
-        return {
-            "hypothesis_and_feedback": hypothesis_and_feedback,
-            # TODO: "RAG": "",
-            "hypothesis_output_format": T(".prompts:output_format.hypothesis").r(),
-            "hypothesis_specification": hypothesis_specification,
-        }, True
-
-    def convert_response(self, response):
-        response_dict = json.loads(response)
-        return DSHypothesis(
-            hypothesis=response_dict.get("hypothesis", "Hypothesis not provided"),
-            reason=response_dict.get("reason", "Reason not provided"),
-            concise_reason=response_dict.get("concise_reason", "Concise reason not provided"),
-            concise_observation=response_dict.get("concise_observation", "Concise observation not provided"),
-            concise_justification=response_dict.get("concise_justification", "Concise justification not provided"),
-            concise_knowledge=response_dict.get("concise_knowledge", "Concise knowledge not provided"),
-            component=response_dict.get("component", "Component not provided"),
-        )
-
-
 class DSExpGen(ExpGen):
     """Data Science Task Generator."""
 
@@ -93,16 +60,89 @@ def is_complete():
             return set(ORDER) == successful_components
 
         if is_complete():
-            # proposal + design
-            hypothesis: DSHypothesis = DSHypothesisGen(scen=self.scen).gen(trace)
+            # base info
             scenario = trace.scen.get_scenario_all_desc()
+            hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(trace=trace)
             
+            # 1. hypothesis gen
+            # TODO: how to generate sota solution
+            sota_solution = ""
+            system_prompt = T(".prompts:hypothesis_gen.system").r(
+                targets="data science project",
+                scenario=scenario,
+                hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
+                hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
+                )
+            user_prompt = T(".prompts:hypothesis_gen.user").r(
+                targets="data science project",
+                hypothesis_and_feedback=hypothesis_and_feedback,
+                )
+
+            resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True))
+            hypothesis = DSHypothesis(
+                hypothesis=resp_dict.get("hypothesis", "Hypothesis not provided"),
+                reason=resp_dict.get("reason", "Reason not provided"),
+                concise_reason=resp_dict.get("concise_reason", "Concise reason not provided"),
+                concise_observation=resp_dict.get("concise_observation", "Concise observation not provided"),
+                concise_justification=resp_dict.get("concise_justification", "Concise justification not provided"),
+                concise_knowledge=resp_dict.get("concise_knowledge", "Concise knowledge not provided"),
+                component=resp_dict.get("component", "Component not provided"),
+            )
+            
+            # 2. gen experiment
             if hypothesis.component == "DataLoadSpec":
                 pass
             elif hypothesis.component == "FeatureEng":
-                pass
+                # TODO: RAG
+                feature_task_output_format = T(".prompts:output_format.feature").r()
+                
+                system_prompt = T(".prompts:hypothesis2task.system").r(
+                    targets="Feature Engineering",
+                    scenario=scenario,
+                    task_output_format=feature_task_output_format,
+                    )
+                user_prompt = T(".prompts:hypothesis2task.user").r(
+                    targets="Feature Engineering",
+                    target_hypothesis=str(hypothesis),
+                    hypothesis_and_feedback=hypothesis_and_feedback,
+                    )
+                
+                resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+                tasks = []
+                for fn in resp_dict:
+                    ft = FeatureTask(
+                        name=fn,
+                        description=resp_dict[fn].get("description", "Factor description not provided"),
+                        formulation=resp_dict[fn].get("formulation", "Feature formulation not provided"),
+                        variables=resp_dict[fn].get("variables", "Variables not provided"),
+                        )
+                
+                return FeatureExperiment(sub_tasks=tasks, hypothesis=hypothesis)
             elif hypothesis.component == "Model":
-                pass
+                model_task_output_format = T(".prompts:output_format.model").r()
+                
+                system_prompt = T(".prompts:hypothesis2task.system").r(
+                    targets="Models",
+                    scenario=scenario,
+                    task_output_format=model_task_output_format,
+                    )
+                user_prompt = T(".prompts:hypothesis2task.user").r(
+                    targets="Models",
+                    target_hypothesis=str(hypothesis),
+                    hypothesis_and_feedback=hypothesis_and_feedback,
+                    )
+                
+                resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+                tasks = []
+                mt = ModelTask(
+                    name=resp_dict.get("model_name", "Model name not provided"),
+                    description=resp_dict.get("description", "Model description not provided"),
+                    architecture=resp_dict.get("architecture", "Model architecture not provided"),
+                    hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
+                    base_code="",
+                )
+                
+                return ModelExperiment(sub_tasks=tasks, hypothesis=hypothesis)
             elif hypothesis.component == "Ensemble":
                 pass
             elif hypothesis.component == "Workflow":
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 0fbdc7292..520c31af1 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -1,3 +1,52 @@
+hypothesis_gen:
+  system: |-
+    The user is working on generating new hypotheses for the {{targets}} in a data-driven research and development process. 
+    The {{targets}} are used in the following scenario:
+    {{scenario}}
+    The user has already proposed several hypotheses and conducted evaluations on them. This information will be provided to you. Your task is to check whether a similar hypothesis has already been generated. 
+    If one exists and you agree with it, feel free to use it. If you disagree, please generate an improved version.
+    {% if hypothesis_specification %}
+    To assist you in formulating new hypotheses, the user has provided some additional information: {{hypothesis_specification}}.
+    **Important:** If the hypothesis_specification outlines the next steps you need to follow, ensure you adhere to those instructions.
+    {% endif %}
+    Please generate the output using the following format and specifications:
+    {{ hypothesis_output_format }}
+
+  user: |-
+    {% if hypothesis_and_feedback|length == 0 %}It is the first round of hypothesis generation. The user has no hypothesis on this scenario yet.
+    {% else %}It is not the first round, the user has made several hypothesis on this scenario and did several evaluation on them.
+    The former hypothesis and the corresponding feedbacks are as follows (focus on the last one & the new hypothesis that it provides and reasoning to see if you agree):
+    {{ hypothesis_and_feedback }}
+    {% endif %}
+    {% if RAG %}
+    To assist you in generating new {{targets}}, we have provided the following information: {{RAG}}.
+    **Note:** The provided RAG is for reference only. 
+    You must carefully assess whether the RAG aligns with the {{targets}}. 
+    If it does not, it should not be used. Exercise caution and make your own judgment.
+    {% endif %}
+    Also generate the relevant keys for the reasoning and the distilled knowledge that follows. For those keys, in particular for knowledge, explain in the context of the specific scenario to build up domain knowledge in the specific field rather than general knowledge.
+
+hypothesis2task:
+  system: |-
+    The user is trying to generate new {{targets}} based on the hypothesis generated in the previous step. 
+    The {{targets}} are used in certain scenario, the scenario is as follows:
+    {{ scenario }}
+    The user will use the {{targets}} generated to do some experiments. The user will provide this information to you:
+    1. The target hypothesis you are targeting to generate {{targets}} for.
+    2. The hypothesis generated in the previous steps and their corresponding feedbacks.
+    3. Former proposed {{targets}} on similar hypothesis.
+    4. Some additional information to help you generate new {{targets}}.
+    Please generate the output following the format below:
+    {{ task_output_format }}
+    
+  user: |-
+    The user has made several hypothesis on this scenario and did several evaluation on them.
+    The target hypothesis you are targeting to generate {{targets}} for is as follows:
+    {{ target_hypothesis }}
+    The former hypothesis and the corresponding feedbacks are as follows:
+    {{ hypothesis_and_feedback }}
+    Please generate the new {{targets}} based on the information above.
+
 hypothesis_and_feedback: |-
   {% for hypothesis, experiment, feedback in trace.hist[-10:] %}
   Hypothesis {{ loop.index }}: {{ hypothesis }}
@@ -24,22 +73,27 @@ output_format:
       "concise_knowledge": "One line summary. Transferable knowledge based on theoretical principles. Use conditional grammar. eg. "If...., ..; When..., .; and etc" Make sure that you state things clearly without ambiguity. Eg. avoid saying "previous hypothesis", because one wouldn't know what that is."
     }
   data_loader: |-
-
+    According to the hypothesis, please help user design one data loader task.
+    The output should follow JSON format. The schema is as follows:
+    {
+        "description": "description of the overall data loader for the data science workflow",
+        # Don't add ellipsis (...) or any filler text that might cause JSON parsing errors here!
+    }
   feature: |-
     According to the hypothesis, please help user design one or more feature engineering tasks.
     The output should follow JSON format. The schema is as follows:
     {
-        "factor or group name 1": {
-            "description": "description of factor or group name 1",
-            "formulation": "latex formulation of factor or group name 1",
+        "feature name 1": {
+            "description": "description of feature name 1",
+            "formulation": "latex formulation of feature or group name 1",
             "variables": {
                 "variable or function name 1": "description of variable or function 1",
                 "variable or function name 2": "description of variable or function 2"
             }
         },
-        "factor or group name 2": {
-            "description": "description of factor or group name 2",
-            "formulation": "latex formulation of factor or group name 2",
+        "feature name 2": {
+            "description": "description of feature name 2",
+            "formulation": "latex formulation of feature or group name 2",
             "variables": {
                 "variable or function name 1": "description of variable or function 1",
                 "variable or function name 2": "description of variable or function 2"
@@ -49,7 +103,6 @@ output_format:
     }
   model: |-
     According to the hypothesis, please help user design one model task.
-    We only build one model from four main model types: ["XGBoost", "RandomForest", "LightGBM", "NN"].
     The output should follow JSON format. The schema is as follows: 
     {
         "model_name": "model_name",
@@ -60,8 +113,6 @@ output_format:
             "hyperparameter_name_2": "value of hyperparameter 2",
             "hyperparameter_name_3": "value of hyperparameter 3"
         },
-        "model_type": "Please select only **one** model type from the following four options: XGBoost, RandomForest, LightGBM, or NN. The selected model must be unique and used as the **primary model**. You may choose an auxiliary model for support or optimization on specific tasks if necessary, but the primary model must come from the provided options."
-
     }
     Usually, a larger model works better than a smaller one. Hence, the parameters should be larger.
   ensemble: |-
diff --git a/rdagent/scenarios/kaggle/prompts.yaml b/rdagent/scenarios/kaggle/prompts.yaml
index ec5072efb..c73ae099f 100644
--- a/rdagent/scenarios/kaggle/prompts.yaml
+++ b/rdagent/scenarios/kaggle/prompts.yaml
@@ -175,16 +175,6 @@ model_experiment_output_format: |-
   }
   Usually, a larger model works better than a smaller one. Hence, the parameters should be larger.
 
-data_loader_experiment_output_format: |-
-  According to the hypothesis, please help user design one data loader task.
-  The output should follow JSON format. The schema is as follows:
-  {
-      "data loader name": {
-          "description": "description of the overall data loader for the data science workflow",
-      }
-      # Don't add ellipsis (...) or any filler text that might cause JSON parsing errors here!
-  }
-
 kg_feedback_generation_user: |-
   We are in a process of finding and validating hypotheses to build a powerful model. Each round aims to confirm or reject hypotheses based on results.
 

From 3a746fa4e5c4e8228983ecbb74002335b037f5c4 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 13 Dec 2024 00:08:47 +0000
Subject: [PATCH 045/304] hypothesis2experiment base

---
 .../data_science/proposal/exp_gen.py          | 66 +++++++++++++++++--
 .../data_science/proposal/prompts.yaml        | 11 +++-
 2 files changed, 70 insertions(+), 7 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 5f56ed681..e293aae11 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -91,11 +91,28 @@ def is_complete():
             
             # 2. gen experiment
             if hypothesis.component == "DataLoadSpec":
-                pass
+                data_loader_task_output_format = T(".prompts:output_format.data_loader").r()
+                system_prompt = T(".prompts:hypothesis2task.system").r(
+                    targets="Data loader and specification generation",
+                    scenario=scenario,
+                    task_output_format=data_loader_task_output_format,
+                    )
+                usre_prompt = T(".prompts:hypothesis2task.user").r(
+                    targets="Data loader and specification generation",
+                    target_hypothesis=str(hypothesis),
+                    hypothesis_and_feedback=hypothesis_and_feedback,
+                    )
+                
+                resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+                dt = DataLoaderTask(
+                    name="Data loader and specification generation",
+                    description=resp_dict.get("description", "Data loader and specification generation description not provided"),
+                )
+                
+                return DataLoaderExperiment(sub_tasks=[dt], hypothesis=hypothesis)
             elif hypothesis.component == "FeatureEng":
                 # TODO: RAG
                 feature_task_output_format = T(".prompts:output_format.feature").r()
-                
                 system_prompt = T(".prompts:hypothesis2task.system").r(
                     targets="Feature Engineering",
                     scenario=scenario,
@@ -133,7 +150,6 @@ def is_complete():
                     )
                 
                 resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
-                tasks = []
                 mt = ModelTask(
                     name=resp_dict.get("model_name", "Model name not provided"),
                     description=resp_dict.get("description", "Model description not provided"),
@@ -142,11 +158,49 @@ def is_complete():
                     base_code="",
                 )
                 
-                return ModelExperiment(sub_tasks=tasks, hypothesis=hypothesis)
+                return ModelExperiment(sub_tasks=[mt], hypothesis=hypothesis)
             elif hypothesis.component == "Ensemble":
-                pass
+                ensemble_task_output_format = T(".prompts:output_format.ensemble").r()
+                
+                system_prompt = T(".prompts:hypothesis2task.system").r(
+                    targets="Ensemble",
+                    scenario=scenario,
+                    task_output_format=ensemble_task_output_format,
+                    )
+                user_prompt = T(".prompts:hypothesis2task.user").r(
+                    targets="Ensemble",
+                    target_hypothesis=str(hypothesis),
+                    hypothesis_and_feedback=hypothesis_and_feedback,
+                    )
+                
+                resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+                et = EnsembleTask(
+                    name="Ensemble",
+                    description=resp_dict.get("description", "Ensemble description not provided"),
+                )
+
+                return EnsembleExperiment(sub_tasks=[et], hypothesis=hypothesis)                
             elif hypothesis.component == "Workflow":
-                pass
+                workflow_task_output_format = T(".prompts:output_format.workflow").r()
+                
+                system_prompt = T(".prompts:hypothesis2task.system").r(
+                    targets="Workflow",
+                    scenario=scenario,
+                    task_output_format=workflow_task_output_format,
+                    )
+                user_prompt = T(".prompts:hypothesis2task.user").r(
+                    targets="Workflow",
+                    target_hypothesis=str(hypothesis),
+                    hypothesis_and_feedback=hypothesis_and_feedback,
+                    )
+                
+                resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+                wt = WorkflowTask(
+                    name="Workflow",
+                    description=resp_dict.get("description", "Workflow description not provided"),
+                )
+
+                return WorkflowExperiment(sub_tasks=[wt], hypothesis=hypothesis)
         else:
             for o in ORDER:
                 if o in successful_components:
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 520c31af1..e8dc4d7d8 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -116,5 +116,14 @@ output_format:
     }
     Usually, a larger model works better than a smaller one. Hence, the parameters should be larger.
   ensemble: |-
-    
+    According to the hypothesis, please help user design one ensemble task.
+    The output should follow JSON format. The schema is as follows:
+    {
+        "description": "A detailed description of the ensemble",
+    }
   workflow: |-
+    According to the hypothesis, please help user design one workflow task.
+    The output should follow JSON format. The schema is as follows:
+    {
+        "description": "A detailed description of the workflow",
+    }

From 68e4c1f66cb6bdb296191da9e640431c8c7aaa67 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 13 Dec 2024 00:20:15 +0000
Subject: [PATCH 046/304] only hypothesis gen and task gen

---
 .../data_science/proposal/exp_gen.py          | 20 +++++++++----------
 .../data_science/proposal/prompts.yaml        | 13 +++++++++++-
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index e293aae11..a58494212 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -92,12 +92,12 @@ def is_complete():
             # 2. gen experiment
             if hypothesis.component == "DataLoadSpec":
                 data_loader_task_output_format = T(".prompts:output_format.data_loader").r()
-                system_prompt = T(".prompts:hypothesis2task.system").r(
+                system_prompt = T(".prompts:task_gen.system").r(
                     targets="Data loader and specification generation",
                     scenario=scenario,
                     task_output_format=data_loader_task_output_format,
                     )
-                usre_prompt = T(".prompts:hypothesis2task.user").r(
+                usre_prompt = T(".prompts:task_gen.user").r(
                     targets="Data loader and specification generation",
                     target_hypothesis=str(hypothesis),
                     hypothesis_and_feedback=hypothesis_and_feedback,
@@ -113,12 +113,12 @@ def is_complete():
             elif hypothesis.component == "FeatureEng":
                 # TODO: RAG
                 feature_task_output_format = T(".prompts:output_format.feature").r()
-                system_prompt = T(".prompts:hypothesis2task.system").r(
+                system_prompt = T(".prompts:task_gen.system").r(
                     targets="Feature Engineering",
                     scenario=scenario,
                     task_output_format=feature_task_output_format,
                     )
-                user_prompt = T(".prompts:hypothesis2task.user").r(
+                user_prompt = T(".prompts:task_gen.user").r(
                     targets="Feature Engineering",
                     target_hypothesis=str(hypothesis),
                     hypothesis_and_feedback=hypothesis_and_feedback,
@@ -138,12 +138,12 @@ def is_complete():
             elif hypothesis.component == "Model":
                 model_task_output_format = T(".prompts:output_format.model").r()
                 
-                system_prompt = T(".prompts:hypothesis2task.system").r(
+                system_prompt = T(".prompts:task_gen.system").r(
                     targets="Models",
                     scenario=scenario,
                     task_output_format=model_task_output_format,
                     )
-                user_prompt = T(".prompts:hypothesis2task.user").r(
+                user_prompt = T(".prompts:task_gen.user").r(
                     targets="Models",
                     target_hypothesis=str(hypothesis),
                     hypothesis_and_feedback=hypothesis_and_feedback,
@@ -162,12 +162,12 @@ def is_complete():
             elif hypothesis.component == "Ensemble":
                 ensemble_task_output_format = T(".prompts:output_format.ensemble").r()
                 
-                system_prompt = T(".prompts:hypothesis2task.system").r(
+                system_prompt = T(".prompts:task_gen.system").r(
                     targets="Ensemble",
                     scenario=scenario,
                     task_output_format=ensemble_task_output_format,
                     )
-                user_prompt = T(".prompts:hypothesis2task.user").r(
+                user_prompt = T(".prompts:task_gen.user").r(
                     targets="Ensemble",
                     target_hypothesis=str(hypothesis),
                     hypothesis_and_feedback=hypothesis_and_feedback,
@@ -183,12 +183,12 @@ def is_complete():
             elif hypothesis.component == "Workflow":
                 workflow_task_output_format = T(".prompts:output_format.workflow").r()
                 
-                system_prompt = T(".prompts:hypothesis2task.system").r(
+                system_prompt = T(".prompts:task_gen.system").r(
                     targets="Workflow",
                     scenario=scenario,
                     task_output_format=workflow_task_output_format,
                     )
-                user_prompt = T(".prompts:hypothesis2task.user").r(
+                user_prompt = T(".prompts:task_gen.user").r(
                     targets="Workflow",
                     target_hypothesis=str(hypothesis),
                     hypothesis_and_feedback=hypothesis_and_feedback,
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index e8dc4d7d8..4f62cbfe9 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -26,26 +26,37 @@ hypothesis_gen:
     {% endif %}
     Also generate the relevant keys for the reasoning and the distilled knowledge that follows. For those keys, in particular for knowledge, explain in the context of the specific scenario to build up domain knowledge in the specific field rather than general knowledge.
 
-hypothesis2task:
+task_gen:
   system: |-
+    {% if hypothesis is not None %}
     The user is trying to generate new {{targets}} based on the hypothesis generated in the previous step. 
+    {% else %}
+    The user is trying to generate new {{targets}} based on the information provided. 
+    {% endif %}
     The {{targets}} are used in certain scenario, the scenario is as follows:
     {{ scenario }}
+
+    {% if hypothesis is not None %}
     The user will use the {{targets}} generated to do some experiments. The user will provide this information to you:
     1. The target hypothesis you are targeting to generate {{targets}} for.
     2. The hypothesis generated in the previous steps and their corresponding feedbacks.
     3. Former proposed {{targets}} on similar hypothesis.
     4. Some additional information to help you generate new {{targets}}.
+    {% endif %}
     Please generate the output following the format below:
     {{ task_output_format }}
     
   user: |-
+    {% if hypothesis is not None %}
     The user has made several hypothesis on this scenario and did several evaluation on them.
     The target hypothesis you are targeting to generate {{targets}} for is as follows:
     {{ target_hypothesis }}
     The former hypothesis and the corresponding feedbacks are as follows:
     {{ hypothesis_and_feedback }}
     Please generate the new {{targets}} based on the information above.
+    {% else %}
+    Please generate the new {{targets}} task.
+    {% endif %}
 
 hypothesis_and_feedback: |-
   {% for hypothesis, experiment, feedback in trace.hist[-10:] %}

From a5704085fef0ea277c0b455c33eab9a8d8ec7b8f Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 13 Dec 2024 00:45:19 +0000
Subject: [PATCH 047/304] proposal related

---
 .../data_science/proposal/exp_gen.py          | 40 ++++++++++++++-----
 .../data_science/proposal/prompts.yaml        |  2 +-
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index a58494212..320361172 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -95,11 +95,12 @@ def is_complete():
                 system_prompt = T(".prompts:task_gen.system").r(
                     targets="Data loader and specification generation",
                     scenario=scenario,
+                    hypothesis=hypothesis,
                     task_output_format=data_loader_task_output_format,
                     )
-                usre_prompt = T(".prompts:task_gen.user").r(
+                user_prompt = T(".prompts:task_gen.user").r(
                     targets="Data loader and specification generation",
-                    target_hypothesis=str(hypothesis),
+                    hypothesis=hypothesis,
                     hypothesis_and_feedback=hypothesis_and_feedback,
                     )
                 
@@ -116,11 +117,12 @@ def is_complete():
                 system_prompt = T(".prompts:task_gen.system").r(
                     targets="Feature Engineering",
                     scenario=scenario,
+                    hypothesis=hypothesis,
                     task_output_format=feature_task_output_format,
                     )
                 user_prompt = T(".prompts:task_gen.user").r(
                     targets="Feature Engineering",
-                    target_hypothesis=str(hypothesis),
+                    hypothesis=hypothesis,
                     hypothesis_and_feedback=hypothesis_and_feedback,
                     )
                 
@@ -141,11 +143,12 @@ def is_complete():
                 system_prompt = T(".prompts:task_gen.system").r(
                     targets="Models",
                     scenario=scenario,
+                    hypothesis=hypothesis,
                     task_output_format=model_task_output_format,
                     )
                 user_prompt = T(".prompts:task_gen.user").r(
                     targets="Models",
-                    target_hypothesis=str(hypothesis),
+                    hypothesis=hypothesis,
                     hypothesis_and_feedback=hypothesis_and_feedback,
                     )
                 
@@ -165,11 +168,12 @@ def is_complete():
                 system_prompt = T(".prompts:task_gen.system").r(
                     targets="Ensemble",
                     scenario=scenario,
+                    hypothesis=hypothesis,
                     task_output_format=ensemble_task_output_format,
                     )
                 user_prompt = T(".prompts:task_gen.user").r(
                     targets="Ensemble",
-                    target_hypothesis=str(hypothesis),
+                    hypothesis=hypothesis,
                     hypothesis_and_feedback=hypothesis_and_feedback,
                     )
                 
@@ -186,11 +190,12 @@ def is_complete():
                 system_prompt = T(".prompts:task_gen.system").r(
                     targets="Workflow",
                     scenario=scenario,
+                    hypothesis=hypothesis,
                     task_output_format=workflow_task_output_format,
                     )
                 user_prompt = T(".prompts:task_gen.user").r(
                     targets="Workflow",
-                    target_hypothesis=str(hypothesis),
+                    hypothesis=hypothesis,
                     hypothesis_and_feedback=hypothesis_and_feedback,
                     )
                 
@@ -207,12 +212,25 @@ def is_complete():
                     # we already have the component, then skip
                     continue
                 elif o == "DataLoadSpec":
-                    dlt = DataLoaderTask(name="DataLoaderTask", description="")
-                    exp = DataLoaderExperiment(
-                        sub_tasks=[dlt],
+                    data_loader_task_output_format = T(".prompts:output_format.data_loader").r()
+                    system_prompt = T(".prompts:task_gen.system").r(
+                        targets="Data loader and specification generation",
+                        scenario=scenario,
+                        hypothesis=None,
+                        task_output_format=data_loader_task_output_format,
                     )
-                    self.complete_component.add(o)
-                    return exp
+                    user_prompt = T(".prompts:task_gen.user").r(
+                        targets="Data loader and specification generation",
+                        hypothesis=None,
+                    )
+                    
+                    resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+                    dt = DataLoaderTask(
+                        name="Data loader and specification generation",
+                        description=resp_dict.get("description", "Data loader and specification generation description not provided"),
+                    )
+                
+                    return DataLoaderExperiment(sub_tasks=[dt], hypothesis=hypothesis)
                 elif o == "FeatureEng":
                     ft = FeatureTask(name="FeatureTask", description="")
                     exp = FeatureExperiment(
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 4f62cbfe9..034e17a09 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -50,7 +50,7 @@ task_gen:
     {% if hypothesis is not None %}
     The user has made several hypothesis on this scenario and did several evaluation on them.
     The target hypothesis you are targeting to generate {{targets}} for is as follows:
-    {{ target_hypothesis }}
+    {{ hypothesis }}
     The former hypothesis and the corresponding feedbacks are as follows:
     {{ hypothesis_and_feedback }}
     Please generate the new {{targets}} based on the information above.

From 3324601d7813d7fec4c06fd1e8d09b38cbd3f99d Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 13 Dec 2024 02:37:42 +0000
Subject: [PATCH 048/304] exp_gen base code

---
 .../data_science/proposal/exp_gen.py          | 96 +++++++++++++++----
 1 file changed, 79 insertions(+), 17 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 320361172..ebb1cc25b 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -230,33 +230,95 @@ def is_complete():
                         description=resp_dict.get("description", "Data loader and specification generation description not provided"),
                     )
                 
-                    return DataLoaderExperiment(sub_tasks=[dt], hypothesis=hypothesis)
+                    exp = DataLoaderExperiment(sub_tasks=[dt])
+                    return exp
                 elif o == "FeatureEng":
-                    ft = FeatureTask(name="FeatureTask", description="")
-                    exp = FeatureExperiment(
-                        sub_tasks=[ft],
+                    feature_task_output_format = T(".prompts:output_format.feature").r()
+                    system_prompt = T(".prompts:task_gen.system").r(
+                        targets="Feature Engineering",
+                        scenario=scenario,
+                        hypothesis=None,
+                        task_output_format=feature_task_output_format,
+                    )
+                    user_prompt = T(".prompts:task_gen.user").r(
+                        targets="Feature Engineering",
+                        hypothesis=None,
                     )
-                    self.complete_component.add(o)
+                    
+                    resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+                    tasks = []
+                    for fn in resp_dict:
+                        ft = FeatureTask(
+                            name=fn,
+                            description=resp_dict[fn].get("description", "Factor description not provided"),
+                            formulation=resp_dict[fn].get("formulation", "Feature formulation not provided"),
+                            variables=resp_dict[fn].get("variables", "Variables not provided"),
+                            )
+                        tasks.append(ft)
+                    exp = FeatureExperiment(sub_tasks=tasks)
                     return exp
                 elif o == "Model":
-                    mt = ModelTask(name="ModelTask", description="")
-                    exp = ModelExperiment(
-                        sub_tasks=[mt],
+                    model_task_output_format = T(".prompts:output_format.model").r()
+                    system_prompt = T(".prompts:task_gen.system").r(
+                        targets="Models",
+                        scenario=scenario,
+                        hypothesis=None,
+                        task_output_format=model_task_output_format,
+                    )
+                    user_prompt = T(".prompts:task_gen.user").r(
+                        targets="Models",
+                        hypothesis=None,
                     )
-                    self.complete_component.add(o)
+                    
+                    resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+                    mt = ModelTask(
+                        name=resp_dict.get("model_name", "Model name not provided"),
+                        description=resp_dict.get("description", "Model description not provided"),
+                        architecture=resp_dict.get("architecture", "Model architecture not provided"),
+                        hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
+                        base_code="",
+                    )
+                    exp = ModelExperiment(sub_tasks=[mt])
                     return exp
                 elif o == "Ensemble":
-                    et = EnsembleTask(name="EnsembleTask", description="")
-                    exp = EnsembleExperiment(
-                        sub_tasks=[et],
+                    ensemble_task_output_format = T(".prompts:output_format.ensemble").r()
+                    system_prompt = T(".prompts:task_gen.system").r(
+                        targets="Ensemble",
+                        scenario=scenario,
+                        hypothesis=None,
+                        task_output_format=ensemble_task_output_format,
+                    )
+                    user_prompt = T(".prompts:task_gen.user").r(
+                        targets="Ensemble",
+                        hypothesis=None,
                     )
-                    self.complete_component.add(o)
+                    
+                    resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+                    et = EnsembleTask(
+                        name="Ensemble",
+                        description=resp_dict.get("description", "Ensemble description not provided"),
+                    )
+                    exp = EnsembleExperiment(sub_tasks=[et])
                     return exp
                 elif o == "Workflow":
-                    wt = WorkflowTask(name="WorkflowTask", description="")
-                    exp = WorkflowExperiment(
-                        sub_tasks=[wt],
+                    workflow_task_output_format = T(".prompts:output_format.workflow").r()
+                    system_prompt = T(".prompts:task_gen.system").r(
+                        targets="Workflow",
+                        scenario=scenario,
+                        hypothesis=None,
+                        task_output_format=workflow_task_output_format,
+                    )
+                    user_prompt = T(".prompts:task_gen.user").r(
+                        targets="Workflow",
+                        hypothesis=None,
                     )
-                    self.complete_component.add(o)
+                    
+                    resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+                    wt = WorkflowTask(
+                        name="Workflow",
+                        description=resp_dict.get("description", "Workflow description not provided"),
+                    )
+                    exp = WorkflowExperiment(sub_tasks=[wt])
                     return exp
+            
         return super().gen(trace)

From 8d73ea3dd31283b69a0bf90cf9c7fbe76dc4e46e Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 13 Dec 2024 04:17:13 +0000
Subject: [PATCH 049/304] dependency_codes inject

---
 .../data_science/proposal/exp_gen.py          | 34 ++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index ebb1cc25b..a3f9c82f3 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -59,6 +59,12 @@ def is_complete():
             """is all components complete"""
             return set(ORDER) == successful_components
 
+        def last_successful_component(com: COMPONENT) -> Experiment:
+            for h, exp, hf in reversed(trace.hist):
+                if hf.decision and h.component == com:
+                    return exp
+            raise RuntimeError(f"No successful {com} component generated yet.")
+        
         if is_complete():
             # base info
             scenario = trace.scen.get_scenario_all_desc()
@@ -136,7 +142,10 @@ def is_complete():
                         variables=resp_dict[fn].get("variables", "Variables not provided"),
                         )
                 
-                return FeatureExperiment(sub_tasks=tasks, hypothesis=hypothesis)
+                exp = FeatureExperiment(sub_tasks=tasks, hypothesis=hypothesis)
+                dependency_exp = last_successful_component("DataLoadSpec")
+                exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+                return exp
             elif hypothesis.component == "Model":
                 model_task_output_format = T(".prompts:output_format.model").r()
                 
@@ -161,7 +170,10 @@ def is_complete():
                     base_code="",
                 )
                 
-                return ModelExperiment(sub_tasks=[mt], hypothesis=hypothesis)
+                exp = ModelExperiment(sub_tasks=[mt], hypothesis=hypothesis)
+                dependency_exp = last_successful_component("FeatureEng")
+                exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+                return exp
             elif hypothesis.component == "Ensemble":
                 ensemble_task_output_format = T(".prompts:output_format.ensemble").r()
                 
@@ -183,7 +195,10 @@ def is_complete():
                     description=resp_dict.get("description", "Ensemble description not provided"),
                 )
 
-                return EnsembleExperiment(sub_tasks=[et], hypothesis=hypothesis)                
+                exp = EnsembleExperiment(sub_tasks=[et], hypothesis=hypothesis)
+                dependency_exp = last_successful_component("Model")
+                exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+                return exp
             elif hypothesis.component == "Workflow":
                 workflow_task_output_format = T(".prompts:output_format.workflow").r()
                 
@@ -205,7 +220,10 @@ def is_complete():
                     description=resp_dict.get("description", "Workflow description not provided"),
                 )
 
-                return WorkflowExperiment(sub_tasks=[wt], hypothesis=hypothesis)
+                exp = WorkflowExperiment(sub_tasks=[wt], hypothesis=hypothesis)
+                dependency_exp = last_successful_component("Ensemble")
+                exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+                return exp
         else:
             for o in ORDER:
                 if o in successful_components:
@@ -256,6 +274,8 @@ def is_complete():
                             )
                         tasks.append(ft)
                     exp = FeatureExperiment(sub_tasks=tasks)
+                    dependency_exp = last_successful_component("DataLoadSpec")
+                    exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
                 elif o == "Model":
                     model_task_output_format = T(".prompts:output_format.model").r()
@@ -279,6 +299,8 @@ def is_complete():
                         base_code="",
                     )
                     exp = ModelExperiment(sub_tasks=[mt])
+                    dependency_exp = last_successful_component("FeatureEng")
+                    exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
                 elif o == "Ensemble":
                     ensemble_task_output_format = T(".prompts:output_format.ensemble").r()
@@ -299,6 +321,8 @@ def is_complete():
                         description=resp_dict.get("description", "Ensemble description not provided"),
                     )
                     exp = EnsembleExperiment(sub_tasks=[et])
+                    dependency_exp = last_successful_component("Model")
+                    exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
                 elif o == "Workflow":
                     workflow_task_output_format = T(".prompts:output_format.workflow").r()
@@ -319,6 +343,8 @@ def is_complete():
                         description=resp_dict.get("description", "Workflow description not provided"),
                     )
                     exp = WorkflowExperiment(sub_tasks=[wt])
+                    dependency_exp = last_successful_component("Ensemble")
+                    exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
             
         return super().gen(trace)

From 00526fff4cebc6d07800f4ae682b9a566f4ed29c Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 13 Dec 2024 06:36:37 +0000
Subject: [PATCH 050/304] proposal completed(not test)

---
 .../components/coder/CoSTEER/evaluators.py    |   8 +-
 .../coder/data_science/ensemble/exp.py        |   2 +-
 .../coder/data_science/feature/__init__.py    |  12 +-
 .../coder/data_science/feature/eval.py        |  16 +-
 .../feature/eval_tests/feature_test.py        |  13 +-
 .../coder/data_science/feature/exp.py         |   4 +-
 .../coder/data_science/feature/test.py        |  16 +-
 .../coder/data_science/model/__init__.py      |   9 +-
 .../components/coder/data_science/model/es.py |  12 +-
 .../coder/data_science/model/eva_utils.py     |  18 +-
 .../coder/data_science/model/eval.py          |  10 +-
 .../coder/data_science/model/exp.py           |  11 +-
 .../coder/data_science/model/test.py          |  27 +--
 .../data_science/raw_data_loader/__init__.py  |  20 +-
 .../data_science/raw_data_loader/eval.py      |  18 +-
 .../eval_tests/data_loader_test.py            |  12 +-
 .../coder/data_science/workflow/exp.py        |   2 +-
 rdagent/core/evaluation.py                    |   2 +
 .../data_science/experiment/experiment.py     |  15 +-
 .../data_science/proposal/exp_gen.py          | 185 ++++++++++++------
 rdagent/scenarios/data_science/scen/scen.py   |   2 +-
 .../aerial-cactus-identification/model01.py   |   2 +-
 22 files changed, 248 insertions(+), 168 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
index ffb3d533d..bf6b82197 100644
--- a/rdagent/components/coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -11,7 +11,6 @@
 from rdagent.core.utils import multiprocessing_wrapper
 from rdagent.log import rdagent_logger as logger
 
-
 # TODO:
 # 1. It seems logically sound, but we currently lack a scenario to apply it.
 # 2. If it proves to be useful, relocate it to a more general location.
@@ -54,6 +53,7 @@ def __str__(self) -> str:
 This implementation is {'SUCCESS' if self.final_decision else 'FAIL'}.
 """
 
+
 class CoSTEERSingleFeedbackDeprecated(CoSTEERSingleFeedback):
     """This class is a base class for all code generator feedback to single implementation"""
 
@@ -79,19 +79,19 @@ def __init__(
         # TODO:
         # Not general enough. So we should not put them in the general costeer feedback
         # Instead, we should create subclass for it.
-        self.shape_feedback = shape_feedback  # Not general enough. So 
+        self.shape_feedback = shape_feedback  # Not general enough. So
 
     # TODO: @property
     @property
     def execution(self):
         return self.execution_feedback
-    
+
     @property
     def return_checking(self):
         if self.value_generated_flag:
             return f"value feedback: {self.value_feedback}\n\nshape feedback: {self.shape_feedback}"
         return None
-    
+
     @property
     def code(self):
         return self.code_feedback
diff --git a/rdagent/components/coder/data_science/ensemble/exp.py b/rdagent/components/coder/data_science/ensemble/exp.py
index f232cb801..720e85aa3 100644
--- a/rdagent/components/coder/data_science/ensemble/exp.py
+++ b/rdagent/components/coder/data_science/ensemble/exp.py
@@ -23,4 +23,4 @@ def from_dict(dict):
         return EnsembleTask(**dict)
 
     def __repr__(self) -> str:
-        return f"<{self.__class__.__name__} {self.name}>"
\ No newline at end of file
+        return f"<{self.__class__.__name__} {self.name}>"
diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
index 5bfdf93f0..6ca4ca891 100644
--- a/rdagent/components/coder/data_science/feature/__init__.py
+++ b/rdagent/components/coder/data_science/feature/__init__.py
@@ -29,14 +29,15 @@
 from rdagent.components.coder.CoSTEER.knowledge_management import (
     CoSTEERQueriedKnowledge,
 )
+from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.components.coder.data_science.raw_data_loader.eval import (
     DataLoaderCoSTEEREvaluator,
 )
-from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 
+
 class FeatureMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
     def implement_one_task(
         self,
@@ -49,12 +50,12 @@ def implement_one_task(
 
         # 2. code
         system_prompt = T(".prompts:feature.system").r()
-        user_prompt = T(".prompts:feature.user").r(
-            competition_info=competition_info, feature_spec=target_task.spec
-        )
+        user_prompt = T(".prompts:feature.user").r(competition_info=competition_info, feature_spec=target_task.spec)
 
         feature_code = json.loads(
-            APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True)
+            APIBackend().build_messages_and_create_chat_completion(
+                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+            )
         )["code"]
 
         return {
@@ -77,6 +78,7 @@ def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
             evo.sub_workspace_list[index].inject_code(**code_list[index])
         return evo
 
+
 class FeatureCoSTEER(CoSTEER):
     def __init__(
         self,
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
index f363e27f4..de8e55b30 100644
--- a/rdagent/components/coder/data_science/feature/eval.py
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -1,25 +1,25 @@
-from rdagent.core.evolving_framework import QueriedKnowledge
-
 import json
 from dataclasses import dataclass
 from os import system
+from pathlib import Path
+
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
     CoSTEERMultiFeedback,
     CoSTEERSingleFeedback,
     CoSTEERSingleFeedbackDeprecated,
 )
-
-from rdagent.utils.env import DSDockerConf, DockerEnv
+from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.core.experiment import FBWorkspace, Task, Workspace
-from pathlib import Path
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
+from rdagent.utils.env import DockerEnv, DSDockerConf
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
 FeatureEvalFeedback = CoSTEERSingleFeedback
 
+
 class FeatureCoSTEEREvaluator(CoSTEEREvaluator):
 
     def evaluate(
@@ -32,8 +32,10 @@ def evaluate(
     ) -> CoSTEERSingleFeedbackDeprecated:
 
         target_task_information = target_task.get_task_information()
-        if (queried_knowledge is not None and
-                target_task_information in queried_knowledge.success_task_to_knowledge_dict):
+        if (
+            queried_knowledge is not None
+            and target_task_information in queried_knowledge.success_task_to_knowledge_dict
+        ):
             return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
         elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
             return CoSTEERSingleFeedbackDeprecated(
diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
index ef80fcdd7..2bedf04b3 100644
--- a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
@@ -6,18 +6,19 @@
 
 Please make sure the stdout is rich enough to support informative feedback
 """
-import pickle
-import logging
-from feat01 import feature_eng
 
+import logging
+import pickle
 
+from feat01 import feature_eng
 
 # Setup logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 
 # Load data
 from load_data import load_from_raw_data
 from sklearn.model_selection import train_test_split
+
 X, y, X_test, test_ids = load_from_raw_data()
 
 X, y, X_param = feat_eng(X, y)
@@ -34,7 +35,5 @@
 
 logging.info("Data loader test passed successfully. Length of test images matches length of test IDs.")
 
-with open('data.pkl', 'wb') as f:
+with open("data.pkl", "wb") as f:
     pickle.dump((X, y, X_test, test_ids), f)
-
-
diff --git a/rdagent/components/coder/data_science/feature/exp.py b/rdagent/components/coder/data_science/feature/exp.py
index fec5ac681..b092b8585 100644
--- a/rdagent/components/coder/data_science/feature/exp.py
+++ b/rdagent/components/coder/data_science/feature/exp.py
@@ -22,7 +22,7 @@ def __init__(
         self.spec: str = spec
         self.implementation: bool = implementation
         super().__init__(name=name, description=description, **kwargs)
-    
+
     def get_task_information(self):
         return f"""name: {self.name}
 description: {self.description}
@@ -43,4 +43,4 @@ def from_dict(dict):
         return FeatureTask(**dict)
 
     def __repr__(self) -> str:
-        return f"<{self.__class__.__name__} {self.name}>"
\ No newline at end of file
+        return f"<{self.__class__.__name__} {self.name}>"
diff --git a/rdagent/components/coder/data_science/feature/test.py b/rdagent/components/coder/data_science/feature/test.py
index e353f5801..1032d14a8 100644
--- a/rdagent/components/coder/data_science/feature/test.py
+++ b/rdagent/components/coder/data_science/feature/test.py
@@ -6,34 +6,36 @@
 - it is not interface unittest(i.e. workspace evaluator in the CoSTEER Loop)
 """
 
-from rdagent.components.coder.data_science.feature import FeatureCoSTEER
+import pickle
 
+from rdagent.components.coder.data_science.feature import FeatureCoSTEER
 from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.scenarios.data_science.experiment.experiment import FeatureExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
 
-import pickle
 # from rdagent.components.coder.data_science.feature.es import ModelMultiProcessEvolvingStrategy
 
 
-
 def develop_one_competition(competition: str):  # -> experiment
     scen = DataScienceScen(competition=competition)
     feature_coder = FeatureCoSTEER(scen)
 
-    with open('/home/v-yuanteli/RD-Agent/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md', 'r') as file:
+    with open(
+        "/home/v-yuanteli/RD-Agent/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md", "r"
+    ) as file:
         feat_spec = file.read()
-    
+
     # Create the experiment
     ft = FeatureTask(name="FeatureTask", description=scen.competition_descriptions, spec=feat_spec)
     exp = FeatureExperiment(
         sub_tasks=[ft],
     )
 
-    with open('/home/v-yuanteli/RD-Agent/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py', 'r') as file:
+    with open(
+        "/home/v-yuanteli/RD-Agent/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py", "r"
+    ) as file:
         load_data_code = file.read()
     exp.experiment_workspace.inject_code(**{"load_data.py": load_data_code})
-    
 
     # Develop the experiment
     exp = feature_coder.develop(exp)
diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 89e8fdaed..973741d1d 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -1,14 +1,17 @@
 from rdagent.components.coder.CoSTEER import CoSTEER
 from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
 from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-
-from rdagent.components.coder.data_science.model.eval import ModelGeneralCaseSpecEvaluator
 from rdagent.components.coder.CoSTEER.knowledge_management import (
     CoSTEERQueriedKnowledge,
 )
+from rdagent.components.coder.data_science.model.es import (
+    ModelMultiProcessEvolvingStrategy,
+)
+from rdagent.components.coder.data_science.model.eval import (
+    ModelGeneralCaseSpecEvaluator,
+)
 from rdagent.components.coder.data_science.model.exp import ModelTask
 from rdagent.core.scenario import Scenario
-from rdagent.components.coder.data_science.model.es import ModelMultiProcessEvolvingStrategy
 
 # from rdagent.utils.agent.tpl import T
 # T(".prompts:model_generator.user").r()
diff --git a/rdagent/components/coder/data_science/model/es.py b/rdagent/components/coder/data_science/model/es.py
index 9b27b801a..b233471a7 100644
--- a/rdagent/components/coder/data_science/model/es.py
+++ b/rdagent/components/coder/data_science/model/es.py
@@ -2,24 +2,22 @@
 from pathlib import Path
 
 from jinja2 import Environment, StrictUndefined
+
 from rdagent.components.coder.CoSTEER.evolving_strategy import (
     MultiProcessEvolvingStrategy,
 )
-
 from rdagent.components.coder.CoSTEER.knowledge_management import (
     CoSTEERQueriedKnowledge,
     CoSTEERQueriedKnowledgeV2,
 )
-from rdagent.components.coder.data_science.model.exp import (
-    ModelTask,
-    ModelFBWorkspace,
-)
+from rdagent.components.coder.data_science.model.exp import ModelFBWorkspace, ModelTask
 from rdagent.core.prompts import Prompts
 from rdagent.oai.llm_conf import LLM_SETTINGS
 from rdagent.oai.llm_utils import APIBackend
 
 coder_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
+
 class ModelMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
     def implement_one_task(
         self,
@@ -101,7 +99,7 @@ def implement_one_task(
         def Model():
             pass
         """
-    
+
     def assign_code_list_to_evo(self, code_list, evo):
         """
         Assign the code list to the evolving item.
@@ -116,4 +114,4 @@ def assign_code_list_to_evo(self, code_list, evo):
                 evo.sub_workspace_list[index] = ModelFBWorkspace(target_task=evo.sub_tasks[index])
             # TODO: avoid hardcode of file name
             evo.sub_workspace_list[index].inject_code(**{"model01.py": code_list[index]})
-        return evo
\ No newline at end of file
+        return evo
diff --git a/rdagent/components/coder/data_science/model/eva_utils.py b/rdagent/components/coder/data_science/model/eva_utils.py
index 1224e9a75..1d4dac331 100644
--- a/rdagent/components/coder/data_science/model/eva_utils.py
+++ b/rdagent/components/coder/data_science/model/eva_utils.py
@@ -14,30 +14,23 @@
 
 evaluate_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
+
 def expected_shape_evaluate(
-    prediction: np.ndarray,    
+    prediction: np.ndarray,
     spec_message: str,
     model_execution_feedback: str,
 ) -> str:
     if prediction is None:
         return "No output generated from the model. Skip value evaluation"
     elif spec_message is None:
-        return (
-            "No spec provided. Shape evaluation not impractical",
-        )
+        return ("No spec provided. Shape evaluation not impractical",)
     else:
         pre_shape = prediction.shape
 
         system_prompt = (
             Environment(undefined=StrictUndefined)
             .from_string(evaluate_prompts["evaluator_shape_feedback"]["system"])
-            .render(
-                spec=(
-                    spec_message
-                    if spec_message is not None
-                    else "No spec description provided."
-                )
-            )
+            .render(spec=(spec_message if spec_message is not None else "No spec description provided."))
         )
 
         execution_feedback_to_render = model_execution_feedback
@@ -63,7 +56,7 @@ def expected_shape_evaluate(
                 execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :]
             else:
                 break
-        
+
         critic_response = APIBackend().build_messages_and_create_chat_completion(
             user_prompt=user_prompt,
             system_prompt=system_prompt,
@@ -146,7 +139,6 @@ def evaluate(
         assert isinstance(target_task, ModelTask)
         assert isinstance(implementation, ModelFBWorkspace)
 
-
         system_prompt = (
             Environment(undefined=StrictUndefined)
             .from_string(evaluate_prompts["evaluator_final_feedback"]["system"])
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 05b511bb6..f97cfe391 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -56,13 +56,13 @@ def evaluate(
                 final_decision=False,
             )
         # assert isinstance(target_task, ModelTask)
-        
+
         batch_size = 8
         assert isinstance(implementation, ModelFBWorkspace)
-        model_execution_feedback, pred_list= implementation.execute(
+        model_execution_feedback, pred_list = implementation.execute(
             batch_size=batch_size,
         )
-        shape_feedback = ""  
+        shape_feedback = ""
         if pred_list is None:
             shape_feedback += "No output generated from the model. No shape evaluation conducted."
         else:
@@ -80,9 +80,9 @@ def evaluate(
                 model_execution_feedback=model_execution_feedback,
             )
 
-            shape_feedback += f"Validation Output: {val_shape_feedback}\n" 
+            shape_feedback += f"Validation Output: {val_shape_feedback}\n"
             shape_feedback += f"Test Output: {test_shape_feedback}\n"
-        value_feedback = "The value feedback is ignored, and the value decision is automatically set as true." 
+        value_feedback = "The value feedback is ignored, and the value decision is automatically set as true."
         code_feedback, _ = ModelCodeEvaluator(scen=self.scen).evaluate(
             target_task=target_task,
             implementation=implementation,
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
index 1e7820b45..317e3d519 100644
--- a/rdagent/components/coder/data_science/model/exp.py
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -21,7 +21,7 @@ def __init__(
         hyperparameters: Dict[str, str],
         formulation: str = None,
         variables: Dict[str, str] = None,
-        model_type: Optional[str] = None,   
+        model_type: Optional[str] = None,
         spec: str,
         **kwargs,
     ) -> None:
@@ -58,7 +58,7 @@ def __repr__(self) -> str:
 class ModelFBWorkspace(FBWorkspace):
     def execute(
         self,
-        batch_size: int = 8,    
+        batch_size: int = 8,
     ):
         super().execute()
         try:
@@ -72,7 +72,7 @@ def execute(
 
             log, results = de.dump_python_code_run_and_get_results(
                 code=dump_code,
-                dump_file_names=["execution_feedback_str.pkl", "pred_list.pkl"],  
+                dump_file_names=["execution_feedback_str.pkl", "pred_list.pkl"],
                 local_path=str(self.workspace_path),
                 env={},
                 code_dump_file_py_name="model_test",
@@ -83,11 +83,10 @@ def execute(
 
         except Exception as e:
             execution_feedback_str = f"Execution error: {e}\nTraceback: {traceback.format_exc()}"
-            pred_list  = None  
+            pred_list = None
 
         if len(execution_feedback_str) > 2000:
             execution_feedback_str = (
                 execution_feedback_str[:1000] + "....hidden long error message...." + execution_feedback_str[-1000:]
             )
-        return execution_feedback_str, pred_list 
-    
+        return execution_feedback_str, pred_list
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index 04a4e025f..4555ff08c 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -4,14 +4,17 @@
 
 from pathlib import Path
 
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
 from rdagent.components.coder.data_science.model import ModelCoSTEER
-from rdagent.components.coder.data_science.model.eval import ModelGeneralCaseSpecEvaluator
-
-from rdagent.components.coder.data_science.model.exp import ModelTask, ModelFBWorkspace
+from rdagent.components.coder.data_science.model.es import (
+    ModelMultiProcessEvolvingStrategy,
+)
+from rdagent.components.coder.data_science.model.eval import (
+    ModelGeneralCaseSpecEvaluator,
+)
+from rdagent.components.coder.data_science.model.exp import ModelFBWorkspace, ModelTask
 from rdagent.scenarios.data_science.experiment.experiment import ModelExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
-from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
-from rdagent.components.coder.data_science.model.es import ModelMultiProcessEvolvingStrategy
 
 
 # Take tasks, spec.md and feat as input, generate a feedback as output
@@ -21,10 +24,10 @@ def develop_one_competition(competition: str):
 
     # Create the task
     mt = ModelTask(
-        name="ModelTask", 
-        description="A CNN Model", 
-        architecture="\hat{y}_u = CNN(X_u)", 
-        variables="variables: {'\\hat{y}_u': 'The predicted output for node u', 'X_u': 'The input features for node u'}", 
+        name="ModelTask",
+        description="A CNN Model",
+        architecture="\hat{y}_u = CNN(X_u)",
+        variables="variables: {'\\hat{y}_u': 'The predicted output for node u', 'X_u': 'The input features for node u'}",
         hyperparameters="...",
         base_code="",
         spec="<read from spec>",
@@ -37,13 +40,13 @@ def develop_one_competition(competition: str):
     for file_name in injected_file_names:
         file_path = tpl_ex_path / file_name
         modelexp.inject_code(**{file_name: file_path.read_text()})
-    
+
     mt.spec += modelexp.code_dict["spec/model.md"]
     mt.base_code += modelexp.code_dict["model01.py"]
     exp = ModelExperiment(
         sub_tasks=[mt],
     )
- 
+
     # Test the evaluator:
     """eva = ModelGeneralCaseSpecEvaluator(scen=scen)
     exp.feedback = eva.evaluate(target_task=mt, queried_knowledge=None, implementation=modelexp, gt_implementation=None)
@@ -54,7 +57,7 @@ def develop_one_competition(competition: str):
     new_code = es.implement_one_task(target_task=mt, queried_knowledge=None)
     print(new_code)"""
 
-    # Run the experiment 
+    # Run the experiment
     for file_name in injected_file_names:
         file_path = tpl_ex_path / file_name
         exp.experiment_workspace.inject_code(**{file_name: file_path.read_text()})
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index 6147cb79a..ecb7bc53c 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -62,11 +62,19 @@ def implement_one_task(
 
         spec_session = APIBackend().build_chat_session(session_system_prompt=system_prompt)
 
-        data_loader_spec = json.loads(spec_session.build_chat_completion(user_prompt=data_loader_prompt, json_mode=True))["spec"]
-        feature_spec = json.loads(spec_session.build_chat_completion(user_prompt=feature_prompt, json_mode=True))["spec"]
+        data_loader_spec = json.loads(
+            spec_session.build_chat_completion(user_prompt=data_loader_prompt, json_mode=True)
+        )["spec"]
+        feature_spec = json.loads(spec_session.build_chat_completion(user_prompt=feature_prompt, json_mode=True))[
+            "spec"
+        ]
         model_spec = json.loads(spec_session.build_chat_completion(user_prompt=model_prompt, json_mode=True))["spec"]
-        ensemble_spec = json.loads(spec_session.build_chat_completion(user_prompt=ensemble_prompt, json_mode=True))["spec"]
-        workflow_spec = json.loads(spec_session.build_chat_completion(user_prompt=workflow_prompt, json_mode=True))["spec"]
+        ensemble_spec = json.loads(spec_session.build_chat_completion(user_prompt=ensemble_prompt, json_mode=True))[
+            "spec"
+        ]
+        workflow_spec = json.loads(spec_session.build_chat_completion(user_prompt=workflow_prompt, json_mode=True))[
+            "spec"
+        ]
 
         # 2. code
         system_prompt = T(".prompts:data_loader_coder.system").r()
@@ -75,7 +83,9 @@ def implement_one_task(
         )
 
         data_loader_code = json.loads(
-            APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True)
+            APIBackend().build_messages_and_create_chat_completion(
+                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+            )
         )["code"]
 
         return {
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index 161669fcc..39c59f893 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -3,6 +3,9 @@
 import json
 from dataclasses import dataclass
 from os import system
+from pathlib import Path
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
     CoSTEERMultiFeedback,
@@ -14,9 +17,8 @@
 from rdagent.core.experiment import FBWorkspace, Task, Workspace
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
-from rdagent.utils.env import DSDockerConf, DockerEnv
-from pathlib import Path
-from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.utils.env import DockerEnv, DSDockerConf
+
 DIRNAME = Path(__file__).absolute().resolve().parent
 
 DataLoaderEvalFeedback = CoSTEERSingleFeedback
@@ -34,8 +36,10 @@ def evaluate(
     ) -> CoSTEERSingleFeedbackDeprecated:
 
         target_task_information = target_task.get_task_information()
-        if (queried_knowledge is not None and
-                target_task_information in queried_knowledge.success_task_to_knowledge_dict):
+        if (
+            queried_knowledge is not None
+            and target_task_information in queried_knowledge.success_task_to_knowledge_dict
+        ):
             return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
         elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
             return CoSTEERSingleFeedbackDeprecated(
@@ -48,9 +52,7 @@ def evaluate(
             )
 
         ds_docker_conf = DSDockerConf()
-        ds_docker_conf.extra_volumes = {
-            f"{DS_RD_SETTING.local_data_path}/{DS_RD_SETTING.competition}": "/kaggle/input"
-        }
+        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{DS_RD_SETTING.competition}": "/kaggle/input"}
         de = DockerEnv(conf=ds_docker_conf)
 
         # TODO: do we need to clean the generated tempory content?
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
index aec0faa10..d20f5c375 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
@@ -6,14 +6,14 @@
 
 Please make sure the stdout is rich enough to support informative feedback
 """
-import pickle
-import logging
-from load_data import load_data
 
+import logging
+import pickle
 
+from load_data import load_data
 
 # Setup logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 
 X, y, X_test, test_ids = load_data()
 
@@ -23,7 +23,5 @@
 
 logging.info("Data loader test passed successfully. Length of test images matches length of test IDs.")
 
-with open('data.pkl', 'wb') as f:
+with open("data.pkl", "wb") as f:
     pickle.dump((X, y, X_test, test_ids), f)
-
-
diff --git a/rdagent/components/coder/data_science/workflow/exp.py b/rdagent/components/coder/data_science/workflow/exp.py
index 5ae63f752..23db871b2 100644
--- a/rdagent/components/coder/data_science/workflow/exp.py
+++ b/rdagent/components/coder/data_science/workflow/exp.py
@@ -23,4 +23,4 @@ def from_dict(dict):
         return WorkflowTask(**dict)
 
     def __repr__(self) -> str:
-        return f"<{self.__class__.__name__} {self.name}>"
\ No newline at end of file
+        return f"<{self.__class__.__name__} {self.name}>"
diff --git a/rdagent/core/evaluation.py b/rdagent/core/evaluation.py
index 71d32d920..65d2c14d1 100644
--- a/rdagent/core/evaluation.py
+++ b/rdagent/core/evaluation.py
@@ -13,6 +13,7 @@ class Feedback:
         It will be more like a **dataclass**.
         The building process of feedback will should be in evaluator
     """
+
     pass
 
 
@@ -25,6 +26,7 @@ class Evaluator(ABC):
             1. raw information including stdout & workspace  (feeedback itself will handle this)
             2. advanced/summaried feedback information. (evaluate will handle this)
     """
+
     def __init__(
         self,
         scen: Scenario,
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index ab75e81c1..95bbd038c 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -1,11 +1,9 @@
-from rdagent.core.experiment import Experiment, FBWorkspace
-
-from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
 from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.components.coder.data_science.model.exp import ModelTask
-from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
+from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
-
+from rdagent.core.experiment import Experiment, FBWorkspace
 
 # KG_MODEL_TYPE_XGBOOST = "XGBoost"
 # KG_MODEL_TYPE_RANDOMFOREST = "RandomForest"
@@ -32,23 +30,26 @@ def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.experiment_workspace = FBWorkspace()
 
+
 class ModelExperiment(Experiment[ModelTask, FBWorkspace, FBWorkspace]):
-    def __init__(self, *args, **kwargs) -> None: # TODO: use previeous step workspace
+    def __init__(self, *args, **kwargs) -> None:  # TODO: use previeous step workspace
         super().__init__(*args, **kwargs)
         self.experiment_workspace = FBWorkspace()
 
+
 class FeatureExperiment(Experiment[FeatureTask, FBWorkspace, FBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.experiment_workspace = FBWorkspace()
 
+
 class EnsembleExperiment(Experiment[EnsembleTask, FBWorkspace, FBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.experiment_workspace = FBWorkspace()
 
+
 class WorkflowExperiment(Experiment[WorkflowTask, FBWorkspace, FBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.experiment_workspace = FBWorkspace()
-
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index a3f9c82f3..05ba903f2 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -1,19 +1,23 @@
+import json
 from argparse import ONE_OR_MORE
 from typing import Literal
-import json
 
-from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
 from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.components.coder.data_science.model.exp import ModelTask
-from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
+from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
-
-from rdagent.scenarios.data_science.experiment.experiment import DataLoaderExperiment, FeatureExperiment, ModelExperiment, EnsembleExperiment, WorkflowExperiment
-
 from rdagent.core.experiment import Experiment
-from rdagent.core.proposal import ExpGen, Trace, Hypothesis
+from rdagent.core.proposal import ExpGen, Hypothesis, Trace
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.data_science.experiment.experiment import (
+    DataLoaderExperiment,
+    EnsembleExperiment,
+    FeatureExperiment,
+    ModelExperiment,
+    WorkflowExperiment,
+)
 from rdagent.utils.agent.tpl import T
 
 COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow"]
@@ -46,6 +50,7 @@ def __str__(self) -> str:
 Concise Knowledge: {self.concise_knowledge}
 """
 
+
 class DSExpGen(ExpGen):
     """Data Science Task Generator."""
 
@@ -54,7 +59,7 @@ def gen(self, trace: Trace) -> Experiment:
         for h, _, hf in trace.hist:
             if hf.decision:
                 successful_components.add(h.component)
-        
+
         def is_complete():
             """is all components complete"""
             return set(ORDER) == successful_components
@@ -64,12 +69,12 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                 if hf.decision and h.component == com:
                     return exp
             raise RuntimeError(f"No successful {com} component generated yet.")
-        
+
         if is_complete():
             # base info
             scenario = trace.scen.get_scenario_all_desc()
             hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(trace=trace)
-            
+
             # 1. hypothesis gen
             # TODO: how to generate sota solution
             sota_solution = ""
@@ -78,13 +83,15 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                 scenario=scenario,
                 hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
                 hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
-                )
+            )
             user_prompt = T(".prompts:hypothesis_gen.user").r(
                 targets="data science project",
                 hypothesis_and_feedback=hypothesis_and_feedback,
-                )
+            )
 
-            resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True))
+            resp_dict = json.loads(
+                APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
+            )
             hypothesis = DSHypothesis(
                 hypothesis=resp_dict.get("hypothesis", "Hypothesis not provided"),
                 reason=resp_dict.get("reason", "Reason not provided"),
@@ -94,7 +101,7 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                 concise_knowledge=resp_dict.get("concise_knowledge", "Concise knowledge not provided"),
                 component=resp_dict.get("component", "Component not provided"),
             )
-            
+
             # 2. gen experiment
             if hypothesis.component == "DataLoadSpec":
                 data_loader_task_output_format = T(".prompts:output_format.data_loader").r()
@@ -103,19 +110,25 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                     scenario=scenario,
                     hypothesis=hypothesis,
                     task_output_format=data_loader_task_output_format,
-                    )
+                )
                 user_prompt = T(".prompts:task_gen.user").r(
                     targets="Data loader and specification generation",
                     hypothesis=hypothesis,
                     hypothesis_and_feedback=hypothesis_and_feedback,
+                )
+
+                resp_dict = json.loads(
+                    APIBackend().build_messages_and_create_chat_completion(
+                        user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
                     )
-                
-                resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+                )
                 dt = DataLoaderTask(
                     name="Data loader and specification generation",
-                    description=resp_dict.get("description", "Data loader and specification generation description not provided"),
+                    description=resp_dict.get(
+                        "description", "Data loader and specification generation description not provided"
+                    ),
                 )
-                
+
                 return DataLoaderExperiment(sub_tasks=[dt], hypothesis=hypothesis)
             elif hypothesis.component == "FeatureEng":
                 # TODO: RAG
@@ -125,14 +138,20 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                     scenario=scenario,
                     hypothesis=hypothesis,
                     task_output_format=feature_task_output_format,
-                    )
+                )
                 user_prompt = T(".prompts:task_gen.user").r(
                     targets="Feature Engineering",
                     hypothesis=hypothesis,
                     hypothesis_and_feedback=hypothesis_and_feedback,
+                )
+
+                resp_dict = json.loads(
+                    APIBackend().build_messages_and_create_chat_completion(
+                        user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
                     )
-                
-                resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+                )
+                dependency_exp = last_successful_component("DataLoadSpec")
+                spec = dependency_exp.experiment_workspace.code_dict["spec/feature.md"]
                 tasks = []
                 for fn in resp_dict:
                     ft = FeatureTask(
@@ -140,88 +159,106 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                         description=resp_dict[fn].get("description", "Factor description not provided"),
                         formulation=resp_dict[fn].get("formulation", "Feature formulation not provided"),
                         variables=resp_dict[fn].get("variables", "Variables not provided"),
-                        )
-                
+                        spec=spec,
+                    )
+
                 exp = FeatureExperiment(sub_tasks=tasks, hypothesis=hypothesis)
-                dependency_exp = last_successful_component("DataLoadSpec")
                 exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                 return exp
             elif hypothesis.component == "Model":
                 model_task_output_format = T(".prompts:output_format.model").r()
-                
+
                 system_prompt = T(".prompts:task_gen.system").r(
                     targets="Models",
                     scenario=scenario,
                     hypothesis=hypothesis,
                     task_output_format=model_task_output_format,
-                    )
+                )
                 user_prompt = T(".prompts:task_gen.user").r(
                     targets="Models",
                     hypothesis=hypothesis,
                     hypothesis_and_feedback=hypothesis_and_feedback,
+                )
+
+                resp_dict = json.loads(
+                    APIBackend().build_messages_and_create_chat_completion(
+                        user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
                     )
-                
-                resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+                )
+                dependency_exp = last_successful_component("FeatureEng")
+                spec = dependency_exp.experiment_workspace.code_dict["spec/model.md"]
                 mt = ModelTask(
                     name=resp_dict.get("model_name", "Model name not provided"),
                     description=resp_dict.get("description", "Model description not provided"),
                     architecture=resp_dict.get("architecture", "Model architecture not provided"),
                     hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
+                    spec=spec,
                     base_code="",
                 )
-                
+
                 exp = ModelExperiment(sub_tasks=[mt], hypothesis=hypothesis)
-                dependency_exp = last_successful_component("FeatureEng")
                 exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                 return exp
             elif hypothesis.component == "Ensemble":
                 ensemble_task_output_format = T(".prompts:output_format.ensemble").r()
-                
+
                 system_prompt = T(".prompts:task_gen.system").r(
                     targets="Ensemble",
                     scenario=scenario,
                     hypothesis=hypothesis,
                     task_output_format=ensemble_task_output_format,
-                    )
+                )
                 user_prompt = T(".prompts:task_gen.user").r(
                     targets="Ensemble",
                     hypothesis=hypothesis,
                     hypothesis_and_feedback=hypothesis_and_feedback,
+                )
+
+                resp_dict = json.loads(
+                    APIBackend().build_messages_and_create_chat_completion(
+                        user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
                     )
-                
-                resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+                )
+                dependency_exp = last_successful_component("Model")
+                spec = dependency_exp.experiment_workspace.code_dict["spec/ensemble.md"]
                 et = EnsembleTask(
                     name="Ensemble",
                     description=resp_dict.get("description", "Ensemble description not provided"),
+                    spec=spec,
                 )
 
                 exp = EnsembleExperiment(sub_tasks=[et], hypothesis=hypothesis)
-                dependency_exp = last_successful_component("Model")
                 exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                 return exp
             elif hypothesis.component == "Workflow":
                 workflow_task_output_format = T(".prompts:output_format.workflow").r()
-                
+
                 system_prompt = T(".prompts:task_gen.system").r(
                     targets="Workflow",
                     scenario=scenario,
                     hypothesis=hypothesis,
                     task_output_format=workflow_task_output_format,
-                    )
+                )
                 user_prompt = T(".prompts:task_gen.user").r(
                     targets="Workflow",
                     hypothesis=hypothesis,
                     hypothesis_and_feedback=hypothesis_and_feedback,
+                )
+
+                resp_dict = json.loads(
+                    APIBackend().build_messages_and_create_chat_completion(
+                        user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
                     )
-                
-                resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+                )
+                dependency_exp = last_successful_component("Ensemble")
+                spec = dependency_exp.experiment_workspace.code_dict["spec/workflow.md"]
                 wt = WorkflowTask(
                     name="Workflow",
                     description=resp_dict.get("description", "Workflow description not provided"),
+                    spec=spec,
                 )
 
                 exp = WorkflowExperiment(sub_tasks=[wt], hypothesis=hypothesis)
-                dependency_exp = last_successful_component("Ensemble")
                 exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                 return exp
         else:
@@ -241,13 +278,19 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                         targets="Data loader and specification generation",
                         hypothesis=None,
                     )
-                    
-                    resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+
+                    resp_dict = json.loads(
+                        APIBackend().build_messages_and_create_chat_completion(
+                            user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                        )
+                    )
                     dt = DataLoaderTask(
                         name="Data loader and specification generation",
-                        description=resp_dict.get("description", "Data loader and specification generation description not provided"),
+                        description=resp_dict.get(
+                            "description", "Data loader and specification generation description not provided"
+                        ),
                     )
-                
+
                     exp = DataLoaderExperiment(sub_tasks=[dt])
                     return exp
                 elif o == "FeatureEng":
@@ -262,8 +305,14 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                         targets="Feature Engineering",
                         hypothesis=None,
                     )
-                    
-                    resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+
+                    resp_dict = json.loads(
+                        APIBackend().build_messages_and_create_chat_completion(
+                            user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                        )
+                    )
+                    dependency_exp = last_successful_component("DataLoadSpec")
+                    spec = dependency_exp.experiment_workspace.code_dict["spec/feature.md"]
                     tasks = []
                     for fn in resp_dict:
                         ft = FeatureTask(
@@ -271,10 +320,10 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                             description=resp_dict[fn].get("description", "Factor description not provided"),
                             formulation=resp_dict[fn].get("formulation", "Feature formulation not provided"),
                             variables=resp_dict[fn].get("variables", "Variables not provided"),
-                            )
+                            spec=spec,
+                        )
                         tasks.append(ft)
                     exp = FeatureExperiment(sub_tasks=tasks)
-                    dependency_exp = last_successful_component("DataLoadSpec")
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
                 elif o == "Model":
@@ -289,17 +338,23 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                         targets="Models",
                         hypothesis=None,
                     )
-                    
-                    resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+
+                    resp_dict = json.loads(
+                        APIBackend().build_messages_and_create_chat_completion(
+                            user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                        )
+                    )
+                    dependency_exp = last_successful_component("FeatureEng")
+                    spec = dependency_exp.experiment_workspace.code_dict["spec/model.md"]
                     mt = ModelTask(
                         name=resp_dict.get("model_name", "Model name not provided"),
                         description=resp_dict.get("description", "Model description not provided"),
                         architecture=resp_dict.get("architecture", "Model architecture not provided"),
                         hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
+                        spec=spec,
                         base_code="",
                     )
                     exp = ModelExperiment(sub_tasks=[mt])
-                    dependency_exp = last_successful_component("FeatureEng")
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
                 elif o == "Ensemble":
@@ -314,14 +369,20 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                         targets="Ensemble",
                         hypothesis=None,
                     )
-                    
-                    resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+
+                    resp_dict = json.loads(
+                        APIBackend().build_messages_and_create_chat_completion(
+                            user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                        )
+                    )
+                    dependency_exp = last_successful_component("Model")
+                    spec = dependency_exp.experiment_workspace.code_dict["spec/ensemble.md"]
                     et = EnsembleTask(
                         name="Ensemble",
                         description=resp_dict.get("description", "Ensemble description not provided"),
+                        spec=spec,
                     )
                     exp = EnsembleExperiment(sub_tasks=[et])
-                    dependency_exp = last_successful_component("Model")
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
                 elif o == "Workflow":
@@ -336,15 +397,21 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                         targets="Workflow",
                         hypothesis=None,
                     )
-                    
-                    resp_dict = json.loads(APIBackend().build_messages_and_create_chat_completion(user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True))
+
+                    resp_dict = json.loads(
+                        APIBackend().build_messages_and_create_chat_completion(
+                            user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                        )
+                    )
+                    dependency_exp = last_successful_component("Ensemble")
+                    spec = dependency_exp.experiment_workspace.code_dict["spec/workflow.md"]
                     wt = WorkflowTask(
                         name="Workflow",
                         description=resp_dict.get("description", "Workflow description not provided"),
+                        spec=spec,
                     )
                     exp = WorkflowExperiment(sub_tasks=[wt])
-                    dependency_exp = last_successful_component("Ensemble")
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
-            
+
         return super().gen(trace)
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index 7471efea8..17965b52d 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -22,7 +22,7 @@ class DataScienceScen(Scenario):
     def __init__(self, competition: str) -> None:
         self.competition = competition
         self.competition_descriptions = crawl_descriptions(competition, DS_RD_SETTING.local_data_path)
-        
+
         leaderboard = leaderboard_scores(competition)
         self.evaluation_metric_direction = float(leaderboard[0]) > float(leaderboard[-1])
 
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
index 34ebee258..26d845353 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
@@ -150,4 +150,4 @@ def model_workflow(
     else:
         test_pred = None
 
-    return val_pred, test_pred
\ No newline at end of file
+    return val_pred, test_pred

From 6980e1da361543731714fd0807eb8f13d285e36e Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Fri, 13 Dec 2024 06:44:26 +0000
Subject: [PATCH 051/304] rewrite ds model evaluate

---
 .../components/coder/data_science/model/es.py |   6 +-
 .../coder/data_science/model/eval.py          |  84 +++++------
 .../model_execute.py}                         |  12 +-
 .../coder/data_science/model/prompts.yaml     | 137 ++++--------------
 .../coder/data_science/model/test.py          |   6 +-
 .../aerial-cactus-identification/model01.py   |   4 +-
 6 files changed, 75 insertions(+), 174 deletions(-)
 rename rdagent/components/coder/data_science/model/{model_execute_template.txt => eval_tests/model_execute.py} (79%)

diff --git a/rdagent/components/coder/data_science/model/es.py b/rdagent/components/coder/data_science/model/es.py
index b233471a7..e06fc80fb 100644
--- a/rdagent/components/coder/data_science/model/es.py
+++ b/rdagent/components/coder/data_science/model/es.py
@@ -10,7 +10,9 @@
     CoSTEERQueriedKnowledge,
     CoSTEERQueriedKnowledgeV2,
 )
-from rdagent.components.coder.data_science.model.exp import ModelFBWorkspace, ModelTask
+from rdagent.components.coder.data_science.model.exp import ModelTask
+
+from rdagent.core.experiment import FBWorkspace
 from rdagent.core.prompts import Prompts
 from rdagent.oai.llm_conf import LLM_SETTINGS
 from rdagent.oai.llm_utils import APIBackend
@@ -111,7 +113,7 @@ def assign_code_list_to_evo(self, code_list, evo):
             if code_list[index] is None:
                 continue
             if evo.sub_workspace_list[index] is None:
-                evo.sub_workspace_list[index] = ModelFBWorkspace(target_task=evo.sub_tasks[index])
+                evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
             # TODO: avoid hardcode of file name
             evo.sub_workspace_list[index].inject_code(**{"model01.py": code_list[index]})
         return evo
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index f97cfe391..a50dfe4ec 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -2,10 +2,11 @@
 Beyond previous tests
 - 
 """
-
+import json
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
     CoSTEERMultiFeedback,
+    CoSTEERSingleFeedback,
     CoSTEERSingleFeedbackDeprecated,
 )
 from rdagent.components.coder.data_science.model.eva_utils import (
@@ -13,11 +14,17 @@
     ModelFinalEvaluator,
     expected_shape_evaluate,
 )
-from rdagent.components.coder.data_science.model.exp import ModelFBWorkspace
 from rdagent.core.evolving_framework import QueriedKnowledge
-from rdagent.core.experiment import Task, Workspace
+from rdagent.core.experiment import Task, Workspace, FBWorkspace
+from rdagent.utils.env import DSDockerConf, DockerEnv
+from rdagent.oai.llm_utils import APIBackend
+from pathlib import Path
+from rdagent.utils.agent.tpl import T
+
+DIRNAME = Path(__file__).absolute().resolve().parent
 
-ModelSingleFeedback = CoSTEERSingleFeedbackDeprecated
+
+ModelSingleFeedback = CoSTEERSingleFeedback
 ModelMultiFeedback = CoSTEERMultiFeedback
 
 
@@ -35,11 +42,11 @@ class ModelGeneralCaseSpecEvaluator(CoSTEEREvaluator):
     def evaluate(
         self,
         target_task: Task,
-        implementation: Workspace,
-        gt_implementation: Workspace,
+        implementation: FBWorkspace,
+        gt_implementation: FBWorkspace,
         queried_knowledge: QueriedKnowledge = None,
         **kwargs,
-    ) -> ModelSingleFeedback:
+    ) -> CoSTEERSingleFeedbackDeprecated:
         target_task_information = target_task.get_task_information()
         if (
             queried_knowledge is not None
@@ -58,54 +65,29 @@ def evaluate(
         # assert isinstance(target_task, ModelTask)
 
         batch_size = 8
-        assert isinstance(implementation, ModelFBWorkspace)
-        model_execution_feedback, pred_list = implementation.execute(
+        assert isinstance(implementation, FBWorkspace)
+        """model_execution_feedback, pred_list= implementation.execute(
             batch_size=batch_size,
+        )"""
+        de = DockerEnv(conf=DSDockerConf())
+        fname = "model_execute.py"
+        with (DIRNAME / "eval_tests" / "model_execute.py").open("r") as f:
+            test_code = f.read()
+            implementation.inject_code(**{fname: test_code})
+        stdout = implementation.execute(env=de, entry=f"python {fname}")
+        system_prompt = T(".prompts:model_eval.system").r(
+            test_code=test_code,
+            scenario="No scenario information yet.",
+            spec=target_task.spec,
         )
-        shape_feedback = ""
-        if pred_list is None:
-            shape_feedback += "No output generated from the model. No shape evaluation conducted."
-        else:
-            val_pred_array, test_pred_array, hypers = pred_list
-            # spec_message = implementation.code_dict["spec/model.md"]
-            spec_message = target_task.spec
-            val_shape_feedback = expected_shape_evaluate(
-                val_pred_array,
-                spec_message,
-                model_execution_feedback=model_execution_feedback,
-            )
-            test_shape_feedback = expected_shape_evaluate(
-                test_pred_array,
-                spec_message,
-                model_execution_feedback=model_execution_feedback,
-            )
-
-            shape_feedback += f"Validation Output: {val_shape_feedback}\n"
-            shape_feedback += f"Test Output: {test_shape_feedback}\n"
-        value_feedback = "The value feedback is ignored, and the value decision is automatically set as true."
-        code_feedback, _ = ModelCodeEvaluator(scen=self.scen).evaluate(
-            target_task=target_task,
-            implementation=implementation,
-            model_execution_feedback=model_execution_feedback,
-        )
-        final_feedback, final_decision = ModelFinalEvaluator(scen=self.scen).evaluate(
-            target_task=target_task,
-            implementation=implementation,
-            model_execution_feedback=model_execution_feedback,
-            model_shape_feedback=shape_feedback,
-            model_code_feedback=code_feedback,
+        user_prompt = T(".prompts:model_eval.user").r(
+            stdout=stdout,
+            code=implementation.code_dict["model01.py"],
         )
+        resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
+        return ModelSingleFeedback(**json.loads(resp))
 
-        return ModelSingleFeedback(
-            execution_feedback=model_execution_feedback,
-            shape_feedback=shape_feedback,
-            value_feedback=value_feedback,
-            code_feedback=code_feedback,
-            final_feedback=final_feedback,
-            final_decision=final_decision,
-            value_generated_flag=(pred_list is not None),
-            final_decision_based_on_gt=False,
-        )
+    """feedback"""
 
 
 class XXX2SpecEval:
diff --git a/rdagent/components/coder/data_science/model/model_execute_template.txt b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
similarity index 79%
rename from rdagent/components/coder/data_science/model/model_execute_template.txt
rename to rdagent/components/coder/data_science/model/eval_tests/model_execute.py
index 6d890deb9..a01387696 100644
--- a/rdagent/components/coder/data_science/model/model_execute_template.txt
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
@@ -30,6 +30,8 @@
     test_X=test_X,
     hyper_params={}  
 )  
+#val_pred = np.random.rand(8, 1)
+#test_pred = np.random.rand(8, 1)
  
 execution_feedback_str = "Execution successful.\n"  
 if val_pred is not None:  
@@ -39,10 +41,6 @@
 if test_pred is not None:  
     execution_feedback_str += f"Test predictions shape: {test_pred.shape}\n"  
 else:  
-    execution_feedback_str += "Test predictions are None.\n"  
-  
-# Save the outputs  
-pred_list = [val_pred, test_pred, hypers]  
-pickle.dump(pred_list, open("pred_list.pkl", "wb"))  
-pickle.dump(execution_feedback_str, open("execution_feedback_str.pkl", "wb"))  
-  
+    execution_feedback_str += "Test predictions are None.\n"  ''
+
+print(execution_feedback_str)
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index d831ffa40..7df80fc9d 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -1,44 +1,3 @@
-extract_model_formulation_system: |-
-    offer description of the proposed model in this paper, write a latex formula with variable as well as the architecture of the model. the format should be like 
-    {
-    "model_name (The name of the model)": {
-        "description": "A detailed description of the model",
-        "formulation": "A LaTeX formula representing the model's formulation",
-        "architecture": "A detailed description of the model's architecture, e.g., neural network layers or tree structures",
-        "variables": {
-            "\\hat{y}_u": "The predicted output for node u",
-            "variable_name_2": "Description of variable 2",
-            "variable_name_3": "Description of variable 3"
-        },
-        "hyperparameters": {
-            "hyperparameter_name_1": "value of hyperparameter 1",
-            "hyperparameter_name_2": "value of hyperparameter 2",
-            "hyperparameter_name_3": "value of hyperparameter 3"
-        },
-        "model_type": "Tabular or TimeSeries or Graph or XGBoost"  # Should be one of "Tabular", "TimeSeries", "Graph", or "XGBoost"
-    }
-    }
-    Eg. 
-    {
-    "ABC Model": {
-        "description": "A detailed description of the model",
-        "formulation": "A LaTeX formula representing the model's formulation",
-        "architecture": "A detailed description of the model's architecture, e.g., neural network layers or tree structures",
-        "variables": {
-            "\\hat{y}_u": "The predicted output for node u",
-            "variable_name_2": "Description of variable 2",
-            "variable_name_3": "Description of variable 3"
-        },
-        "hyperparameters": {
-            "hyperparameter_name_1": "value of hyperparameter 1",
-            "hyperparameter_name_2": "value of hyperparameter 2",
-            "hyperparameter_name_3": "value of hyperparameter 3"
-        },
-        "model_type": "Tabular or TimeSeries or Graph or RandomForest or XGBoost"  # If torch & Neural network models are required, the choice should be one of "Tabular", "TimeSeries", or "Graph" 
-    }
-    }
-    such format content should be begin with ```json and end with ``` and the content should be in json format.
-
 evolving_strategy_model_coder:
     system: |-
         User is trying to implement some pytorch models in the following scenario:
@@ -102,80 +61,36 @@ evolving_strategy_model_coder:
         {% endfor %}
         {% endif %}
 
-evaluator_shape_feedback:
-    system: |-
-        User is trying to evaluate whether a model output shape is correct or not. The correct message about the ground truth shape is given in spec.md as below:
-        {{ spec }}
-
-        The user will provide you the actual output of the model. The model is a part for solving a task in an given scenario. This model takes train dataset as input. Valid and test dataset are optional. The model workflow will generate prediction output of valid and test dataset.
-        The user will provide the execution result message.
-
-        Your job is to compare the output user provide and the message from spec.md to evaluate whether the user's model output is correct.
-
-        In your response you should give a clear judgement and also point out the expected shape and actual shape of the model output.
-        Here is an example structure for the output:
-        Expected prediction shape: (8, 1). The actual output shape: (8, 1). The shape of the output is correct.
-
-    user: |-
-        --------------Actual Output Shape:---------------
-        {{ pre_shape }}
-        --------------Execution feedback:---------------
-        {{ model_execution_feedback }}
-
-evaluator_code_feedback:
+model_eval:
     system: |-
+        You are data scientist.
         User is trying to implement some models in the following scenario:
         {{ scenario }}
         User will provide you the information of the model.
-
-        Your job is to check whether user's code is align with the model information and the scenario.
-        The user will provide the source python code and the execution error message if execution failed.
-        The user might provide you the ground truth code for you to provide the critic. You should not leak the ground truth code to the user in any form but you can use it to provide the critic.
-
-        User has also compared the output generated by the user's code and the ground truth code. The user will provide you some analysis results comparing two output. You may find some error in the code which caused the difference between the two output.
-
-        If the ground truth code is provided, your critic should only consider checking whether the user's code is align with the ground truth code since the ground truth is definitely correct.
-        If the ground truth code is not provided, your critic should consider checking whether the user's code is reasonable and correct to the description and to the scenario.
-
-        Notice that your critics are not for user to debug the code. They are sent to the coding agent to correct the code. So don't give any following items for the user to check like "Please check the code line XXX".
-
-        You suggestion should not include any code, just some clear and short suggestions. Please point out very critical issues in your response, ignore non-important issues to avoid confusion. If no big issue found in the code, you can response "No critics found".
-
-        You should provide the suggestion to each of your critic to help the user improve the code. Please response the critic in the following format. Here is an example structure for the output:
-        critic 1: The critic message to critic 1
-        critic 2: The critic message to critic 2
-    
-    user: |-
-        --------------Model information:---------------
-        {{ model_information }}
-        --------------Python code:---------------
-        {{ code }}
-        --------------Execution feedback:---------------
-        {{ model_execution_feedback }}
-
-
-evaluator_final_feedback:
-    system: |-
-        User is trying to implement a model in the following scenario:
-        {{ scenario }}
-        User has finished evaluation and got some feedback from the evaluator.
-        The evaluator run the code and get the output and provide several feedback regarding user's code and code output. You should analyze the feedback and considering the scenario and model description to give a final decision about the evaluation result. The final decision concludes whether the model is implemented correctly and if not, detail feedback containing reason and suggestion if the final decision is False.
-
-        The implementation final decision is considered in the following logic:
-        1. If the value and the ground truth value are exactly the same under a small tolerance, the implementation is considered correct.
-        2. If no ground truth value is not provided, the implementation is considered correct if the code execution is successful and the code feedback is align with the scenario and model description.
-
-        Please response the critic in the json format. Here is an example structure for the JSON output, please strictly follow the format:
+        The information about how to implement the model is given in spec.md as below:
+        {{ spec }}
+        You are testing the model with the following code:
+        ```python
+        {{test_code}}
+        ```
+        You should evaluate the code given by user. You should concern about whether the user implement it correctly, including whether the shape of model's output is aligned with request, the equality of code, and any other thing you think necessary.
+        You will be given the code generated by user and the stdout of the testing process.
+        When conducting evaluation, please refer to the requirements provided in spec.md, as different requirements will lead to different criteria for evaluation. 
+        For example, in some cases, the model's output may be required to have predictions for both the valid and test sets, while in other cases, only one of them may be required. Some cases may also require the model's hyperparameters to be preserved and outputted.
+        Please respond with your feedback in the following JSON format and order
+        ```json
         {
-            "final_decision": True,
-            "final_feedback": "The final feedback message",
+            "execution": "Describe whether the model execute successfully, including any errors or issues encountered.",
+            "return_checking": "Checks about the generated value, including whether the value generated. Especially compare the shape of model output and the requirement in spec.md.",
+            "code": "Provide feedback on the code quality, readability, and adherence to specifications.",
+            "final_decision": <true/false>
         }
+        ```
+
     user: |-
-        --------------Model information:---------------
-        {{ model_information }}
-        --------------Model Execution feedback:---------------
-        {{ model_execution_feedback }}
-        --------------Model shape feedback:---------------
-        {{ model_shape_feedback }}
-        --------------Model Code feedback:---------------
-        {{ model_code_feedback }}
\ No newline at end of file
+        --------------Code generated by user:---------------
+        {{ code }}
+        --------------stdoutput:---------------
+        '''
+        {{ stdout }}
+        '''
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index 4555ff08c..a65625c4d 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -15,6 +15,10 @@
 from rdagent.components.coder.data_science.model.exp import ModelFBWorkspace, ModelTask
 from rdagent.scenarios.data_science.experiment.experiment import ModelExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+from rdagent.components.coder.data_science.model.es import ModelMultiProcessEvolvingStrategy
+from rdagent.core.experiment import FBWorkspace
+
 
 
 # Take tasks, spec.md and feat as input, generate a feedback as output
@@ -36,7 +40,7 @@ def develop_one_competition(competition: str):
     tpl_ex_path = Path(__file__).resolve() / Path("rdagent/scenarios/kaggle/tpl_ex").resolve() / competition
     injected_file_names = ["spec/model.md", "load_data.py", "feat01.py", "model01.py"]
 
-    modelexp = ModelFBWorkspace()
+    modelexp = FBWorkspace()
     for file_name in injected_file_names:
         file_path = tpl_ex_path / file_name
         modelexp.inject_code(**{file_name: file_path.read_text()})
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
index 26d845353..115e20ca3 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
@@ -25,7 +25,7 @@ def model_workflow(
     val_y: np.ndarray = None,
     test_X: np.ndarray = None,
     **hyper_params,
-) -> tuple[np.ndarray | None, np.ndarray | None]:
+) -> tuple[np.ndarray | None, np.ndarray | None, dict]:
     """
     Manages the workflow of a machine learning model, including training, validation, and testing.
 
@@ -150,4 +150,4 @@ def model_workflow(
     else:
         test_pred = None
 
-    return val_pred, test_pred
+    return val_pred, test_pred, hyper_params
\ No newline at end of file

From 1b051e34677c547b7a672d065a776bbaa6ebb4b8 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 13 Dec 2024 07:56:07 +0000
Subject: [PATCH 052/304] fix data bug

---
 rdagent/components/coder/data_science/raw_data_loader/eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index 39c59f893..f352b4774 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -52,7 +52,7 @@ def evaluate(
             )
 
         ds_docker_conf = DSDockerConf()
-        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{DS_RD_SETTING.competition}": "/kaggle/input"}
+        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
         de = DockerEnv(conf=ds_docker_conf)
 
         # TODO: do we need to clean the generated tempory content?

From a37071e0ae2fb0729e353427ac19df96259c6281 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Fri, 13 Dec 2024 08:09:03 +0000
Subject: [PATCH 053/304] small code refinement on conf and other

---
 rdagent/app/data_science/conf.py              | 52 +------------------
 rdagent/app/data_science/loop.py              |  2 +-
 rdagent/app/kaggle/conf.py                    | 26 ++++------
 rdagent/app/kaggle/loop.py                    |  2 +-
 .../data_science/proposal/exp_gen.py          |  2 +-
 .../kaggle/docker/mle_bench_docker/Dockerfile |  1 +
 rdagent/scenarios/kaggle/kaggle_crawler.py    |  6 ++-
 7 files changed, 21 insertions(+), 70 deletions(-)

diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index 33cf0b575..156956dbe 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -1,8 +1,8 @@
-from rdagent.components.workflow.conf import BasePropSetting
+from rdagent.app.kaggle.conf import KaggleBasePropSetting
 from rdagent.core.conf import ExtendedSettingsConfigDict
 
 
-class DataScienceBasePropSetting(BasePropSetting):
+class DataScienceBasePropSetting(KaggleBasePropSetting):
     model_config = ExtendedSettingsConfigDict(env_prefix="DS_", protected_namespaces=())
 
     # Main components
@@ -44,53 +44,5 @@ class DataScienceBasePropSetting(BasePropSetting):
     summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGExperiment2Feedback"
     """Summarizer class"""
 
-    # Configs
-    ## Base
-    competition: str = ""
-    """Kaggle competition name, e.g., 'sf-crime'"""
-
-    template_path: str = "rdagent/scenarios/kaggle/experiment/templates"  # TODO: we may not need this
-    """Kaggle competition base templates path"""
-
-    local_data_path: str = ""
-    """Folder storing Kaggle competition data"""
-
-    if_using_mle_data: bool = False
-
-    ## Workflow
-    evolving_n: int = 10
-    """Number of evolutions"""
-
-    auto_submit: bool = False
-    """Automatically upload and submit each experiment result to Kaggle platform"""
-
-    ### shared components in the workflow
-    # Conditionally set the knowledge_base based on the use of graph RAG
-    knowledge_base: str = ""
-    """Knowledge base class, uses 'KGKnowledgeGraph' when advanced graph-based RAG is enabled, otherwise empty."""
-
-    domain_knowledge_path: str = (
-        "/data/userdata/share/kaggle/domain_knowledge"  # TODO: It should be sth like knowledge_base_kwargs
-    )
-    """Folder storing domain knowledge files in .case format"""
-
-    knowledge_base_path: str = "kg_graph.pkl"
-    """Advanced version of graph-based RAG"""
-
-    rag_path: str = "git_ignore_folder/kaggle_vector_base.pkl"
-    """Base version of vector-based RAG"""
-
-    ## proposal
-    # (TODO: should goto sub config of proposal)
-    #  Move to hypothesis_gen as a sub config instead of global config
-    if_action_choosing_based_on_UCB: bool = False
-    """Enable decision mechanism based on UCB algorithm"""
-
-    if_using_vector_rag: bool = False
-    """Enable basic vector-based RAG"""
-
-    if_using_graph_rag: bool = False
-    """Enable advanced graph-based RAG"""
-
 
 DS_RD_SETTING = DataScienceBasePropSetting()
diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 6bde5f352..11449ef96 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -155,7 +155,7 @@ def main(path=None, step_n=None, competition=None):
         DS_RD_SETTING.competition = competition
 
     if DS_RD_SETTING.competition:
-        download_data(competition=DS_RD_SETTING.competition, local_path=DS_RD_SETTING.local_data_path)
+        download_data(competition=DS_RD_SETTING.competition, settings=DS_RD_SETTING)
     else:
         logger.error("Please specify competition name.")
     if path is None:
diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py
index 0d45d35a8..b0c9e7165 100644
--- a/rdagent/app/kaggle/conf.py
+++ b/rdagent/app/kaggle/conf.py
@@ -1,8 +1,7 @@
-from rdagent.components.workflow.conf import BasePropSetting
-from rdagent.core.conf import ExtendedSettingsConfigDict
+from rdagent.core.conf import ExtendedBaseSettings, ExtendedSettingsConfigDict
 
 
-class KaggleBasePropSetting(BasePropSetting):
+class KaggleBasePropSetting(ExtendedBaseSettings):
     model_config = ExtendedSettingsConfigDict(env_prefix="KG_", protected_namespaces=())
 
     # 1) overriding the default
@@ -45,12 +44,21 @@ class KaggleBasePropSetting(BasePropSetting):
     local_data_path: str = ""
     """Folder storing Kaggle competition data"""
 
+    if_using_mle_data: bool = False
+    auto_submit: bool = False
+    """Automatically upload and submit each experiment result to Kaggle platform"""
+    # Conditionally set the knowledge_base based on the use of graph RAG
+    knowledge_base: str = ""
+    """Knowledge base class, uses 'KGKnowledgeGraph' when advanced graph-based RAG is enabled, otherwise empty."""
     if_action_choosing_based_on_UCB: bool = False
     """Enable decision mechanism based on UCB algorithm"""
 
     domain_knowledge_path: str = "/data/userdata/share/kaggle/domain_knowledge"
     """Folder storing domain knowledge files in .case format"""
 
+    knowledge_base_path: str = "kg_graph.pkl"
+    """Advanced version of graph-based RAG"""
+
     rag_path: str = "git_ignore_folder/kaggle_vector_base.pkl"
     """Base version of vector-based RAG"""
 
@@ -60,20 +68,8 @@ class KaggleBasePropSetting(BasePropSetting):
     if_using_graph_rag: bool = False
     """Enable advanced graph-based RAG"""
 
-    # Conditionally set the knowledge_base based on the use of graph RAG
-    knowledge_base: str = ""
-    """Knowledge base class, uses 'KGKnowledgeGraph' when advanced graph-based RAG is enabled, otherwise empty."""
-
-    knowledge_base_path: str = "kg_graph.pkl"
-    """Advanced version of graph-based RAG"""
-
-    auto_submit: bool = False
-    """Automatically upload and submit each experiment result to Kaggle platform"""
-
     mini_case: bool = False
     """Enable mini-case study for experiments"""
 
-    if_using_mle_data: bool = False
-
 
 KAGGLE_IMPLEMENT_SETTING = KaggleBasePropSetting()
diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
index fa822d354..c727425fa 100644
--- a/rdagent/app/kaggle/loop.py
+++ b/rdagent/app/kaggle/loop.py
@@ -132,7 +132,7 @@ def main(path=None, step_n=None, competition=None):
     """
     if competition:
         KAGGLE_IMPLEMENT_SETTING.competition = competition
-        download_data(competition=competition, local_path=KAGGLE_IMPLEMENT_SETTING.local_data_path)
+        download_data(competition=competition, settings=KAGGLE_IMPLEMENT_SETTING)
         if KAGGLE_IMPLEMENT_SETTING.if_using_graph_rag:
             KAGGLE_IMPLEMENT_SETTING.knowledge_base = (
                 "rdagent.scenarios.kaggle.knowledge_management.graph.KGKnowledgeGraph"
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 05ba903f2..054a83073 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -70,9 +70,9 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                     return exp
             raise RuntimeError(f"No successful {com} component generated yet.")
 
+        scenario = trace.scen.get_scenario_all_desc()
         if is_complete():
             # base info
-            scenario = trace.scen.get_scenario_all_desc()
             hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(trace=trace)
 
             # 1. hypothesis gen
diff --git a/rdagent/scenarios/kaggle/docker/mle_bench_docker/Dockerfile b/rdagent/scenarios/kaggle/docker/mle_bench_docker/Dockerfile
index 227969351..f82f29979 100644
--- a/rdagent/scenarios/kaggle/docker/mle_bench_docker/Dockerfile
+++ b/rdagent/scenarios/kaggle/docker/mle_bench_docker/Dockerfile
@@ -7,6 +7,7 @@ RUN apt-get clean && apt-get update && apt-get install -y \
     git \  
     build-essential \
     git-lfs \
+    unzip \
     && rm -rf /var/lib/apt/lists/* 
 
 RUN git clone https://github.com/openai/mle-bench.git
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
index 02f680afd..069ca655a 100644
--- a/rdagent/scenarios/kaggle/kaggle_crawler.py
+++ b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -16,6 +16,7 @@
 from selenium.webdriver.common.by import By
 
 from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+from rdagent.core.conf import ExtendedBaseSettings
 from rdagent.core.exception import KaggleError
 from rdagent.core.prompts import Prompts
 from rdagent.log import rdagent_logger as logger
@@ -101,8 +102,9 @@ def kaggle_description_css_selectors() -> tuple[str, str]:
     return descriptions
 
 
-def download_data(competition: str, local_path: str = KAGGLE_IMPLEMENT_SETTING.local_data_path) -> None:
-    if KAGGLE_IMPLEMENT_SETTING.if_using_mle_data:
+def download_data(competition: str, settings: ExtendedBaseSettings = KAGGLE_IMPLEMENT_SETTING) -> None:
+    local_path = settings.local_data_path
+    if settings.if_using_mle_data:
         zipfile_path = f"{local_path}/zip_files"
         zip_competition_path = Path(zipfile_path) / competition
         if (

From f2ed789ed175b580cb04e5f67933671f3a7846dd Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Fri, 13 Dec 2024 08:13:03 +0000
Subject: [PATCH 054/304] load data in ds model

---
 .../coder/data_science/model/eval.py          |  5 +++-
 .../model/eval_tests/model_execute.py         | 24 +++++++++----------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index a50dfe4ec..b4165b662 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -20,6 +20,7 @@
 from rdagent.oai.llm_utils import APIBackend
 from pathlib import Path
 from rdagent.utils.agent.tpl import T
+from rdagent.app.data_science.conf import DS_RD_SETTING
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
@@ -69,7 +70,9 @@ def evaluate(
         """model_execution_feedback, pred_list= implementation.execute(
             batch_size=batch_size,
         )"""
-        de = DockerEnv(conf=DSDockerConf())
+        ds_docker_conf = DSDockerConf()        
+        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}        
+        de = DockerEnv(conf=ds_docker_conf)
         fname = "model_execute.py"
         with (DIRNAME / "eval_tests" / "model_execute.py").open("r") as f:
             test_code = f.read()
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
index a01387696..a6ba73777 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
@@ -7,31 +7,29 @@
 import traceback  
   
 import numpy as np  
-from model01 import model_workflow  
+from model01 import model_workflow
+from load_data import load_from_raw_data
+X, y, test_X, test_ids = load_from_raw_data()
   
-# train_X = np.load("train_X.npy")  
-# train_y = np.load("train_y.npy")  
-# val_X = np.load("val_X.npy")  
-# val_y = np.load("val_y.npy")  
-# test_X = np.load("test_X.npy")
-train_X = np.random.rand(8, 64, 64, 3)
+
+"""train_X = np.random.rand(8, 64, 64, 3)
 train_y = np.random.rand(8, 1)
 val_X = np.random.rand(8, 64, 64, 3)
 val_y = np.random.rand(8, 1)
-test_X = np.random.rand(8, 64, 64, 3)
+test_X = np.random.rand(8, 64, 64, 3)"""
   
 
 # Call model_workflow  
 val_pred, test_pred, hypers = model_workflow(  
-    X=train_X,  
-    y=train_y,  
-    val_X=val_X,  
-    val_y=val_y,  
+    X=X,  
+    y=y,  
+    val_X=None,  
+    val_y=None,  
     test_X=test_X,
     hyper_params={}  
 )  
 #val_pred = np.random.rand(8, 1)
-#test_pred = np.random.rand(8, 1)
+test_pred = np.random.rand(8, 1)
  
 execution_feedback_str = "Execution successful.\n"  
 if val_pred is not None:  

From 9bbf9a3bf298c76e918f07b4798316ab34ed2ec6 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Fri, 13 Dec 2024 08:22:23 +0000
Subject: [PATCH 055/304] fix some bugs

---
 .../coder/data_science/feature/__init__.py    | 26 ++-----------------
 .../coder/data_science/feature/eval.py        | 11 +++++---
 .../feature/eval_tests/feature_test.py        | 18 ++++++++++---
 .../coder/data_science/feature/prompts.yaml   |  1 +
 .../coder/data_science/feature/test.py        |  5 ----
 .../spec/feature.md                           |  2 +-
 6 files changed, 25 insertions(+), 38 deletions(-)

diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
index 6ca4ca891..2a0300aab 100644
--- a/rdagent/components/coder/data_science/feature/__init__.py
+++ b/rdagent/components/coder/data_science/feature/__init__.py
@@ -1,23 +1,3 @@
-# from rdagent.components.coder.CoSTEER import CoSTEER
-# from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
-# from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-# from rdagent.core.scenario import Scenario
-
-
-# class FeatureCoSTEER(CoSTEER):
-#     def __init__(
-#         self,
-#         scen: Scenario,
-#         *args,
-#         **kwargs,
-#     ) -> None:
-#         eva = CoSTEERMultiEvaluator(
-#             FeatureCoSTEEREvaluator(scen=scen), scen=scen
-#         )  # Please specify whether you agree running your eva in parallel or not
-#         es = FeatureMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
-
-#         super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
-
 import json
 
 from rdagent.components.coder.CoSTEER import CoSTEER
@@ -30,9 +10,7 @@
     CoSTEERQueriedKnowledge,
 )
 from rdagent.components.coder.data_science.feature.exp import FeatureTask
-from rdagent.components.coder.data_science.raw_data_loader.eval import (
-    DataLoaderCoSTEEREvaluator,
-)
+from rdagent.components.coder.data_science.feature.eval import FeatureCoSTEEREvaluator
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
@@ -87,7 +65,7 @@ def __init__(
         **kwargs,
     ) -> None:
         eva = CoSTEERMultiEvaluator(
-            DataLoaderCoSTEEREvaluator(scen=scen), scen=scen
+            FeatureCoSTEEREvaluator(scen=scen), scen=scen
         )  # Please specify whether you agree running your eva in parallel or not
         es = FeatureMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
 
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
index de8e55b30..2ab83b5b9 100644
--- a/rdagent/components/coder/data_science/feature/eval.py
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -10,10 +10,11 @@
     CoSTEERSingleFeedbackDeprecated,
 )
 from rdagent.core.evolving_framework import QueriedKnowledge
-from rdagent.core.experiment import FBWorkspace, Task, Workspace
+from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
+from rdagent.app.data_science.conf import DS_RD_SETTING
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
@@ -47,7 +48,9 @@ def evaluate(
                 final_decision=False,
             )
 
-        de = DockerEnv(conf=DSDockerConf())
+        ds_docker_conf = DSDockerConf()
+        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
+        de = DockerEnv(conf=ds_docker_conf)
 
         # TODO: do we need to clean the generated tempory content?
         fname = "feature_test.py"
@@ -56,8 +59,8 @@ def evaluate(
             implementation.inject_code(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
-        system_prompt = T(".prompts:feature.system").r(test_code=test_code)
-        user_prompt = T(".prompts:feature.user").r(stdout=stdout)
+        system_prompt = T(".prompts:feature_eval.system").r(test_code=test_code)
+        user_prompt = T(".prompts:feature_eval.user").r(stdout=stdout)
 
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
         return FeatureEvalFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
index 2bedf04b3..6d02835ea 100644
--- a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
@@ -10,7 +10,10 @@
 import logging
 import pickle
 
-from feat01 import feature_eng
+import pandas as pd
+import numpy as np
+
+from feat01 import feat_eng
 
 # Setup logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -29,9 +32,16 @@
 assert len(X_test) == len(test_ids), "Mismatch in length of test images and test IDs"
 assert len(X) == len(y), "Mismatch in length of training images and labels"
 # Check for missing values
-assert not X.isnull().values.any(), "Missing values found in training data"
-assert not X_test.isnull().values.any(), "Missing values found in test data"
-assert not y.isnull().values.any(), "Missing values found in labels"
+if isinstance(X, pd.DataFrame):
+    assert not X.isnull().values.any(), "Missing values found in training data"
+    assert not X_test.isnull().values.any(), "Missing values found in test data"
+    assert not y.isnull().values.any(), "Missing values found in labels"
+elif isinstance(X, np.ndarray):
+    assert not np.isnan(X).any(), "Missing values found in training data"
+    assert not np.isnan(X_test).any(), "Missing values found in test data"
+    assert not np.isnan(y).any(), "Missing values found in labels"
+else:
+    raise TypeError("Unsupported data type for X and y")
 
 logging.info("Data loader test passed successfully. Length of test images matches length of test IDs.")
 
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
index ae5a29b36..9d3287b94 100644
--- a/rdagent/components/coder/data_science/feature/prompts.yaml
+++ b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -2,6 +2,7 @@ feature:
   system: |-
     You are a Python data scientist working on a new project. This project involves implementing feature engineering techniques to prepare data for machine learning models, and this project code will be written by GPT.
     Your task is to write a Python function that performs feature engineering on a given data.
+    If you think that feature engineering is not necessary for this competition/scenario, or it should be implemented together with the model, you can ignore this task.
     You should follow the provided specifications to complete this task.
 
     Please response the code in the following json format. Here is an example structure for the JSON output:
diff --git a/rdagent/components/coder/data_science/feature/test.py b/rdagent/components/coder/data_science/feature/test.py
index 1032d14a8..6bdab1d63 100644
--- a/rdagent/components/coder/data_science/feature/test.py
+++ b/rdagent/components/coder/data_science/feature/test.py
@@ -6,16 +6,11 @@
 - it is not interface unittest(i.e. workspace evaluator in the CoSTEER Loop)
 """
 
-import pickle
-
 from rdagent.components.coder.data_science.feature import FeatureCoSTEER
 from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.scenarios.data_science.experiment.experiment import FeatureExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
 
-# from rdagent.components.coder.data_science.feature.es import ModelMultiProcessEvolvingStrategy
-
-
 def develop_one_competition(competition: str):  # -> experiment
     scen = DataScienceScen(competition=competition)
     feature_coder = FeatureCoSTEER(scen)
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md
index 433dd4c04..60139d6af 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md
@@ -4,7 +4,7 @@
 - Implement a function for feature engineering with the following signature:
 
 ```python
-def feature_eng(X: np.ndarray, y: np.ndarray | None = None, X_fit: np.ndarray | None = None, y_fit: np.ndarray | None = None, param: object | None = None) -> tuple[np.ndarray, np.ndarray | None, object]:
+def feat_eng(X: np.ndarray, y: np.ndarray | None = None, X_fit: np.ndarray | None = None, y_fit: np.ndarray | None = None, param: object | None = None) -> tuple[np.ndarray, np.ndarray | None, object]:
     """
     Perform feature engineering on the input data.
 

From 93b66565c5f2b70ae67afc48dba2880765a41a42 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Sun, 15 Dec 2024 12:47:55 +0000
Subject: [PATCH 056/304] a debug llm tool app

---
 rdagent/app/data_science/loop.py              | 186 +++++++++---------
 .../coder/data_science/feature/__init__.py    |   2 +-
 .../coder/data_science/feature/eval.py        |   2 +-
 .../feature/eval_tests/feature_test.py        |   3 +-
 .../coder/data_science/feature/test.py        |   1 +
 .../components/coder/data_science/model/es.py |   1 -
 .../coder/data_science/model/eval.py          |  14 +-
 .../model/eval_tests/model_execute.py         |  52 +++--
 .../coder/data_science/model/test.py          |   5 +-
 rdagent/core/evolving_agent.py                |  67 +++----
 rdagent/log/logger.py                         |  15 ++
 rdagent/log/ui/llm_st.py                      | 137 +++++++++++++
 rdagent/oai/llm_utils.py                      |   6 +-
 .../aerial-cactus-identification/model01.py   |   2 +-
 rdagent/utils/agent/tpl.py                    |  15 +-
 rdagent/utils/workflow.py                     |  27 +--
 16 files changed, 347 insertions(+), 188 deletions(-)
 create mode 100644 rdagent/log/ui/llm_st.py

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 11449ef96..8bc17667c 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -28,52 +28,50 @@ class DataScienceRDLoop(RDLoop):
 
     @measure_time
     def __init__(self, PROP_SETTING: BasePropSetting):
-
-        with logger.tag("init"):
-            scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
-            logger.log_object(scen, tag="scenario")
-
-            ### shared components in the workflow  # TODO: check if
-            knowledge_base = (
-                import_class(PROP_SETTING.knowledge_base)(PROP_SETTING.knowledge_base_path, scen)
-                if PROP_SETTING.knowledge_base != ""
-                else None
-            )
-            logger.log_object(knowledge_base, tag="knowledge_base")
-
-            # 1) task generation from scratch
-            # self.scratch_gen: tuple[HypothesisGen, Hypothesis2Experiment] = DummyHypothesisGen(scen),
-
-            # 2) task generation from a complete solution
-            self.exp_gen: ExpGen = import_class(PROP_SETTING.exp_gen)(scen)
-            self.data_loader_coder: DataLoaderCoSTEER = import_class(PROP_SETTING.data_loader_coder)(scen)
-            # self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
-            # logger.log_object(self.hypothesis_gen, tag="hypothesis generator")
-            # self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
-            # logger.log_object(self.hypothesis2experiment, tag="hypothesis2experiment")
-
-            # TODO: we need more coder
-            # self.feature_coder: Developer = import_class(PROP_SETTING.feature_coder)(scen)
-            # logger.log_object(self.feature_coder, tag="feature coder")
-            # self.model_feature_selection_coder: Developer = import_class(PROP_SETTING.model_feature_selection_coder)(
-            #     scen
-            # )
-            # logger.log_object(self.model_feature_selection_coder, tag="model feature selection coder")
-            # self.model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen)
-            # logger.log_object(self.model_coder, tag="model coder")
-
-            # TODO: now we only need on runner
-            # self.feature_runner: Developer = import_class(PROP_SETTING.feature_runner)(scen)
-            # logger.log_object(self.feature_runner, tag="feature runner")
-            # self.model_runner: Developer = import_class(PROP_SETTING.model_runner)(scen)
-            # logger.log_object(self.model_runner, tag="model runner")
-
-            # self.summarizer: Experiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
-            # logger.log_object(self.summarizer, tag="summarizer")
-
-            # self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
-            self.trace = Trace(scen=scen)
-            super(RDLoop, self).__init__()
+        scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
+        logger.log_object(scen, tag="scenario")
+
+        ### shared components in the workflow  # TODO: check if
+        knowledge_base = (
+            import_class(PROP_SETTING.knowledge_base)(PROP_SETTING.knowledge_base_path, scen)
+            if PROP_SETTING.knowledge_base != ""
+            else None
+        )
+        logger.log_object(knowledge_base, tag="knowledge_base")
+
+        # 1) task generation from scratch
+        # self.scratch_gen: tuple[HypothesisGen, Hypothesis2Experiment] = DummyHypothesisGen(scen),
+
+        # 2) task generation from a complete solution
+        self.exp_gen: ExpGen = import_class(PROP_SETTING.exp_gen)(scen)
+        self.data_loader_coder: DataLoaderCoSTEER = import_class(PROP_SETTING.data_loader_coder)(scen)
+        # self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
+        # logger.log_object(self.hypothesis_gen, tag="hypothesis generator")
+        # self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
+        # logger.log_object(self.hypothesis2experiment, tag="hypothesis2experiment")
+
+        # TODO: we need more coder
+        # self.feature_coder: Developer = import_class(PROP_SETTING.feature_coder)(scen)
+        # logger.log_object(self.feature_coder, tag="feature coder")
+        # self.model_feature_selection_coder: Developer = import_class(PROP_SETTING.model_feature_selection_coder)(
+        #     scen
+        # )
+        # logger.log_object(self.model_feature_selection_coder, tag="model feature selection coder")
+        # self.model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen)
+        # logger.log_object(self.model_coder, tag="model coder")
+
+        # TODO: now we only need on runner
+        # self.feature_runner: Developer = import_class(PROP_SETTING.feature_runner)(scen)
+        # logger.log_object(self.feature_runner, tag="feature runner")
+        # self.model_runner: Developer = import_class(PROP_SETTING.model_runner)(scen)
+        # logger.log_object(self.model_runner, tag="model runner")
+
+        # self.summarizer: Experiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
+        # logger.log_object(self.summarizer, tag="summarizer")
+
+        # self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
+        self.trace = Trace(scen=scen)
+        super(RDLoop, self).__init__()
 
     @measure_time
     def direct_exp_gen(self, prev_out: dict[str, Any]):
@@ -83,18 +81,17 @@ def direct_exp_gen(self, prev_out: dict[str, Any]):
 
     @measure_time
     def coding(self, prev_out: dict[str, Any]):
-        with logger.tag("d"):  # develop
-            exp = self.data_loader_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
-            # if prev_out["direct_exp_gen"]["propose"].action in [
-            #     KG_ACTION_FEATURE_ENGINEERING,
-            #     KG_ACTION_FEATURE_PROCESSING,
-            # ]:
-            #     exp = self.feature_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
-            # elif prev_out["direct_exp_gen"]["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
-            #     exp = self.model_feature_selection_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
-            # else:
-            #     exp = self.model_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
-            # logger.log_object(exp.sub_workspace_list, tag="coder result")
+        exp = self.data_loader_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
+        # if prev_out["direct_exp_gen"]["propose"].action in [
+        #     KG_ACTION_FEATURE_ENGINEERING,
+        #     KG_ACTION_FEATURE_PROCESSING,
+        # ]:
+        #     exp = self.feature_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
+        # elif prev_out["direct_exp_gen"]["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
+        #     exp = self.model_feature_selection_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
+        # else:
+        #     exp = self.model_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
+        # logger.log_object(exp.sub_workspace_list, tag="coder result")
         return exp
 
     @measure_time
@@ -102,43 +99,42 @@ def running(self, prev_out: dict[str, Any]):
         if not self.exp_gen.is_complete():
             raise NextLoopExcpetion()
 
-        with logger.tag("ef"):  # evaluate and feedback
-            if prev_out["direct_exp_gen"]["propose"].action in [
-                KG_ACTION_FEATURE_ENGINEERING,
-                KG_ACTION_FEATURE_PROCESSING,
-            ]:
-                exp = self.feature_runner.develop(prev_out["coding"])
-            else:
-                exp = self.model_runner.develop(prev_out["coding"])
-            logger.log_object(exp, tag="runner result")
-            if DS_RD_SETTING.competition in [
-                "optiver-realized-volatility-prediction",
-                "covid19-global-forecasting-week-1",
-            ]:
-                try:
-                    python_files_to_notebook(DS_RD_SETTING.competition, exp.experiment_workspace.workspace_path)
-                except Exception as e:
-                    logger.error(f"Merge python files to one file failed: {e}")
-            if DS_RD_SETTING.auto_submit:
-                csv_path = exp.experiment_workspace.workspace_path / "submission.csv"
-                try:
-                    subprocess.run(
-                        [
-                            "kaggle",
-                            "competitions",
-                            "submit",
-                            "-f",
-                            str(csv_path.absolute()),
-                            "-m",
-                            str(csv_path.parent.absolute()),
-                            DS_RD_SETTING.competition,
-                        ],
-                        check=True,
-                    )
-                except subprocess.CalledProcessError as e:
-                    logger.error(f"Auto submission failed: \n{e}")
-                except Exception as e:
-                    logger.error(f"Other exception when use kaggle api:\n{e}")
+        if prev_out["direct_exp_gen"]["propose"].action in [
+            KG_ACTION_FEATURE_ENGINEERING,
+            KG_ACTION_FEATURE_PROCESSING,
+        ]:
+            exp = self.feature_runner.develop(prev_out["coding"])
+        else:
+            exp = self.model_runner.develop(prev_out["coding"])
+        logger.log_object(exp, tag="runner result")
+        if DS_RD_SETTING.competition in [
+            "optiver-realized-volatility-prediction",
+            "covid19-global-forecasting-week-1",
+        ]:
+            try:
+                python_files_to_notebook(DS_RD_SETTING.competition, exp.experiment_workspace.workspace_path)
+            except Exception as e:
+                logger.error(f"Merge python files to one file failed: {e}")
+        if DS_RD_SETTING.auto_submit:
+            csv_path = exp.experiment_workspace.workspace_path / "submission.csv"
+            try:
+                subprocess.run(
+                    [
+                        "kaggle",
+                        "competitions",
+                        "submit",
+                        "-f",
+                        str(csv_path.absolute()),
+                        "-m",
+                        str(csv_path.parent.absolute()),
+                        DS_RD_SETTING.competition,
+                    ],
+                    check=True,
+                )
+            except subprocess.CalledProcessError as e:
+                logger.error(f"Auto submission failed: \n{e}")
+            except Exception as e:
+                logger.error(f"Other exception when use kaggle api:\n{e}")
 
         return exp
 
diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
index 2a0300aab..ee5644fe3 100644
--- a/rdagent/components/coder/data_science/feature/__init__.py
+++ b/rdagent/components/coder/data_science/feature/__init__.py
@@ -9,8 +9,8 @@
 from rdagent.components.coder.CoSTEER.knowledge_management import (
     CoSTEERQueriedKnowledge,
 )
-from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.components.coder.data_science.feature.eval import FeatureCoSTEEREvaluator
+from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
index 2ab83b5b9..a083fbed4 100644
--- a/rdagent/components/coder/data_science/feature/eval.py
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -3,6 +3,7 @@
 from os import system
 from pathlib import Path
 
+from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
     CoSTEERMultiFeedback,
@@ -14,7 +15,6 @@
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
-from rdagent.app.data_science.conf import DS_RD_SETTING
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
index 6d02835ea..5a594d909 100644
--- a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
@@ -10,9 +10,8 @@
 import logging
 import pickle
 
-import pandas as pd
 import numpy as np
-
+import pandas as pd
 from feat01 import feat_eng
 
 # Setup logging
diff --git a/rdagent/components/coder/data_science/feature/test.py b/rdagent/components/coder/data_science/feature/test.py
index 6bdab1d63..48eecd50b 100644
--- a/rdagent/components/coder/data_science/feature/test.py
+++ b/rdagent/components/coder/data_science/feature/test.py
@@ -11,6 +11,7 @@
 from rdagent.scenarios.data_science.experiment.experiment import FeatureExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
 
+
 def develop_one_competition(competition: str):  # -> experiment
     scen = DataScienceScen(competition=competition)
     feature_coder = FeatureCoSTEER(scen)
diff --git a/rdagent/components/coder/data_science/model/es.py b/rdagent/components/coder/data_science/model/es.py
index e06fc80fb..9423a2868 100644
--- a/rdagent/components/coder/data_science/model/es.py
+++ b/rdagent/components/coder/data_science/model/es.py
@@ -11,7 +11,6 @@
     CoSTEERQueriedKnowledgeV2,
 )
 from rdagent.components.coder.data_science.model.exp import ModelTask
-
 from rdagent.core.experiment import FBWorkspace
 from rdagent.core.prompts import Prompts
 from rdagent.oai.llm_conf import LLM_SETTINGS
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index b4165b662..7a93cd653 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -2,7 +2,11 @@
 Beyond previous tests
 - 
 """
+
 import json
+from pathlib import Path
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
     CoSTEERMultiFeedback,
@@ -15,12 +19,10 @@
     expected_shape_evaluate,
 )
 from rdagent.core.evolving_framework import QueriedKnowledge
-from rdagent.core.experiment import Task, Workspace, FBWorkspace
-from rdagent.utils.env import DSDockerConf, DockerEnv
+from rdagent.core.experiment import FBWorkspace, Task, Workspace
 from rdagent.oai.llm_utils import APIBackend
-from pathlib import Path
 from rdagent.utils.agent.tpl import T
-from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.utils.env import DockerEnv, DSDockerConf
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
@@ -70,8 +72,8 @@ def evaluate(
         """model_execution_feedback, pred_list= implementation.execute(
             batch_size=batch_size,
         )"""
-        ds_docker_conf = DSDockerConf()        
-        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}        
+        ds_docker_conf = DSDockerConf()
+        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
         de = DockerEnv(conf=ds_docker_conf)
         fname = "model_execute.py"
         with (DIRNAME / "eval_tests" / "model_execute.py").open("r") as f:
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
index a6ba73777..045ca91eb 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
@@ -2,43 +2,37 @@
 adapt for cv models
 """
 
-import os  
-import pickle  
-import traceback  
-  
-import numpy as np  
-from model01 import model_workflow
+import os
+import pickle
+import traceback
+
+import numpy as np
 from load_data import load_from_raw_data
+from model01 import model_workflow
+
 X, y, test_X, test_ids = load_from_raw_data()
-  
+
 
 """train_X = np.random.rand(8, 64, 64, 3)
 train_y = np.random.rand(8, 1)
 val_X = np.random.rand(8, 64, 64, 3)
 val_y = np.random.rand(8, 1)
 test_X = np.random.rand(8, 64, 64, 3)"""
-  
-
-# Call model_workflow  
-val_pred, test_pred, hypers = model_workflow(  
-    X=X,  
-    y=y,  
-    val_X=None,  
-    val_y=None,  
-    test_X=test_X,
-    hyper_params={}  
-)  
-#val_pred = np.random.rand(8, 1)
+
+
+# Call model_workflow
+val_pred, test_pred, hypers = model_workflow(X=X, y=y, val_X=None, val_y=None, test_X=test_X, hyper_params={})
+# val_pred = np.random.rand(8, 1)
 test_pred = np.random.rand(8, 1)
- 
-execution_feedback_str = "Execution successful.\n"  
-if val_pred is not None:  
-    execution_feedback_str += f"Validation predictions shape: {val_pred.shape}\n"  
-else:  
-    execution_feedback_str += "Validation predictions are None.\n"  
-if test_pred is not None:  
-    execution_feedback_str += f"Test predictions shape: {test_pred.shape}\n"  
-else:  
-    execution_feedback_str += "Test predictions are None.\n"  ''
+
+execution_feedback_str = "Execution successful.\n"
+if val_pred is not None:
+    execution_feedback_str += f"Validation predictions shape: {val_pred.shape}\n"
+else:
+    execution_feedback_str += "Validation predictions are None.\n"
+if test_pred is not None:
+    execution_feedback_str += f"Test predictions shape: {test_pred.shape}\n"
+else:
+    execution_feedback_str += "Test predictions are None.\n" ""
 
 print(execution_feedback_str)
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index a65625c4d..575f08cb6 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -13,12 +13,9 @@
     ModelGeneralCaseSpecEvaluator,
 )
 from rdagent.components.coder.data_science.model.exp import ModelFBWorkspace, ModelTask
+from rdagent.core.experiment import FBWorkspace
 from rdagent.scenarios.data_science.experiment.experiment import ModelExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
-from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
-from rdagent.components.coder.data_science.model.es import ModelMultiProcessEvolvingStrategy
-from rdagent.core.experiment import FBWorkspace
-
 
 
 # Take tasks, spec.md and feat as input, generate a feedback as output
diff --git a/rdagent/core/evolving_agent.py b/rdagent/core/evolving_agent.py
index e196d0c0a..6c5d923e5 100644
--- a/rdagent/core/evolving_agent.py
+++ b/rdagent/core/evolving_agent.py
@@ -58,43 +58,44 @@ def multistep_evolve(
         eva: Evaluator | Feedback,
         filter_final_evo: bool = False,
     ) -> EvolvableSubjects:
-        for _ in tqdm(range(self.max_loop), "Implementing"):
-            # 1. knowledge self-evolving
-            if self.knowledge_self_gen and self.rag is not None:
-                self.rag.generate_knowledge(self.evolving_trace)
-            # 2. RAG
-            queried_knowledge = None
-            if self.with_knowledge and self.rag is not None:
-                # TODO: Putting the evolving trace in here doesn't actually work
-                queried_knowledge = self.rag.query(evo, self.evolving_trace)
+        for evo_loop_id in tqdm(range(self.max_loop), "Implementing"):
+            with logger.tag(f"evo_loop_{evo_loop_id}"):
+                # 1. knowledge self-evolving
+                if self.knowledge_self_gen and self.rag is not None:
+                    self.rag.generate_knowledge(self.evolving_trace)
+                # 2. RAG
+                queried_knowledge = None
+                if self.with_knowledge and self.rag is not None:
+                    # TODO: Putting the evolving trace in here doesn't actually work
+                    queried_knowledge = self.rag.query(evo, self.evolving_trace)
 
-            # 3. evolve
-            evo = self.evolving_strategy.evolve(
-                evo=evo,
-                evolving_trace=self.evolving_trace,
-                queried_knowledge=queried_knowledge,
-            )
-            # TODO: Due to design issues, we have chosen to ignore this mypy error.
-            logger.log_object(evo.sub_workspace_list, tag="evolving code")  # type: ignore[attr-defined]
-            for sw in evo.sub_workspace_list:  # type: ignore[attr-defined]
-                logger.info(f"evolving code workspace: {sw}")
+                # 3. evolve
+                evo = self.evolving_strategy.evolve(
+                    evo=evo,
+                    evolving_trace=self.evolving_trace,
+                    queried_knowledge=queried_knowledge,
+                )
+                # TODO: Due to design issues, we have chosen to ignore this mypy error.
+                logger.log_object(evo.sub_workspace_list, tag="evolving code")  # type: ignore[attr-defined]
+                for sw in evo.sub_workspace_list:  # type: ignore[attr-defined]
+                    logger.info(f"evolving code workspace: {sw}")
 
-            # 4. Pack evolve results
-            es = EvoStep(evo, queried_knowledge)
+                # 4. Pack evolve results
+                es = EvoStep(evo, queried_knowledge)
 
-            # 5. Evaluation
-            if self.with_feedback:
-                es.feedback = (
-                    # TODO: Due to the irregular design of rdagent.core.evaluation.Evaluator,
-                    # it fails mypy's test here, so we'll ignore this error for now.
-                    eva
-                    if isinstance(eva, Feedback)
-                    else eva.evaluate(evo, queried_knowledge=queried_knowledge)  # type: ignore[arg-type, call-arg]
-                )
-                logger.log_object(es.feedback, tag="evolving feedback")
+                # 5. Evaluation
+                if self.with_feedback:
+                    es.feedback = (
+                        # TODO: Due to the irregular design of rdagent.core.evaluation.Evaluator,
+                        # it fails mypy's test here, so we'll ignore this error for now.
+                        eva
+                        if isinstance(eva, Feedback)
+                        else eva.evaluate(evo, queried_knowledge=queried_knowledge)  # type: ignore[arg-type, call-arg]
+                    )
+                    logger.log_object(es.feedback, tag="evolving feedback")
 
-            # 6. update trace
-            self.evolving_trace.append(es)
+                # 6. update trace
+                self.evolving_trace.append(es)
         if self.with_feedback and filter_final_evo:
             evo = self.filter_evolvable_subjects_by_feedback(evo, self.evolving_trace[-1].feedback)
         return evo
diff --git a/rdagent/log/logger.py b/rdagent/log/logger.py
index 87caf32c6..ec194f463 100644
--- a/rdagent/log/logger.py
+++ b/rdagent/log/logger.py
@@ -1,3 +1,4 @@
+import json
 import os
 import sys
 from contextlib import contextmanager
@@ -113,6 +114,20 @@ def log_object(self, obj: object, *, tag: str = "") -> None:
         caller_info = get_caller_info()
         tag = f"{self._tag}.{tag}.{self.get_pids()}".strip(".")
 
+        if "debug_" in tag:
+            debug_log_path = self.log_trace_path / "debug_llm.json"
+            debug_data = {"tag": tag, "obj": obj}
+            if debug_log_path.exists():
+                with debug_log_path.open("r+", encoding="utf-8") as f:
+                    existing_data = json.load(f)
+                    existing_data.append(debug_data)
+                    f.seek(0)
+                    json.dump(existing_data, f, ensure_ascii=False, indent=4)
+            else:
+                with debug_log_path.open("w", encoding="utf-8") as f:
+                    json.dump([debug_data], f, ensure_ascii=False, indent=4)
+            return
+
         logp = self.storage.log(obj, name=tag, save_type="pkl")
 
         file_handler_id = logger.add(
diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
new file mode 100644
index 000000000..052ace2eb
--- /dev/null
+++ b/rdagent/log/ui/llm_st.py
@@ -0,0 +1,137 @@
+import argparse
+import json
+from pathlib import Path
+
+import streamlit as st
+from streamlit import session_state
+
+st.set_page_config(layout="wide", page_title="debug_llm", page_icon="🎓", initial_sidebar_state="expanded")
+
+# 获取log_path参数
+parser = argparse.ArgumentParser(description="RD-Agent Streamlit App")
+parser.add_argument("--log_dir", type=str, help="Path to the log directory")
+args = parser.parse_args()
+if args.log_dir:
+    main_log_path = Path(args.log_dir)
+    if not main_log_path.exists():
+        st.error(f"Log dir `{main_log_path}` does not exist!")
+        st.stop()
+else:
+    main_log_path = Path("./log")
+
+if "data" not in session_state:
+    session_state.data = []
+if "log_path" not in session_state:
+    session_state.log_path = None
+
+eset = set()
+
+
+def load_data():
+    try:
+        with open(f"{main_log_path}/{session_state.log_path}/debug_llm.json", "r") as f:
+            session_state.data = json.load(f)
+    except Exception as e:
+        session_state.data = [{"error": str(e)}]
+
+
+# Add a button to refresh the data
+with st.sidebar:
+    lc1, lc2 = st.columns([1, 2], vertical_alignment="center")
+    with lc1:
+        st.markdown(":blue[**Log Path**]")
+    with lc2:
+        manually = st.toggle("Manual Input")
+    if manually:
+        st.text_input("log path", key="log_path", label_visibility="collapsed")
+    else:
+        folders = sorted(
+            [folder for folder in main_log_path.iterdir() if folder.is_dir()],
+            key=lambda folder: folder.stat().st_mtime,
+            reverse=True,
+        )
+        folders = [folder.name for folder in folders]
+
+        st.selectbox(f"**Select from `{main_log_path.absolute()}`**", folders, key="log_path")
+
+    if st.button("Refresh Data"):
+        load_data()
+        st.rerun()
+
+    expand_all = st.toggle("Expand All", key="expand_all")
+
+
+def show_text(text, lang=None):
+    if lang is not None:
+        st.code(text, language=lang, wrap_lines=True)
+    elif "```py" in text:
+        st.code(text, language="python", wrap_lines=True)
+    else:
+        st.code(text, language="html", wrap_lines=True)
+
+
+def highlight_prompts_uri(uri):
+    parts = uri.split(":")
+    return f"**{parts[0]}:**:green[**{parts[1]}**]"
+
+
+# Display the data
+for d in session_state.data:
+    tag = d["tag"]
+    obj = d["obj"]
+    if "evo_loop_" in tag:
+        tags = tag.split(".")
+        for t in tags:
+            if "evo_loop_" in t:
+                etag = t
+                break
+        if etag not in eset:
+            eset.add(etag)
+            st.subheader(f"**{etag}**", anchor=etag, divider="rainbow")
+    if "debug_tpl" in tag:
+        uri = obj["uri"]
+        tpl = obj["template"]
+        cxt = obj["context"]
+        rd = obj["rendered"]
+
+        with st.expander(highlight_prompts_uri(uri), expanded=expand_all, icon="⚙️"):
+            t1, t2, t3 = st.tabs([":blue[**Template**]", ":orange[**Context**]", ":green[**Rendered**]"])
+            with t1:
+                show_text(tpl, lang="django")
+            with t2:
+                st.json(cxt)
+            with t3:
+                show_text(rd)
+    elif "debug_llm" in tag:
+        system = obj.get("system", None)
+        user = obj["user"]
+        resp = obj["resp"]
+
+        with st.expander(f"**LLM**", expanded=expand_all, icon="🤖"):
+            t1, t2, t3 = st.tabs([":green[**Response**]", ":blue[**User**]", ":orange[**System**]"])
+            with t3:
+                if system is None:
+                    st.text("In session, no system prompt")
+                else:
+                    show_text(system)
+            with t2:
+                show_text(user)
+            with t1:
+                try:
+                    rdict = json.loads(resp)
+                    if "code" in rdict:
+                        code = rdict["code"]
+                        st.code(code, language="python", wrap_lines=True)
+                        rdict.pop("code")
+                    elif "spec" in rdict:
+                        spec = rdict["spec"]
+                        st.markdown(spec)
+                        rdict.pop("spec")
+                    st.write(":red[**Other parts (except for the code or spec) in response dict:**]")
+                    st.json(rdict)
+                except:
+                    st.json(resp)
+
+with st.sidebar:
+    et_toc = "\n".join(f"- [**{etag}**](#{etag})" for etag in sorted(eset))
+    st.markdown(et_toc, unsafe_allow_html=True)
diff --git a/rdagent/oai/llm_utils.py b/rdagent/oai/llm_utils.py
index abe9e4ed6..2182d0ef3 100644
--- a/rdagent/oai/llm_utils.py
+++ b/rdagent/oai/llm_utils.py
@@ -215,6 +215,7 @@ def build_chat_completion(self, user_prompt: str, **kwargs: Any) -> str:
                 chat_completion=True,
                 **kwargs,
             )
+            logger.log_object({"user": user_prompt, "resp": response}, tag="debug_llm")
 
         messages.append(
             {
@@ -493,12 +494,15 @@ def build_messages_and_create_chat_completion(
             former_messages,
             shrink_multiple_break=shrink_multiple_break,
         )
-        return self._try_create_chat_completion_or_embedding(
+
+        resp = self._try_create_chat_completion_or_embedding(
             messages=messages,
             chat_completion=True,
             chat_cache_prefix=chat_cache_prefix,
             **kwargs,
         )
+        logger.log_object({"system": system_prompt, "user": user_prompt, "resp": resp}, tag="debug_llm")
+        return resp
 
     def create_embedding(self, input_content: str | list[str], **kwargs: Any) -> list[Any] | Any:
         input_content_list = [input_content] if isinstance(input_content, str) else input_content
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
index 115e20ca3..da2af05a0 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
@@ -150,4 +150,4 @@ def model_workflow(
     else:
         test_pred = None
 
-    return val_pred, test_pred, hyper_params
\ No newline at end of file
+    return val_pred, test_pred, hyper_params
diff --git a/rdagent/utils/agent/tpl.py b/rdagent/utils/agent/tpl.py
index f3a5bd940..6b4e24c2a 100644
--- a/rdagent/utils/agent/tpl.py
+++ b/rdagent/utils/agent/tpl.py
@@ -12,6 +12,7 @@
 from jinja2 import Environment, StrictUndefined
 
 from rdagent.core.utils import SingletonBaseClass
+from rdagent.log import rdagent_logger as logger
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 PROJ_PATH = DIRNAME.parent.parent
@@ -37,6 +38,7 @@ def __init__(self, uri: str):
 
             the loaded content will be saved in `self.template`
         """
+        self.uri = uri
         # Inspect the calling stack to get the caller's directory
         stack = inspect.stack()
         caller_frame = stack[1]
@@ -49,6 +51,7 @@ def __init__(self, uri: str):
 
         if path_part.startswith("."):
             yaml_file_path = caller_dir / f"{path_part[1:].replace('.', '/')}.yaml"
+            self.uri = f"{str(caller_dir.relative_to(PROJ_PATH)).replace('/', '.')}{uri}"
         else:
             yaml_file_path = (PROJ_PATH / path_part.replace(".", "/")).with_suffix(".yaml")
 
@@ -66,7 +69,17 @@ def r(self, **context: Any):
         """
         Render the template with the given context.
         """
-        return Environment(undefined=StrictUndefined).from_string(self.template).render(**context)
+        rendered = Environment(undefined=StrictUndefined).from_string(self.template).render(**context)
+        logger.log_object(
+            obj={
+                "uri": self.uri,
+                "template": self.template,
+                "context": context,
+                "rendered": rendered,
+            },
+            tag="debug_tpl",
+        )
+        return rendered
 
 
 T = RDAT  # shortcuts
diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py
index ece190625..8ee89a962 100644
--- a/rdagent/utils/workflow.py
+++ b/rdagent/utils/workflow.py
@@ -106,19 +106,20 @@ def run(self, step_n: int | None = None):
                 start = datetime.datetime.now(datetime.timezone.utc)
 
                 name = self.steps[si]
-                func = getattr(self, name)
-                try:
-                    self.loop_prev_out[name] = func(self.loop_prev_out)
-                    # TODO: Fix the error logger.exception(f"Skip loop {li} due to {e}")
-                except self.skip_loop_error as e:
-                    logger.warning(f"Skip loop {li} due to {e}")
-                    self.loop_idx += 1
-                    self.step_idx = 0
-                    continue
-                except CoderError as e:
-                    logger.warning(f"Traceback loop {li} due to {e}")
-                    self.step_idx = 0
-                    continue
+                with logger.tag(name):
+                    func = getattr(self, name)
+                    try:
+                        self.loop_prev_out[name] = func(self.loop_prev_out)
+                        # TODO: Fix the error logger.exception(f"Skip loop {li} due to {e}")
+                    except self.skip_loop_error as e:
+                        logger.warning(f"Skip loop {li} due to {e}")
+                        self.loop_idx += 1
+                        self.step_idx = 0
+                        continue
+                    except CoderError as e:
+                        logger.warning(f"Traceback loop {li} due to {e}")
+                        self.step_idx = 0
+                        continue
 
                 end = datetime.datetime.now(datetime.timezone.utc)
 

From b283cb18bb84104c0978ce018bf38fa03367f422 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 16 Dec 2024 03:00:12 +0000
Subject: [PATCH 057/304] model task base_code added

---
 rdagent/scenarios/data_science/proposal/exp_gen.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 054a83073..aced71998 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -68,7 +68,7 @@ def last_successful_component(com: COMPONENT) -> Experiment:
             for h, exp, hf in reversed(trace.hist):
                 if hf.decision and h.component == com:
                     return exp
-            raise RuntimeError(f"No successful {com} component generated yet.")
+            return None
 
         scenario = trace.scen.get_scenario_all_desc()
         if is_complete():
@@ -346,13 +346,18 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                     )
                     dependency_exp = last_successful_component("FeatureEng")
                     spec = dependency_exp.experiment_workspace.code_dict["spec/model.md"]
+                    if last_model_exp:=last_successful_component("Model"):
+                        # TODO: model only have one (named "model.py")?
+                        base_code = last_model_exp.experiment_workspace.code_dict["model.py"]
+                    else:
+                        base_code = ""
                     mt = ModelTask(
                         name=resp_dict.get("model_name", "Model name not provided"),
                         description=resp_dict.get("description", "Model description not provided"),
                         architecture=resp_dict.get("architecture", "Model architecture not provided"),
                         hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
                         spec=spec,
-                        base_code="",
+                        base_code=base_code,
                     )
                     exp = ModelExperiment(sub_tasks=[mt])
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)

From 1728a90fc40ae15a92b6c9caeec3655f68173ce4 Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Mon, 16 Dec 2024 06:34:41 +0000
Subject: [PATCH 058/304] return code dict in ds model evolvingstrategy

---
 .../components/coder/data_science/model/es.py |  19 +-
 .../coder/data_science/model/eva_utils.py     | 196 ------------------
 .../coder/data_science/model/eval.py          |   7 +-
 .../coder/data_science/model/exp.py           |  37 ----
 4 files changed, 11 insertions(+), 248 deletions(-)
 delete mode 100644 rdagent/components/coder/data_science/model/eva_utils.py

diff --git a/rdagent/components/coder/data_science/model/es.py b/rdagent/components/coder/data_science/model/es.py
index 9423a2868..5240bfc86 100644
--- a/rdagent/components/coder/data_science/model/es.py
+++ b/rdagent/components/coder/data_science/model/es.py
@@ -11,7 +11,6 @@
     CoSTEERQueriedKnowledgeV2,
 )
 from rdagent.components.coder.data_science.model.exp import ModelTask
-from rdagent.core.experiment import FBWorkspace
 from rdagent.core.prompts import Prompts
 from rdagent.oai.llm_conf import LLM_SETTINGS
 from rdagent.oai.llm_utils import APIBackend
@@ -24,7 +23,7 @@ def implement_one_task(
         self,
         target_task: ModelTask,
         queried_knowledge: CoSTEERQueriedKnowledge | None = None,
-    ) -> str:
+    ) -> dict[str, str]:
         model_information_str = target_task.get_task_information()
 
         queried_similar_successful_knowledge = (
@@ -86,7 +85,7 @@ def implement_one_task(
             elif len(queried_similar_successful_knowledge_to_render) > 1:
                 queried_similar_successful_knowledge_to_render = queried_similar_successful_knowledge_to_render[1:]
 
-        code = json.loads(
+        model_code = json.loads(
             # APIBackend(use_chat_cache=CoSTEER_SETTINGS.coder_use_cache).build_messages_and_create_chat_completion(
             APIBackend().build_messages_and_create_chat_completion(
                 user_prompt=user_prompt,
@@ -94,14 +93,16 @@ def implement_one_task(
                 json_mode=True,
             ),
         )["code"]
-        return code
+        return{
+            "model01.py":model_code,
+        }
         """
         import pandas as pd
         def Model():
             pass
         """
 
-    def assign_code_list_to_evo(self, code_list, evo):
+    def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
         """
         Assign the code list to the evolving item.
 
@@ -112,7 +113,7 @@ def assign_code_list_to_evo(self, code_list, evo):
             if code_list[index] is None:
                 continue
             if evo.sub_workspace_list[index] is None:
-                evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
-            # TODO: avoid hardcode of file name
-            evo.sub_workspace_list[index].inject_code(**{"model01.py": code_list[index]})
-        return evo
+                # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
+                evo.sub_workspace_list[index] = evo.experiment_workspace
+            evo.sub_workspace_list[index].inject_code(**code_list[index])
+        return evo
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/model/eva_utils.py b/rdagent/components/coder/data_science/model/eva_utils.py
deleted file mode 100644
index 1d4dac331..000000000
--- a/rdagent/components/coder/data_science/model/eva_utils.py
+++ /dev/null
@@ -1,196 +0,0 @@
-import json
-from pathlib import Path
-from typing import Tuple
-
-import numpy as np
-from jinja2 import Environment, StrictUndefined
-
-from rdagent.components.coder.data_science.model.exp import ModelFBWorkspace, ModelTask
-from rdagent.core.evaluation import Evaluator
-from rdagent.core.experiment import Task, Workspace
-from rdagent.core.prompts import Prompts
-from rdagent.oai.llm_conf import LLM_SETTINGS
-from rdagent.oai.llm_utils import APIBackend
-
-evaluate_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
-
-
-def expected_shape_evaluate(
-    prediction: np.ndarray,
-    spec_message: str,
-    model_execution_feedback: str,
-) -> str:
-    if prediction is None:
-        return "No output generated from the model. Skip value evaluation"
-    elif spec_message is None:
-        return ("No spec provided. Shape evaluation not impractical",)
-    else:
-        pre_shape = prediction.shape
-
-        system_prompt = (
-            Environment(undefined=StrictUndefined)
-            .from_string(evaluate_prompts["evaluator_shape_feedback"]["system"])
-            .render(spec=(spec_message if spec_message is not None else "No spec description provided."))
-        )
-
-        execution_feedback_to_render = model_execution_feedback
-
-        for _ in range(10):  # 10 times to split the content is enough
-            user_prompt = (
-                Environment(undefined=StrictUndefined)
-                .from_string(
-                    evaluate_prompts["evaluator_shape_feedback"]["user"],
-                )
-                .render(
-                    pre_shape=pre_shape,
-                    model_execution_feedback=execution_feedback_to_render,
-                )
-            )
-            if (
-                APIBackend().build_messages_and_calculate_token(
-                    user_prompt=user_prompt,
-                    system_prompt=system_prompt,
-                )
-                > LLM_SETTINGS.chat_token_limit
-            ):
-                execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :]
-            else:
-                break
-
-        critic_response = APIBackend().build_messages_and_create_chat_completion(
-            user_prompt=user_prompt,
-            system_prompt=system_prompt,
-            json_mode=False,
-        )
-
-        return critic_response
-
-
-class ModelCodeEvaluator(Evaluator):
-    def evaluate(
-        self,
-        target_task: Task,
-        implementation: Workspace,
-        model_execution_feedback: str = "",
-    ):
-        assert isinstance(target_task, ModelTask)
-        assert isinstance(implementation, ModelFBWorkspace)
-
-        model_task_information = target_task.get_task_information()
-        # TODO: avoid hardcode of file name
-        code = implementation.code_dict["model01.py"]
-
-        system_prompt = (
-            Environment(undefined=StrictUndefined)
-            .from_string(evaluate_prompts["evaluator_code_feedback"]["system"])
-            .render(
-                scenario=(
-                    # TODO: Here replaced with ds scen information
-                    # self.scen.get_scenario_all_desc(target_task, filtered_tag=target_task.model_type)
-                    # if self.scen is not None
-                    # else "No scenario description."
-                    "No scenario description."
-                )
-            )
-        )
-
-        execution_feedback_to_render = model_execution_feedback
-        for _ in range(10):  # 10 times to split the content is enough
-            user_prompt = (
-                Environment(undefined=StrictUndefined)
-                .from_string(
-                    evaluate_prompts["evaluator_code_feedback"]["user"],
-                )
-                .render(
-                    model_information=model_task_information,
-                    code=code,
-                    model_execution_feedback=execution_feedback_to_render,
-                )
-            )
-            if (
-                APIBackend().build_messages_and_calculate_token(
-                    user_prompt=user_prompt,
-                    system_prompt=system_prompt,
-                )
-                > LLM_SETTINGS.chat_token_limit
-            ):
-                execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :]
-            else:
-                break
-
-        critic_response = APIBackend().build_messages_and_create_chat_completion(
-            user_prompt=user_prompt,
-            system_prompt=system_prompt,
-            json_mode=False,
-        )
-
-        return critic_response, None
-
-
-class ModelFinalEvaluator(Evaluator):
-    def evaluate(
-        self,
-        target_task: Task,
-        implementation: Workspace,
-        model_execution_feedback: str,
-        model_shape_feedback: str,
-        model_code_feedback: str,
-    ):
-        assert isinstance(target_task, ModelTask)
-        assert isinstance(implementation, ModelFBWorkspace)
-
-        system_prompt = (
-            Environment(undefined=StrictUndefined)
-            .from_string(evaluate_prompts["evaluator_final_feedback"]["system"])
-            .render(
-                scenario=(
-                    # TODO: Here replaced with ds scen information
-                    # self.scen.get_scenario_all_desc(target_task, filtered_tag=target_task.model_type)
-                    # if self.scen is not None
-                    # else "No scenario description."
-                    "No scenario description."
-                )
-            )
-        )
-
-        execution_feedback_to_render = model_execution_feedback
-
-        for _ in range(10):  # 10 times to split the content is enough
-            user_prompt = (
-                Environment(undefined=StrictUndefined)
-                .from_string(
-                    evaluate_prompts["evaluator_final_feedback"]["user"],
-                )
-                .render(
-                    model_information=target_task.get_task_information(),
-                    model_execution_feedback=execution_feedback_to_render,
-                    model_shape_feedback=model_shape_feedback,
-                    model_code_feedback=model_code_feedback,
-                )
-            )
-            if (
-                APIBackend().build_messages_and_calculate_token(
-                    user_prompt=user_prompt,
-                    system_prompt=system_prompt,
-                )
-                > LLM_SETTINGS.chat_token_limit
-            ):
-                execution_feedback_to_render = execution_feedback_to_render[len(execution_feedback_to_render) // 2 :]
-            else:
-                break
-
-        final_evaluation_dict = json.loads(
-            APIBackend().build_messages_and_create_chat_completion(
-                user_prompt=user_prompt,
-                system_prompt=system_prompt,
-                json_mode=True,
-            ),
-        )
-        if isinstance(final_evaluation_dict["final_decision"], str) and final_evaluation_dict[
-            "final_decision"
-        ].lower() in ("true", "false"):
-            final_evaluation_dict["final_decision"] = bool(final_evaluation_dict["final_decision"])
-        return (
-            final_evaluation_dict["final_feedback"],
-            final_evaluation_dict["final_decision"],
-        )
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 7a93cd653..738cc08d8 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -13,13 +13,8 @@
     CoSTEERSingleFeedback,
     CoSTEERSingleFeedbackDeprecated,
 )
-from rdagent.components.coder.data_science.model.eva_utils import (
-    ModelCodeEvaluator,
-    ModelFinalEvaluator,
-    expected_shape_evaluate,
-)
 from rdagent.core.evolving_framework import QueriedKnowledge
-from rdagent.core.experiment import FBWorkspace, Task, Workspace
+from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
index 317e3d519..8e88dd30d 100644
--- a/rdagent/components/coder/data_science/model/exp.py
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -53,40 +53,3 @@ def from_dict(dict):
 
     def __repr__(self) -> str:
         return f"<{self.__class__.__name__} {self.name}>"
-
-
-class ModelFBWorkspace(FBWorkspace):
-    def execute(
-        self,
-        batch_size: int = 8,
-    ):
-        super().execute()
-        try:
-            de = DockerEnv(DSDockerConf())
-            de.prepare()
-
-            # self.code_dict["spec.md"]
-            # TODO: generate dataset automatically
-
-            dump_code = (Path(__file__).parent / "model_execute_template.txt").read_text()
-
-            log, results = de.dump_python_code_run_and_get_results(
-                code=dump_code,
-                dump_file_names=["execution_feedback_str.pkl", "pred_list.pkl"],
-                local_path=str(self.workspace_path),
-                env={},
-                code_dump_file_py_name="model_test",
-            )
-            if results is None:
-                raise RuntimeError(f"Error in running the model code: {log}")
-            [execution_feedback_str, pred_list] = results
-
-        except Exception as e:
-            execution_feedback_str = f"Execution error: {e}\nTraceback: {traceback.format_exc()}"
-            pred_list = None
-
-        if len(execution_feedback_str) > 2000:
-            execution_feedback_str = (
-                execution_feedback_str[:1000] + "....hidden long error message...." + execution_feedback_str[-1000:]
-            )
-        return execution_feedback_str, pred_list

From a23731f70ab17e6db0c90188414953f97687b001 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Mon, 16 Dec 2024 07:20:29 +0000
Subject: [PATCH 059/304] fix ds_scen description_template

---
 .../scenarios/data_science/scen/prompts.yaml  | 24 +++++++++++++++++++
 rdagent/scenarios/data_science/scen/scen.py   |  5 ++--
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
index d9f14d250..06b8e1926 100644
--- a/rdagent/scenarios/data_science/scen/prompts.yaml
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -4,3 +4,27 @@ scen_desc: -|
 
   ------The expected output & submission format specifications------
   {{scen.submission_specifications}}
+
+description_template: -|
+  system: |-
+    You are an assistant that extracts structured information from unstructured text.
+    The user will provide you a Kaggle competition description, and you need to extract specific details from it.
+    For the dataset, the competition may not include detailed information about the dataset. The user has read the dataset and provide you the relevant information. Please include it in your response.
+    Please answer in Json format with the following schema:
+    {
+      "Competition Type": "The type of competition, e.g., 'Classification', 'Regression', 'Clustering', 'Prediction", "Time-Series Forecasting",
+      "Competition Description": "A brief description of the competition",
+      "Target Description": "A description of the target variable to be predicted",
+      "Competition Features": "Two-line description of the overall features involved within the competition as background."
+      "Submission Specifications": "The submission specification & sample submission csv descriptions for the model to output."
+      "Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
+      "Evaluation Description": "A brief description of the metrics used in the evaluation. Please note that if `evaluation_metric_direction` is True, it indicates that higher values are better; if False, lower values are preferred."
+    }
+    Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together.
+
+
+  user: |-
+    Competition Description: 
+    {{ competition_descriptions }}
+    Evaluation_metric_direction: 
+    {{ evaluation_metric_direction }}
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index 17965b52d..1921f351c 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -29,10 +29,9 @@ def __init__(self, competition: str) -> None:
         self._analysis_competition_description()
 
     def _analysis_competition_description(self):
-        sys_prompt = T("scenarios.kaggle.experiment.prompts:kg_description_template.system").r()
-        user_prompt = T("scenarios.kaggle.experiment.prompts:kg_description_template.user").r(
+        sys_prompt = T(".prompts:description_template.system").r()
+        user_prompt = T(".prompts:description_template.user").r(
             competition_descriptions=self.competition_descriptions,
-            raw_data_information=self.source_data,
             evaluation_metric_direction=self.evaluation_metric_direction,
         )
 

From 1178618207b511ff518523ee087ea407feb00149 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 16 Dec 2024 07:26:54 +0000
Subject: [PATCH 060/304] redundent prompts.yaml

---
 .../coder/data_science/raw_data_loader/prompts.yaml      | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index fd08bdb7c..7b224a539 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -1,12 +1,3 @@
-data_loader_execute_code: |-
-  # execute and cache the preprocessed data
-  import pickle
-  from load_data import load_data
-  data = load_data()
-  with open('data.pkl', 'wb') as f:
-      pickle.dump(data, f)
-  
-
 spec:
   system: |-
     You are a Python data scientist working on a new kaggle competition project. This project will be used to analyze data and build models to predict future outcomes, and this project codes will be written by GPT.

From 505cc4f4fc76ebfaeed990655cda731372502f02 Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Mon, 16 Dec 2024 07:33:09 +0000
Subject: [PATCH 061/304] fix some bugs

---
 .../coder/data_science/model/eval_tests/model_execute.py      | 4 +++-
 rdagent/components/coder/data_science/model/test.py           | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
index 045ca91eb..57f8221d0 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
@@ -9,8 +9,10 @@
 import numpy as np
 from load_data import load_from_raw_data
 from model01 import model_workflow
+from sklearn.model_selection import train_test_split
 
 X, y, test_X, test_ids = load_from_raw_data()
+train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=42)
 
 
 """train_X = np.random.rand(8, 64, 64, 3)
@@ -21,7 +23,7 @@
 
 
 # Call model_workflow
-val_pred, test_pred, hypers = model_workflow(X=X, y=y, val_X=None, val_y=None, test_X=test_X, hyper_params={})
+val_pred, test_pred, hypers = model_workflow(X=train_X, y=train_y, val_X=val_X, val_y=val_y, test_X=test_X, hyper_params={})
 # val_pred = np.random.rand(8, 1)
 test_pred = np.random.rand(8, 1)
 
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index 575f08cb6..186215564 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -12,7 +12,7 @@
 from rdagent.components.coder.data_science.model.eval import (
     ModelGeneralCaseSpecEvaluator,
 )
-from rdagent.components.coder.data_science.model.exp import ModelFBWorkspace, ModelTask
+from rdagent.components.coder.data_science.model.exp import ModelTask
 from rdagent.core.experiment import FBWorkspace
 from rdagent.scenarios.data_science.experiment.experiment import ModelExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen

From f18478dfe458c7e875840f82da4dc0b18544d526 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 16 Dec 2024 08:07:54 +0000
Subject: [PATCH 062/304] feature test change

---
 .../coder/data_science/feature/eval_tests/feature_test.py     | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
index 5a594d909..029de0f52 100644
--- a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
@@ -19,7 +19,6 @@
 
 # Load data
 from load_data import load_from_raw_data
-from sklearn.model_selection import train_test_split
 
 X, y, X_test, test_ids = load_from_raw_data()
 
@@ -43,6 +42,3 @@
     raise TypeError("Unsupported data type for X and y")
 
 logging.info("Data loader test passed successfully. Length of test images matches length of test IDs.")
-
-with open("data.pkl", "wb") as f:
-    pickle.dump((X, y, X_test, test_ids), f)

From a108072b91a13668ac7b8f0e46704dfb90054d37 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Mon, 16 Dec 2024 08:35:15 +0000
Subject: [PATCH 063/304] fix a bug

---
 .../scenarios/data_science/scen/prompts.yaml  | 55 +++++++++++++++++--
 rdagent/scenarios/data_science/scen/scen.py   |  2 +-
 2 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
index 06b8e1926..bc5e4052c 100644
--- a/rdagent/scenarios/data_science/scen/prompts.yaml
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -5,7 +5,7 @@ scen_desc: -|
   ------The expected output & submission format specifications------
   {{scen.submission_specifications}}
 
-description_template: -|
+description_template:
   system: |-
     You are an assistant that extracts structured information from unstructured text.
     The user will provide you a Kaggle competition description, and you need to extract specific details from it.
@@ -21,10 +21,57 @@ description_template: -|
       "Evaluation Description": "A brief description of the metrics used in the evaluation. Please note that if `evaluation_metric_direction` is True, it indicates that higher values are better; if False, lower values are preferred."
     }
     Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together.
-
-
   user: |-
     Competition Description: 
     {{ competition_descriptions }}
     Evaluation_metric_direction: 
-    {{ evaluation_metric_direction }}
\ No newline at end of file
+    {{ evaluation_metric_direction }}
+
+competition_background: |-
+  You are solving a data science tasks and the type of the competition is {{ competition_type }}.
+  The competition description is: {{competition_description}}. 
+  
+  We provide an overall script in file: train.py. The user will run the train.py script along with several feature and model scripts to train several model to get a good performance on this task.
+
+  The train.py script is as follows:
+  ```python
+  {{ train_script }}
+  ```
+  
+  The final output of our pipeline is from a ensemble of up to four models. Each model is trained on a different subset of the data.
+  The four model types are: XGBoost, RandomForest, LightGBM and Neural Network (A Pytorch model).
+  About the Neural Network model, You can try different architectures and hyperparameters to improve the performance. You can even use a pytorch model to ensemble the other three types of models. Try to open your mind on the NN model.
+  
+  The data is extracted from the competition dataset, focusing on relevant attributes in {{ competition_features }}.
+
+  The user firstly designs and implements a feature book for each model. The feature book is a combination of several features and feature groups.
+  The feature book is built from:
+  - Raw features: The raw features are the original features from the dataset.
+  - generated features: The generated features are the features that are calculated based on the raw features according to some formulations. The calculation should be align with some physical or logical meaning. Don't just simply apply some numeric operations to the raw features.
+  - feature groups: The feature groups are preprocessed group of features from the raw features like normalization, one hot encoding, etc.
+  The feature or feature group is defined in the following parts:
+  - Name: The name of the feature or feature group.
+  - Description: A description of the feature or feature group.
+  - Formulation: The formulation of the feature or feature group.
+  - Variables: The variable list used in the formulation. Notice: The variable should be a specific feature in the dataset. Please make sure the feature name is exactly the same as the feature name in the dataset.
+  
+  For each model, the user will design and implement the model in a separate script.
+  The model is defined in the following parts:
+  - Name: The name of the model.
+  - Description: A description of the model.
+  - Architecture: The detailed architecture of the model, such as neural network layers or tree structures.
+  - ModelType: The type of the model, which should be one of ["XGBoost", "RandomForest", "LightGBM", "NN"].
+  The model should provide clear and detailed documentation of its architecture and hyperparameters.
+
+  The user tries to optimize the performance iteratively by employing one of the feature related or model related action items:
+  - Feature related:
+    - "Feature engineering": The user will design several new tasks and implement several new features. The new feature might only affect the model using all the feature book.
+    - "Feature processing": The user will design a new task to process the feature book like normalization or one hot encoding to improve the model performance. Any processing with help of a deep model is not included in this task.
+  - Model related:
+    - "Model feature selection": The user will modify one model to select the part of the features from the feature book to improve the model performance.
+    - "Model tuning": The user will tune the hyperparameters of XGBoost, RandomForest or LightGBM or build or improve the NN model to improve the model performance. 
+  Notice: You can automatically optimize the hyperparameters of the model using some library when training the model. Since we don't have a lot of time to train the model, please use a small number of trials to optimize the hyperparameters. 
+  Our validation set split is not deterministic, so when you are using hyperparameter tuning, you can merge training and validation and use cross validation method to tune the hyperparameters.
+  One you have determine the best model parameter, you should retrain the model on all training and validation set to get the final model.
+
+  For each loop, you need to help user decide which action item to choose and provide the corresponding code to implement the action item.
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index 1921f351c..79e0de7df 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -68,7 +68,7 @@ def get_competition_full_desc(self) -> str:
 
     @property
     def background(self) -> str:
-        background_template = T("scenarios.kaggle.experiment.prompts:kg_background")
+        background_template = T(".prompts:competition_background")
         background_prompt = background_template.r(
             competition_type=self.competition_type,
             competition_description=self.competition_description,

From 636a1d578bf043556325cb07cd0670247f37a961 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 16 Dec 2024 08:37:30 +0000
Subject: [PATCH 064/304] fix prompts.yaml

---
 rdagent/scenarios/data_science/scen/prompts.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
index 06b8e1926..203fd1aa6 100644
--- a/rdagent/scenarios/data_science/scen/prompts.yaml
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -1,11 +1,11 @@
-scen_desc: -|
+scen_desc: |-
   ------Background of the scenario------
   {{scen.background}}
 
   ------The expected output & submission format specifications------
   {{scen.submission_specifications}}
 
-description_template: -|
+description_template:
   system: |-
     You are an assistant that extracts structured information from unstructured text.
     The user will provide you a Kaggle competition description, and you need to extract specific details from it.

From d7b6ca4a320f14fd8164a72f80fb3a743ade00d1 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 16 Dec 2024 08:51:47 +0000
Subject: [PATCH 065/304] remove feature test local path

---
 rdagent/components/coder/data_science/feature/test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rdagent/components/coder/data_science/feature/test.py b/rdagent/components/coder/data_science/feature/test.py
index 48eecd50b..6306d0fb4 100644
--- a/rdagent/components/coder/data_science/feature/test.py
+++ b/rdagent/components/coder/data_science/feature/test.py
@@ -17,7 +17,7 @@ def develop_one_competition(competition: str):  # -> experiment
     feature_coder = FeatureCoSTEER(scen)
 
     with open(
-        "/home/v-yuanteli/RD-Agent/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md", "r"
+        "./rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md", "r"
     ) as file:
         feat_spec = file.read()
 
@@ -28,7 +28,7 @@ def develop_one_competition(competition: str):  # -> experiment
     )
 
     with open(
-        "/home/v-yuanteli/RD-Agent/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py", "r"
+        "./rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py", "r"
     ) as file:
         load_data_code = file.read()
     exp.experiment_workspace.inject_code(**{"load_data.py": load_data_code})

From 5c4490f5d49326fca51e42c3166d21f1061ef715 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Mon, 16 Dec 2024 09:25:56 +0000
Subject: [PATCH 066/304] refine the structure of scene

---
 .../scenarios/data_science/scen/prompts.yaml  | 71 ++++---------------
 rdagent/scenarios/data_science/scen/scen.py   | 44 +++++-------
 2 files changed, 32 insertions(+), 83 deletions(-)

diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
index a4f7fd7c6..cdb68ef77 100644
--- a/rdagent/scenarios/data_science/scen/prompts.yaml
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -5,73 +5,30 @@ scen_desc: |-
   ------The expected output & submission format specifications------
   {{scen.submission_specifications}}
 
-description_template:
+competition_description_template:
   system: |-
-    You are an assistant that extracts structured information from unstructured text.
+    You are a data science assistant that extracts structured information from unstructured text.
     The user will provide you a Kaggle competition description, and you need to extract specific details from it.
     For the dataset, the competition may not include detailed information about the dataset. The user has read the dataset and provide you the relevant information. Please include it in your response.
     Please answer in Json format with the following schema:
     {
-      "Competition Type": "The type of competition, e.g., 'Classification', 'Regression', 'Clustering', 'Prediction", "Time-Series Forecasting",
-      "Competition Description": "A brief description of the competition",
-      "Target Description": "A description of the target variable to be predicted",
-      "Competition Features": "Two-line description of the overall features involved within the competition as background."
+      "Competition Task Type": "The type of competition task, e.g., 'Classification', 'Regression', 'Clustering', 'Recommendation", "Time-Series Forecasting",
+      "Competition Data Type": "The type of competition data, e.g., 'Tabular', 'Time Series', 'Text (Natural Language Processing)', 'Image (Computer Vision)', 'Audio', 'Video'", 
+      "Competition Brief Description": "A brief description of the competition",
+      "Competition Target Description": "A description of the target variable to be predicted",
       "Submission Specifications": "The submission specification & sample submission csv descriptions for the model to output."
       "Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
-      "Evaluation Description": "A brief description of the metrics used in the evaluation. Please note that if `evaluation_metric_direction` is True, it indicates that higher values are better; if False, lower values are preferred."
     }
-    Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together.
   user: |-
     Competition Description: 
-    {{ competition_descriptions }}
-    Evaluation_metric_direction: 
-    {{ evaluation_metric_direction }}
+    {{ competition_raw_description }}
 
 competition_background: |-
-  You are solving a data science tasks and the type of the competition is {{ competition_type }}.
-  The competition description is: {{competition_description}}. 
-  
-  We provide an overall script in file: train.py. The user will run the train.py script along with several feature and model scripts to train several model to get a good performance on this task.
+  You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
+  Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
+  You are dedicated to producing accurate, efficient, and innovative solutions.
 
-  The train.py script is as follows:
-  ```python
-  {{ train_script }}
-  ```
-  
-  The final output of our pipeline is from a ensemble of up to four models. Each model is trained on a different subset of the data.
-  The four model types are: XGBoost, RandomForest, LightGBM and Neural Network (A Pytorch model).
-  About the Neural Network model, You can try different architectures and hyperparameters to improve the performance. You can even use a pytorch model to ensemble the other three types of models. Try to open your mind on the NN model.
-  
-  The data is extracted from the competition dataset, focusing on relevant attributes in {{ competition_features }}.
-
-  The user firstly designs and implements a feature book for each model. The feature book is a combination of several features and feature groups.
-  The feature book is built from:
-  - Raw features: The raw features are the original features from the dataset.
-  - generated features: The generated features are the features that are calculated based on the raw features according to some formulations. The calculation should be align with some physical or logical meaning. Don't just simply apply some numeric operations to the raw features.
-  - feature groups: The feature groups are preprocessed group of features from the raw features like normalization, one hot encoding, etc.
-  The feature or feature group is defined in the following parts:
-  - Name: The name of the feature or feature group.
-  - Description: A description of the feature or feature group.
-  - Formulation: The formulation of the feature or feature group.
-  - Variables: The variable list used in the formulation. Notice: The variable should be a specific feature in the dataset. Please make sure the feature name is exactly the same as the feature name in the dataset.
-  
-  For each model, the user will design and implement the model in a separate script.
-  The model is defined in the following parts:
-  - Name: The name of the model.
-  - Description: A description of the model.
-  - Architecture: The detailed architecture of the model, such as neural network layers or tree structures.
-  - ModelType: The type of the model, which should be one of ["XGBoost", "RandomForest", "LightGBM", "NN"].
-  The model should provide clear and detailed documentation of its architecture and hyperparameters.
-
-  The user tries to optimize the performance iteratively by employing one of the feature related or model related action items:
-  - Feature related:
-    - "Feature engineering": The user will design several new tasks and implement several new features. The new feature might only affect the model using all the feature book.
-    - "Feature processing": The user will design a new task to process the feature book like normalization or one hot encoding to improve the model performance. Any processing with help of a deep model is not included in this task.
-  - Model related:
-    - "Model feature selection": The user will modify one model to select the part of the features from the feature book to improve the model performance.
-    - "Model tuning": The user will tune the hyperparameters of XGBoost, RandomForest or LightGBM or build or improve the NN model to improve the model performance. 
-  Notice: You can automatically optimize the hyperparameters of the model using some library when training the model. Since we don't have a lot of time to train the model, please use a small number of trials to optimize the hyperparameters. 
-  Our validation set split is not deterministic, so when you are using hyperparameter tuning, you can merge training and validation and use cross validation method to tune the hyperparameters.
-  One you have determine the best model parameter, you should retrain the model on all training and validation set to get the final model.
-
-  For each loop, you need to help user decide which action item to choose and provide the corresponding code to implement the action item.
\ No newline at end of file
+  The task type for this competition is {{ competition_task_type }}.
+  The data type used in this competition is {{ competition_data_type }}.
+  Briefly, the competition involves: {{ competition_brief_description }}.
+  #TODO: Add more details about the competition?
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index 79e0de7df..a519f1ef2 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -21,18 +21,17 @@ class DataScienceScen(Scenario):
 
     def __init__(self, competition: str) -> None:
         self.competition = competition
-        self.competition_descriptions = crawl_descriptions(competition, DS_RD_SETTING.local_data_path)
+        self.competition_raw_description = crawl_descriptions(competition, DS_RD_SETTING.local_data_path)
 
         leaderboard = leaderboard_scores(competition)
-        self.evaluation_metric_direction = float(leaderboard[0]) > float(leaderboard[-1])
+        self.competition_metric_direction = float(leaderboard[0]) > float(leaderboard[-1])
 
         self._analysis_competition_description()
 
     def _analysis_competition_description(self):
-        sys_prompt = T(".prompts:description_template.system").r()
-        user_prompt = T(".prompts:description_template.user").r(
-            competition_descriptions=self.competition_descriptions,
-            evaluation_metric_direction=self.evaluation_metric_direction,
+        sys_prompt = T(".prompts:competition_description_template.system").r()
+        user_prompt = T(".prompts:competition_description_template.user").r(
+            competition_raw_description=self.competition_raw_description,
         )
 
         response_analysis = APIBackend().build_messages_and_create_chat_completion(
@@ -42,41 +41,34 @@ def _analysis_competition_description(self):
         )
 
         response_json_analysis = json.loads(response_analysis)
-        self.competition_type = response_json_analysis.get("Competition Type", "No type provided")
-        self.competition_description = response_json_analysis.get("Competition Description", "No description provided")
-        self.target_description = response_json_analysis.get("Target Description", "No target provided")
-        self.competition_features = response_json_analysis.get("Competition Features", "No features provided")
+        self.competition_task_type = response_json_analysis.get("Competition Task Type", "No type provided")
+        self.competition_data_type = response_json_analysis.get("Competition Data Type", "No data type provided")
+        self.competition_brief_description = response_json_analysis.get("Competition Brief Description", "No brief description provided")
+        self.competition_target_description = response_json_analysis.get("Competition Target Description", "No target description provided")
         self.submission_specifications = response_json_analysis.get(
             "Submission Specifications", "No submission requirements provided"
         )
         self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
-        self.evaluation_desc = response_json_analysis.get(
-            "Evaluation Description", "No evaluation specification provided."
-        )
 
     def get_competition_full_desc(self) -> str:
-        evaluation_direction = "higher the better" if self.evaluation_metric_direction else "lower the better"
-        return f"""Competition Type: {self.competition_type}
-    Competition Description: {self.competition_description}
-    Target Description: {self.target_description}
-    Competition Features: {self.competition_features}
+        return f"""Competition Task Type: {self.competition_task_type}
+    Competition Data Type: {self.competition_data_type}
+    Competition Brief Description: {self.competition_brief_description}
+    Competition Target Description: {self.competition_target_description}
     Submission Specifications: {self.submission_specifications}
     Model Output Channel: {self.model_output_channel}
-    Evaluation Descriptions: {self.evaluation_desc}
-    Is the evaluation metric the higher the better: {evaluation_direction}
     """
 
     @property
     def background(self) -> str:
         background_template = T(".prompts:competition_background")
         background_prompt = background_template.r(
-            competition_type=self.competition_type,
-            competition_description=self.competition_description,
-            target_description=self.target_description,
-            competition_features=self.competition_features,
+            competition_task_type=self.competition_task_type,
+            competition_data_type=self.competition_data_type,
+            competition_brief_description=self.competition_brief_description,
+            target_description=self.competition_target_description,
             submission_specifications=self.submission_specifications,
-            evaluation_desc=self.evaluation_desc,
-            evaluate_bool=self.evaluation_metric_direction,
+            evaluate_bool=self.competition_metric_direction,
         )
         return background_prompt
 

From 6cf001ed51da18d1a555684d49acae788bb9b07f Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Mon, 16 Dec 2024 10:02:51 +0000
Subject: [PATCH 067/304] fix some bugs

---
 .../data_science/raw_data_loader/__init__.py  |  4 +-
 .../scenarios/data_science/scen/prompts.yaml  | 21 +++++----
 rdagent/scenarios/data_science/scen/scen.py   | 45 +++++++++++--------
 3 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index ecb7bc53c..5964e6b23 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -1,6 +1,6 @@
 """
 
-Loop should not large change excclude
+Loop should not large change exclude
 - Action Choice[current data loader & spec]
 - other should share
     - Propose[choice] => Task[Choice] => CoSTEER => 
@@ -50,7 +50,7 @@ def implement_one_task(
     ) -> dict[str, str]:
         # return a workspace with "load_data.py", "spec/load_data.md" inside
         # assign the implemented code to the new workspace.
-        competition_info = self.scen.competition_descriptions
+        competition_info = self.scen.get_scenario_all_desc()
 
         # 1. specifications
         system_prompt = T(".prompts:spec.system").r(competition_info=competition_info)
diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
index cdb68ef77..b6bde18ce 100644
--- a/rdagent/scenarios/data_science/scen/prompts.yaml
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -1,9 +1,10 @@
-scen_desc: |-
+scenario_description: |-
   ------Background of the scenario------
   {{scen.background}}
 
   ------The expected output & submission format specifications------
   {{scen.submission_specifications}}
+  The evaluation metric used is directed as: {{scen.metric_direction}}.
 
 competition_description_template:
   system: |-
@@ -12,10 +13,11 @@ competition_description_template:
     For the dataset, the competition may not include detailed information about the dataset. The user has read the dataset and provide you the relevant information. Please include it in your response.
     Please answer in Json format with the following schema:
     {
-      "Competition Task Type": "The type of competition task, e.g., 'Classification', 'Regression', 'Clustering', 'Recommendation", "Time-Series Forecasting",
-      "Competition Data Type": "The type of competition data, e.g., 'Tabular', 'Time Series', 'Text (Natural Language Processing)', 'Image (Computer Vision)', 'Audio', 'Video'", 
-      "Competition Brief Description": "A brief description of the competition",
-      "Competition Target Description": "A description of the target variable to be predicted",
+      "Task Type": "The type of competition task, e.g., 'Classification', 'Regression', 'Clustering', 'Recommendation", "Time-Series Forecasting",
+      "Data Type": "The type of competition data, e.g., 'Tabular', 'Time Series', 'Text (Natural Language Processing)', 'Image (Computer Vision)', 'Audio', 'Video'", 
+      "Brief Description": "A brief description of the competition",
+      "Data Description": "A detailed description of the dataset used in the competition, including its source, structure, and any relevant characteristics",
+      "Target Description": "A description of the target variable to be predicted",
       "Submission Specifications": "The submission specification & sample submission csv descriptions for the model to output."
       "Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
     }
@@ -28,7 +30,8 @@ competition_background: |-
   Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
   You are dedicated to producing accurate, efficient, and innovative solutions.
 
-  The task type for this competition is {{ competition_task_type }}.
-  The data type used in this competition is {{ competition_data_type }}.
-  Briefly, the competition involves: {{ competition_brief_description }}.
-  #TODO: Add more details about the competition?
\ No newline at end of file
+  The task type for this competition is {{ task_type }}.
+  The data type used in this competition is {{ data_type }}.
+  Briefly, the competition involves: {{ brief_description }}.
+  The dataset used in this competition is: {{ data_description }}.
+  Your goal in this competition is to: {{target_description }}.
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index a519f1ef2..930bb56be 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -21,17 +21,17 @@ class DataScienceScen(Scenario):
 
     def __init__(self, competition: str) -> None:
         self.competition = competition
-        self.competition_raw_description = crawl_descriptions(competition, DS_RD_SETTING.local_data_path)
+        self.raw_description = crawl_descriptions(competition, DS_RD_SETTING.local_data_path)
 
         leaderboard = leaderboard_scores(competition)
-        self.competition_metric_direction = float(leaderboard[0]) > float(leaderboard[-1])
+        self.metric_direction = "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize"
 
         self._analysis_competition_description()
 
     def _analysis_competition_description(self):
         sys_prompt = T(".prompts:competition_description_template.system").r()
         user_prompt = T(".prompts:competition_description_template.user").r(
-            competition_raw_description=self.competition_raw_description,
+            competition_raw_description=self.raw_description,
         )
 
         response_analysis = APIBackend().build_messages_and_create_chat_completion(
@@ -41,20 +41,22 @@ def _analysis_competition_description(self):
         )
 
         response_json_analysis = json.loads(response_analysis)
-        self.competition_task_type = response_json_analysis.get("Competition Task Type", "No type provided")
-        self.competition_data_type = response_json_analysis.get("Competition Data Type", "No data type provided")
-        self.competition_brief_description = response_json_analysis.get("Competition Brief Description", "No brief description provided")
-        self.competition_target_description = response_json_analysis.get("Competition Target Description", "No target description provided")
+        self.task_type = response_json_analysis.get("Task Type", "No type provided")
+        self.data_type = response_json_analysis.get("Data Type", "No data type provided")
+        self.brief_description = response_json_analysis.get("Brief Description", "No brief description provided")
+        self.data_description = response_json_analysis.get("Data Description", "No data description provided")
+        self.target_description = response_json_analysis.get("Target Description", "No target description provided")
         self.submission_specifications = response_json_analysis.get(
             "Submission Specifications", "No submission requirements provided"
         )
         self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
 
     def get_competition_full_desc(self) -> str:
-        return f"""Competition Task Type: {self.competition_task_type}
-    Competition Data Type: {self.competition_data_type}
-    Competition Brief Description: {self.competition_brief_description}
-    Competition Target Description: {self.competition_target_description}
+        return f"""Task Type: {self.task_type}
+    Data Type: {self.data_type}
+    Brief Description: {self.brief_description}
+    Data Description: {self.data_description}
+    Target Description: {self.target_description}
     Submission Specifications: {self.submission_specifications}
     Model Output Channel: {self.model_output_channel}
     """
@@ -63,12 +65,11 @@ def get_competition_full_desc(self) -> str:
     def background(self) -> str:
         background_template = T(".prompts:competition_background")
         background_prompt = background_template.r(
-            competition_task_type=self.competition_task_type,
-            competition_data_type=self.competition_data_type,
-            competition_brief_description=self.competition_brief_description,
-            target_description=self.competition_target_description,
-            submission_specifications=self.submission_specifications,
-            evaluate_bool=self.competition_metric_direction,
+            task_type=self.task_type,
+            data_type=self.data_type,
+            brief_description=self.brief_description,
+            data_description=self.data_description,
+            target_description=self.target_description,
         )
         return background_prompt
 
@@ -100,5 +101,13 @@ def rich_style_description(self) -> str:
 To automatically optimize performance metrics within the validation set or Kaggle Leaderboard, ultimately discovering the most efficient features and models through autonomous research and development.
 """
 
+    @property
+    def to_dict(self):
+        return {
+            "background": self.background,
+            "submission_specifications": self.submission_specifications,
+            "metric_direction": self.metric_direction,
+        }
+
     def get_scenario_all_desc(self) -> str:
-        return T(".prompts:scen_desc").r(scen=self)
+        return T(".prompts:scenario_description").r(scen=self.to_dict)

From e1abb6f224cd80951a85005619ffb2690330a27b Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 17 Dec 2024 02:11:03 +0000
Subject: [PATCH 068/304] exp_gen change

---
 .../data_science/proposal/exp_gen.py          | 203 +++++-------------
 1 file changed, 58 insertions(+), 145 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index aced71998..fd5f8a899 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -53,6 +53,32 @@ def __str__(self) -> str:
 
 class DSExpGen(ExpGen):
     """Data Science Task Generator."""
+    def llm_task_gen(self,
+                    targets: str,
+                    scenario_desc: str,
+                    task_output_format: str,
+                    hypothesis: Hypothesis | None = None,
+                    hypothesis_and_feedback: str | None = None
+                    ) -> dict:
+        system_prompt = T(".prompts:task_gen.system").r(
+            targets=targets,
+            scenario=scenario_desc,
+            hypothesis=hypothesis,
+            task_output_format=task_output_format,
+        )
+        user_prompt = T(".prompts:task_gen.user").r(
+            targets=targets,
+            hypothesis=hypothesis,
+            hypothesis_and_feedback=hypothesis_and_feedback,
+        )
+
+        resp_dict = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(
+                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+            )
+        )
+        
+        return resp_dict
 
     def gen(self, trace: Trace) -> Experiment:
         successful_components = set()
@@ -70,7 +96,7 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                     return exp
             return None
 
-        scenario = trace.scen.get_scenario_all_desc()
+        scenario_desc = trace.scen.get_scenario_all_desc()
         if is_complete():
             # base info
             hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(trace=trace)
@@ -80,7 +106,7 @@ def last_successful_component(com: COMPONENT) -> Experiment:
             sota_solution = ""
             system_prompt = T(".prompts:hypothesis_gen.system").r(
                 targets="data science project",
-                scenario=scenario,
+                scenario=scenario_desc,
                 hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
                 hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
             )
@@ -104,24 +130,14 @@ def last_successful_component(com: COMPONENT) -> Experiment:
 
             # 2. gen experiment
             if hypothesis.component == "DataLoadSpec":
-                data_loader_task_output_format = T(".prompts:output_format.data_loader").r()
-                system_prompt = T(".prompts:task_gen.system").r(
-                    targets="Data loader and specification generation",
-                    scenario=scenario,
-                    hypothesis=hypothesis,
-                    task_output_format=data_loader_task_output_format,
-                )
-                user_prompt = T(".prompts:task_gen.user").r(
+                resp_dict = self.llm_task_gen(
                     targets="Data loader and specification generation",
+                    scenario_desc=scenario_desc,
                     hypothesis=hypothesis,
+                    task_output_format=T(".prompts:output_format.data_loader").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                resp_dict = json.loads(
-                    APIBackend().build_messages_and_create_chat_completion(
-                        user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-                    )
-                )
                 dt = DataLoaderTask(
                     name="Data loader and specification generation",
                     description=resp_dict.get(
@@ -132,24 +148,14 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                 return DataLoaderExperiment(sub_tasks=[dt], hypothesis=hypothesis)
             elif hypothesis.component == "FeatureEng":
                 # TODO: RAG
-                feature_task_output_format = T(".prompts:output_format.feature").r()
-                system_prompt = T(".prompts:task_gen.system").r(
-                    targets="Feature Engineering",
-                    scenario=scenario,
-                    hypothesis=hypothesis,
-                    task_output_format=feature_task_output_format,
-                )
-                user_prompt = T(".prompts:task_gen.user").r(
+                resp_dict = self.llm_task_gen(
                     targets="Feature Engineering",
+                    scenario_desc=scenario_desc,
                     hypothesis=hypothesis,
+                    task_output_format=T(".prompts:output_format.feature").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                resp_dict = json.loads(
-                    APIBackend().build_messages_and_create_chat_completion(
-                        user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-                    )
-                )
                 dependency_exp = last_successful_component("DataLoadSpec")
                 spec = dependency_exp.experiment_workspace.code_dict["spec/feature.md"]
                 tasks = []
@@ -166,25 +172,14 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                 exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                 return exp
             elif hypothesis.component == "Model":
-                model_task_output_format = T(".prompts:output_format.model").r()
-
-                system_prompt = T(".prompts:task_gen.system").r(
-                    targets="Models",
-                    scenario=scenario,
-                    hypothesis=hypothesis,
-                    task_output_format=model_task_output_format,
-                )
-                user_prompt = T(".prompts:task_gen.user").r(
+                resp_dict = self.llm_task_gen(
                     targets="Models",
+                    scenario_desc=scenario_desc,
                     hypothesis=hypothesis,
+                    task_output_format=T(".prompts:output_format.model").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                resp_dict = json.loads(
-                    APIBackend().build_messages_and_create_chat_completion(
-                        user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-                    )
-                )
                 dependency_exp = last_successful_component("FeatureEng")
                 spec = dependency_exp.experiment_workspace.code_dict["spec/model.md"]
                 mt = ModelTask(
@@ -200,25 +195,14 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                 exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                 return exp
             elif hypothesis.component == "Ensemble":
-                ensemble_task_output_format = T(".prompts:output_format.ensemble").r()
-
-                system_prompt = T(".prompts:task_gen.system").r(
-                    targets="Ensemble",
-                    scenario=scenario,
-                    hypothesis=hypothesis,
-                    task_output_format=ensemble_task_output_format,
-                )
-                user_prompt = T(".prompts:task_gen.user").r(
+                resp_dict = self.llm_task_gen(
                     targets="Ensemble",
+                    scenario_desc=scenario_desc,
                     hypothesis=hypothesis,
+                    task_output_format=T(".prompts:output_format.ensemble").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                resp_dict = json.loads(
-                    APIBackend().build_messages_and_create_chat_completion(
-                        user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-                    )
-                )
                 dependency_exp = last_successful_component("Model")
                 spec = dependency_exp.experiment_workspace.code_dict["spec/ensemble.md"]
                 et = EnsembleTask(
@@ -231,25 +215,14 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                 exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                 return exp
             elif hypothesis.component == "Workflow":
-                workflow_task_output_format = T(".prompts:output_format.workflow").r()
-
-                system_prompt = T(".prompts:task_gen.system").r(
-                    targets="Workflow",
-                    scenario=scenario,
-                    hypothesis=hypothesis,
-                    task_output_format=workflow_task_output_format,
-                )
-                user_prompt = T(".prompts:task_gen.user").r(
+                resp_dict = self.llm_task_gen(
                     targets="Workflow",
+                    scenario_desc=scenario_desc,
                     hypothesis=hypothesis,
+                    task_output_format=T(".prompts:output_format.workflow").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                resp_dict = json.loads(
-                    APIBackend().build_messages_and_create_chat_completion(
-                        user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-                    )
-                )
                 dependency_exp = last_successful_component("Ensemble")
                 spec = dependency_exp.experiment_workspace.code_dict["spec/workflow.md"]
                 wt = WorkflowTask(
@@ -267,22 +240,10 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                     # we already have the component, then skip
                     continue
                 elif o == "DataLoadSpec":
-                    data_loader_task_output_format = T(".prompts:output_format.data_loader").r()
-                    system_prompt = T(".prompts:task_gen.system").r(
+                    resp_dict = self.llm_task_gen(
                         targets="Data loader and specification generation",
-                        scenario=scenario,
-                        hypothesis=None,
-                        task_output_format=data_loader_task_output_format,
-                    )
-                    user_prompt = T(".prompts:task_gen.user").r(
-                        targets="Data loader and specification generation",
-                        hypothesis=None,
-                    )
-
-                    resp_dict = json.loads(
-                        APIBackend().build_messages_and_create_chat_completion(
-                            user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-                        )
+                        scenario_desc=scenario_desc,
+                        task_output_format=T(".prompts:output_format.data_loader").r(),
                     )
                     dt = DataLoaderTask(
                         name="Data loader and specification generation",
@@ -294,22 +255,10 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                     exp = DataLoaderExperiment(sub_tasks=[dt])
                     return exp
                 elif o == "FeatureEng":
-                    feature_task_output_format = T(".prompts:output_format.feature").r()
-                    system_prompt = T(".prompts:task_gen.system").r(
-                        targets="Feature Engineering",
-                        scenario=scenario,
-                        hypothesis=None,
-                        task_output_format=feature_task_output_format,
-                    )
-                    user_prompt = T(".prompts:task_gen.user").r(
+                    resp_dict = self.llm_task_gen(
                         targets="Feature Engineering",
-                        hypothesis=None,
-                    )
-
-                    resp_dict = json.loads(
-                        APIBackend().build_messages_and_create_chat_completion(
-                            user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-                        )
+                        scenario_desc=scenario_desc,
+                        task_output_format=T(".prompts:output_format.feature").r(),
                     )
                     dependency_exp = last_successful_component("DataLoadSpec")
                     spec = dependency_exp.experiment_workspace.code_dict["spec/feature.md"]
@@ -327,22 +276,10 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
                 elif o == "Model":
-                    model_task_output_format = T(".prompts:output_format.model").r()
-                    system_prompt = T(".prompts:task_gen.system").r(
+                    resp_dict = self.llm_task_gen(
                         targets="Models",
-                        scenario=scenario,
-                        hypothesis=None,
-                        task_output_format=model_task_output_format,
-                    )
-                    user_prompt = T(".prompts:task_gen.user").r(
-                        targets="Models",
-                        hypothesis=None,
-                    )
-
-                    resp_dict = json.loads(
-                        APIBackend().build_messages_and_create_chat_completion(
-                            user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-                        )
+                        scenario_desc=scenario_desc,
+                        task_output_format=T(".prompts:output_format.model").r(),
                     )
                     dependency_exp = last_successful_component("FeatureEng")
                     spec = dependency_exp.experiment_workspace.code_dict["spec/model.md"]
@@ -363,22 +300,10 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
                 elif o == "Ensemble":
-                    ensemble_task_output_format = T(".prompts:output_format.ensemble").r()
-                    system_prompt = T(".prompts:task_gen.system").r(
+                    resp_dict = self.llm_task_gen(
                         targets="Ensemble",
-                        scenario=scenario,
-                        hypothesis=None,
-                        task_output_format=ensemble_task_output_format,
-                    )
-                    user_prompt = T(".prompts:task_gen.user").r(
-                        targets="Ensemble",
-                        hypothesis=None,
-                    )
-
-                    resp_dict = json.loads(
-                        APIBackend().build_messages_and_create_chat_completion(
-                            user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-                        )
+                        scenario_desc=scenario_desc,
+                        task_output_format=T(".prompts:output_format.ensemble").r(),
                     )
                     dependency_exp = last_successful_component("Model")
                     spec = dependency_exp.experiment_workspace.code_dict["spec/ensemble.md"]
@@ -391,22 +316,10 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
                 elif o == "Workflow":
-                    workflow_task_output_format = T(".prompts:output_format.workflow").r()
-                    system_prompt = T(".prompts:task_gen.system").r(
-                        targets="Workflow",
-                        scenario=scenario,
-                        hypothesis=None,
-                        task_output_format=workflow_task_output_format,
-                    )
-                    user_prompt = T(".prompts:task_gen.user").r(
+                    resp_dict = self.llm_task_gen(
                         targets="Workflow",
-                        hypothesis=None,
-                    )
-
-                    resp_dict = json.loads(
-                        APIBackend().build_messages_and_create_chat_completion(
-                            user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-                        )
+                        scenario_desc=scenario_desc,
+                        task_output_format=T(".prompts:output_format.workflow").r(),
                     )
                     dependency_exp = last_successful_component("Ensemble")
                     spec = dependency_exp.experiment_workspace.code_dict["spec/workflow.md"]

From 137965494ce002c5a15e8d4b29aaf11d1562d8f1 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 17 Dec 2024 02:58:48 +0000
Subject: [PATCH 069/304] exp_gen change

---
 .../data_science/proposal/exp_gen.py          | 49 +++++++++++--------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index fd5f8a899..ea964f913 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -1,5 +1,4 @@
 import json
-from argparse import ONE_OR_MORE
 from typing import Literal
 
 from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
@@ -8,8 +7,8 @@
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
 from rdagent.core.experiment import Experiment
-from rdagent.core.proposal import ExpGen, Hypothesis, Trace
-from rdagent.core.scenario import Scenario
+from rdagent.core.proposal import ExpGen, Hypothesis, Trace, HypothesisFeedback
+from rdagent.core.knowledge_base import KnowledgeBase
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.scenarios.data_science.experiment.experiment import (
     DataLoaderExperiment,
@@ -18,12 +17,12 @@
     ModelExperiment,
     WorkflowExperiment,
 )
+from rdagent.scenarios.data_science.scen import DataScienceScen
 from rdagent.utils.agent.tpl import T
 
 COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow"]
 ORDER = COMPONENT.__args__
 
-
 class DSHypothesis(Hypothesis):
     def __init__(
         self,
@@ -51,6 +50,22 @@ def __str__(self) -> str:
 """
 
 
+class DSTrace(Trace[DataScienceScen, KnowledgeBase]):
+    def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None = None) -> None:
+        self.scen: DataScienceScen = scen
+        self.hist: list[tuple[DSHypothesis, Experiment, HypothesisFeedback]] = []
+        self.knowledge_base = knowledge_base
+
+    def get_sota_hypothesis_and_experiment(self, component: COMPONENT | None = None) -> tuple[DSHypothesis | None, Experiment | None]:
+        """Access the last experiment result, sub-task, and the corresponding hypothesis."""
+        for h, exp, hf in self.hist[::-1]:
+            if hf.decision:
+                if component and h.component != component:
+                    continue
+                return h, exp
+        return None, None
+
+
 class DSExpGen(ExpGen):
     """Data Science Task Generator."""
     def llm_task_gen(self,
@@ -80,7 +95,7 @@ def llm_task_gen(self,
         
         return resp_dict
 
-    def gen(self, trace: Trace) -> Experiment:
+    def gen(self, trace: DSTrace) -> Experiment:
         successful_components = set()
         for h, _, hf in trace.hist:
             if hf.decision:
@@ -90,12 +105,6 @@ def is_complete():
             """is all components complete"""
             return set(ORDER) == successful_components
 
-        def last_successful_component(com: COMPONENT) -> Experiment:
-            for h, exp, hf in reversed(trace.hist):
-                if hf.decision and h.component == com:
-                    return exp
-            return None
-
         scenario_desc = trace.scen.get_scenario_all_desc()
         if is_complete():
             # base info
@@ -156,7 +165,7 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = last_successful_component("DataLoadSpec")
+                dependency_exp = trace.get_sota_hypothesis_and_experiment("DataLoadSpec")
                 spec = dependency_exp.experiment_workspace.code_dict["spec/feature.md"]
                 tasks = []
                 for fn in resp_dict:
@@ -180,7 +189,7 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = last_successful_component("FeatureEng")
+                dependency_exp = trace.get_sota_hypothesis_and_experiment("FeatureEng")
                 spec = dependency_exp.experiment_workspace.code_dict["spec/model.md"]
                 mt = ModelTask(
                     name=resp_dict.get("model_name", "Model name not provided"),
@@ -203,7 +212,7 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = last_successful_component("Model")
+                dependency_exp = trace.get_sota_hypothesis_and_experiment("Model")
                 spec = dependency_exp.experiment_workspace.code_dict["spec/ensemble.md"]
                 et = EnsembleTask(
                     name="Ensemble",
@@ -223,7 +232,7 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = last_successful_component("Ensemble")
+                dependency_exp = trace.get_sota_hypothesis_and_experiment("Ensemble")
                 spec = dependency_exp.experiment_workspace.code_dict["spec/workflow.md"]
                 wt = WorkflowTask(
                     name="Workflow",
@@ -260,7 +269,7 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                         scenario_desc=scenario_desc,
                         task_output_format=T(".prompts:output_format.feature").r(),
                     )
-                    dependency_exp = last_successful_component("DataLoadSpec")
+                    dependency_exp = trace.get_sota_hypothesis_and_experiment("DataLoadSpec")
                     spec = dependency_exp.experiment_workspace.code_dict["spec/feature.md"]
                     tasks = []
                     for fn in resp_dict:
@@ -281,9 +290,9 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                         scenario_desc=scenario_desc,
                         task_output_format=T(".prompts:output_format.model").r(),
                     )
-                    dependency_exp = last_successful_component("FeatureEng")
+                    dependency_exp = trace.get_sota_hypothesis_and_experiment("FeatureEng")
                     spec = dependency_exp.experiment_workspace.code_dict["spec/model.md"]
-                    if last_model_exp:=last_successful_component("Model"):
+                    if last_model_exp:=trace.get_sota_hypothesis_and_experiment("Model"):
                         # TODO: model only have one (named "model.py")?
                         base_code = last_model_exp.experiment_workspace.code_dict["model.py"]
                     else:
@@ -305,7 +314,7 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                         scenario_desc=scenario_desc,
                         task_output_format=T(".prompts:output_format.ensemble").r(),
                     )
-                    dependency_exp = last_successful_component("Model")
+                    dependency_exp = trace.get_sota_hypothesis_and_experiment("Model")
                     spec = dependency_exp.experiment_workspace.code_dict["spec/ensemble.md"]
                     et = EnsembleTask(
                         name="Ensemble",
@@ -321,7 +330,7 @@ def last_successful_component(com: COMPONENT) -> Experiment:
                         scenario_desc=scenario_desc,
                         task_output_format=T(".prompts:output_format.workflow").r(),
                     )
-                    dependency_exp = last_successful_component("Ensemble")
+                    dependency_exp = trace.get_sota_hypothesis_and_experiment("Ensemble")
                     spec = dependency_exp.experiment_workspace.code_dict["spec/workflow.md"]
                     wt = WorkflowTask(
                         name="Workflow",

From 8d1eca9b0e23f1ec2c4000ac01ad69a6620c05cb Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 17 Dec 2024 06:58:16 +0000
Subject: [PATCH 070/304] spec & workspace changes

---
 rdagent/components/coder/CoSTEER/evaluators.py     |  1 -
 .../components/coder/CoSTEER/evolving_strategy.py  |  6 ++++--
 .../components/coder/data_science/ensemble/exp.py  |  1 -
 .../coder/data_science/feature/__init__.py         |  6 ++++--
 .../components/coder/data_science/feature/exp.py   |  2 --
 rdagent/components/coder/data_science/model/es.py  |  4 +++-
 rdagent/components/coder/data_science/model/exp.py |  2 --
 .../coder/data_science/raw_data_loader/__init__.py |  2 ++
 .../components/coder/data_science/workflow/exp.py  |  1 -
 .../coder/factor_coder/evolving_strategy.py        |  2 ++
 .../coder/model_coder/evolving_strategy.py         |  2 ++
 rdagent/scenarios/data_science/scen/prompts.yaml   |  6 +++---
 rdagent/scenarios/data_science/scen/scen.py        | 14 +++++---------
 13 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
index bf6b82197..4729a0f24 100644
--- a/rdagent/components/coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -40,7 +40,6 @@ class CoSTEERSingleFeedback(Feedback):
     # value_feedback, shape_feedback, value_generated_flag
     code: str
     final_decision: bool
-    final_decision_based_on_gt: bool | None = None
 
     def __str__(self) -> str:
         return f"""------------------Execution------------------
diff --git a/rdagent/components/coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/CoSTEER/evolving_strategy.py
index ba17851c4..e22575363 100644
--- a/rdagent/components/coder/CoSTEER/evolving_strategy.py
+++ b/rdagent/components/coder/CoSTEER/evolving_strategy.py
@@ -12,7 +12,8 @@
 from rdagent.core.conf import RD_AGENT_SETTINGS
 from rdagent.core.evaluation import Scenario
 from rdagent.core.evolving_framework import EvolvingStrategy, QueriedKnowledge
-from rdagent.core.experiment import Workspace
+from rdagent.core.experiment import FBWorkspace
+
 from rdagent.core.prompts import Prompts
 from rdagent.core.scenario import Task
 from rdagent.core.utils import multiprocessing_wrapper
@@ -30,6 +31,7 @@ def implement_one_task(
         self,
         target_task: Task,
         queried_knowledge: QueriedKnowledge = None,
+        workspace: FBWorkspace | None = None,
     ) -> dict[str, str]:  # FIXME: fix interface of previous implement
         """
         This method will input the task & current workspace,
@@ -95,7 +97,7 @@ def evolve(
 
         result = multiprocessing_wrapper(
             [
-                (self.implement_one_task, (evo.sub_tasks[target_index], queried_knowledge))
+                (self.implement_one_task, (evo.sub_tasks[target_index], queried_knowledge, evo.sub_workspace_list[target_index]))
                 for target_index in to_be_finished_task_index
             ],
             n=RD_AGENT_SETTINGS.multi_proc_n,
diff --git a/rdagent/components/coder/data_science/ensemble/exp.py b/rdagent/components/coder/data_science/ensemble/exp.py
index 720e85aa3..75f8d2055 100644
--- a/rdagent/components/coder/data_science/ensemble/exp.py
+++ b/rdagent/components/coder/data_science/ensemble/exp.py
@@ -13,7 +13,6 @@ def __init__(
         self,
         name: str,
         description: str,
-        spec: str,
         **kwargs,
     ) -> None:
         pass
diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
index ee5644fe3..770f872f4 100644
--- a/rdagent/components/coder/data_science/feature/__init__.py
+++ b/rdagent/components/coder/data_science/feature/__init__.py
@@ -15,20 +15,22 @@
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 
+from rdagent.core.experiment import FBWorkspace
 
 class FeatureMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
     def implement_one_task(
         self,
         target_task: FeatureTask,
         queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+        workspace: FBWorkspace | None = None,
     ) -> dict[str, str]:
         # return a workspace with "load_data.py", "spec/load_data.md" inside
         # assign the implemented code to the new workspace.
-        competition_info = self.scen.competition_descriptions
+        competition_info = self.scen.get_scenario_all_desc()
 
         # 2. code
         system_prompt = T(".prompts:feature.system").r()
-        user_prompt = T(".prompts:feature.user").r(competition_info=competition_info, feature_spec=target_task.spec)
+        user_prompt = T(".prompts:feature.user").r(competition_info=competition_info, feature_spec=workspace.code_dict["spec/feature.md"])
 
         feature_code = json.loads(
             APIBackend().build_messages_and_create_chat_completion(
diff --git a/rdagent/components/coder/data_science/feature/exp.py b/rdagent/components/coder/data_science/feature/exp.py
index b092b8585..0e3d6e80e 100644
--- a/rdagent/components/coder/data_science/feature/exp.py
+++ b/rdagent/components/coder/data_science/feature/exp.py
@@ -13,13 +13,11 @@ def __init__(
         self,
         name: str,
         description: str,
-        spec: str,
         variables: dict = {},
         implementation: bool = False,
         **kwargs,
     ) -> None:
         self.variables: dict = variables
-        self.spec: str = spec
         self.implementation: bool = implementation
         super().__init__(name=name, description=description, **kwargs)
 
diff --git a/rdagent/components/coder/data_science/model/es.py b/rdagent/components/coder/data_science/model/es.py
index 5240bfc86..a19d949b2 100644
--- a/rdagent/components/coder/data_science/model/es.py
+++ b/rdagent/components/coder/data_science/model/es.py
@@ -14,6 +14,7 @@
 from rdagent.core.prompts import Prompts
 from rdagent.oai.llm_conf import LLM_SETTINGS
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.core.experiment import FBWorkspace
 
 coder_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
@@ -23,6 +24,7 @@ def implement_one_task(
         self,
         target_task: ModelTask,
         queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+        workspace: FBWorkspace | None = None,
     ) -> dict[str, str]:
         model_information_str = target_task.get_task_information()
 
@@ -52,7 +54,7 @@ def implement_one_task(
                 # scenario=self.scen.get_scenario_all_desc(filtered_tag=target_task.model_type),
                 # TODO: fit new scenario information
                 scenario=("No scenario description."),
-                spec=target_task.spec,
+                spec=workspace.code_dict["spec/model.md"],
                 queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
                 current_code=target_task.base_code,
             )
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
index 8e88dd30d..6d246cc8b 100644
--- a/rdagent/components/coder/data_science/model/exp.py
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -22,7 +22,6 @@ def __init__(
         formulation: str = None,
         variables: Dict[str, str] = None,
         model_type: Optional[str] = None,
-        spec: str,
         **kwargs,
     ) -> None:
         self.formulation: str = formulation
@@ -33,7 +32,6 @@ def __init__(
             model_type  # Tabular for tabular model, TimesSeries for time series model, Graph for graph model, XGBoost for XGBoost model
             # TODO: More Models Supported
         )
-        self.spec: str = spec
         super().__init__(name=name, description=description, *args, **kwargs)
 
     def get_task_information(self):
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index 5964e6b23..88f4eabc2 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -40,6 +40,7 @@
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
+from rdagent.core.experiment import FBWorkspace
 
 
 class DataLoaderMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
@@ -47,6 +48,7 @@ def implement_one_task(
         self,
         target_task: DataLoaderTask,
         queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+        workspace: FBWorkspace | None = None,
     ) -> dict[str, str]:
         # return a workspace with "load_data.py", "spec/load_data.md" inside
         # assign the implemented code to the new workspace.
diff --git a/rdagent/components/coder/data_science/workflow/exp.py b/rdagent/components/coder/data_science/workflow/exp.py
index 23db871b2..794282cfe 100644
--- a/rdagent/components/coder/data_science/workflow/exp.py
+++ b/rdagent/components/coder/data_science/workflow/exp.py
@@ -13,7 +13,6 @@ def __init__(
         self,
         name: str,
         description: str,
-        spec: str,
         **kwargs,
     ) -> None:
         pass
diff --git a/rdagent/components/coder/factor_coder/evolving_strategy.py b/rdagent/components/coder/factor_coder/evolving_strategy.py
index 5fbc81450..16b34ef21 100644
--- a/rdagent/components/coder/factor_coder/evolving_strategy.py
+++ b/rdagent/components/coder/factor_coder/evolving_strategy.py
@@ -17,6 +17,7 @@
 from rdagent.core.prompts import Prompts
 from rdagent.oai.llm_conf import LLM_SETTINGS
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.core.experiment import FBWorkspace
 
 implement_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
@@ -72,6 +73,7 @@ def implement_one_task(
         self,
         target_task: FactorTask,
         queried_knowledge: CoSTEERQueriedKnowledge,
+        workspace: FBWorkspace | None = None,
     ) -> str:
         target_factor_task_information = target_task.get_task_information()
 
diff --git a/rdagent/components/coder/model_coder/evolving_strategy.py b/rdagent/components/coder/model_coder/evolving_strategy.py
index b980508f0..d9b7f56af 100644
--- a/rdagent/components/coder/model_coder/evolving_strategy.py
+++ b/rdagent/components/coder/model_coder/evolving_strategy.py
@@ -19,6 +19,7 @@
 from rdagent.core.prompts import Prompts
 from rdagent.oai.llm_conf import LLM_SETTINGS
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.core.experiment import FBWorkspace
 
 coder_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
@@ -28,6 +29,7 @@ def implement_one_task(
         self,
         target_task: ModelTask,
         queried_knowledge: CoSTEERQueriedKnowledge = None,
+        workspace: FBWorkspace | None = None,
     ) -> str:
         model_information_str = target_task.get_task_information()
 
diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
index b6bde18ce..432f5c4f4 100644
--- a/rdagent/scenarios/data_science/scen/prompts.yaml
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -1,10 +1,10 @@
 scenario_description: |-
   ------Background of the scenario------
-  {{scen.background}}
+  {{background}}
 
   ------The expected output & submission format specifications------
-  {{scen.submission_specifications}}
-  The evaluation metric used is directed as: {{scen.metric_direction}}.
+  {{submission_specifications}}
+  The evaluation metric used is directed as: {{metric_direction}}.
 
 competition_description_template:
   system: |-
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index 930bb56be..3fbdc8fda 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -101,13 +101,9 @@ def rich_style_description(self) -> str:
 To automatically optimize performance metrics within the validation set or Kaggle Leaderboard, ultimately discovering the most efficient features and models through autonomous research and development.
 """
 
-    @property
-    def to_dict(self):
-        return {
-            "background": self.background,
-            "submission_specifications": self.submission_specifications,
-            "metric_direction": self.metric_direction,
-        }
-
     def get_scenario_all_desc(self) -> str:
-        return T(".prompts:scenario_description").r(scen=self.to_dict)
+        return T(".prompts:scenario_description").r(
+            background=self.background,
+            submission_specifications=self.submission_specifications,
+            metric_direction=self.metric_direction,
+        )

From 8ccd32f5cd1f5bdf7e932df425e874bcdf00f5d4 Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Tue, 17 Dec 2024 07:00:25 +0000
Subject: [PATCH 071/304] init for workflow

---
 .../model/eval_tests/model_execute.py         |  2 +-
 .../coder/data_science/model/test.py          |  2 +-
 .../coder/data_science/workflow/__init__.py   | 39 ++++++++------
 .../coder/data_science/workflow/es.py         | 26 +++++++++
 .../coder/data_science/workflow/eval.py       | 27 ++++++++++
 .../coder/data_science/workflow/prompts.yaml  | 34 ++++++++++++
 .../coder/data_science/workflow/test.py       | 54 +++++++++++++++++++
 7 files changed, 165 insertions(+), 19 deletions(-)
 create mode 100644 rdagent/components/coder/data_science/workflow/es.py
 create mode 100644 rdagent/components/coder/data_science/workflow/eval.py
 create mode 100644 rdagent/components/coder/data_science/workflow/prompts.yaml
 create mode 100644 rdagent/components/coder/data_science/workflow/test.py

diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
index 57f8221d0..f3009e27c 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
@@ -25,7 +25,7 @@
 # Call model_workflow
 val_pred, test_pred, hypers = model_workflow(X=train_X, y=train_y, val_X=val_X, val_y=val_y, test_X=test_X, hyper_params={})
 # val_pred = np.random.rand(8, 1)
-test_pred = np.random.rand(8, 1)
+# test_pred = np.random.rand(8, 1)
 
 execution_feedback_str = "Execution successful.\n"
 if val_pred is not None:
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index 186215564..4d0505c81 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -31,7 +31,7 @@ def develop_one_competition(competition: str):
         variables="variables: {'\\hat{y}_u': 'The predicted output for node u', 'X_u': 'The input features for node u'}",
         hyperparameters="...",
         base_code="",
-        spec="<read from spec>",
+        spec="",
     )
 
     tpl_ex_path = Path(__file__).resolve() / Path("rdagent/scenarios/kaggle/tpl_ex").resolve() / competition
diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
index 90961567e..996985b1a 100644
--- a/rdagent/components/coder/data_science/workflow/__init__.py
+++ b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -1,19 +1,24 @@
-# from rdagent.components.coder.CoSTEER import CoSTEER
-# from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
-# from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-# from rdagent.core.scenario import Scenario
+from rdagent.components.coder.CoSTEER import CoSTEER
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
+from rdagent.core.scenario import Scenario
+from rdagent.components.coder.data_science.workflow.es import (
+    WorkflowMultiProcessEvolvingStrategy,
+)
+from rdagent.components.coder.data_science.workflow.eval import (
+    WorkflowGeneralCaseSpecEvaluator,
+)
 
 
-# class WorkflowCoSTEER(CoSTEER):
-#     def __init__(
-#         self,
-#         scen: Scenario,
-#         *args,
-#         **kwargs,
-#     ) -> None:
-#         eva = CoSTEERMultiEvaluator(
-#             WorkflowCoSTEEREvaluator(scen=scen), scen=scen
-#         )  # Please specify whether you agree running your eva in parallel or not
-#         es = WorkflowMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
-
-#         super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
+class WorkflowCoSTEER(CoSTEER):
+    def __init__(
+        self,
+        scen: Scenario,
+        *args,
+        **kwargs,
+    ) -> None:
+        eva = CoSTEERMultiEvaluator(
+            WorkflowGeneralCaseSpecEvaluator(scen=scen), scen=scen
+        )  # Please specify whether you agree running your eva in parallel or not
+        es = WorkflowMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
diff --git a/rdagent/components/coder/data_science/workflow/es.py b/rdagent/components/coder/data_science/workflow/es.py
new file mode 100644
index 000000000..8c454b625
--- /dev/null
+++ b/rdagent/components/coder/data_science/workflow/es.py
@@ -0,0 +1,26 @@
+import json
+from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
+from rdagent.components.coder.CoSTEER.knowledge_management import CoSTEERQueriedKnowledge
+from rdagent.oai.llm_utils import APIBackend
+
+class WorkflowMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
+    def implement_one_task(
+        self,
+        target_task: WorkflowTask,
+        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+    ) -> dict[str, str]:
+        competition_info = self.scen.competition_descriptions
+        
+        system_prompt = T(".prompts:workflow_coder.system").r()
+        user_prompt = T(".prompts:workflow_coder.user").r(
+            competition_info=competition_info,
+        )
+
+        data_loader_code = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(
+                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+            )
+        )["code"]
+        
+        return
+    
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
new file mode 100644
index 000000000..9d4c16b4b
--- /dev/null
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -0,0 +1,27 @@
+from rdagent.core.experiment import FBWorkspace, Task
+from rdagent.core.evolving_framework import QueriedKnowledge
+from rdagent.components.coder.CoSTEER.evaluators import (
+    CoSTEEREvaluator,
+    CoSTEERMultiFeedback,
+    CoSTEERSingleFeedback,
+    CoSTEERSingleFeedbackDeprecated,
+)
+
+class WorkflowGeneralCaseSpecEvaluator(CoSTEEREvaluator):
+    """
+    Motivation case:
+    - Simplest case, we already split the data into train_data, valid_data, and test_data. We require the model to learn (optionally validate on valid data), and infer on test data.
+
+    Test workflow:
+    - Build train, valid, and test data to run it, and test the output (e.g., shape, etc.)
+    """
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: FBWorkspace,
+        gt_implementation: FBWorkspace,
+        queried_knowledge: QueriedKnowledge = None,
+        **kwargs,
+    ) -> CoSTEERSingleFeedbackDeprecated:
+        
+        return
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
new file mode 100644
index 000000000..522a45252
--- /dev/null
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -0,0 +1,34 @@
+workflow_coder:
+  system: |-
+    You are a Python data scientist working on a new kaggle competition project.
+
+    The user has write different Python function that can load and preprocess data, execute feature engineering, train models and ensemble them.
+    These Python codes with different functionalities are written separately in different Python files.
+    Your task is to Integrate the existing processes of load_data, feature, model, and ensemble into a complete workflow.
+    This workflow code is also a Python file, and it functions similarly to a main process that calls the sub-files for each step and ultimately outputs a prediction file.
+
+    The user will also provide some specifications about how to organize the whole code and give instructions. The code you implement should align the framework given in specifications.
+
+    Please response the code in the following json format. Here is an example structure for the JSON output:
+    {
+        "code": "The Python code as a string."
+    }
+
+  user: |-
+    ---------Competition Information---------
+    {{ competition_info }}
+
+    ---------Workflow Specification---------
+    {{ workflow_spec }}
+
+    ---------load data code---------
+    {{ load_data_code }}
+
+    ---------feature engineering code---------
+    {{ feature_code }}
+
+    ---------model training code---------
+    {{ model_code }}
+
+    ---------ensemble code---------
+    {{ ensembloe_code }}
diff --git a/rdagent/components/coder/data_science/workflow/test.py b/rdagent/components/coder/data_science/workflow/test.py
new file mode 100644
index 000000000..d5d08388f
--- /dev/null
+++ b/rdagent/components/coder/data_science/workflow/test.py
@@ -0,0 +1,54 @@
+"""
+Generate dataset to test the workflow output
+"""
+
+from pathlib import Path
+
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+from rdagent.components.coder.data_science.workflow import WorkflowCoSTEER
+from rdagent.components.coder.data_science.workflow.es import (
+    WorkflowMultiProcessEvolvingStrategy,
+)
+from rdagent.components.coder.data_science.workflow.eval import (
+    WorkflowGeneralCaseSpecEvaluator,
+)
+from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
+from rdagent.core.experiment import FBWorkspace
+from rdagent.scenarios.data_science.experiment.experiment import WorkflowExperiment
+from rdagent.scenarios.data_science.scen import DataScienceScen
+
+def develop_one_competition(competition: str):
+    scen = DataScienceScen(competition=competition)
+    workflow_coder = WorkflowCoSTEER(scen)
+
+    wt = WorkflowTask(
+        name="WorkflowTask",
+        description="Integrate the existing processes of load_data, feature, model, and ensemble into a complete workflow.",
+        spec="",
+        base_code={
+            
+        }
+    )
+
+    tpl_ex_path = Path(__file__).resolve() / Path("rdagent/scenarios/kaggle/tpl_ex").resolve() / competition
+    injected_file_names = ["spec/workflow.md", "load_data.py", "feat01.py", "model01.py", "ens.py", "main.py"]
+
+    workflowexp = FBWorkspace()
+    for file_name in injected_file_names:
+        file_path = tpl_ex_path / file_name
+        workflowexp.inject_code(**{file_name: file_path.read_text()})
+
+    wt.spec += workflowexp.code_dict["spec/model.md"]
+    wt.base_code += workflowexp.code_dict["model01.py"]
+    exp = WorkflowExperiment(
+        sub_tasks=[wt],
+    )
+
+    es = WorkflowMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+    new_code = es.implement_one_task(target_task=wt, queried_knowledge=None)
+    print(new_code)
+
+
+if __name__ == "__main__":
+    develop_one_competition("aerial-cactus-identification")
+    # dotenv run -- python rdagent/components/coder/data_science/workflow/test.py

From 2e2d1531a3dbf3fb8c298b0519984fb9987bd492 Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Tue, 17 Dec 2024 07:02:03 +0000
Subject: [PATCH 072/304] fix

---
 rdagent/components/coder/data_science/workflow/es.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rdagent/components/coder/data_science/workflow/es.py b/rdagent/components/coder/data_science/workflow/es.py
index 8c454b625..58ce94ece 100644
--- a/rdagent/components/coder/data_science/workflow/es.py
+++ b/rdagent/components/coder/data_science/workflow/es.py
@@ -2,6 +2,9 @@
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
 from rdagent.components.coder.CoSTEER.knowledge_management import CoSTEERQueriedKnowledge
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.components.coder.CoSTEER.evolving_strategy import (
+    MultiProcessEvolvingStrategy,
+)
 
 class WorkflowMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
     def implement_one_task(

From 6926e18379024480a3c1c6df4ff805eb155c3958 Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Tue, 17 Dec 2024 07:41:14 +0000
Subject: [PATCH 073/304] ds model fit for spec & workspace change

---
 rdagent/components/coder/CoSTEER/evolving_strategy.py | 2 +-
 rdagent/components/coder/data_science/model/eval.py   | 2 +-
 rdagent/components/coder/data_science/model/test.py   | 4 +---
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/CoSTEER/evolving_strategy.py
index e22575363..8ac1c4085 100644
--- a/rdagent/components/coder/CoSTEER/evolving_strategy.py
+++ b/rdagent/components/coder/CoSTEER/evolving_strategy.py
@@ -97,7 +97,7 @@ def evolve(
 
         result = multiprocessing_wrapper(
             [
-                (self.implement_one_task, (evo.sub_tasks[target_index], queried_knowledge, evo.sub_workspace_list[target_index]))
+                (self.implement_one_task, (evo.sub_tasks[target_index], queried_knowledge, evo.experiment_workspace))
                 for target_index in to_be_finished_task_index
             ],
             n=RD_AGENT_SETTINGS.multi_proc_n,
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 738cc08d8..68304cec2 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -78,7 +78,7 @@ def evaluate(
         system_prompt = T(".prompts:model_eval.system").r(
             test_code=test_code,
             scenario="No scenario information yet.",
-            spec=target_task.spec,
+            spec=implementation.code_dict["spec/model.md"]
         )
         user_prompt = T(".prompts:model_eval.user").r(
             stdout=stdout,
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index 4d0505c81..2cdbace0f 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -31,7 +31,6 @@ def develop_one_competition(competition: str):
         variables="variables: {'\\hat{y}_u': 'The predicted output for node u', 'X_u': 'The input features for node u'}",
         hyperparameters="...",
         base_code="",
-        spec="",
     )
 
     tpl_ex_path = Path(__file__).resolve() / Path("rdagent/scenarios/kaggle/tpl_ex").resolve() / competition
@@ -42,7 +41,6 @@ def develop_one_competition(competition: str):
         file_path = tpl_ex_path / file_name
         modelexp.inject_code(**{file_name: file_path.read_text()})
 
-    mt.spec += modelexp.code_dict["spec/model.md"]
     mt.base_code += modelexp.code_dict["model01.py"]
     exp = ModelExperiment(
         sub_tasks=[mt],
@@ -55,7 +53,7 @@ def develop_one_competition(competition: str):
 
     # Test the evolving strategy:
     """es = ModelMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
-    new_code = es.implement_one_task(target_task=mt, queried_knowledge=None)
+    new_code = es.implement_one_task(target_task=mt, queried_knowledge=None, workspace=modelexp)
     print(new_code)"""
 
     # Run the experiment

From 3118c6701752c6fffbb6caf31805e7dd3f6a9037 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Tue, 17 Dec 2024 09:34:29 +0000
Subject: [PATCH 074/304] improve data_loader_spec

---
 .../data_science/raw_data_loader/prompts.yaml | 52 ++++++++++++++-----
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 7b224a539..04cd55366 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -1,26 +1,50 @@
 spec:
-  system: |-
-    You are a Python data scientist working on a new kaggle competition project. This project will be used to analyze data and build models to predict future outcomes, and this project codes will be written by GPT.
-    Your task is to write five specification texts (markdown format) for follow-up tasks. The five tasks are: data loader(and preprocess), feature engineering, model building, ensemble, and workflow.
-    The competition information is provided as a html format string.
+  system:
+    You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
+    Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
+
+    Currently, you are working on a Kaggle competition project. 
+    This project involves analyzing data and building models to beat other competitors, with the code being generated by large language models.
     
+    Your task is to write five specification texts (in markdown format) for the following tasks
+    - Data loading (and preprocessing)
+    - Feature Engineering
+    - Model Building
+    - Ensemble
+    - The overall workflow
+
     -----------Competition Information-----------
     {{ competition_info }}
 
   user:
     data_loader: |-
-      Data loader specification text should include two parts:
-      1. function interface:
-        - all raw data files are in /kaggle/input/ directory, so the function should take no input.
-        - function name must be "load_data".
-        - have annotations for the output.
-        - have a docstring that describes the function.
-      2. Precautions:
-        some precautions for data loading and preprocessing.
+      Data loader specification text should follow these detailed requirements:
+      1. Function Interface:
+        - The function must be named `load_data`
+        - All raw data files are located in the /kaggle/input/ directory; therefore, the function should not take any input arguments.
+        - The function must include proper and specific annotations for the output, specifying the expected data type (e.g., `pd.DataFrame`, `dict`, `np.array`, etc.).
+        - A clear docstring should be provided that:
+          - Describes the purpose of the function.
+          - Mentions the source of the data (e.g., data location or structure).
+          - Explains the expected output format.
+      2. Precautions for Data Loading and Preprocessing:
+        - Handle potential issues such as:
+          - File encoding (e.g., UTF-8) and data delimiters (e.g., CSV comma-separated).
+          - Missing values in datasets: describe how they should be handled (e.g., fill with a specific value, drop rows, etc.).
+          - Data types: ensure proper type conversion (e.g., numeric columns, date parsing).
+          - Memory efficiency for large datasets: consider techniques such as downcasting types or reading data in chunks.
+          - Multiple files: if the dataset includes multiple files, specify how they should be combined or processed.
+        - Add any domain-specific handling (e.g., date formatting, specific transformations) relevant to the competition dataset.
+      3. Output:
+        - The function should return four objects: `X`, `y`, `X_test`, and `test_ids`.
+        - `X`: The feature matrix for the training data.
+        - `y`: The target vector for the training data.
+        - `X_test`: The feature matrix for the test data.
+        - `test_ids`: The identifiers for the test data.
 
-      Please response the specification in the following json format. Here is an example structure for the JSON output:
+      Please respond with a JSON structure as follows:
       {
-          "spec": "The specification as a string."
+          "spec": "The detailed and corresponding specification string as described above."
       }
 
     feature: |-

From b630f7995253d16cf680d223dd8776c56e5ab8a1 Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Tue, 17 Dec 2024 09:44:43 +0000
Subject: [PATCH 075/304] ds model eval for more cases

---
 .../model/eval_tests/model_execute.py         | 28 ++++++++++++++++---
 .../coder/data_science/model/prompts.yaml     |  9 ++++--
 .../spec/model.md                             |  2 ++
 3 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
index f3009e27c..4ebb5806a 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
@@ -21,13 +21,13 @@
 val_y = np.random.rand(8, 1)
 test_X = np.random.rand(8, 64, 64, 3)"""
 
-
+print("The first execution begins.\n")
 # Call model_workflow
-val_pred, test_pred, hypers = model_workflow(X=train_X, y=train_y, val_X=val_X, val_y=val_y, test_X=test_X, hyper_params={})
+val_pred, test_pred, hypers = model_workflow(X=train_X, y=train_y, val_X=val_X, val_y=val_y, test_X=None, hyper_params=None)
 # val_pred = np.random.rand(8, 1)
 # test_pred = np.random.rand(8, 1)
 
-execution_feedback_str = "Execution successful.\n"
+execution_feedback_str = "The first Execution successful.\n"
 if val_pred is not None:
     execution_feedback_str += f"Validation predictions shape: {val_pred.shape}\n"
 else:
@@ -35,6 +35,26 @@
 if test_pred is not None:
     execution_feedback_str += f"Test predictions shape: {test_pred.shape}\n"
 else:
-    execution_feedback_str += "Test predictions are None.\n" ""
+    execution_feedback_str += "Test predictions are None.\n"
+if hypers is not None:
+    execution_feedback_str += f"Hyperparameters:{hypers}\n"
+else:
+    execution_feedback_str += "Hyperparameters are None.\n"
+print(execution_feedback_str)
 
+print("The second execution begins.\n")
+val_pred, test_pred, finalhypers = model_workflow(X=train_X, y=train_y, val_X=None, val_y=None, test_X=test_X, hyper_params=hypers)
+execution_feedback_str = "The second Execution successful.\n"
+if val_pred is not None:
+    execution_feedback_str += f"Validation predictions shape: {val_pred.shape}\n"
+else:
+    execution_feedback_str += "Validation predictions are None.\n"
+if test_pred is not None:
+    execution_feedback_str += f"Test predictions shape: {test_pred.shape}\n"
+else:
+    execution_feedback_str += "Test predictions are None.\n"
+if hypers is not None:
+    execution_feedback_str += f"Hyperparameters:{finalhypers}\n"
+else:
+    execution_feedback_str += "Hyperparameters are None.\n"
 print(execution_feedback_str)
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 7df80fc9d..02c4dc413 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -73,15 +73,20 @@ model_eval:
         ```python
         {{test_code}}
         ```
+        The first time you execute it, you will not provide test inputs, only train, valid inputs, and empty hyperparameters. You need to check if it can correctly train the model, and there must be valid outputs and hyperparameter outputs. 
+        The hyperparameters returned must not be none. It should has parameters that will be useful for retrain later, for example, the number of iteration when the first round training early stopped.
+        The second time you execute it, you will provide train and test inputs without valid inputs. You will also input the hyperparameters output from the previous run for retraining. You need to check if these hyperparameters are used in the model code below and if it can correctly output the test predictions.
+        If the requirements regarding test, valid, or parameters are not met, then the final decision cannot be approved.
+        
         You should evaluate the code given by user. You should concern about whether the user implement it correctly, including whether the shape of model's output is aligned with request, the equality of code, and any other thing you think necessary.
         You will be given the code generated by user and the stdout of the testing process.
         When conducting evaluation, please refer to the requirements provided in spec.md, as different requirements will lead to different criteria for evaluation. 
-        For example, in some cases, the model's output may be required to have predictions for both the valid and test sets, while in other cases, only one of them may be required. Some cases may also require the model's hyperparameters to be preserved and outputted.
+    
         Please respond with your feedback in the following JSON format and order
         ```json
         {
             "execution": "Describe whether the model execute successfully, including any errors or issues encountered.",
-            "return_checking": "Checks about the generated value, including whether the value generated. Especially compare the shape of model output and the requirement in spec.md.",
+            "return_checking": "Checks about the generated value, including whether the value generated. Especially compare the shape of model output and the requirement in spec.md.", you also need to check whether the hyperparameters used for retraining are correctly returned during the test execution of the model.
             "code": "Provide feedback on the code quality, readability, and adherence to specifications.",
             "final_decision": <true/false>
         }
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/model.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/model.md
index c333497ab..3cff0cf6e 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/model.md
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/model.md
@@ -11,8 +11,10 @@ def model_workflow(X: np.ndarray, y: np.ndarray, val_X: np.ndarray = None, val_y
     - If test/valid exist, output inference on them
     - Follow the hyperparameter if exists.
         - the returned hyperparameter should align with the input(except the newly generated early stop)
+    - Return hyperparameters for retrain if not exists.
     - If valid exist, add <early stop> to update the hyperparameter
 
+
     Parameters
     ----------
     X : np.ndarray

From 84cf4d444d67b81cc035b8284353cfc13662f3c4 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Tue, 17 Dec 2024 10:05:01 +0000
Subject: [PATCH 076/304] refine prompts

---
 .../data_science/raw_data_loader/prompts.yaml | 36 ++++++++++++-------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 04cd55366..1cad289cc 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -6,7 +6,7 @@ spec:
     Currently, you are working on a Kaggle competition project. 
     This project involves analyzing data and building models to beat other competitors, with the code being generated by large language models.
     
-    Your task is to write five specification texts (in markdown format) for the following tasks
+    Your task is to write five specification texts (in markdown format) for the following tasks, based on the competition information provided
     - Data loading (and preprocessing)
     - Feature Engineering
     - Model Building
@@ -20,7 +20,7 @@ spec:
     data_loader: |-
       Data loader specification text should follow these detailed requirements:
       1. Function Interface:
-        - The function must be named `load_data`
+        - The function must be named `load_data`.
         - All raw data files are located in the /kaggle/input/ directory; therefore, the function should not take any input arguments.
         - The function must include proper and specific annotations for the output, specifying the expected data type (e.g., `pd.DataFrame`, `dict`, `np.array`, etc.).
         - A clear docstring should be provided that:
@@ -44,21 +44,33 @@ spec:
 
       Please respond with a JSON structure as follows:
       {
-          "spec": "The detailed and corresponding specification string as described above."
+          "spec": "The corresponding specification string as described above. You should create the rules based on the competition information instead of copying the requirements."
       }
 
     feature: |-
-      Feature engineering specification text should include two parts:
-      1. function interface:
-        - function name must be "feat_eng".
-        - have annotations for the input and output.
-        - have a docstring that describes the function.
-      2. Precautions:
-        some precautions for feature engineering.
+      Feature engineering specification text should adhere to the following requirements:
+      1. Function Interface:
+        - The function must be named `feat_eng`.
+        - Must include proper and specific annotations for both input and output based on the Competition Information:
+          - Input: Specify the expected input data type (e.g., `pd.DataFrame`, `dict`, `np.array`, etc.).
+          - Output: Specify the transformed output data type (e.g., `pd.DataFrame`, `dict`, `np.array`, etc.).
+        - A comprehensive docstring must be provided that:
+          - Describes the purpose of the function.
+          - Clarifies the input parameters and their types.
+          - Defines the structure and format of the output.
+      2. Precautions for Feature Engineering:
+        - If feature engineering is strictly part of the model pipeline and should not be done here, explicitly state that feature engineering will be handled at the model stage.
+        - If the competition requirements or modeling strategy dictate that feature engineering must be integrated into the model pipeline, this function will remain as a placeholder and return the input data unchanged.
+        - When feature engineering is applied, consider the following precautions:
+          - Ensure scalability for large datasets.
+          - Handle missing values and outliers appropriately during feature transformation.
+          - Feature types: Ensure consistency between feature data types and transformations.
+          - Custom features: Provide logic for domain-specific features, if applicable.
+          - Avoid data leakage: Only use features derived from training data, excluding information from test or validation sets.
 
-      Please response the specification in the following json format. Here is an example structure for the JSON output:
+      Please respond with a JSON structure as follows:
       {
-          "spec": "The specification as a string."
+          "spec": "The corresponding specification string as described above. You should create the rules based on the competition information instead of copying the requirements."
       }
 
     model: |-

From 81a427aaa29a4fa8c63c6c2f08b2c1a805f9713a Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 17 Dec 2024 13:11:39 +0000
Subject: [PATCH 077/304] spec change

---
 .../coder/CoSTEER/evolving_strategy.py        |  6 ++-
 .../coder/data_science/feature/__init__.py    |  6 ++-
 .../coder/data_science/feature/test.py        |  8 +---
 .../components/coder/data_science/model/es.py |  8 ++--
 .../model/eval_tests/model_execute.py         |  4 +-
 .../data_science/raw_data_loader/__init__.py  |  2 +-
 .../coder/factor_coder/evolving_strategy.py   |  2 +-
 .../coder/model_coder/evolving_strategy.py    |  2 +-
 .../data_science/proposal/exp_gen.py          | 43 +++++++------------
 9 files changed, 36 insertions(+), 45 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/CoSTEER/evolving_strategy.py
index e22575363..99432ac5c 100644
--- a/rdagent/components/coder/CoSTEER/evolving_strategy.py
+++ b/rdagent/components/coder/CoSTEER/evolving_strategy.py
@@ -13,7 +13,6 @@
 from rdagent.core.evaluation import Scenario
 from rdagent.core.evolving_framework import EvolvingStrategy, QueriedKnowledge
 from rdagent.core.experiment import FBWorkspace
-
 from rdagent.core.prompts import Prompts
 from rdagent.core.scenario import Task
 from rdagent.core.utils import multiprocessing_wrapper
@@ -97,7 +96,10 @@ def evolve(
 
         result = multiprocessing_wrapper(
             [
-                (self.implement_one_task, (evo.sub_tasks[target_index], queried_knowledge, evo.sub_workspace_list[target_index]))
+                (
+                    self.implement_one_task,
+                    (evo.sub_tasks[target_index], queried_knowledge, evo.sub_workspace_list[target_index]),
+                )
                 for target_index in to_be_finished_task_index
             ],
             n=RD_AGENT_SETTINGS.multi_proc_n,
diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
index 770f872f4..5eafdd304 100644
--- a/rdagent/components/coder/data_science/feature/__init__.py
+++ b/rdagent/components/coder/data_science/feature/__init__.py
@@ -11,11 +11,11 @@
 )
 from rdagent.components.coder.data_science.feature.eval import FeatureCoSTEEREvaluator
 from rdagent.components.coder.data_science.feature.exp import FeatureTask
+from rdagent.core.experiment import FBWorkspace
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 
-from rdagent.core.experiment import FBWorkspace
 
 class FeatureMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
     def implement_one_task(
@@ -30,7 +30,9 @@ def implement_one_task(
 
         # 2. code
         system_prompt = T(".prompts:feature.system").r()
-        user_prompt = T(".prompts:feature.user").r(competition_info=competition_info, feature_spec=workspace.code_dict["spec/feature.md"])
+        user_prompt = T(".prompts:feature.user").r(
+            competition_info=competition_info, feature_spec=workspace.code_dict["spec/feature.md"]
+        )
 
         feature_code = json.loads(
             APIBackend().build_messages_and_create_chat_completion(
diff --git a/rdagent/components/coder/data_science/feature/test.py b/rdagent/components/coder/data_science/feature/test.py
index 6306d0fb4..41b308468 100644
--- a/rdagent/components/coder/data_science/feature/test.py
+++ b/rdagent/components/coder/data_science/feature/test.py
@@ -16,9 +16,7 @@ def develop_one_competition(competition: str):  # -> experiment
     scen = DataScienceScen(competition=competition)
     feature_coder = FeatureCoSTEER(scen)
 
-    with open(
-        "./rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md", "r"
-    ) as file:
+    with open("./rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md", "r") as file:
         feat_spec = file.read()
 
     # Create the experiment
@@ -27,9 +25,7 @@ def develop_one_competition(competition: str):  # -> experiment
         sub_tasks=[ft],
     )
 
-    with open(
-        "./rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py", "r"
-    ) as file:
+    with open("./rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py", "r") as file:
         load_data_code = file.read()
     exp.experiment_workspace.inject_code(**{"load_data.py": load_data_code})
 
diff --git a/rdagent/components/coder/data_science/model/es.py b/rdagent/components/coder/data_science/model/es.py
index a19d949b2..a12421b0e 100644
--- a/rdagent/components/coder/data_science/model/es.py
+++ b/rdagent/components/coder/data_science/model/es.py
@@ -11,10 +11,10 @@
     CoSTEERQueriedKnowledgeV2,
 )
 from rdagent.components.coder.data_science.model.exp import ModelTask
+from rdagent.core.experiment import FBWorkspace
 from rdagent.core.prompts import Prompts
 from rdagent.oai.llm_conf import LLM_SETTINGS
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.core.experiment import FBWorkspace
 
 coder_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
@@ -95,8 +95,8 @@ def implement_one_task(
                 json_mode=True,
             ),
         )["code"]
-        return{
-            "model01.py":model_code,
+        return {
+            "model01.py": model_code,
         }
         """
         import pandas as pd
@@ -118,4 +118,4 @@ def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
                 # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
                 evo.sub_workspace_list[index] = evo.experiment_workspace
             evo.sub_workspace_list[index].inject_code(**code_list[index])
-        return evo
\ No newline at end of file
+        return evo
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
index 57f8221d0..1fa6b37b7 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
@@ -23,7 +23,9 @@
 
 
 # Call model_workflow
-val_pred, test_pred, hypers = model_workflow(X=train_X, y=train_y, val_X=val_X, val_y=val_y, test_X=test_X, hyper_params={})
+val_pred, test_pred, hypers = model_workflow(
+    X=train_X, y=train_y, val_X=val_X, val_y=val_y, test_X=test_X, hyper_params={}
+)
 # val_pred = np.random.rand(8, 1)
 test_pred = np.random.rand(8, 1)
 
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index 88f4eabc2..31beb63c2 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -37,10 +37,10 @@
     DataLoaderCoSTEEREvaluator,
 )
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.core.experiment import FBWorkspace
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
-from rdagent.core.experiment import FBWorkspace
 
 
 class DataLoaderMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
diff --git a/rdagent/components/coder/factor_coder/evolving_strategy.py b/rdagent/components/coder/factor_coder/evolving_strategy.py
index 16b34ef21..0496d83d9 100644
--- a/rdagent/components/coder/factor_coder/evolving_strategy.py
+++ b/rdagent/components/coder/factor_coder/evolving_strategy.py
@@ -14,10 +14,10 @@
 )
 from rdagent.components.coder.factor_coder.config import FACTOR_COSTEER_SETTINGS
 from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace, FactorTask
+from rdagent.core.experiment import FBWorkspace
 from rdagent.core.prompts import Prompts
 from rdagent.oai.llm_conf import LLM_SETTINGS
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.core.experiment import FBWorkspace
 
 implement_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
diff --git a/rdagent/components/coder/model_coder/evolving_strategy.py b/rdagent/components/coder/model_coder/evolving_strategy.py
index d9b7f56af..8aa5bca1c 100644
--- a/rdagent/components/coder/model_coder/evolving_strategy.py
+++ b/rdagent/components/coder/model_coder/evolving_strategy.py
@@ -16,10 +16,10 @@
     ModelFBWorkspace,
     ModelTask,
 )
+from rdagent.core.experiment import FBWorkspace
 from rdagent.core.prompts import Prompts
 from rdagent.oai.llm_conf import LLM_SETTINGS
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.core.experiment import FBWorkspace
 
 coder_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index ea964f913..488ea10d2 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -7,8 +7,8 @@
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
 from rdagent.core.experiment import Experiment
-from rdagent.core.proposal import ExpGen, Hypothesis, Trace, HypothesisFeedback
 from rdagent.core.knowledge_base import KnowledgeBase
+from rdagent.core.proposal import ExpGen, Hypothesis, HypothesisFeedback, Trace
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.scenarios.data_science.experiment.experiment import (
     DataLoaderExperiment,
@@ -23,6 +23,7 @@
 COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow"]
 ORDER = COMPONENT.__args__
 
+
 class DSHypothesis(Hypothesis):
     def __init__(
         self,
@@ -56,7 +57,9 @@ def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None =
         self.hist: list[tuple[DSHypothesis, Experiment, HypothesisFeedback]] = []
         self.knowledge_base = knowledge_base
 
-    def get_sota_hypothesis_and_experiment(self, component: COMPONENT | None = None) -> tuple[DSHypothesis | None, Experiment | None]:
+    def get_sota_hypothesis_and_experiment(
+        self, component: COMPONENT | None = None
+    ) -> tuple[DSHypothesis | None, Experiment | None]:
         """Access the last experiment result, sub-task, and the corresponding hypothesis."""
         for h, exp, hf in self.hist[::-1]:
             if hf.decision:
@@ -68,13 +71,15 @@ def get_sota_hypothesis_and_experiment(self, component: COMPONENT | None = None)
 
 class DSExpGen(ExpGen):
     """Data Science Task Generator."""
-    def llm_task_gen(self,
-                    targets: str,
-                    scenario_desc: str,
-                    task_output_format: str,
-                    hypothesis: Hypothesis | None = None,
-                    hypothesis_and_feedback: str | None = None
-                    ) -> dict:
+
+    def llm_task_gen(
+        self,
+        targets: str,
+        scenario_desc: str,
+        task_output_format: str,
+        hypothesis: Hypothesis | None = None,
+        hypothesis_and_feedback: str | None = None,
+    ) -> dict:
         system_prompt = T(".prompts:task_gen.system").r(
             targets=targets,
             scenario=scenario_desc,
@@ -92,7 +97,7 @@ def llm_task_gen(self,
                 user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
             )
         )
-        
+
         return resp_dict
 
     def gen(self, trace: DSTrace) -> Experiment:
@@ -166,7 +171,6 @@ def is_complete():
                 )
 
                 dependency_exp = trace.get_sota_hypothesis_and_experiment("DataLoadSpec")
-                spec = dependency_exp.experiment_workspace.code_dict["spec/feature.md"]
                 tasks = []
                 for fn in resp_dict:
                     ft = FeatureTask(
@@ -174,7 +178,6 @@ def is_complete():
                         description=resp_dict[fn].get("description", "Factor description not provided"),
                         formulation=resp_dict[fn].get("formulation", "Feature formulation not provided"),
                         variables=resp_dict[fn].get("variables", "Variables not provided"),
-                        spec=spec,
                     )
 
                 exp = FeatureExperiment(sub_tasks=tasks, hypothesis=hypothesis)
@@ -190,13 +193,11 @@ def is_complete():
                 )
 
                 dependency_exp = trace.get_sota_hypothesis_and_experiment("FeatureEng")
-                spec = dependency_exp.experiment_workspace.code_dict["spec/model.md"]
                 mt = ModelTask(
                     name=resp_dict.get("model_name", "Model name not provided"),
                     description=resp_dict.get("description", "Model description not provided"),
                     architecture=resp_dict.get("architecture", "Model architecture not provided"),
                     hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
-                    spec=spec,
                     base_code="",
                 )
 
@@ -213,11 +214,9 @@ def is_complete():
                 )
 
                 dependency_exp = trace.get_sota_hypothesis_and_experiment("Model")
-                spec = dependency_exp.experiment_workspace.code_dict["spec/ensemble.md"]
                 et = EnsembleTask(
                     name="Ensemble",
                     description=resp_dict.get("description", "Ensemble description not provided"),
-                    spec=spec,
                 )
 
                 exp = EnsembleExperiment(sub_tasks=[et], hypothesis=hypothesis)
@@ -233,11 +232,9 @@ def is_complete():
                 )
 
                 dependency_exp = trace.get_sota_hypothesis_and_experiment("Ensemble")
-                spec = dependency_exp.experiment_workspace.code_dict["spec/workflow.md"]
                 wt = WorkflowTask(
                     name="Workflow",
                     description=resp_dict.get("description", "Workflow description not provided"),
-                    spec=spec,
                 )
 
                 exp = WorkflowExperiment(sub_tasks=[wt], hypothesis=hypothesis)
@@ -270,7 +267,6 @@ def is_complete():
                         task_output_format=T(".prompts:output_format.feature").r(),
                     )
                     dependency_exp = trace.get_sota_hypothesis_and_experiment("DataLoadSpec")
-                    spec = dependency_exp.experiment_workspace.code_dict["spec/feature.md"]
                     tasks = []
                     for fn in resp_dict:
                         ft = FeatureTask(
@@ -278,7 +274,6 @@ def is_complete():
                             description=resp_dict[fn].get("description", "Factor description not provided"),
                             formulation=resp_dict[fn].get("formulation", "Feature formulation not provided"),
                             variables=resp_dict[fn].get("variables", "Variables not provided"),
-                            spec=spec,
                         )
                         tasks.append(ft)
                     exp = FeatureExperiment(sub_tasks=tasks)
@@ -291,8 +286,7 @@ def is_complete():
                         task_output_format=T(".prompts:output_format.model").r(),
                     )
                     dependency_exp = trace.get_sota_hypothesis_and_experiment("FeatureEng")
-                    spec = dependency_exp.experiment_workspace.code_dict["spec/model.md"]
-                    if last_model_exp:=trace.get_sota_hypothesis_and_experiment("Model"):
+                    if last_model_exp := trace.get_sota_hypothesis_and_experiment("Model"):
                         # TODO: model only have one (named "model.py")?
                         base_code = last_model_exp.experiment_workspace.code_dict["model.py"]
                     else:
@@ -302,7 +296,6 @@ def is_complete():
                         description=resp_dict.get("description", "Model description not provided"),
                         architecture=resp_dict.get("architecture", "Model architecture not provided"),
                         hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
-                        spec=spec,
                         base_code=base_code,
                     )
                     exp = ModelExperiment(sub_tasks=[mt])
@@ -315,11 +308,9 @@ def is_complete():
                         task_output_format=T(".prompts:output_format.ensemble").r(),
                     )
                     dependency_exp = trace.get_sota_hypothesis_and_experiment("Model")
-                    spec = dependency_exp.experiment_workspace.code_dict["spec/ensemble.md"]
                     et = EnsembleTask(
                         name="Ensemble",
                         description=resp_dict.get("description", "Ensemble description not provided"),
-                        spec=spec,
                     )
                     exp = EnsembleExperiment(sub_tasks=[et])
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
@@ -331,11 +322,9 @@ def is_complete():
                         task_output_format=T(".prompts:output_format.workflow").r(),
                     )
                     dependency_exp = trace.get_sota_hypothesis_and_experiment("Ensemble")
-                    spec = dependency_exp.experiment_workspace.code_dict["spec/workflow.md"]
                     wt = WorkflowTask(
                         name="Workflow",
                         description=resp_dict.get("description", "Workflow description not provided"),
-                        spec=spec,
                     )
                     exp = WorkflowExperiment(sub_tasks=[wt])
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)

From b7aad318e8b364b9a01e28c82b8d2af601cfa57b Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Wed, 18 Dec 2024 02:53:28 +0000
Subject: [PATCH 078/304] spell check

---
 rdagent/app/data_science/loop.py                            | 2 +-
 rdagent/components/coder/data_science/feature/eval.py       | 2 +-
 rdagent/components/coder/data_science/workflow/prompts.yaml | 2 +-
 rdagent/utils/agent/tpl.py                                  | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 8bc17667c..81d44464c 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -97,7 +97,7 @@ def coding(self, prev_out: dict[str, Any]):
     @measure_time
     def running(self, prev_out: dict[str, Any]):
         if not self.exp_gen.is_complete():
-            raise NextLoopExcpetion()
+            raise NextLoopException()
 
         if prev_out["direct_exp_gen"]["propose"].action in [
             KG_ACTION_FEATURE_ENGINEERING,
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
index a083fbed4..e9d922367 100644
--- a/rdagent/components/coder/data_science/feature/eval.py
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -52,7 +52,7 @@ def evaluate(
         ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
         de = DockerEnv(conf=ds_docker_conf)
 
-        # TODO: do we need to clean the generated tempory content?
+        # TODO: do we need to clean the generated temporary content?
         fname = "feature_test.py"
         with (DIRNAME / "eval_tests" / "feature_test.py").open("r") as f:
             test_code = f.read()
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index 522a45252..4a38c5dd3 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -31,4 +31,4 @@ workflow_coder:
     {{ model_code }}
 
     ---------ensemble code---------
-    {{ ensembloe_code }}
+    {{ ensemble_code }}
diff --git a/rdagent/utils/agent/tpl.py b/rdagent/utils/agent/tpl.py
index 6b4e24c2a..2323b1106 100644
--- a/rdagent/utils/agent/tpl.py
+++ b/rdagent/utils/agent/tpl.py
@@ -1,7 +1,7 @@
 """
-Here are some infrastruture to build a agent
+Here are some infrastructure to build a agent
 
-The motivation of tempalte and AgentOutput Design
+The motivation of template and AgentOutput Design
 """
 
 import inspect

From 12f12177d906c0751f8b58d336a9ba4d45c6a1fb Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Wed, 18 Dec 2024 06:34:05 +0000
Subject: [PATCH 079/304] refine ds modal for more cases: eval and es

---
 .../coder/CoSTEER/evolving_strategy.py        |  2 +-
 .../components/coder/data_science/model/es.py |  7 +-
 .../model/eval_tests/model_execute.py         |  4 +-
 .../coder/data_science/model/prompts.yaml     | 75 +++++++++++++++++--
 .../spec/model.md                             | 15 ++--
 5 files changed, 84 insertions(+), 19 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/CoSTEER/evolving_strategy.py
index 99432ac5c..7275e2a9d 100644
--- a/rdagent/components/coder/CoSTEER/evolving_strategy.py
+++ b/rdagent/components/coder/CoSTEER/evolving_strategy.py
@@ -98,7 +98,7 @@ def evolve(
             [
                 (
                     self.implement_one_task,
-                    (evo.sub_tasks[target_index], queried_knowledge, evo.sub_workspace_list[target_index]),
+                    (evo.sub_tasks[target_index], queried_knowledge, evo.experiment_workspace),
                 )
                 for target_index in to_be_finished_task_index
             ],
diff --git a/rdagent/components/coder/data_science/model/es.py b/rdagent/components/coder/data_science/model/es.py
index a12421b0e..4408828df 100644
--- a/rdagent/components/coder/data_science/model/es.py
+++ b/rdagent/components/coder/data_science/model/es.py
@@ -48,15 +48,13 @@ def implement_one_task(
         system_prompt = (
             Environment(undefined=StrictUndefined)
             .from_string(
-                coder_prompts["evolving_strategy_model_coder"]["system"],
+                coder_prompts["model_coder"]["system"],
             )
             .render(
                 # scenario=self.scen.get_scenario_all_desc(filtered_tag=target_task.model_type),
                 # TODO: fit new scenario information
-                scenario=("No scenario description."),
                 spec=workspace.code_dict["spec/model.md"],
                 queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
-                current_code=target_task.base_code,
             )
         )
 
@@ -65,12 +63,13 @@ def implement_one_task(
             user_prompt = (
                 Environment(undefined=StrictUndefined)
                 .from_string(
-                    coder_prompts["evolving_strategy_model_coder"]["user"],
+                    coder_prompts["model_coder"]["user"],
                 )
                 .render(
                     model_information_str=model_information_str,
                     queried_similar_successful_knowledge=queried_similar_successful_knowledge_to_render,
                     queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
+                    current_code=target_task.base_code,
                 )
                 .strip("\n")
             )
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
index d0aef55ac..47654e961 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
@@ -24,7 +24,7 @@
 print("The first execution begins.\n")
 # Call model_workflow
 val_pred, test_pred, hypers = model_workflow(
-    X=train_X, y=train_y, val_X=val_X, val_y=val_y, test_X=test_X, hyper_params={}
+    X=train_X, y=train_y, val_X=val_X, val_y=val_y, test_X=None,
 )
 # val_pred = np.random.rand(8, 1)
 # test_pred = np.random.rand(8, 1)
@@ -45,7 +45,7 @@
 print(execution_feedback_str)
 
 print("The second execution begins.\n")
-val_pred, test_pred, finalhypers = model_workflow(X=train_X, y=train_y, val_X=None, val_y=None, test_X=test_X, hyper_params=hypers)
+val_pred, test_pred, finalhypers = model_workflow(X=train_X, y=train_y, val_X=None, val_y=None, test_X=test_X, **hypers)
 execution_feedback_str = "The second Execution successful.\n"
 if val_pred is not None:
     execution_feedback_str += f"Validation predictions shape: {val_pred.shape}\n"
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 02c4dc413..988a9bb21 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -1,4 +1,4 @@
-evolving_strategy_model_coder:
+evolving_strategy_model_coder_previous_version:
     system: |-
         User is trying to implement some pytorch models in the following scenario:
         {{ scenario }}
@@ -19,7 +19,7 @@ evolving_strategy_model_coder:
         ```python
         {{ current_code }}
         ```
-        Your code should be very similar to the former code which means your code should be ninety more percent same as the former code! You should not modify the right part of the code.
+        You should not modify the right part of the code.
         {% else %}
         User has not write any code before. You should write the new code from scratch.
         {% endif %}
@@ -61,6 +61,67 @@ evolving_strategy_model_coder:
         {% endfor %}
         {% endif %}
 
+model_coder:
+    system: |-
+        You are tasked with implementing PyTorch models based on specific requirements provided by the user. The user’s ultimate goal is to obtain accurate predictions from the model on input data. Follow the instructions below to ensure your response is correct and aligned with the user’s expectations.
+
+        Instructions for Code Generation:
+            Specification Compliance:
+                The user has provided a detailed framework or set of specifications under {{ spec }}. Your code must strictly adhere to this specification, including any required classes, methods, and organizational structure. Do not implement or add anything outside the scope of the provided specification.
+
+            Leveraging User Inputs:
+                The user may provide various forms of additional information to guide you:
+
+                    Successful Examples: Correct implementations of similar models.
+                    Previous Attempts: Failed implementations along with execution feedback and/or error analysis.
+                    Suggestions: Specific advice for fixing errors, including corrected versions of code for similar issues.
+                Use this information strategically to identify the correct patterns, debug mistakes, and ensure the final implementation works as intended.
+
+            Preserving Correct Code:
+                If the user has shared their latest code, carefully analyze it and only modify parts that require changes. Do not alter correct sections of the code.
+
+            Error Learning:
+                If previous failed attempts and their feedback are available, learn from them. Understand what went wrong and avoid repeating similar mistakes in your new implementation.
+
+        Formatting Your Response:
+            Return only the code in a JSON format as shown below. Do not include any explanations or extra text. Example:
+            {
+                "code": "Your corrected or newly implemented Python code as a single string"
+            }
+    user: |-
+        Here is all the relevant information for this task:
+
+        Target Model Details:
+        {{ model_information_str }}
+
+        {% if queried_similar_successful_knowledge|length != 0 %}
+        --------------Successful Implementations for Similar Models:--------------
+        ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
+        {{ similar_successful_knowledge.target_task.get_task_information() }}
+        =====Code:=====
+        {{ similar_successful_knowledge.implementation.code }}
+        {% endfor %} 
+        {% endif %}
+
+        {% if queried_former_failed_knowledge|length != 0 %}
+        --------------Previous Failed Attempts:--------------
+        {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
+        =====Code:=====
+        {{ former_failed_knowledge.implementation.code }}
+        =====Feedback:=====
+        {{ former_failed_knowledge.feedback }}
+        {% endfor %}
+        {% endif %}
+
+
+        {% if current_code is not none %}
+        --------------Latest Code:--------------
+        {{ current_code }}
+        {% else %} 
+        No prior code has been implemented. 
+        {% endif %}
+
+
 model_eval:
     system: |-
         You are data scientist.
@@ -74,8 +135,10 @@ model_eval:
         {{test_code}}
         ```
         The first time you execute it, you will not provide test inputs, only train, valid inputs, and empty hyperparameters. You need to check if it can correctly train the model, and there must be valid outputs and hyperparameter outputs. 
-        The hyperparameters returned must not be none. It should has parameters that will be useful for retrain later, for example, the number of iteration when the first round training early stopped.
-        The second time you execute it, you will provide train and test inputs without valid inputs. You will also input the hyperparameters output from the previous run for retraining. You need to check if these hyperparameters are used in the model code below and if it can correctly output the test predictions.
+        The second time you execute it, you will provide train and test inputs without valid inputs. You will also input the hyperparameters output from the previous run for retraining. 
+        Therefore, during the evaluate you must check:
+        - The hyperparameters returned must not be none. It should has parameters that will be useful for retrain later. It must include the early stop round.
+        - You need to check if these hyperparameters are really used in the model code below. The early stop round must be used if given.
         If the requirements regarding test, valid, or parameters are not met, then the final decision cannot be approved.
         
         You should evaluate the code given by user. You should concern about whether the user implement it correctly, including whether the shape of model's output is aligned with request, the equality of code, and any other thing you think necessary.
@@ -86,8 +149,8 @@ model_eval:
         ```json
         {
             "execution": "Describe whether the model execute successfully, including any errors or issues encountered.",
-            "return_checking": "Checks about the generated value, including whether the value generated. Especially compare the shape of model output and the requirement in spec.md.", you also need to check whether the hyperparameters used for retraining are correctly returned during the test execution of the model.
-            "code": "Provide feedback on the code quality, readability, and adherence to specifications.",
+            "return_checking": "Checks about the generated value, including whether the value generated and comparing the shape of model output and the requirement in spec.md.". You also need to check whether the hyperparameters used for retraining are correctly returned during the test execution of the model.
+            "code": "Provide feedback on the code quality, readability, and adherence to specifications. Check whether the hyperparameters from the previous run are used in the model code", compare the parameters name in stdout and if it is used in retraining part of code.
             "final_decision": <true/false>
         }
         ```
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/model.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/model.md
index 3cff0cf6e..04c24dad5 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/model.md
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/model.md
@@ -3,16 +3,17 @@
 - Implement a function to manage the model workflow with the following signature:
 
 ```python
-def model_workflow(X: np.ndarray, y: np.ndarray, val_X: np.ndarray = None, val_y: np.ndarray = None, test_X: np.ndarray = None, **hyper_params; dict = {}) -> tuple[np.ndarray | None, np.ndarray | None, dict]:
+def model_workflow(X: np.ndarray, y: np.ndarray, val_X: np.ndarray = None, val_y: np.ndarray = None, test_X: np.ndarray = None, **hyper_params) -> tuple[np.ndarray | None, np.ndarray | None, dict]:
     """
-    Manages the workflow of a machine learning model, including training, validation.
+    Manages the workflow of a machine learning model, including training, validation
     The testing&validation's inference is included, as well
 
     - If test/valid exist, output inference on them
-    - Follow the hyperparameter if exists.
+    - Follow the hyperparameter if exists
+        - Hyperparameters at least has <early stop round>. The code must check if it is given and use it.
         - the returned hyperparameter should align with the input(except the newly generated early stop)
-    - Return hyperparameters for retrain if not exists.
-    - If valid exist, add <early stop> to update the hyperparameter
+    - Return hyperparameters for retrain if not exists. Hyperparameters should have <early stop round>
+    - If valid exist, add <early stop round> to update the hyperparameter
 
 
     Parameters
@@ -36,6 +37,8 @@ def model_workflow(X: np.ndarray, y: np.ndarray, val_X: np.ndarray = None, val_y
         Predictions on the validation data, predictions on the test data
     """
 ```
-- In this task, the shape of output should be (batch_size, num_class), as num_class = 1 here.
+- In this task, the shape of input(X of train, valid and test) should be (num_samples, height, width, channels).
+
+- In this task, the shape of output should be (num_samples, num_class), as num_class = 1 here.
 
 - The function should handle data augmentation, model creation, training, and prediction.

From e86aa733427cc59bafe60741c19c6cc776141f8f Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Wed, 18 Dec 2024 07:10:34 +0000
Subject: [PATCH 080/304] update model template

---
 .../tpl_ex/aerial-cactus-identification/model01.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
index da2af05a0..0a22c8d51 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
@@ -112,8 +112,11 @@ def model_workflow(
         metrics=["accuracy"],
     )
 
+    # Extract early_stop_round from hyper_params, default is 25
+    early_stop_round = hyper_params.get("early_stop_round", 25)
+
     callbacks = [
-        EarlyStopping(monitor="val_loss", patience=hyper_params.get("patience", 25)),
+        EarlyStopping(monitor="val_loss", patience=early_stop_round),
         ModelCheckpoint(filepath="best_model.keras", monitor="val_loss", save_best_only=True),
     ]
 
@@ -130,6 +133,15 @@ def model_workflow(
             shuffle=True,
             callbacks=callbacks,
         )
+        # Dynamic adjustment of early_stop_round
+        if "early_stop_round" not in hyper_params:
+            val_loss = history.history["val_loss"]
+            best_epoch = np.argmin(val_loss)
+            dynamic_early_stop = max(5, int((len(val_loss) - best_epoch) * 0.5))  # 50% of remaining epochs
+
+            print(f"Dynamic early_stop_round: {dynamic_early_stop}")
+            hyper_params["early_stop_round"] = dynamic_early_stop
+        
         # Predict on validation data
         val_pred = model.predict(validation_datagen.flow(validation_images, batch_size=1, shuffle=False), verbose=1)
     else:

From cf5e18cba219aabc9c5593ede7aaff7fc04fbb0f Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Wed, 18 Dec 2024 08:13:15 +0000
Subject: [PATCH 081/304] prompts for model and ensemble

---
 .../data_science/raw_data_loader/prompts.yaml | 68 ++++++++++++++-----
 1 file changed, 52 insertions(+), 16 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 1cad289cc..6541b8602 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -28,7 +28,7 @@ spec:
           - Mentions the source of the data (e.g., data location or structure).
           - Explains the expected output format.
       2. Precautions for Data Loading and Preprocessing:
-        - Handle potential issues such as:
+        - Handle potential issues such as (You should depend on the competition information to make a concise specification):
           - File encoding (e.g., UTF-8) and data delimiters (e.g., CSV comma-separated).
           - Missing values in datasets: describe how they should be handled (e.g., fill with a specific value, drop rows, etc.).
           - Data types: ensure proper type conversion (e.g., numeric columns, date parsing).
@@ -54,6 +54,7 @@ spec:
         - Must include proper and specific annotations for both input and output based on the Competition Information:
           - Input: Specify the expected input data type (e.g., `pd.DataFrame`, `dict`, `np.array`, etc.).
           - Output: Specify the transformed output data type (e.g., `pd.DataFrame`, `dict`, `np.array`, etc.).
+          - You should depend on the competition information to make a concise specification.
         - A comprehensive docstring must be provided that:
           - Describes the purpose of the function.
           - Clarifies the input parameters and their types.
@@ -74,31 +75,66 @@ spec:
       }
 
     model: |-
-      Model building specification text should include two parts:
-      1. function interface:
-        - function name must be "model_workflow".
-        - have annotations for the input and output.
-        - have a docstring that describes the function.
+      Model building specification text should adhere to the following requirements:
+      1. Function Interface:
+        - The function name must be `model_workflow`.
+        - The function should include:
+          - Type annotations for all inputs and outputs.
+          - Input and output shapes:
+            - Input:
+              - `X`: A 4D NumPy array of shape `(num_samples, height, width, channels)`, where:
+                - `num_samples`: Number of training samples.
+                - `height` and `width`: Dimensions of the image (e.g., `224 x 224`).
+                - `channels`: Number of color channels (e.g., `3` for RGB).
+              - `y`: A 2D NumPy array of shape `(num_samples, 1)`, where `1` represents binary classification labels.
+              - Optional:
+                - `val_X`: Validation features of shape `(num_val_samples, height, width, channels)`.
+                - `val_y`: Validation labels of shape `(num_val_samples, 1)`.
+                - `test_X`: Test features of shape `(num_test_samples, height, width, channels)`.
+              - `**hyper_params`: A dictionary of important hyperparameters for model configuration.
+            - Output:
+              - A tuple consisting of:
+                - `pred_val`: Predictions on validation data (`np.ndarray` of shape `(num_val_samples, 1)` or `None`).
+                - `pred_test`: Predictions on test data (`np.ndarray` of shape `(num_test_samples, 1)` or `None`).
+                - `metrics`: A dictionary containing evaluation metrics, such as accuracy or loss.
+
+        - Include a clear and concise docstring to explain the function’s purpose, its input parameters, and its expected return values.
+
       2. Precautions:
-        some precautions for model building.
+        - Ensure input arrays (`X`, `y`, `val_X`, `val_y`, `test_X`) have the correct shapes and consistent dimensions.
+        - Use default values for hyperparameters if none are provided in `**hyper_params`.
+        - Perform model training on `X` and `y`, and evaluate using `val_X` and `val_y`.
+        - If `test_X` is provided, generate predictions for it.
+        - Return a dictionary containing important metrics like validation loss, accuracy, or other evaluation results.
 
-      Please response the specification in the following json format. Here is an example structure for the JSON output:
+      Please respond in the following JSON format:
       {
-          "spec": "The specification as a string."
+          "spec": "The function definition in code format, tailored to the Competition Information, with detailed explanations provided in the docstring."
       }
 
+
     ensemble: |-
       Ensemble specification text should include two parts:
-      1. function interface:
-        - function name must be "ensemble".
-        - have annotations for the input and output.
-        - have a docstring that describes the function.
+      1. Function Interface:
+        - The function name must be `ens_and_decision`.
+        - The function should include:
+          - Type annotations for both inputs and outputs.
+          - Input (for example):
+            - `test_pred_l`: A list of NumPy arrays (as an example, if you think predictions should be represented as Pandas DataFrames, use `pd.DataFrame`) containing predictions for the test data.
+            - `val_pred_l`: A list of NumPy arrays containing predictions for the validation data.
+            - `val_label`: A 1D NumPy array of true labels for the validation data.
+          - Output:
+            - A 1D NumPy array containing the final binary predictions for the test data.
+        - Include a docstring that describes the purpose of the function, the parameters, and the expected return value.
+
       2. Precautions:
-        some precautions for ensemble.
+        - Ensure all predictions in `test_pred_l` and `val_pred_l` have the same shape and dimensions.
+        - Validate that `val_label` is provided and has the same length as `val_pred_l` predictions.
+        - Perform checks to handle empty or invalid inputs gracefully.
 
-      Please response the specification in the following json format. Here is an example structure for the JSON output:
+      Please respond in the following JSON format:
       {
-          "spec": "The specification as a string."
+          "spec": "The function definition in code format, tailored to the Competition Information, with detailed explanations provided in the docstring."
       }
 
     workflow: |-

From 81b27b4e0c370c9adfd7b49889124c4c18ffefd3 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Wed, 18 Dec 2024 08:39:57 +0000
Subject: [PATCH 082/304] fix a bug

---
 .../components/coder/data_science/raw_data_loader/prompts.yaml  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 6541b8602..7534a381a 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -96,7 +96,7 @@ spec:
               - A tuple consisting of:
                 - `pred_val`: Predictions on validation data (`np.ndarray` of shape `(num_val_samples, 1)` or `None`).
                 - `pred_test`: Predictions on test data (`np.ndarray` of shape `(num_test_samples, 1)` or `None`).
-                - `metrics`: A dictionary containing evaluation metrics, such as accuracy or loss.
+                - `hyper_params`: A dictionary of important hyperparameters for model configuration.
 
         - Include a clear and concise docstring to explain the function’s purpose, its input parameters, and its expected return values.
 

From dc8f71c29f5cd3594e50f1046a93aede3e09b9c8 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Wed, 18 Dec 2024 08:44:57 +0000
Subject: [PATCH 083/304] fix a bug

---
 .../coder/data_science/raw_data_loader/prompts.yaml         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 7534a381a..fbba49ffb 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -91,7 +91,7 @@ spec:
                 - `val_X`: Validation features of shape `(num_val_samples, height, width, channels)`.
                 - `val_y`: Validation labels of shape `(num_val_samples, 1)`.
                 - `test_X`: Test features of shape `(num_test_samples, height, width, channels)`.
-              - `**hyper_params`: A dictionary of important hyperparameters for model configuration.
+              - `hyper_params`: A dictionary of important hyperparameters for model configuration.
             - Output:
               - A tuple consisting of:
                 - `pred_val`: Predictions on validation data (`np.ndarray` of shape `(num_val_samples, 1)` or `None`).
@@ -102,10 +102,10 @@ spec:
 
       2. Precautions:
         - Ensure input arrays (`X`, `y`, `val_X`, `val_y`, `test_X`) have the correct shapes and consistent dimensions.
-        - Use default values for hyperparameters if none are provided in `**hyper_params`.
+        - Use default values for hyperparameters if none are provided in `hyper_params`.
+        - Return hyperparameters for retrain if not exists.
         - Perform model training on `X` and `y`, and evaluate using `val_X` and `val_y`.
         - If `test_X` is provided, generate predictions for it.
-        - Return a dictionary containing important metrics like validation loss, accuracy, or other evaluation results.
 
       Please respond in the following JSON format:
       {

From b6acea3eb2aea1875c7cbcad03cdc8f13cdacdf9 Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Wed, 18 Dec 2024 09:27:17 +0000
Subject: [PATCH 084/304] init: ds workflow evovingstrategy

---
 .../coder/data_science/model/prompts.yaml     | 63 -------------------
 .../coder/data_science/workflow/es.py         | 36 +++++++++--
 .../coder/data_science/workflow/exp.py        |  3 +-
 .../coder/data_science/workflow/prompts.yaml  | 11 ++--
 .../coder/data_science/workflow/test.py       | 10 +--
 5 files changed, 40 insertions(+), 83 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 988a9bb21..681e4c129 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -1,66 +1,3 @@
-evolving_strategy_model_coder_previous_version:
-    system: |-
-        User is trying to implement some pytorch models in the following scenario:
-        {{ scenario }}
-        Your code is expected to align the scenario in any form which means The user needs to get the prediction of the model based on the input data.
-
-        To help you write the correct code, the user might provide multiple information that helps you write the correct code:
-        1. The user might provide you the correct code to similar models. Your should learn from these code to write the correct code.
-        2. The user might provide you the failed former code and the corresponding feedback to the code. The feedback contains to the execution, the code and the model output value. You should analyze the feedback and try to correct the latest code.
-        3. The user might provide you the suggestion to the latest fail code and some similar fail to correct pairs. Each pair contains the fail code with similar error and the corresponding corrected version code. You should learn from these suggestion to write the correct code.
-
-        The user will also provide some information about how to organize the whole code and give instructions. These information are as below, and the code you implement should align the framework given below:
-        {{ spec }}
-
-        Your must write your code based on your former latest attempt below which consists of your former code and code feedback, you should read the former attempt carefully and must not modify the right part of your former code.
-
-        {% if current_code is not none %}
-        User has write some code before. You should write the new code based on this code. Here is the latest code:
-        ```python
-        {{ current_code }}
-        ```
-        You should not modify the right part of the code.
-        {% else %}
-        User has not write any code before. You should write the new code from scratch.
-        {% endif %}
-
-        {% if queried_former_failed_knowledge|length != 0 %}
-        --------------Your former latest attempt:---------------
-        =====Code to the former implementation=====
-        {{ queried_former_failed_knowledge[-1].implementation.code }}
-        =====Feedback to the former implementation=====
-        {{ queried_former_failed_knowledge[-1].feedback }}
-        {% endif %}
-        
-        Please response the code in the following json format. Here is an example structure for the JSON output:
-        {
-            "code": "The Python code as a string."
-        }
-
-    user: |-
-        --------------Target model information:---------------
-        {{ model_information_str }}
-
-        {% if queried_similar_successful_knowledge|length != 0 %}
-        --------------Correct code to similar models:---------------
-        {% for similar_successful_knowledge in queried_similar_successful_knowledge %}
-        =====Model {{loop.index}}:=====
-        {{ similar_successful_knowledge.target_task.get_task_information() }}
-        =====Code:=====
-        {{ similar_successful_knowledge.implementation.code }}
-        {% endfor %}
-        {% endif %}
-
-        {% if queried_former_failed_knowledge|length != 0 %}
-        --------------Former failed code:---------------
-        {% for former_failed_knowledge in queried_former_failed_knowledge %}
-        =====Code to implementation {{ loop.index }}=====
-        {{ former_failed_knowledge.implementation.code }}
-        =====Feedback to implementation {{ loop.index }}=====
-        {{ former_failed_knowledge.feedback }}
-        {% endfor %}
-        {% endif %}
-
 model_coder:
     system: |-
         You are tasked with implementing PyTorch models based on specific requirements provided by the user. The user’s ultimate goal is to obtain accurate predictions from the model on input data. Follow the instructions below to ensure your response is correct and aligned with the user’s expectations.
diff --git a/rdagent/components/coder/data_science/workflow/es.py b/rdagent/components/coder/data_science/workflow/es.py
index 58ce94ece..6c62b288f 100644
--- a/rdagent/components/coder/data_science/workflow/es.py
+++ b/rdagent/components/coder/data_science/workflow/es.py
@@ -5,25 +5,51 @@
 from rdagent.components.coder.CoSTEER.evolving_strategy import (
     MultiProcessEvolvingStrategy,
 )
+from rdagent.core.experiment import FBWorkspace
+from rdagent.utils.agent.tpl import T
 
 class WorkflowMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
     def implement_one_task(
         self,
         target_task: WorkflowTask,
         queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+        workspace: FBWorkspace | None = None,
     ) -> dict[str, str]:
-        competition_info = self.scen.competition_descriptions
+        # competition_info = self.scen.competition_descriptions
         
-        system_prompt = T(".prompts:workflow_coder.system").r()
+        system_prompt = T(".prompts:workflow_coder.system").r(
+            workflow_spec=workspace.code_dict["spec/workflow.md"]
+        )
         user_prompt = T(".prompts:workflow_coder.user").r(
-            competition_info=competition_info,
+            load_data_code=workspace.code_dict["load_data.py"],
+            feature_code=workspace.code_dict["feat01.py"],
+            model_code=workspace.code_dict["model01.py"],
+            ensemble_code=workspace.code_dict["ens.py"],
         )
-
         data_loader_code = json.loads(
             APIBackend().build_messages_and_create_chat_completion(
                 user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
             )
         )["code"]
         
-        return
+        return{
+            "main.py": data_loader_code
+        }
+    
+    def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
+        """
+        Assign the code list to the evolving item.
+
+        The code list is aligned with the evolving item's sub-tasks.
+        If a task is not implemented, put a None in the list.
+        """
+        for index in range(len(evo.sub_tasks)):
+            if code_list[index] is None:
+                continue
+            if evo.sub_workspace_list[index] is None:
+                # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
+                evo.sub_workspace_list[index] = evo.experiment_workspace
+            evo.sub_workspace_list[index].inject_code(**code_list[index])
+        return evo
+
     
diff --git a/rdagent/components/coder/data_science/workflow/exp.py b/rdagent/components/coder/data_science/workflow/exp.py
index 794282cfe..f2934ee6f 100644
--- a/rdagent/components/coder/data_science/workflow/exp.py
+++ b/rdagent/components/coder/data_science/workflow/exp.py
@@ -13,9 +13,10 @@ def __init__(
         self,
         name: str,
         description: str,
+        *args,
         **kwargs,
     ) -> None:
-        pass
+        super().__init__(name=name, description=description, *args, **kwargs)
 
     @staticmethod
     def from_dict(dict):
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index 4a38c5dd3..9e651c82d 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -7,7 +7,10 @@ workflow_coder:
     Your task is to Integrate the existing processes of load_data, feature, model, and ensemble into a complete workflow.
     This workflow code is also a Python file, and it functions similarly to a main process that calls the sub-files for each step and ultimately outputs a prediction file.
 
-    The user will also provide some specifications about how to organize the whole code and give instructions. The code you implement should align the framework given in specifications.
+    The user will also provide some specifications about how to organize the whole code and give instructions. 
+    These specifications are as below: 
+    {{ workflow_spec }}
+    The code you implement should align the framework given in specifications.
 
     Please response the code in the following json format. Here is an example structure for the JSON output:
     {
@@ -15,12 +18,6 @@ workflow_coder:
     }
 
   user: |-
-    ---------Competition Information---------
-    {{ competition_info }}
-
-    ---------Workflow Specification---------
-    {{ workflow_spec }}
-
     ---------load data code---------
     {{ load_data_code }}
 
diff --git a/rdagent/components/coder/data_science/workflow/test.py b/rdagent/components/coder/data_science/workflow/test.py
index d5d08388f..382d0ae98 100644
--- a/rdagent/components/coder/data_science/workflow/test.py
+++ b/rdagent/components/coder/data_science/workflow/test.py
@@ -24,10 +24,7 @@ def develop_one_competition(competition: str):
     wt = WorkflowTask(
         name="WorkflowTask",
         description="Integrate the existing processes of load_data, feature, model, and ensemble into a complete workflow.",
-        spec="",
-        base_code={
-            
-        }
+        base_code="",
     )
 
     tpl_ex_path = Path(__file__).resolve() / Path("rdagent/scenarios/kaggle/tpl_ex").resolve() / competition
@@ -38,14 +35,13 @@ def develop_one_competition(competition: str):
         file_path = tpl_ex_path / file_name
         workflowexp.inject_code(**{file_name: file_path.read_text()})
 
-    wt.spec += workflowexp.code_dict["spec/model.md"]
-    wt.base_code += workflowexp.code_dict["model01.py"]
+    wt.base_code += workflowexp.code_dict["main.py"]
     exp = WorkflowExperiment(
         sub_tasks=[wt],
     )
 
     es = WorkflowMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
-    new_code = es.implement_one_task(target_task=wt, queried_knowledge=None)
+    new_code = es.implement_one_task(target_task=wt, queried_knowledge=None, workspace = workflowexp)
     print(new_code)
 
 

From 7f70ce2824265b9e871620a503e7f9048cdd8764 Mon Sep 17 00:00:00 2001
From: Xisen Wang <118058822+xisen-w@users.noreply.github.com>
Date: Wed, 18 Dec 2024 19:53:05 +0800
Subject: [PATCH 085/304] Adding ensemble (#505)

* Initial Draft

* Updating logic for init

* Revising

* Successful Testing

* Updating to use the latest & right class

* bug: bug-fixing for testing
---
 .../coder/data_science/ensemble/__init__.py   | 119 +++++++++++++++---
 .../coder/data_science/ensemble/conf.py       |   2 +
 .../coder/data_science/ensemble/eval.py       |  57 +++++++++
 .../ensemble/eval_tests/ensemble_test.py      |  40 ++++++
 .../coder/data_science/ensemble/exp.py        |   9 +-
 .../coder/data_science/ensemble/prompts.yaml  |  32 +++++
 .../coder/data_science/ensemble/test.py       |  52 ++++++++
 7 files changed, 291 insertions(+), 20 deletions(-)
 create mode 100644 rdagent/components/coder/data_science/ensemble/conf.py
 create mode 100644 rdagent/components/coder/data_science/ensemble/eval.py
 create mode 100644 rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
 create mode 100644 rdagent/components/coder/data_science/ensemble/prompts.yaml
 create mode 100644 rdagent/components/coder/data_science/ensemble/test.py

diff --git a/rdagent/components/coder/data_science/ensemble/__init__.py b/rdagent/components/coder/data_science/ensemble/__init__.py
index 40d21d3e5..6727477e3 100644
--- a/rdagent/components/coder/data_science/ensemble/__init__.py
+++ b/rdagent/components/coder/data_science/ensemble/__init__.py
@@ -1,19 +1,100 @@
-# from rdagent.components.coder.CoSTEER import CoSTEER
-# from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
-# from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-# from rdagent.core.scenario import Scenario
-
-
-# class ModelEnsembleCoSTEER(CoSTEER):
-#     def __init__(
-#         self,
-#         scen: Scenario,
-#         *args,
-#         **kwargs,
-#     ) -> None:
-#         eva = CoSTEERMultiEvaluator(
-#             ModelEnsembleCoSTEEREvaluator(scen=scen), scen=scen
-#         )  # Please specify whether you agree running your eva in parallel or not
-#         es = ModelEnsembleMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
-
-#         super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
+"""
+File structure
+- ___init__.py: the entrance/agent of coder
+- evaluator.py
+- conf.py
+- exp.py: everything under the experiment, e.g.
+    - Task
+    - Experiment
+    - Workspace
+- test.py
+    - Each coder could be tested.
+"""
+
+import json
+
+from rdagent.components.coder.CoSTEER import CoSTEER
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
+from rdagent.components.coder.CoSTEER.evolving_strategy import (
+    MultiProcessEvolvingStrategy,
+)
+from rdagent.components.coder.CoSTEER.knowledge_management import (
+    CoSTEERQueriedKnowledge,
+)
+from rdagent.components.coder.data_science.ensemble.eval import (
+    EnsembleCoSTEEREvaluator,
+)
+from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
+from rdagent.core.scenario import Scenario
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+
+from rdagent.core.experiment import FBWorkspace
+
+
+class EnsembleMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
+    def implement_one_task(
+        self,
+        target_task: EnsembleTask,
+        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+        workspace: FBWorkspace | None = None,
+    ) -> dict[str, str]:
+        # return a workspace with "ensemble.py" inside
+        competition_info = self.scen.get_scenario_all_desc()
+        # Generate code
+        system_prompt = T(".prompts:ensemble_coder.system").r(competition_info=competition_info)
+        user_prompt = T(".prompts:ensemble_coder.user").r(ensemble_spec=workspace.code_dict["spec/ensemble.md"])
+
+        ensemble_code = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(
+                user_prompt=user_prompt,
+                system_prompt=system_prompt,
+                json_mode=True
+            )
+        )["code"]
+
+        return {
+            "ensemble.py": ensemble_code,
+        }
+    
+    def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
+        """
+        Assign the code list to the evolving item.
+
+        The code list is aligned with the evolving item's sub-tasks.
+        If a task is not implemented, put a None in the list.
+        """
+        for index in range(len(evo.sub_tasks)):
+            if code_list[index] is None:
+                continue
+            if evo.sub_workspace_list[index] is None:
+                # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
+                evo.sub_workspace_list[index] = evo.experiment_workspace
+            evo.sub_workspace_list[index].inject_code(**code_list[index])
+        return evo
+
+
+class EnsembleCoSTEER(CoSTEER):
+    def __init__(
+        self,
+        scen: Scenario,
+        *args,
+        **kwargs,
+    ) -> None:
+        eva = CoSTEERMultiEvaluator(
+            EnsembleCoSTEEREvaluator(scen=scen), scen=scen
+        )
+        es = EnsembleMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+
+        super().__init__(
+            *args,
+            settings=CoSTEER_SETTINGS,
+            eva=eva,
+            es=es,
+            evolving_version=2,
+            scen=scen,
+            **kwargs
+        )
+
+
diff --git a/rdagent/components/coder/data_science/ensemble/conf.py b/rdagent/components/coder/data_science/ensemble/conf.py
new file mode 100644
index 000000000..5ba4cd60f
--- /dev/null
+++ b/rdagent/components/coder/data_science/ensemble/conf.py
@@ -0,0 +1,2 @@
+# Configuration file for ensemble component
+# Currently empty as no specific configuration is needed 
diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
new file mode 100644
index 000000000..fc2476c53
--- /dev/null
+++ b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -0,0 +1,57 @@
+import json
+from dataclasses import dataclass
+from pathlib import Path
+import numpy as np
+
+from rdagent.components.coder.CoSTEER.evaluators import (
+    CoSTEEREvaluator,
+    CoSTEERSingleFeedback,
+)
+from rdagent.core.evaluation import Feedback
+from rdagent.core.evolving_framework import QueriedKnowledge
+from rdagent.core.experiment import FBWorkspace, Task
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+from rdagent.utils.env import DSDockerConf, DockerEnv
+
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+EnsembleEvalFeedback = CoSTEERSingleFeedback
+
+class EnsembleCoSTEEREvaluator(CoSTEEREvaluator):
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: FBWorkspace,
+        gt_implementation: FBWorkspace,
+        queried_knowledge: QueriedKnowledge = None,
+        **kwargs,
+    ) -> EnsembleEvalFeedback:
+        
+        target_task_information = target_task.get_task_information()
+        if (queried_knowledge is not None and
+                target_task_information in queried_knowledge.success_task_to_knowledge_dict):
+            return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
+        elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
+            return EnsembleEvalFeedback(
+                execution="This task has failed too many times, skip implementation.",
+                code="This task has failed too many times, skip implementation.",
+                return_checking="This task has failed too many times, skip implementation.",
+                final_decision=False,
+            )
+
+        de = DockerEnv(conf=DSDockerConf())
+
+        fname = "ensemble_test.py"
+        with (DIRNAME / "eval_tests" / "ensemble_test.py").open("r") as f:
+            test_code = f.read()
+            implementation.inject_code(**{fname: test_code})
+        stdout = implementation.execute(env=de, entry=f"python {fname}")
+
+        system_prompt = T(".prompts:ensemble_eval.system").r(test_code=test_code)
+        user_prompt = T(".prompts:ensemble_eval.user").r(stdout=stdout)
+
+        resp = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt, system_prompt, json_mode=True
+        )
+        return EnsembleEvalFeedback(**json.loads(resp)) 
diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
new file mode 100644
index 000000000..cc40fc7e0
--- /dev/null
+++ b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
@@ -0,0 +1,40 @@
+"""
+A qualified ensemble implementation should:
+- Successfully run
+- Return binary predictions
+- Have correct shapes for inputs and outputs
+- Use validation data appropriately
+"""
+import numpy as np
+import logging
+from ensemble import ens_and_decision
+
+# Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Create test data
+n_models = 3
+n_samples = 100
+
+# Create synthetic predictions
+test_pred_l = [np.random.rand(n_samples, 1) for _ in range(n_models)]
+val_pred_l = [np.random.rand(n_samples, 1) for _ in range(n_models)]
+val_label = np.random.randint(0, 2, (n_samples, 1))
+
+# Run ensemble
+try:
+    final_predictions = ens_and_decision(test_pred_l, val_pred_l, val_label)
+    
+    # Check shape
+    assert final_predictions.shape == (n_samples, 1), "Wrong output shape"
+    
+    # Check binary values
+    assert np.all(np.isin(final_predictions, [0, 1])), "Predictions must be binary (0 or 1)"
+    
+    logging.info("Ensemble test passed successfully.")
+    logging.info(f"Output shape: {final_predictions.shape}")
+    logging.info(f"Unique values in predictions: {np.unique(final_predictions)}")
+    
+except Exception as e:
+    logging.error(f"Test failed: {str(e)}")
+    raise 
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/ensemble/exp.py b/rdagent/components/coder/data_science/ensemble/exp.py
index 75f8d2055..f6ef9b69a 100644
--- a/rdagent/components/coder/data_science/ensemble/exp.py
+++ b/rdagent/components/coder/data_science/ensemble/exp.py
@@ -13,9 +13,16 @@ def __init__(
         self,
         name: str,
         description: str,
+        *args,
         **kwargs,
     ) -> None:
-        pass
+        super().__init__(name=name, description=description, *args, **kwargs)
+
+    def get_task_information(self):
+        task_desc = f"""name: {self.name}
+        description: {self.description}
+        """
+        return task_desc
 
     @staticmethod
     def from_dict(dict):
diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
new file mode 100644
index 000000000..05493a918
--- /dev/null
+++ b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -0,0 +1,32 @@
+ensemble_coder:
+  system: |-
+    You are a Python data scientist working on model ensemble implementation. Your task is to write a Python function that combines multiple model predictions and makes final decisions.
+    You should follow the provided specifications to complete this task.
+
+    -----------Competition Information-----------
+    {{ competition_info }}
+
+    Please respond with the code in the following json format:
+    {
+        "code": "The Python code as a string."
+    }
+  user: |-
+    Please implement an ensemble function with the following specification:
+    -----------Ensemble Specification-----------
+    {{ ensemble_spec }}
+
+ensemble_eval:
+  system: |-
+    You are a data scientist evaluating an ensemble implementation.
+    You are testing the ensemble with the following code:    ```python
+    {{test_code}}    ```
+    You'll be given the stdout of your testing scripts.
+    Please respond with your feedback in the following JSON format: 
+    {
+        "execution": "Describe how well the ensemble executed, including any errors or issues encountered.",
+        "return_checking": "Detail the checks performed on the ensemble results, including shape and value validation.",
+        "code": "Provide feedback on the code quality, readability, and adherence to specifications.",
+        "final_decision": <true/false>
+    }    
+  user: |-    
+    {{stdout}}   
diff --git a/rdagent/components/coder/data_science/ensemble/test.py b/rdagent/components/coder/data_science/ensemble/test.py
new file mode 100644
index 000000000..dc3c79e2f
--- /dev/null
+++ b/rdagent/components/coder/data_science/ensemble/test.py
@@ -0,0 +1,52 @@
+"""
+Helper functions for testing the ensemble coder(CoSTEER-based) component.
+"""
+import sys
+from pathlib import Path
+
+from rdagent.components.coder.data_science.ensemble import EnsembleCoSTEER
+from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
+from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.scenarios.data_science.experiment.experiment import EnsembleExperiment
+
+# Add the competition folder to path
+COMPETITION_PATH = Path(__file__).parent.parent.parent.parent.parent / "scenarios" / "kaggle" / "tpl_ex" / "aerial-cactus-identification"
+sys.path.append(str(COMPETITION_PATH))
+
+
+def load_ensemble_spec():
+    spec_path = COMPETITION_PATH / "spec" / "ensemble.md"
+    with open(spec_path, 'r') as f:
+        return f.read()
+
+
+def develop_ensemble():
+    # Initialize scenario and coder
+    scen = DataScienceScen(competition="aerial-cactus-identification")
+    ensemble_coder = EnsembleCoSTEER(scen)
+    # Load ensemble specification
+    ensemble_spec = load_ensemble_spec()
+
+    # Create the ensemble task with actual data context and specification
+    task = EnsembleTask(
+        name="EnsembleTask",
+        description=
+        """
+        Implement ensemble and decision making for model predictions.
+        """
+    )
+
+    exp = EnsembleExperiment(
+        sub_tasks=[task]
+    )
+
+    # Injecting the corresponding specification
+    exp.experiment_workspace.inject_code(**{"spec/ensemble.md": ensemble_spec})
+
+    # Develop the experiment
+    exp = ensemble_coder.develop(exp)
+    return exp
+
+
+if __name__ == "__main__":
+    develop_ensemble() 

From 62dbcf5bb3d68ed5e88b15c81c9c917d5e08123e Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 18 Dec 2024 11:54:30 +0000
Subject: [PATCH 086/304] data science loop changes

---
 rdagent/app/data_science/loop.py              | 78 ++++++++++---------
 .../coder/data_science/feature/eval.py        | 16 ++--
 .../coder/data_science/feature/test.py        |  4 +-
 .../coder/data_science/model/eval.py          | 13 +---
 .../coder/data_science/model/test.py          |  4 +-
 .../data_science/raw_data_loader/eval.py      | 20 ++---
 .../data_science/raw_data_loader/test.py      |  4 +-
 .../coder/data_science/workflow/test.py       |  4 +-
 rdagent/core/experiment.py                    |  2 +-
 rdagent/scenarios/data_science/dev/runner.py  | 30 +++++++
 .../data_science/experiment/experiment.py     | 53 +------------
 .../data_science/proposal/exp_gen.py          | 30 +++----
 12 files changed, 109 insertions(+), 149 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 81d44464c..2b93ad1e7 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -2,9 +2,14 @@
 from typing import Any, Literal
 
 import fire
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
+from rdagent.components.coder.data_science.feature import FeatureCoSTEER
+from rdagent.components.coder.data_science.model import ModelCoSTEER
+from rdagent.components.coder.data_science.ensemble import EnsembleCoSTEER
+from rdagent.components.coder.data_science.workflow import WorkflowCoSTEER
 from rdagent.components.workflow.conf import BasePropSetting
 from rdagent.components.workflow.rd_loop import NextLoopException, RDLoop
 from rdagent.core.exception import FactorEmptyError, ModelEmptyError
@@ -18,15 +23,16 @@
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
-from rdagent.log.time import measure_time
-from rdagent.scenarios.kaggle.experiment.utils import python_files_to_notebook
-from rdagent.scenarios.kaggle.kaggle_crawler import download_data
-
+from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace, DSExpGen
+from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
+from rdagent.components.coder.data_science.feature.exp import FeatureTask
+from rdagent.components.coder.data_science.model.exp import ModelTask
+from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
 
 class DataScienceRDLoop(RDLoop):
     skip_loop_error = (NextLoopException,)
 
-    @measure_time
     def __init__(self, PROP_SETTING: BasePropSetting):
         scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
         logger.log_object(scen, tag="scenario")
@@ -43,22 +49,13 @@ def __init__(self, PROP_SETTING: BasePropSetting):
         # self.scratch_gen: tuple[HypothesisGen, Hypothesis2Experiment] = DummyHypothesisGen(scen),
 
         # 2) task generation from a complete solution
-        self.exp_gen: ExpGen = import_class(PROP_SETTING.exp_gen)(scen)
-        self.data_loader_coder: DataLoaderCoSTEER = import_class(PROP_SETTING.data_loader_coder)(scen)
-        # self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
-        # logger.log_object(self.hypothesis_gen, tag="hypothesis generator")
-        # self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
-        # logger.log_object(self.hypothesis2experiment, tag="hypothesis2experiment")
-
-        # TODO: we need more coder
-        # self.feature_coder: Developer = import_class(PROP_SETTING.feature_coder)(scen)
-        # logger.log_object(self.feature_coder, tag="feature coder")
-        # self.model_feature_selection_coder: Developer = import_class(PROP_SETTING.model_feature_selection_coder)(
-        #     scen
-        # )
-        # logger.log_object(self.model_feature_selection_coder, tag="model feature selection coder")
-        # self.model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen)
-        # logger.log_object(self.model_coder, tag="model coder")
+        # self.exp_gen: ExpGen = import_class(PROP_SETTING.exp_gen)(scen)
+        self.exp_gen = DSExpGen(scen)
+        self.data_loader_coder = DataLoaderCoSTEER(scen)
+        self.feature_coder = FeatureCoSTEER(scen)
+        self.model_coder = ModelCoSTEER(scen)
+        self.ensemble_coder = EnsembleCoSTEER(scen)
+        self.workflow_coder = WorkflowCoSTEER(scen)
 
         # TODO: now we only need on runner
         # self.feature_runner: Developer = import_class(PROP_SETTING.feature_runner)(scen)
@@ -70,31 +67,31 @@ def __init__(self, PROP_SETTING: BasePropSetting):
         # logger.log_object(self.summarizer, tag="summarizer")
 
         # self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
-        self.trace = Trace(scen=scen)
+        self.trace = DSTrace(scen=scen)
         super(RDLoop, self).__init__()
 
-    @measure_time
     def direct_exp_gen(self, prev_out: dict[str, Any]):
         exp = self.exp_gen.gen(self.trace)
-        hypo = exp.hypothesis
-        return {"propose": hypo, "exp_gen": exp}
+        return exp
 
-    @measure_time
     def coding(self, prev_out: dict[str, Any]):
-        exp = self.data_loader_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
-        # if prev_out["direct_exp_gen"]["propose"].action in [
-        #     KG_ACTION_FEATURE_ENGINEERING,
-        #     KG_ACTION_FEATURE_PROCESSING,
-        # ]:
-        #     exp = self.feature_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
-        # elif prev_out["direct_exp_gen"]["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
-        #     exp = self.model_feature_selection_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
-        # else:
-        #     exp = self.model_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
-        # logger.log_object(exp.sub_workspace_list, tag="coder result")
+        exp: DSExperiment = prev_out["direct_exp_gen"]
+        exp_task = exp.sub_tasks[0]
+        if isinstance(exp_task, DataLoaderTask):
+            exp = self.data_loader_coder.develop(exp)
+        elif isinstance(exp_task, FeatureTask):
+            exp = self.feature_coder.develop(exp)
+        elif isinstance(exp_task, ModelTask):
+            exp = self.model_coder.develop(exp)
+        elif isinstance(exp_task, EnsembleTask):
+            exp = self.ensemble_coder.develop(exp)
+        elif isinstance(exp_task, WorkflowTask):
+            exp = self.workflow_coder.develop(exp)
+        else:
+            raise NotImplementedError(f"Unsupported task type in DataScienceRDLoop: {exp_task}")
+
         return exp
 
-    @measure_time
     def running(self, prev_out: dict[str, Any]):
         if not self.exp_gen.is_complete():
             raise NextLoopException()
@@ -163,3 +160,8 @@ def main(path=None, step_n=None, competition=None):
 
 if __name__ == "__main__":
     fire.Fire(main)
+
+
+
+
+
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
index e9d922367..bff592a31 100644
--- a/rdagent/components/coder/data_science/feature/eval.py
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -1,14 +1,10 @@
 import json
-from dataclasses import dataclass
-from os import system
 from pathlib import Path
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
-    CoSTEERMultiFeedback,
     CoSTEERSingleFeedback,
-    CoSTEERSingleFeedbackDeprecated,
 )
 from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.core.experiment import FBWorkspace, Task
@@ -30,7 +26,7 @@ def evaluate(
         gt_implementation: FBWorkspace,
         queried_knowledge: QueriedKnowledge = None,
         **kwargs,
-    ) -> CoSTEERSingleFeedbackDeprecated:
+    ) -> FeatureEvalFeedback:
 
         target_task_information = target_task.get_task_information()
         if (
@@ -39,12 +35,10 @@ def evaluate(
         ):
             return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
         elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
-            return CoSTEERSingleFeedbackDeprecated(
-                execution_feedback="This task has failed too many times, skip implementation.",
-                shape_feedback="This task has failed too many times, skip implementation.",
-                value_feedback="This task has failed too many times, skip implementation.",
-                code_feedback="This task has failed too many times, skip implementation.",
-                final_feedback="This task has failed too many times, skip implementation.",
+            return FeatureEvalFeedback(
+                execution="This task has failed too many times, skip implementation.",
+                return_checking="This task has failed too many times, skip implementation.",
+                code="This task has failed too many times, skip implementation.",
                 final_decision=False,
             )
 
diff --git a/rdagent/components/coder/data_science/feature/test.py b/rdagent/components/coder/data_science/feature/test.py
index 41b308468..801addc46 100644
--- a/rdagent/components/coder/data_science/feature/test.py
+++ b/rdagent/components/coder/data_science/feature/test.py
@@ -8,7 +8,7 @@
 
 from rdagent.components.coder.data_science.feature import FeatureCoSTEER
 from rdagent.components.coder.data_science.feature.exp import FeatureTask
-from rdagent.scenarios.data_science.experiment.experiment import FeatureExperiment
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
 
 
@@ -21,7 +21,7 @@ def develop_one_competition(competition: str):  # -> experiment
 
     # Create the experiment
     ft = FeatureTask(name="FeatureTask", description=scen.competition_descriptions, spec=feat_spec)
-    exp = FeatureExperiment(
+    exp = DSExperiment(
         sub_tasks=[ft],
     )
 
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 68304cec2..d05c5b505 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -9,9 +9,7 @@
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
-    CoSTEERMultiFeedback,
     CoSTEERSingleFeedback,
-    CoSTEERSingleFeedbackDeprecated,
 )
 from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.core.experiment import FBWorkspace, Task
@@ -23,7 +21,6 @@
 
 
 ModelSingleFeedback = CoSTEERSingleFeedback
-ModelMultiFeedback = CoSTEERMultiFeedback
 
 
 # Below are unit tests for testing the specification of the implemented model ------------------
@@ -44,7 +41,7 @@ def evaluate(
         gt_implementation: FBWorkspace,
         queried_knowledge: QueriedKnowledge = None,
         **kwargs,
-    ) -> CoSTEERSingleFeedbackDeprecated:
+    ) -> ModelSingleFeedback:
         target_task_information = target_task.get_task_information()
         if (
             queried_knowledge is not None
@@ -53,11 +50,9 @@ def evaluate(
             return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
         elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
             return ModelSingleFeedback(
-                execution_feedback="This task has failed too many times, skip implementation.",
-                shape_feedback="This task has failed too many times, skip implementation.",
-                value_feedback="This task has failed too many times, skip implementation.",
-                code_feedback="This task has failed too many times, skip implementation.",
-                final_feedback="This task has failed too many times, skip implementation.",
+                execution="This task has failed too many times, skip implementation.",
+                return_checking="This task has failed too many times, skip implementation.",
+                code="This task has failed too many times, skip implementation.",
                 final_decision=False,
             )
         # assert isinstance(target_task, ModelTask)
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index 2cdbace0f..f3ced2bf4 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -14,7 +14,7 @@
 )
 from rdagent.components.coder.data_science.model.exp import ModelTask
 from rdagent.core.experiment import FBWorkspace
-from rdagent.scenarios.data_science.experiment.experiment import ModelExperiment
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
 
 
@@ -42,7 +42,7 @@ def develop_one_competition(competition: str):
         modelexp.inject_code(**{file_name: file_path.read_text()})
 
     mt.base_code += modelexp.code_dict["model01.py"]
-    exp = ModelExperiment(
+    exp = DSExperiment(
         sub_tasks=[mt],
     )
 
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index f352b4774..aedf4ab8b 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -1,19 +1,15 @@
 # tess successfully running.
 # (GPT) if it aligns with the spec & rationality of the spec.
 import json
-from dataclasses import dataclass
-from os import system
 from pathlib import Path
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
-    CoSTEERMultiFeedback,
     CoSTEERSingleFeedback,
-    CoSTEERSingleFeedbackDeprecated,
 )
+from rdagent.components.coder.CoSTEER.knowledge_management import CoSTEERQueriedKnowledgeV2
 from rdagent.core.evaluation import Feedback
-from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.core.experiment import FBWorkspace, Task, Workspace
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
@@ -31,9 +27,9 @@ def evaluate(
         target_task: Task,
         implementation: FBWorkspace,
         gt_implementation: FBWorkspace,
-        queried_knowledge: QueriedKnowledge = None,
+        queried_knowledge: CoSTEERQueriedKnowledgeV2 = None,
         **kwargs,
-    ) -> CoSTEERSingleFeedbackDeprecated:
+    ) -> DataLoaderEvalFeedback:
 
         target_task_information = target_task.get_task_information()
         if (
@@ -42,12 +38,10 @@ def evaluate(
         ):
             return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
         elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
-            return CoSTEERSingleFeedbackDeprecated(
-                execution_feedback="This task has failed too many times, skip implementation.",
-                shape_feedback="This task has failed too many times, skip implementation.",
-                value_feedback="This task has failed too many times, skip implementation.",
-                code_feedback="This task has failed too many times, skip implementation.",
-                final_feedback="This task has failed too many times, skip implementation.",
+            return DataLoaderEvalFeedback(
+                execution="This task has failed too many times, skip implementation.",
+                return_checking="This task has failed too many times, skip implementation.",
+                code="This task has failed too many times, skip implementation.",
                 final_decision=False,
             )
 
diff --git a/rdagent/components/coder/data_science/raw_data_loader/test.py b/rdagent/components/coder/data_science/raw_data_loader/test.py
index cf705ce78..5aacc8b8c 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/test.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/test.py
@@ -8,7 +8,7 @@
 
 from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
-from rdagent.scenarios.data_science.experiment.experiment import DataLoaderExperiment
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
 
 
@@ -18,7 +18,7 @@ def develop_one_competition(competition: str):  # -> experiment
 
     # Create the experiment
     dlt = DataLoaderTask(name="DataLoaderTask", description="")
-    exp = DataLoaderExperiment(
+    exp = DSExperiment(
         sub_tasks=[dlt],
     )
 
diff --git a/rdagent/components/coder/data_science/workflow/test.py b/rdagent/components/coder/data_science/workflow/test.py
index d5d08388f..d915ac111 100644
--- a/rdagent/components/coder/data_science/workflow/test.py
+++ b/rdagent/components/coder/data_science/workflow/test.py
@@ -14,7 +14,7 @@
 )
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
 from rdagent.core.experiment import FBWorkspace
-from rdagent.scenarios.data_science.experiment.experiment import WorkflowExperiment
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
 
 def develop_one_competition(competition: str):
@@ -40,7 +40,7 @@ def develop_one_competition(competition: str):
 
     wt.spec += workflowexp.code_dict["spec/model.md"]
     wt.base_code += workflowexp.code_dict["model01.py"]
-    exp = WorkflowExperiment(
+    exp = DSExperiment(
         sub_tasks=[wt],
     )
 
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index 64ef4ded9..bd40c2407 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -231,7 +231,7 @@ def __init__(
         self.based_experiments: Sequence[ASpecificWSForExperiment] = based_experiments
 
         self.result: object = None  # The result of the experiment, can be different types in different scenarios.
-        self.sub_results: dict[str, float] = {}
+        self.sub_results: dict[str, float] = {} # TODO: in Kaggle, now sub results are all saved in self.result, remove this in the future.
         self.experiment_workspace: ASpecificWSForExperiment | None = None
 
 
diff --git a/rdagent/scenarios/data_science/dev/runner.py b/rdagent/scenarios/data_science/dev/runner.py
index e69de29bb..e55ea0c31 100644
--- a/rdagent/scenarios/data_science/dev/runner.py
+++ b/rdagent/scenarios/data_science/dev/runner.py
@@ -0,0 +1,30 @@
+from rdagent.core.developer import Developer
+import pandas as pd
+from rdagent.core.exception import CoderError
+from rdagent.utils.env import DockerEnv, DSDockerConf
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.log import rdagent_logger as logger
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+
+class DSRunner(Developer[DSExperiment]):
+    def develop(self, exp: DSExperiment) -> DSExperiment:
+        ds_docker_conf = DSDockerConf()
+        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
+        
+        de = DockerEnv(conf=ds_docker_conf)
+        
+        # execute workflow
+        exp.experiment_workspace.execute(env=de, entry="python main.py")
+        submission_fp = exp.experiment_workspace.workspace_path / "submission.csv"
+        score_fp = exp.experiment_workspace.workspace_path / "scores.csv"
+        
+        if not submission_fp.exists():
+            logger.error("Submission file (submission.csv) is not generated.")
+            raise CoderError("Submission file (submission.csv) is not generated.")
+        
+        if not score_fp.exists():
+            logger.error("Metrics file (scores.csv) is not generated.")
+            raise CoderError("Metrics file (scores.csv) is not generated.")
+        
+        exp.result = pd.read_csv(score_fp, index_col=0)
+        return exp
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index 95bbd038c..9a38ee49a 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -1,55 +1,6 @@
-from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
-from rdagent.components.coder.data_science.feature.exp import FeatureTask
-from rdagent.components.coder.data_science.model.exp import ModelTask
-from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
-from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
-from rdagent.core.experiment import Experiment, FBWorkspace
+from rdagent.core.experiment import Experiment, FBWorkspace, Task
 
-# KG_MODEL_TYPE_XGBOOST = "XGBoost"
-# KG_MODEL_TYPE_RANDOMFOREST = "RandomForest"
-# KG_MODEL_TYPE_LIGHTGBM = "LightGBM"
-# KG_MODEL_TYPE_NN = "NN"
-
-# KG_MODEL_MAPPING = {
-#     KG_MODEL_TYPE_XGBOOST: "model/model_xgboost.py",
-#     KG_MODEL_TYPE_RANDOMFOREST: "model/model_randomforest.py",
-#     KG_MODEL_TYPE_LIGHTGBM: "model/model_lightgbm.py",
-#     KG_MODEL_TYPE_NN: "model/model_nn.py",
-# }
-
-# KG_SELECT_MAPPING = {
-#     KG_MODEL_TYPE_XGBOOST: "model/select_xgboost.py",
-#     KG_MODEL_TYPE_RANDOMFOREST: "model/select_randomforest.py",
-#     KG_MODEL_TYPE_LIGHTGBM: "model/select_lightgbm.py",
-#     KG_MODEL_TYPE_NN: "model/select_nn.py",
-# }
-
-
-class DataLoaderExperiment(Experiment[DataLoaderTask, FBWorkspace, FBWorkspace]):
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.experiment_workspace = FBWorkspace()
-
-
-class ModelExperiment(Experiment[ModelTask, FBWorkspace, FBWorkspace]):
-    def __init__(self, *args, **kwargs) -> None:  # TODO: use previeous step workspace
-        super().__init__(*args, **kwargs)
-        self.experiment_workspace = FBWorkspace()
-
-
-class FeatureExperiment(Experiment[FeatureTask, FBWorkspace, FBWorkspace]):
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.experiment_workspace = FBWorkspace()
-
-
-class EnsembleExperiment(Experiment[EnsembleTask, FBWorkspace, FBWorkspace]):
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.experiment_workspace = FBWorkspace()
-
-
-class WorkflowExperiment(Experiment[WorkflowTask, FBWorkspace, FBWorkspace]):
+class DSExperiment(Experiment[Task, FBWorkspace, FBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.experiment_workspace = FBWorkspace()
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 488ea10d2..d8c90ac07 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -10,13 +10,7 @@
 from rdagent.core.knowledge_base import KnowledgeBase
 from rdagent.core.proposal import ExpGen, Hypothesis, HypothesisFeedback, Trace
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.scenarios.data_science.experiment.experiment import (
-    DataLoaderExperiment,
-    EnsembleExperiment,
-    FeatureExperiment,
-    ModelExperiment,
-    WorkflowExperiment,
-)
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
 from rdagent.utils.agent.tpl import T
 
@@ -100,7 +94,7 @@ def llm_task_gen(
 
         return resp_dict
 
-    def gen(self, trace: DSTrace) -> Experiment:
+    def gen(self, trace: DSTrace) -> DSExperiment:
         successful_components = set()
         for h, _, hf in trace.hist:
             if hf.decision:
@@ -159,7 +153,7 @@ def is_complete():
                     ),
                 )
 
-                return DataLoaderExperiment(sub_tasks=[dt], hypothesis=hypothesis)
+                return DSExperiment(sub_tasks=[dt], hypothesis=hypothesis)
             elif hypothesis.component == "FeatureEng":
                 # TODO: RAG
                 resp_dict = self.llm_task_gen(
@@ -180,7 +174,7 @@ def is_complete():
                         variables=resp_dict[fn].get("variables", "Variables not provided"),
                     )
 
-                exp = FeatureExperiment(sub_tasks=tasks, hypothesis=hypothesis)
+                exp = DSExperiment(sub_tasks=tasks, hypothesis=hypothesis)
                 exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                 return exp
             elif hypothesis.component == "Model":
@@ -201,7 +195,7 @@ def is_complete():
                     base_code="",
                 )
 
-                exp = ModelExperiment(sub_tasks=[mt], hypothesis=hypothesis)
+                exp = DSExperiment(sub_tasks=[mt], hypothesis=hypothesis)
                 exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                 return exp
             elif hypothesis.component == "Ensemble":
@@ -219,7 +213,7 @@ def is_complete():
                     description=resp_dict.get("description", "Ensemble description not provided"),
                 )
 
-                exp = EnsembleExperiment(sub_tasks=[et], hypothesis=hypothesis)
+                exp = DSExperiment(sub_tasks=[et], hypothesis=hypothesis)
                 exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                 return exp
             elif hypothesis.component == "Workflow":
@@ -237,7 +231,7 @@ def is_complete():
                     description=resp_dict.get("description", "Workflow description not provided"),
                 )
 
-                exp = WorkflowExperiment(sub_tasks=[wt], hypothesis=hypothesis)
+                exp = DSExperiment(sub_tasks=[wt], hypothesis=hypothesis)
                 exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                 return exp
         else:
@@ -258,7 +252,7 @@ def is_complete():
                         ),
                     )
 
-                    exp = DataLoaderExperiment(sub_tasks=[dt])
+                    exp = DSExperiment(sub_tasks=[dt])
                     return exp
                 elif o == "FeatureEng":
                     resp_dict = self.llm_task_gen(
@@ -276,7 +270,7 @@ def is_complete():
                             variables=resp_dict[fn].get("variables", "Variables not provided"),
                         )
                         tasks.append(ft)
-                    exp = FeatureExperiment(sub_tasks=tasks)
+                    exp = DSExperiment(sub_tasks=tasks)
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
                 elif o == "Model":
@@ -298,7 +292,7 @@ def is_complete():
                         hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
                         base_code=base_code,
                     )
-                    exp = ModelExperiment(sub_tasks=[mt])
+                    exp = DSExperiment(sub_tasks=[mt])
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
                 elif o == "Ensemble":
@@ -312,7 +306,7 @@ def is_complete():
                         name="Ensemble",
                         description=resp_dict.get("description", "Ensemble description not provided"),
                     )
-                    exp = EnsembleExperiment(sub_tasks=[et])
+                    exp = DSExperiment(sub_tasks=[et])
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
                 elif o == "Workflow":
@@ -326,7 +320,7 @@ def is_complete():
                         name="Workflow",
                         description=resp_dict.get("description", "Workflow description not provided"),
                     )
-                    exp = WorkflowExperiment(sub_tasks=[wt])
+                    exp = DSExperiment(sub_tasks=[wt])
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
 

From 13fae9a380e3e7b8816e43af93f5de3a9669fb46 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 18 Dec 2024 14:04:07 +0000
Subject: [PATCH 087/304] data science loop base

---
 rdagent/app/data_science/loop.py              | 66 +++++--------------
 .../scenarios/data_science/dev/feedback.py    | 26 ++++++++
 .../data_science/proposal/exp_gen.py          | 31 +++++----
 3 files changed, 63 insertions(+), 60 deletions(-)
 create mode 100644 rdagent/scenarios/data_science/dev/feedback.py

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 2b93ad1e7..1d472c568 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -24,12 +24,14 @@
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
 from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace, DSExpGen
+from rdagent.scenarios.kaggle.kaggle_crawler import download_data
 from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
 from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.components.coder.data_science.model.exp import ModelTask
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
-
+from rdagent.scenarios.data_science.dev.runner import DSRunner
+from rdagent.scenarios.data_science.dev.feedback import DSExperiment2Feedback
 class DataScienceRDLoop(RDLoop):
     skip_loop_error = (NextLoopException,)
 
@@ -57,18 +59,14 @@ def __init__(self, PROP_SETTING: BasePropSetting):
         self.ensemble_coder = EnsembleCoSTEER(scen)
         self.workflow_coder = WorkflowCoSTEER(scen)
 
-        # TODO: now we only need on runner
-        # self.feature_runner: Developer = import_class(PROP_SETTING.feature_runner)(scen)
-        # logger.log_object(self.feature_runner, tag="feature runner")
-        # self.model_runner: Developer = import_class(PROP_SETTING.model_runner)(scen)
-        # logger.log_object(self.model_runner, tag="model runner")
-
+        self.runner = DSRunner(scen)
         # self.summarizer: Experiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
         # logger.log_object(self.summarizer, tag="summarizer")
 
         # self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
         self.trace = DSTrace(scen=scen)
-        super(RDLoop, self).__init__()
+        self.summarizer = DSExperiment2Feedback(scen)
+        # super(RDLoop, self).__init__()
 
     def direct_exp_gen(self, prev_out: dict[str, Any]):
         exp = self.exp_gen.gen(self.trace)
@@ -93,48 +91,20 @@ def coding(self, prev_out: dict[str, Any]):
         return exp
 
     def running(self, prev_out: dict[str, Any]):
-        if not self.exp_gen.is_complete():
-            raise NextLoopException()
-
-        if prev_out["direct_exp_gen"]["propose"].action in [
-            KG_ACTION_FEATURE_ENGINEERING,
-            KG_ACTION_FEATURE_PROCESSING,
-        ]:
-            exp = self.feature_runner.develop(prev_out["coding"])
-        else:
-            exp = self.model_runner.develop(prev_out["coding"])
-        logger.log_object(exp, tag="runner result")
-        if DS_RD_SETTING.competition in [
-            "optiver-realized-volatility-prediction",
-            "covid19-global-forecasting-week-1",
-        ]:
-            try:
-                python_files_to_notebook(DS_RD_SETTING.competition, exp.experiment_workspace.workspace_path)
-            except Exception as e:
-                logger.error(f"Merge python files to one file failed: {e}")
-        if DS_RD_SETTING.auto_submit:
-            csv_path = exp.experiment_workspace.workspace_path / "submission.csv"
-            try:
-                subprocess.run(
-                    [
-                        "kaggle",
-                        "competitions",
-                        "submit",
-                        "-f",
-                        str(csv_path.absolute()),
-                        "-m",
-                        str(csv_path.parent.absolute()),
-                        DS_RD_SETTING.competition,
-                    ],
-                    check=True,
-                )
-            except subprocess.CalledProcessError as e:
-                logger.error(f"Auto submission failed: \n{e}")
-            except Exception as e:
-                logger.error(f"Other exception when use kaggle api:\n{e}")
-
+        if not self.trace.all_components_completed():
+            raise NextLoopException("Not all 5 components are completed, skip running of DataScienceRDLoop.")
+        exp = self.runner.develop(prev_out["coding"])
         return exp
 
+    def feedback(self, prev_out: dict[str, Any]):
+        if not self.trace.all_components_completed():
+            raise NextLoopException("Not all 5 components are completed, skip feedback of DataScienceRDLoop.")
+
+        feedback = self.summarizer.generate_feedback(
+            prev_out["running"], prev_out["direct_exp_gen"].hypothesis, self.trace
+        )
+        self.trace.hist.append((prev_out["direct_exp_gen"].hypothesis, prev_out["running"], feedback))
+        
 
 def main(path=None, step_n=None, competition=None):
     """
diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
new file mode 100644
index 000000000..fb3910ce7
--- /dev/null
+++ b/rdagent/scenarios/data_science/dev/feedback.py
@@ -0,0 +1,26 @@
+import json
+from pathlib import Path
+
+import pandas as pd
+
+from rdagent.components.knowledge_management.graph import UndirectedNode
+from rdagent.core.experiment import Experiment
+from rdagent.core.prompts import Prompts
+from rdagent.core.proposal import (
+    Experiment2Feedback,
+    Hypothesis,
+    HypothesisFeedback,
+    Trace,
+)
+from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.kaggle.experiment.kaggle_experiment import KG_SELECT_MAPPING
+from rdagent.utils import convert2bool
+from rdagent.utils.agent.tpl import T
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+
+from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace
+
+class DSExperiment2Feedback(Experiment2Feedback):
+    def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> HypothesisFeedback:
+        return super().generate_feedback(exp, trace)
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index d8c90ac07..bd468bcd2 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -61,7 +61,23 @@ def get_sota_hypothesis_and_experiment(
                     continue
                 return h, exp
         return None, None
-
+    
+    @property
+    def successful_components(self) -> set[COMPONENT]:
+        """
+        Get successful components.
+        """
+        successful_components = set()
+        for h, _, hf in self.hist:
+            if hf.decision:
+                successful_components.add(h.component)
+        return successful_components
+    
+    def all_components_completed(self) -> bool:
+        """
+        Check if 5 successful components are completed.
+        """
+        return set(ORDER) == self.successful_components
 
 class DSExpGen(ExpGen):
     """Data Science Task Generator."""
@@ -95,17 +111,8 @@ def llm_task_gen(
         return resp_dict
 
     def gen(self, trace: DSTrace) -> DSExperiment:
-        successful_components = set()
-        for h, _, hf in trace.hist:
-            if hf.decision:
-                successful_components.add(h.component)
-
-        def is_complete():
-            """is all components complete"""
-            return set(ORDER) == successful_components
-
         scenario_desc = trace.scen.get_scenario_all_desc()
-        if is_complete():
+        if trace.all_components_completed():
             # base info
             hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(trace=trace)
 
@@ -236,7 +243,7 @@ def is_complete():
                 return exp
         else:
             for o in ORDER:
-                if o in successful_components:
+                if o in trace.successful_components:
                     # we already have the component, then skip
                     continue
                 elif o == "DataLoadSpec":

From 999d13351c1e19387963004828993b5f24f473dd Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 19 Dec 2024 02:56:37 +0000
Subject: [PATCH 088/304] ds loop feedback

---
 .../scenarios/data_science/dev/feedback.py    |  67 ++++++++++--
 .../scenarios/data_science/dev/prompts.yaml   |  55 ++++++++++
 .../data_science/proposal/exp_gen.py          |   2 +-
 .../scenarios/kaggle/developer/feedback.py    | 102 +++++++++---------
 4 files changed, 168 insertions(+), 58 deletions(-)
 create mode 100644 rdagent/scenarios/data_science/dev/prompts.yaml

diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
index fb3910ce7..ab9ca03e1 100644
--- a/rdagent/scenarios/data_science/dev/feedback.py
+++ b/rdagent/scenarios/data_science/dev/feedback.py
@@ -1,20 +1,15 @@
 import json
 from pathlib import Path
 
-import pandas as pd
-
 from rdagent.components.knowledge_management.graph import UndirectedNode
 from rdagent.core.experiment import Experiment
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import (
     Experiment2Feedback,
-    Hypothesis,
     HypothesisFeedback,
-    Trace,
 )
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.scenarios.kaggle.experiment.kaggle_experiment import KG_SELECT_MAPPING
 from rdagent.utils import convert2bool
 from rdagent.utils.agent.tpl import T
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
@@ -23,4 +18,64 @@
 
 class DSExperiment2Feedback(Experiment2Feedback):
     def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> HypothesisFeedback:
-        return super().generate_feedback(exp, trace)
\ No newline at end of file
+        hypothesis = exp.hypothesis
+        current_results = exp.result
+        if hypothesis.component == "DataLoadSpec":
+            modified_file_name = "load_data.py"
+        elif hypothesis.component == "FeatureEng":
+            modified_file_name = "feature.py"
+        elif hypothesis.component == "Model":
+            modified_file_name = "model.py"
+        elif hypothesis.component == "Ensemble":
+            modified_file_name = "ensemble.py"
+        elif hypothesis.component == "Workflow":
+            modified_file_name = "main.py"
+        modified_code = exp.experiment_workspace.code_dict[modified_file_name]
+        
+        sota_hypothesis, sota_exp = trace.get_sota_hypothesis_and_experiment()
+
+        if sota_exp:
+            sota_codes = {
+                "load_data.py": (sota_exp.experiment_workspace.workspace_path / "load_data.py").read_text(),
+                "feature.py": (sota_exp.experiment_workspace.workspace_path / "feature.py").read_text(),
+                "model.py": (sota_exp.experiment_workspace.workspace_path / "model.py").read_text(),
+                "ensemble.py": (sota_exp.experiment_workspace.workspace_path / "ensemble.py").read_text(),
+                "main.py": (sota_exp.experiment_workspace.workspace_path / "main.py").read_text(),
+            }
+            sota_results = sota_exp.result
+        else:
+            sota_codes = None
+            sota_results = None
+        
+        
+        last_hypothesis_and_feedback = None
+        if trace.hist and len(trace.hist) > 0:
+            last_hypothesis_and_feedback = (trace.hist[-1][0], trace.hist[-1][2])
+        
+        system_prompt = T(".prompts:exp_feedback.system").r(
+            scenario=self.scen.get_scenario_all_desc()
+        )
+        user_prompt = T(".prompts:exp_feedback.user").r(
+            sota_codes=sota_codes,
+            sota_results=sota_results,
+            hypothesis=str(hypothesis),
+            modified_code=modified_code,
+            current_results=current_results,
+            last_hypothesis_and_feedback=last_hypothesis_and_feedback,
+        )
+        
+        resp_dict = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(
+                    user_prompt=user_prompt,
+                    system_prompt=system_prompt,
+                    json_mode=True,
+                )
+            )
+        
+        return HypothesisFeedback(
+            observations=resp_dict.get("Observations", "No observations provided"),
+            hypothesis_evaluation=resp_dict.get("Feedback for Hypothesis", "No feedback provided"),
+            new_hypothesis=resp_dict.get("New Hypothesis", "No new hypothesis provided"),
+            reason=resp_dict.get("Reasoning", "No reasoning provided"),
+            decision=convert2bool(resp_dict.get("Replace Best Result", "no")),
+        )
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
new file mode 100644
index 000000000..dffaf5a76
--- /dev/null
+++ b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -0,0 +1,55 @@
+exp_feedback:
+  system: |-
+    You are an advanced assistant for analyzing results in data-driven R&D.
+    The task is described in the following scenario:
+    {{ scenario }}
+
+    You will analyze the current experiment's hypothesis, code, results, and compare them with previous experiments and the best past result.
+    Your feedback should:
+    1. Confirm if the current result supports or refutes the hypothesis.
+    2. Compare with previous best results.
+    3. Suggest improvements or new directions. Stay innovative and adapative.
+
+    Please provide detailed and constructive feedback.
+    Example JSON Structure for Result Analysis:
+    {
+      "Observations": "Your overall observations here",
+      "Feedback for Hypothesis": "Observations related to the hypothesis",
+      "New Hypothesis": "Your new hypothesis here",
+      "Reasoning": "Reasoning for the new hypothesis",
+      "Replace Best Result": "yes or no"
+    }
+
+  user: |-
+    We are in a process of finding and validating hypotheses to build powerful codes. Each round aims to confirm or reject hypotheses based on results.
+
+    {% if sota_codes %}
+    {% for name, code in sota_codes.items() %}
+    The SOTA code for {{ name }} is:
+    ```python
+    {{ code }}
+    ```
+    {% endfor %}
+    The SOTA results is:
+    {{ sota_results }}
+    {% endif %}
+
+    Current solution to be evaluated:
+    Hypothesis: {{ hypothesis }}
+    Modified code according to hypothesis:
+    ```python
+    {{ modified_code }}
+    ```
+    Final results of the current solution: {{ current_results }}
+
+    {% if last_hypothesis_and_feedback %}
+    The user has made some hypothesis and conducted experiments to validate them, and the results are as follows:
+    hypothesis: {{ last_hypothesis_and_feedback[0].hypothesis }}
+    feedback decision: {{ last_hypothesis_and_feedback[1].decision }} 
+    reason: {{ last_hypothesis_and_feedback[1].reason }}
+    {% endif %}
+    Please refer to these hypothesis and feedback to help you recommend new hypothesis
+
+    Consider Changing Direction for Significant Gaps with the Best Result and the last round:
+      - If the new results significantly differ from SOTA, consider a new direction.
+      - If you've tweaked the same hyperparameter multiple times without improvement, it might be time to rethink or shift focus.
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index bd468bcd2..ab388ee09 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -48,7 +48,7 @@ def __str__(self) -> str:
 class DSTrace(Trace[DataScienceScen, KnowledgeBase]):
     def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None = None) -> None:
         self.scen: DataScienceScen = scen
-        self.hist: list[tuple[DSHypothesis, Experiment, HypothesisFeedback]] = []
+        self.hist: list[tuple[DSHypothesis, DSExperiment, HypothesisFeedback]] = []
         self.knowledge_base = knowledge_base
 
     def get_sota_hypothesis_and_experiment(
diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
index d3dcee949..e5cc06bf6 100644
--- a/rdagent/scenarios/kaggle/developer/feedback.py
+++ b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -103,23 +103,23 @@ def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback
         current_hypothesis_reason = hypothesis.reason
         current_target_action = hypothesis.action
         current_sub_exps_to_code = {}
-        if hypothesis.action == "Model tuning":
-            current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.sub_workspace_list[0].code
-        elif hypothesis.action == "Model feature selection":
-            current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.experiment_workspace.code_dict[
-                KG_SELECT_MAPPING[exp.sub_tasks[0].model_type]
-            ]
-        else:
-            current_sub_exps_to_code = {
-                sub_ws.target_task.get_task_information(): sub_ws.code for sub_ws in exp.sub_workspace_list
-            }
+        # if hypothesis.action == "Model tuning":
+        #     current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.sub_workspace_list[0].code
+        # elif hypothesis.action == "Model feature selection":
+        #     current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.experiment_workspace.code_dict[
+        #         KG_SELECT_MAPPING[exp.sub_tasks[0].model_type]
+        #     ]
+        # else:
+        #     current_sub_exps_to_code = {
+        #         sub_ws.target_task.get_task_information(): sub_ws.code for sub_ws in exp.sub_workspace_list
+        #     }
         current_sub_exps_to_code_str = json.dumps(current_sub_exps_to_code, indent=2)
         current_result = exp.result
         current_sub_results = exp.sub_results
 
-        last_hypothesis_and_feedback = None
-        if trace.hist and len(trace.hist) > 0:
-            last_hypothesis_and_feedback = (trace.hist[-1][0], trace.hist[-1][2])
+        # last_hypothesis_and_feedback = None
+        # if trace.hist and len(trace.hist) > 0:
+        #     last_hypothesis_and_feedback = (trace.hist[-1][0], trace.hist[-1][2])
 
         # Prepare render dictionary
         render_dict = {
@@ -157,44 +157,44 @@ def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback
         new_hypothesis = response_json.get("New Hypothesis", "No new hypothesis provided")
         reason = response_json.get("Reasoning", "No reasoning provided")
         decision = convert2bool(response_json.get("Replace Best Result", "no"))
-        leaderboard = self.scen.leaderboard
-        current_score = current_result.iloc[0]
-        sorted_scores = sorted(leaderboard, reverse=True)
-        import bisect
-
-        if self.scen.evaluation_metric_direction:
-            insert_position = bisect.bisect_right([-score for score in sorted_scores], -current_score)
-        else:
-            insert_position = bisect.bisect_left(sorted_scores, current_score, lo=0, hi=len(sorted_scores))
-        percentile_ranking = (insert_position) / (len(sorted_scores)) * 100
-
-        experiment_feedback = {
-            "hypothesis_text": current_hypothesis,
-            "tasks_factors": current_sub_exps_to_code,
-            "current_result": current_result,
-        }
-
-        if self.scen.if_using_vector_rag:
-            raise NotImplementedError("Vector RAG is not implemented yet since there are plenty bugs!")
-            self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
-            self.scen.vector_base.dump()
-        elif self.scen.if_using_graph_rag:
-            competition_node = UndirectedNode(content=self.scen.get_competition_full_desc(), label="competition")
-            hypothesis_node = UndirectedNode(content=hypothesis.hypothesis, label=hypothesis.action)
-            exp_code_nodes = []
-            for exp, code in current_sub_exps_to_code.items():
-                exp_code_nodes.append(UndirectedNode(content=exp, label="experiments"))
-                if code != "":
-                    exp_code_nodes.append(UndirectedNode(content=code, label="code"))
-            conclusion_node = UndirectedNode(content=response, label="conclusion")
-            all_nodes = [competition_node, hypothesis_node, *exp_code_nodes, conclusion_node]
-            all_nodes = trace.knowledge_base.batch_embedding(all_nodes)
-            for node in all_nodes:
-                if node is not competition_node:
-                    trace.knowledge_base.add_node(node, competition_node)
-
-        if self.scen.if_action_choosing_based_on_UCB:
-            self.scen.action_counts[hypothesis.action] += 1
+        # leaderboard = self.scen.leaderboard
+        # current_score = current_result.iloc[0]
+        # sorted_scores = sorted(leaderboard, reverse=True)
+        # import bisect
+
+        # if self.scen.evaluation_metric_direction:
+        #     insert_position = bisect.bisect_right([-score for score in sorted_scores], -current_score)
+        # else:
+        #     insert_position = bisect.bisect_left(sorted_scores, current_score, lo=0, hi=len(sorted_scores))
+        # percentile_ranking = (insert_position) / (len(sorted_scores)) * 100
+
+        # experiment_feedback = {
+        #     "hypothesis_text": current_hypothesis,
+        #     "tasks_factors": current_sub_exps_to_code,
+        #     "current_result": current_result,
+        # }
+
+        # if self.scen.if_using_vector_rag:
+        #     raise NotImplementedError("Vector RAG is not implemented yet since there are plenty bugs!")
+        #     self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
+        #     self.scen.vector_base.dump()
+        # elif self.scen.if_using_graph_rag:
+        #     competition_node = UndirectedNode(content=self.scen.get_competition_full_desc(), label="competition")
+        #     hypothesis_node = UndirectedNode(content=hypothesis.hypothesis, label=hypothesis.action)
+        #     exp_code_nodes = []
+        #     for exp, code in current_sub_exps_to_code.items():
+        #         exp_code_nodes.append(UndirectedNode(content=exp, label="experiments"))
+        #         if code != "":
+        #             exp_code_nodes.append(UndirectedNode(content=code, label="code"))
+        #     conclusion_node = UndirectedNode(content=response, label="conclusion")
+        #     all_nodes = [competition_node, hypothesis_node, *exp_code_nodes, conclusion_node]
+        #     all_nodes = trace.knowledge_base.batch_embedding(all_nodes)
+        #     for node in all_nodes:
+        #         if node is not competition_node:
+        #             trace.knowledge_base.add_node(node, competition_node)
+
+        # if self.scen.if_action_choosing_based_on_UCB:
+        #     self.scen.action_counts[hypothesis.action] += 1
 
         return HypothesisFeedback(
             observations=observations,

From b6241cde95033d060e37419568df9b755f25776d Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 19 Dec 2024 02:58:43 +0000
Subject: [PATCH 089/304] fix

---
 .../scenarios/kaggle/developer/feedback.py    | 80 +++++++++----------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
index e5cc06bf6..7346cff8e 100644
--- a/rdagent/scenarios/kaggle/developer/feedback.py
+++ b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -103,23 +103,23 @@ def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback
         current_hypothesis_reason = hypothesis.reason
         current_target_action = hypothesis.action
         current_sub_exps_to_code = {}
-        # if hypothesis.action == "Model tuning":
-        #     current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.sub_workspace_list[0].code
-        # elif hypothesis.action == "Model feature selection":
-        #     current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.experiment_workspace.code_dict[
-        #         KG_SELECT_MAPPING[exp.sub_tasks[0].model_type]
-        #     ]
-        # else:
-        #     current_sub_exps_to_code = {
-        #         sub_ws.target_task.get_task_information(): sub_ws.code for sub_ws in exp.sub_workspace_list
-        #     }
+        if hypothesis.action == "Model tuning":
+            current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.sub_workspace_list[0].code
+        elif hypothesis.action == "Model feature selection":
+            current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.experiment_workspace.code_dict[
+                KG_SELECT_MAPPING[exp.sub_tasks[0].model_type]
+            ]
+        else:
+            current_sub_exps_to_code = {
+                sub_ws.target_task.get_task_information(): sub_ws.code for sub_ws in exp.sub_workspace_list
+            }
         current_sub_exps_to_code_str = json.dumps(current_sub_exps_to_code, indent=2)
         current_result = exp.result
         current_sub_results = exp.sub_results
 
-        # last_hypothesis_and_feedback = None
-        # if trace.hist and len(trace.hist) > 0:
-        #     last_hypothesis_and_feedback = (trace.hist[-1][0], trace.hist[-1][2])
+        last_hypothesis_and_feedback = None
+        if trace.hist and len(trace.hist) > 0:
+            last_hypothesis_and_feedback = (trace.hist[-1][0], trace.hist[-1][2])
 
         # Prepare render dictionary
         render_dict = {
@@ -168,33 +168,33 @@ def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback
         #     insert_position = bisect.bisect_left(sorted_scores, current_score, lo=0, hi=len(sorted_scores))
         # percentile_ranking = (insert_position) / (len(sorted_scores)) * 100
 
-        # experiment_feedback = {
-        #     "hypothesis_text": current_hypothesis,
-        #     "tasks_factors": current_sub_exps_to_code,
-        #     "current_result": current_result,
-        # }
-
-        # if self.scen.if_using_vector_rag:
-        #     raise NotImplementedError("Vector RAG is not implemented yet since there are plenty bugs!")
-        #     self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
-        #     self.scen.vector_base.dump()
-        # elif self.scen.if_using_graph_rag:
-        #     competition_node = UndirectedNode(content=self.scen.get_competition_full_desc(), label="competition")
-        #     hypothesis_node = UndirectedNode(content=hypothesis.hypothesis, label=hypothesis.action)
-        #     exp_code_nodes = []
-        #     for exp, code in current_sub_exps_to_code.items():
-        #         exp_code_nodes.append(UndirectedNode(content=exp, label="experiments"))
-        #         if code != "":
-        #             exp_code_nodes.append(UndirectedNode(content=code, label="code"))
-        #     conclusion_node = UndirectedNode(content=response, label="conclusion")
-        #     all_nodes = [competition_node, hypothesis_node, *exp_code_nodes, conclusion_node]
-        #     all_nodes = trace.knowledge_base.batch_embedding(all_nodes)
-        #     for node in all_nodes:
-        #         if node is not competition_node:
-        #             trace.knowledge_base.add_node(node, competition_node)
-
-        # if self.scen.if_action_choosing_based_on_UCB:
-        #     self.scen.action_counts[hypothesis.action] += 1
+        experiment_feedback = {
+            "hypothesis_text": current_hypothesis,
+            "tasks_factors": current_sub_exps_to_code,
+            "current_result": current_result,
+        }
+
+        if self.scen.if_using_vector_rag:
+            raise NotImplementedError("Vector RAG is not implemented yet since there are plenty bugs!")
+            self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
+            self.scen.vector_base.dump()
+        elif self.scen.if_using_graph_rag:
+            competition_node = UndirectedNode(content=self.scen.get_competition_full_desc(), label="competition")
+            hypothesis_node = UndirectedNode(content=hypothesis.hypothesis, label=hypothesis.action)
+            exp_code_nodes = []
+            for exp, code in current_sub_exps_to_code.items():
+                exp_code_nodes.append(UndirectedNode(content=exp, label="experiments"))
+                if code != "":
+                    exp_code_nodes.append(UndirectedNode(content=code, label="code"))
+            conclusion_node = UndirectedNode(content=response, label="conclusion")
+            all_nodes = [competition_node, hypothesis_node, *exp_code_nodes, conclusion_node]
+            all_nodes = trace.knowledge_base.batch_embedding(all_nodes)
+            for node in all_nodes:
+                if node is not competition_node:
+                    trace.knowledge_base.add_node(node, competition_node)
+
+        if self.scen.if_action_choosing_based_on_UCB:
+            self.scen.action_counts[hypothesis.action] += 1
 
         return HypothesisFeedback(
             observations=observations,

From 7e2874f1bfb2c7ec7d7a4cea25ef6a5d94635ac6 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 19 Dec 2024 03:10:30 +0000
Subject: [PATCH 090/304] remove measure_time because it's duplicated (in
 LoopBase)

---
 rdagent/app/data_science/loop.py              |  4 +---
 rdagent/app/kaggle/loop.py                    |  4 ----
 rdagent/app/qlib_rd_loop/factor.py            |  2 --
 .../app/qlib_rd_loop/factor_from_report.py    |  5 -----
 rdagent/components/workflow/rd_loop.py        |  8 --------
 rdagent/log/time.py                           | 19 -------------------
 6 files changed, 1 insertion(+), 41 deletions(-)
 delete mode 100644 rdagent/log/time.py

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 1d472c568..8d30ff38f 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -37,7 +37,6 @@ class DataScienceRDLoop(RDLoop):
 
     def __init__(self, PROP_SETTING: BasePropSetting):
         scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
-        logger.log_object(scen, tag="scenario")
 
         ### shared components in the workflow  # TODO: check if
         knowledge_base = (
@@ -45,7 +44,6 @@ def __init__(self, PROP_SETTING: BasePropSetting):
             if PROP_SETTING.knowledge_base != ""
             else None
         )
-        logger.log_object(knowledge_base, tag="knowledge_base")
 
         # 1) task generation from scratch
         # self.scratch_gen: tuple[HypothesisGen, Hypothesis2Experiment] = DummyHypothesisGen(scen),
@@ -66,7 +64,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):
         # self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
         self.trace = DSTrace(scen=scen)
         self.summarizer = DSExperiment2Feedback(scen)
-        # super(RDLoop, self).__init__()
+        super(RDLoop, self).__init__()
 
     def direct_exp_gen(self, prev_out: dict[str, Any]):
         exp = self.exp_gen.gen(self.trace)
diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
index c727425fa..8325938b0 100644
--- a/rdagent/app/kaggle/loop.py
+++ b/rdagent/app/kaggle/loop.py
@@ -16,7 +16,6 @@
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
-from rdagent.log.time import measure_time
 from rdagent.scenarios.kaggle.experiment.scenario import (
     KG_ACTION_FEATURE_ENGINEERING,
     KG_ACTION_FEATURE_PROCESSING,
@@ -28,7 +27,6 @@
 
 
 class KaggleRDLoop(RDLoop):
-    @measure_time
     def __init__(self, PROP_SETTING: BasePropSetting):
         with logger.tag("init"):
             scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
@@ -60,7 +58,6 @@ def __init__(self, PROP_SETTING: BasePropSetting):
             self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
             super(RDLoop, self).__init__()
 
-    @measure_time
     def coding(self, prev_out: dict[str, Any]):
         with logger.tag("d"):  # develop
             if prev_out["direct_exp_gen"]["propose"].action in [
@@ -75,7 +72,6 @@ def coding(self, prev_out: dict[str, Any]):
             logger.log_object(exp.sub_workspace_list, tag="coder result")
         return exp
 
-    @measure_time
     def running(self, prev_out: dict[str, Any]):
         with logger.tag("ef"):  # evaluate and feedback
             if prev_out["direct_exp_gen"]["propose"].action in [
diff --git a/rdagent/app/qlib_rd_loop/factor.py b/rdagent/app/qlib_rd_loop/factor.py
index a27bf59ec..8379b0f08 100755
--- a/rdagent/app/qlib_rd_loop/factor.py
+++ b/rdagent/app/qlib_rd_loop/factor.py
@@ -10,13 +10,11 @@
 from rdagent.components.workflow.rd_loop import RDLoop
 from rdagent.core.exception import FactorEmptyError
 from rdagent.log import rdagent_logger as logger
-from rdagent.log.time import measure_time
 
 
 class FactorRDLoop(RDLoop):
     skip_loop_error = (FactorEmptyError,)
 
-    @measure_time
     def running(self, prev_out: dict[str, Any]):
         with logger.tag("ef"):  # evaluate and feedback
             exp = self.runner.develop(prev_out["coding"])
diff --git a/rdagent/app/qlib_rd_loop/factor_from_report.py b/rdagent/app/qlib_rd_loop/factor_from_report.py
index 2acab1012..d98e0b13a 100644
--- a/rdagent/app/qlib_rd_loop/factor_from_report.py
+++ b/rdagent/app/qlib_rd_loop/factor_from_report.py
@@ -14,7 +14,6 @@
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import Hypothesis
 from rdagent.log import rdagent_logger as logger
-from rdagent.log.time import measure_time
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorExperiment
 from rdagent.scenarios.qlib.factor_experiment_loader.pdf_loader import (
@@ -102,7 +101,6 @@ def extract_hypothesis_and_exp_from_reports(report_file_path: str) -> Tuple[Qlib
 
 
 class FactorReportLoop(FactorRDLoop, metaclass=LoopMeta):
-    @measure_time
     def __init__(self, report_folder: str = None):
         super().__init__(PROP_SETTING=FACTOR_FROM_REPORT_PROP_SETTING)
         if report_folder is None:
@@ -118,7 +116,6 @@ def __init__(self, report_folder: str = None):
         self.current_loop_exp = None
         self.steps = ["propose_hypo_exp", "propose", "exp_gen", "coding", "running", "feedback"]
 
-    @measure_time
     def propose_hypo_exp(self, prev_out: dict[str, Any]):
         with logger.tag("r"):
             while True:
@@ -140,11 +137,9 @@ def propose_hypo_exp(self, prev_out: dict[str, Any]):
                 self.current_loop_exp = exp
                 return None
 
-    @measure_time
     def propose(self, prev_out: dict[str, Any]):
         return self.current_loop_hypothesis
 
-    @measure_time
     def exp_gen(self, prev_out: dict[str, Any]):
         return self.current_loop_exp
 
diff --git a/rdagent/components/workflow/rd_loop.py b/rdagent/components/workflow/rd_loop.py
index deb79093f..bde21e349 100644
--- a/rdagent/components/workflow/rd_loop.py
+++ b/rdagent/components/workflow/rd_loop.py
@@ -17,7 +17,6 @@
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
-from rdagent.log.time import measure_time
 from rdagent.utils.workflow import LoopBase, LoopMeta
 
 
@@ -29,7 +28,6 @@ class NextLoopException(Exception):
 
 class RDLoop(LoopBase, metaclass=LoopMeta):
 
-    @measure_time
     def __init__(self, PROP_SETTING: BasePropSetting):
         with logger.tag("init"):
             scen: Scenario = import_class(PROP_SETTING.scen)()
@@ -52,41 +50,35 @@ def __init__(self, PROP_SETTING: BasePropSetting):
             super().__init__()
 
     # excluded steps
-    @measure_time
     def _propose(self):
         hypothesis = self.hypothesis_gen.gen(self.trace)
         logger.log_object(hypothesis, tag="hypothesis generation")
         return hypothesis
 
-    @measure_time
     def _exp_gen(self, hypothesis: Hypothesis):
         exp = self.hypothesis2experiment.convert(hypothesis, self.trace)
         logger.log_object(exp.sub_tasks, tag="experiment generation")
         return exp
 
     # included steps
-    @measure_time
     def direct_exp_gen(self, prev_out: dict[str, Any]):
         with logger.tag("r"):  # research
             hypo = self._propose()
             exp = self._exp_gen(hypo)
         return {"propose": hypo, "exp_gen": exp}
 
-    @measure_time
     def coding(self, prev_out: dict[str, Any]):
         with logger.tag("d"):  # develop
             exp = self.coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
             logger.log_object(exp.sub_workspace_list, tag="coder result")
         return exp
 
-    @measure_time
     def running(self, prev_out: dict[str, Any]):
         with logger.tag("ef"):  # evaluate and feedback
             exp = self.runner.develop(prev_out["coding"])
             logger.log_object(exp, tag="runner result")
         return exp
 
-    @measure_time
     def feedback(self, prev_out: dict[str, Any]):
         feedback = self.summarizer.generate_feedback(
             prev_out["running"], prev_out["direct_exp_gen"]["propose"], self.trace
diff --git a/rdagent/log/time.py b/rdagent/log/time.py
deleted file mode 100644
index 27b1b0db1..000000000
--- a/rdagent/log/time.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import time
-from functools import wraps
-
-from rdagent.log import rdagent_logger as logger
-
-
-def measure_time(method):
-    @wraps(method)
-    def timed(*args, **kwargs):
-        start_time = time.time()
-        result = method(*args, **kwargs)
-        end_time = time.time()
-        duration = end_time - start_time
-        method_name = method.__name__
-        # logger.log_object(f"{method_name} took {duration:.2f} sec")
-        logger.info(f"{method_name} took {duration:.2f} sec")
-        return result
-
-    return timed

From 3335406eb483872463cd097e8c4bd281ade3eb2c Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Thu, 19 Dec 2024 04:31:46 +0000
Subject: [PATCH 091/304] add the knowledge query for data_loader & feature

---
 .../coder/data_science/feature/__init__.py    | 20 ++++-
 .../coder/data_science/feature/prompts.yaml   | 36 ++++++++-
 .../data_science/raw_data_loader/__init__.py  | 35 ++++++--
 .../data_science/raw_data_loader/prompts.yaml | 81 ++++++++++++++++++-
 4 files changed, 155 insertions(+), 17 deletions(-)

diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
index 5eafdd304..30628c646 100644
--- a/rdagent/components/coder/data_science/feature/__init__.py
+++ b/rdagent/components/coder/data_science/feature/__init__.py
@@ -26,12 +26,26 @@ def implement_one_task(
     ) -> dict[str, str]:
         # return a workspace with "load_data.py", "spec/load_data.md" inside
         # assign the implemented code to the new workspace.
-        competition_info = self.scen.get_scenario_all_desc()
+        feature_information_str = target_task.get_task_information()
+
+        # 1. query
+        queried_similar_successful_knowledge = (
+            queried_knowledge.task_to_similar_task_successful_knowledge[feature_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+        queried_former_failed_knowledge = (
+            queried_knowledge.task_to_former_failed_traces[feature_information_str]
+            if queried_knowledge is not None
+            else []
+        )
 
         # 2. code
-        system_prompt = T(".prompts:feature.system").r()
+        system_prompt = T(".prompts:feature.system").r(queried_similar_successful_knowledge=queried_similar_successful_knowledge,
+                    queried_former_failed_knowledge=queried_former_failed_knowledge[0])
         user_prompt = T(".prompts:feature.user").r(
-            competition_info=competition_info, feature_spec=workspace.code_dict["spec/feature.md"]
+            feature_spec=workspace.code_dict["spec/feature.md"],
+            latest_code=workspace.code_dict.get("feat01.py"),
         )
 
         feature_code = json.loads(
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
index 9d3287b94..e003686cc 100644
--- a/rdagent/components/coder/data_science/feature/prompts.yaml
+++ b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -1,6 +1,9 @@
 feature:
   system: |-
-    You are a Python data scientist working on a new project. This project involves implementing feature engineering techniques to prepare data for machine learning models, and this project code will be written by GPT.
+   You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
+    Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
+    
+    This project involves implementing feature engineering techniques to prepare data for machine learning models, and this project code will be written by GPT.
     Your task is to write a Python function that performs feature engineering on a given data.
     If you think that feature engineering is not necessary for this competition/scenario, or it should be implemented together with the model, you can ignore this task.
     You should follow the provided specifications to complete this task.
@@ -9,15 +12,40 @@ feature:
     {
         "code": "The Python code as a string."
     }
+
+    -----------Here is the relevant information for this task-----------
+    {% if queried_similar_successful_knowledge|length != 0 %}
+    --------------Successful Implementations for Similar Models:--------------
+    ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
+    {{ similar_successful_knowledge.target_task.get_task_information() }}
+    =====Code:=====
+    {{ similar_successful_knowledge.implementation.code }}
+    {% endfor %} 
+    {% endif %}
+
+    {% if queried_former_failed_knowledge|length != 0 %}
+    --------------Previous Failed Attempts:--------------
+    {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
+    =====Code:=====
+    {{ former_failed_knowledge.implementation.code }}
+    =====Feedback:=====
+    {{ former_failed_knowledge.feedback }}
+    {% endfor %}
+    {% endif %}
+
     ```
   user: |-
-    ---------Competition Information---------
-    {{ competition_info }}
-
     ---------Feature Processing Specification---------
     {{ feature_spec }}
 
 
+    {% if latest_code %}
+    ---------Former Specification---------
+      Former Code: {{ latest_code }}
+      You should follow the former code to improve it.
+    {% endif %}    
+
+
 feature_eval:
   system: |-
     You are data scientist.
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index 31beb63c2..aad7b0dac 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -32,6 +32,7 @@
 )
 from rdagent.components.coder.CoSTEER.knowledge_management import (
     CoSTEERQueriedKnowledge,
+    CoSTEERQueriedKnowledgeV2,
 )
 from rdagent.components.coder.data_science.raw_data_loader.eval import (
     DataLoaderCoSTEEREvaluator,
@@ -53,14 +54,29 @@ def implement_one_task(
         # return a workspace with "load_data.py", "spec/load_data.md" inside
         # assign the implemented code to the new workspace.
         competition_info = self.scen.get_scenario_all_desc()
+        data_loader_task_info = target_task.get_task_information()
 
+        queried_similar_successful_knowledge = (
+            queried_knowledge.task_to_similar_task_successful_knowledge[data_loader_task_info]
+            if queried_knowledge is not None
+            else []
+        )
+        queried_former_failed_knowledge = (
+            queried_knowledge.task_to_former_failed_traces[data_loader_task_info]
+            if queried_knowledge is not None
+            else []
+        )
+    
         # 1. specifications
-        system_prompt = T(".prompts:spec.system").r(competition_info=competition_info)
-        data_loader_prompt = T(".prompts:spec.user.data_loader").r()
-        feature_prompt = T(".prompts:spec.user.feature").r()
-        model_prompt = T(".prompts:spec.user.model").r()
-        ensemble_prompt = T(".prompts:spec.user.ensemble").r()
-        workflow_prompt = T(".prompts:spec.user.workflow").r()
+        # TODO: Why is queried_former_failed_knowledge[0] used here?
+        system_prompt = T(".prompts:spec.system").r(competition_info=competition_info,
+                                                    queried_similar_successful_knowledge=queried_similar_successful_knowledge,
+                    queried_former_failed_knowledge=queried_former_failed_knowledge[0])
+        data_loader_prompt = T(".prompts:spec.user.data_loader").r(latest_spec=workspace.code_dict.get("spec/data_loader.md"))
+        feature_prompt = T(".prompts:spec.user.feature").r(latest_spec=workspace.code_dict.get("spec/feature.md"))
+        model_prompt = T(".prompts:spec.user.model").r(latest_spec=workspace.code_dict.get("spec/model.md"))
+        ensemble_prompt = T(".prompts:spec.user.ensemble").r(latest_spec=workspace.code_dict.get("spec/ensemble.md"))
+        workflow_prompt = T(".prompts:spec.user.workflow").r(latest_spec=workspace.code_dict.get("spec/workflow.md"))
 
         spec_session = APIBackend().build_chat_session(session_system_prompt=system_prompt)
 
@@ -79,9 +95,12 @@ def implement_one_task(
         ]
 
         # 2. code
-        system_prompt = T(".prompts:data_loader_coder.system").r()
+        system_prompt = T(".prompts:data_loader_coder.system").r(
+            queried_similar_successful_knowledge=queried_similar_successful_knowledge,
+                    queried_former_failed_knowledge=queried_former_failed_knowledge[0])
         user_prompt = T(".prompts:data_loader_coder.user").r(
-            competition_info=competition_info, data_loader_spec=data_loader_spec
+            competition_info=competition_info, data_loader_spec=data_loader_spec,
+            latest_code=workspace.code_dict.get("load_data.py")
         )
 
         data_loader_code = json.loads(
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index fbba49ffb..fbc0b58d6 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -1,5 +1,5 @@
 spec:
-  system:
+  system: |-
     You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
     Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
 
@@ -16,6 +16,26 @@ spec:
     -----------Competition Information-----------
     {{ competition_info }}
 
+    -----------Here is the relevant information for this task-----------
+    {% if queried_similar_successful_knowledge|length != 0 %}
+    --------------Successful Implementations for Similar Models:--------------
+    ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
+    {{ similar_successful_knowledge.target_task.get_task_information() }}
+    =====Code:=====
+    {{ similar_successful_knowledge.implementation.code }}
+    {% endfor %} 
+    {% endif %}
+
+    {% if queried_former_failed_knowledge|length != 0 %}
+    --------------Previous Failed Attempts:--------------
+    {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
+    =====Code:=====
+    {{ former_failed_knowledge.implementation.code }}
+    =====Feedback:=====
+    {{ former_failed_knowledge.feedback }}
+    {% endfor %}
+    {% endif %}
+
   user:
     data_loader: |-
       Data loader specification text should follow these detailed requirements:
@@ -41,6 +61,12 @@ spec:
         - `y`: The target vector for the training data.
         - `X_test`: The feature matrix for the test data.
         - `test_ids`: The identifiers for the test data.
+      
+      {% if latest_spec %}
+      4. Former Specification:
+        {{ latest_spec }}
+        You should follow the provided specifications to improve this task.
+      {% endif %}
 
       Please respond with a JSON structure as follows:
       {
@@ -59,7 +85,7 @@ spec:
           - Describes the purpose of the function.
           - Clarifies the input parameters and their types.
           - Defines the structure and format of the output.
-      2. Precautions for Feature Engineering:
+      2. Precautions for Feature Engineering (You should depend on the competition information to make a concise specification):
         - If feature engineering is strictly part of the model pipeline and should not be done here, explicitly state that feature engineering will be handled at the model stage.
         - If the competition requirements or modeling strategy dictate that feature engineering must be integrated into the model pipeline, this function will remain as a placeholder and return the input data unchanged.
         - When feature engineering is applied, consider the following precautions:
@@ -68,6 +94,12 @@ spec:
           - Feature types: Ensure consistency between feature data types and transformations.
           - Custom features: Provide logic for domain-specific features, if applicable.
           - Avoid data leakage: Only use features derived from training data, excluding information from test or validation sets.
+      
+      {% if latest_spec %}
+      3. Former Specification:
+        {{ latest_spec }}
+        You should follow the provided specifications to improve this task.
+      {% endif %}
 
       Please respond with a JSON structure as follows:
       {
@@ -107,6 +139,12 @@ spec:
         - Perform model training on `X` and `y`, and evaluate using `val_X` and `val_y`.
         - If `test_X` is provided, generate predictions for it.
 
+      {% if latest_spec %}
+      3. Former Specification:
+        {{ latest_spec }}
+        You should follow the provided specifications to improve this task.
+      {% endif %}
+
       Please respond in the following JSON format:
       {
           "spec": "The function definition in code format, tailored to the Competition Information, with detailed explanations provided in the docstring."
@@ -132,6 +170,12 @@ spec:
         - Validate that `val_label` is provided and has the same length as `val_pred_l` predictions.
         - Perform checks to handle empty or invalid inputs gracefully.
 
+      {% if latest_spec %}
+      3. Former Specification:
+        {{ latest_spec }}
+        You should follow the provided specifications to improve this task.
+      {% endif %}
+
       Please respond in the following JSON format:
       {
           "spec": "The function definition in code format, tailored to the Competition Information, with detailed explanations provided in the docstring."
@@ -142,6 +186,12 @@ spec:
       1. Precautions:
         some precautions for workflow.
 
+      {% if latest_spec %}
+      2. Former Specification:
+        {{ latest_spec }}
+        You should follow the provided specifications to improve this task.
+      {% endif %}
+
       Please response the specification in the following json format. Here is an example structure for the JSON output:
       {
           "spec": "The specification as a string."
@@ -157,6 +207,27 @@ data_loader_coder:
     {
         "code": "The Python code as a string."
     }
+
+    -----------Here is the relevant information for this task-----------
+    {% if queried_similar_successful_knowledge|length != 0 %}
+    --------------Successful Implementations for Similar Models:--------------
+    ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
+    {{ similar_successful_knowledge.target_task.get_task_information() }}
+    =====Code:=====
+    {{ similar_successful_knowledge.implementation.code }}
+    {% endfor %} 
+    {% endif %}
+
+    {% if queried_former_failed_knowledge|length != 0 %}
+    --------------Previous Failed Attempts:--------------
+    {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
+    =====Code:=====
+    {{ former_failed_knowledge.implementation.code }}
+    =====Feedback:=====
+    {{ former_failed_knowledge.feedback }}
+    {% endfor %}
+    {% endif %}
+
   user: |-
     ---------Competition Information---------
     {{ competition_info }}
@@ -164,6 +235,12 @@ data_loader_coder:
     ---------Data Loader Specification---------
     {{ data_loader_spec }}
 
+    {% if latest_code %}
+    ---------Former Specification---------
+      Former Code: {{ latest_code }}
+      You should follow the former code to improve it.
+    {% endif %}
+
 
 data_loader_eval:
   system: |-

From 26da5c4593f382681e82b5522acb4aa955ebc12a Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Thu, 19 Dec 2024 09:59:45 +0000
Subject: [PATCH 092/304] edit ds workflow evaluator

---
 .../coder/data_science/model/prompts.yaml     |  2 +-
 .../coder/data_science/workflow/eval.py       | 44 ++++++++++++++++++-
 .../coder/data_science/workflow/prompts.yaml  | 31 +++++++++++++
 .../coder/data_science/workflow/test.py       |  8 +++-
 .../aerial-cactus-identification/main.py      |  2 +-
 5 files changed, 81 insertions(+), 6 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 681e4c129..d67633a6c 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -61,7 +61,7 @@ model_coder:
 
 model_eval:
     system: |-
-        You are data scientist.
+        You are a data scientist.
         User is trying to implement some models in the following scenario:
         {{ scenario }}
         User will provide you the information of the model.
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index 9d4c16b4b..76be7624b 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -1,4 +1,6 @@
+import json
 from rdagent.core.experiment import FBWorkspace, Task
+from pathlib import Path
 from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
@@ -6,6 +8,15 @@
     CoSTEERSingleFeedback,
     CoSTEERSingleFeedbackDeprecated,
 )
+from rdagent.utils.agent.tpl import T
+from rdagent.utils.env import DockerEnv, DSDockerConf
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.oai.llm_utils import APIBackend
+
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+WorkflowSingleFeedback = CoSTEERSingleFeedback
+WorkflowMultiFeedback = CoSTEERMultiFeedback
 
 class WorkflowGeneralCaseSpecEvaluator(CoSTEEREvaluator):
     """
@@ -23,5 +34,34 @@ def evaluate(
         queried_knowledge: QueriedKnowledge = None,
         **kwargs,
     ) -> CoSTEERSingleFeedbackDeprecated:
-        
-        return
\ No newline at end of file
+        target_task_information = target_task.get_task_information()
+        if (
+            queried_knowledge is not None
+            and target_task_information in queried_knowledge.success_task_to_knowledge_dict
+        ):
+            return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
+        elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
+            return WorkflowSingleFeedback(
+                execution_feedback="This task has failed too many times, skip implementation.",
+                shape_feedback="This task has failed too many times, skip implementation.",
+                value_feedback="This task has failed too many times, skip implementation.",
+                code_feedback="This task has failed too many times, skip implementation.",
+                final_feedback="This task has failed too many times, skip implementation.",
+                final_decision=False,
+            )
+        ds_docker_conf = DSDockerConf()
+        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
+        de = DockerEnv(conf=ds_docker_conf)
+        fname = "main.py"
+        stdout = implementation.execute(env=de, entry=f"python {fname}")
+        system_prompt = T(".prompts:workflow_eval.system").r(
+            scenario="No scenario information yet.",
+            spec=implementation.code_dict["spec/workflow.md"]
+        )
+        user_prompt = T(".prompts:workflow_eval.user").r(
+            stdout=stdout,
+            code=implementation.code_dict["main.py"],
+        )
+        resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
+        return WorkflowSingleFeedback(**json.loads(resp))
+        
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index 9e651c82d..ee5dc9772 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -29,3 +29,34 @@ workflow_coder:
 
     ---------ensemble code---------
     {{ ensemble_code }}
+
+workflow_eval:
+  system: |-
+    You are a data scientist.
+    User is trying to build a workflow in the following scenario:
+    {{ scenario }}
+    User will provide you the information of the workflow and the components of it.
+    The information about how to build the workflow is given in specification file as below:
+    {{ spec }}
+    This workflow will import all the codes including data loading, feature engineering, model tuning and ensembling.
+    You are testing it by running the workflow code. The results will be collected as the stdout and it will help you evaluate the code.
+
+    Your job is to evaluate the workflow code given by user. You should concern about whether the code executes successfully, generate prediction correctly and satisfy other requirements in specification.
+
+    Please respond with your feedback in the following JSON format and order
+    ```json
+    {
+        "execution": "Describe whether the model execute successfully, including any errors or issues encountered.",
+        "return_checking": "Checks about the generated value, including whether the value generated and comparing the shape of model output and the requirement in spec.md.". You also need to check whether the hyperparameters used for retraining are correctly returned during the test execution of the model.
+        "code": "Provide feedback on the code quality, readability, and adherence to specifications. Check whether the hyperparameters from the previous run are used in the model code", compare the parameters name in stdout and if it is used in retraining part of code.
+        "final_decision": <true/false>
+    }
+    ```
+
+  user: |-
+    --------------Code generated by user:---------------
+    {{ code }}
+    --------------stdoutput:---------------
+    '''
+    {{ stdout }}
+    '''
diff --git a/rdagent/components/coder/data_science/workflow/test.py b/rdagent/components/coder/data_science/workflow/test.py
index c05691fa8..9ba15cc2b 100644
--- a/rdagent/components/coder/data_science/workflow/test.py
+++ b/rdagent/components/coder/data_science/workflow/test.py
@@ -40,9 +40,13 @@ def develop_one_competition(competition: str):
         sub_tasks=[wt],
     )
 
-    es = WorkflowMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+    """es = WorkflowMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
     new_code = es.implement_one_task(target_task=wt, queried_knowledge=None, workspace = workflowexp)
-    print(new_code)
+    print(new_code)"""
+
+    eva = WorkflowGeneralCaseSpecEvaluator(scen=scen)
+    exp.feedback = eva.evaluate(target_task=wt, queried_knowledge=None, implementation=workflowexp, gt_implementation=None)
+    print(exp.feedback)
 
 
 if __name__ == "__main__":
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
index f3a4bb78c..f6e2227bc 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
@@ -21,7 +21,7 @@
 # Model workflow
 from model01 import model_workflow
 
-val_pred, test_pred = model_workflow(train_images, train_labels, validation_images, validation_labels, test_images)
+val_pred, test_pred, _ = model_workflow(train_images, train_labels, validation_images, validation_labels, test_images)
 
 
 # Ensemble

From 00ad54e0ef4014b58a2423c27f59449a1b9aa598 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 19 Dec 2024 14:06:40 +0000
Subject: [PATCH 093/304] data_loader bug fix

---
 .../components/coder/data_science/raw_data_loader/eval.py  | 2 +-
 .../raw_data_loader/eval_tests/data_loader_test.py         | 4 +---
 .../coder/data_science/raw_data_loader/prompts.yaml        | 7 +++++++
 rdagent/log/ui/llm_st.py                                   | 2 ++
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index aedf4ab8b..2ba76b59c 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -56,7 +56,7 @@ def evaluate(
             implementation.inject_code(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
-        system_prompt = T(".prompts:data_loader_eval.system").r(test_code=test_code)
+        system_prompt = T(".prompts:data_loader_eval.system").r(test_code=test_code, code=implementation.code_dict["load_data.py"])
         user_prompt = T(".prompts:data_loader_eval.user").r(stdout=stdout)
 
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
index d20f5c375..b1330362e 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
@@ -7,13 +7,11 @@
 Please make sure the stdout is rich enough to support informative feedback
 """
 
-import logging
 import pickle
 
 from load_data import load_data
 
 # Setup logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 
 X, y, X_test, test_ids = load_data()
 
@@ -21,7 +19,7 @@
 assert len(X_test) == len(test_ids), "Mismatch in length of test images and test IDs"
 assert len(X) == len(y), "Mismatch in length of training images and labels"
 
-logging.info("Data loader test passed successfully. Length of test images matches length of test IDs.")
+print("Data loader test passed successfully. Length of test images matches length of test IDs.")
 
 with open("data.pkl", "wb") as f:
     pickle.dump((X, y, X_test, test_ids), f)
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index fbc0b58d6..638d1a8ad 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -245,10 +245,17 @@ data_loader_coder:
 data_loader_eval:
   system: |-
     You are data scientist.
+
+    The data loader code is:
+    ```python
+    {{code}}
+    ```
+
     You are testing the data_loader with the following code
     ```python
     {{test_code}}
     ```
+
     You'll be given the stdout of your testing scripts.
     Please respond with your feedback in the following JSON format and order
     ```json
diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
index 052ace2eb..9191c09ec 100644
--- a/rdagent/log/ui/llm_st.py
+++ b/rdagent/log/ui/llm_st.py
@@ -121,10 +121,12 @@ def highlight_prompts_uri(uri):
                     rdict = json.loads(resp)
                     if "code" in rdict:
                         code = rdict["code"]
+                        st.markdown(":red[**Code in response dict:**]")
                         st.code(code, language="python", wrap_lines=True)
                         rdict.pop("code")
                     elif "spec" in rdict:
                         spec = rdict["spec"]
+                        st.markdown(":red[**Spec in response dict:**]")
                         st.markdown(spec)
                         rdict.pop("spec")
                     st.write(":red[**Other parts (except for the code or spec) in response dict:**]")

From 35a1db9f59fe69b068f42bfd8808e1864623618a Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 19 Dec 2024 14:53:46 +0000
Subject: [PATCH 094/304] stop evolving when all tasks completed

---
 rdagent/core/evolving_agent.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/rdagent/core/evolving_agent.py b/rdagent/core/evolving_agent.py
index 6c5d923e5..344942c0c 100644
--- a/rdagent/core/evolving_agent.py
+++ b/rdagent/core/evolving_agent.py
@@ -93,6 +93,11 @@ def multistep_evolve(
                         else eva.evaluate(evo, queried_knowledge=queried_knowledge)  # type: ignore[arg-type, call-arg]
                     )
                     logger.log_object(es.feedback, tag="evolving feedback")
+                    
+                    all_completed = all(f.final_decision for f in es.feedback)
+                    if all_completed:
+                        logger.info("All tasks in evolving subject have been completed.")
+                        break
 
                 # 6. update trace
                 self.evolving_trace.append(es)

From f96f9a2743e765869dc015669930d2b7def6a385 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 20 Dec 2024 02:23:40 +0000
Subject: [PATCH 095/304] llm app change

---
 rdagent/log/ui/llm_st.py  | 55 +++++++++++++++++++++++++++------------
 rdagent/utils/workflow.py |  2 +-
 2 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
index 9191c09ec..7b6918dcf 100644
--- a/rdagent/log/ui/llm_st.py
+++ b/rdagent/log/ui/llm_st.py
@@ -1,3 +1,4 @@
+import re
 import argparse
 import json
 from pathlib import Path
@@ -24,8 +25,7 @@
 if "log_path" not in session_state:
     session_state.log_path = None
 
-eset = set()
-
+tlist = []
 
 def load_data():
     try:
@@ -74,20 +74,36 @@ def highlight_prompts_uri(uri):
     parts = uri.split(":")
     return f"**{parts[0]}:**:green[**{parts[1]}**]"
 
+def extract_loopid_func_name(tag):
+    match = re.search(r'Loop_(\d+)\.(\w+)\.', tag)
+    if match:
+        return match.group(1), match.group(2)
+    return None, None
+
+def extract_evoid(tag):
+    match = re.search(r'\.evo_loop_(\d+)\.', tag)
+    if match:
+        return match.group(1)
+    return None
 
 # Display the data
 for d in session_state.data:
     tag = d["tag"]
     obj = d["obj"]
-    if "evo_loop_" in tag:
-        tags = tag.split(".")
-        for t in tags:
-            if "evo_loop_" in t:
-                etag = t
-                break
-        if etag not in eset:
-            eset.add(etag)
-            st.subheader(f"**{etag}**", anchor=etag, divider="rainbow")
+    
+    loop_id, func_name = extract_loopid_func_name(tag)
+    evo_id = extract_evoid(tag)
+    if loop_id:
+        if loop_id not in tlist:
+            tlist.append(loop_id)
+            st.subheader(f"**Loop_{loop_id}**", anchor=f"Loop_{loop_id}", divider="blue")
+        if f"loop_{loop_id}.{func_name}" not in tlist:
+            tlist.append(f"loop_{loop_id}.{func_name}")
+            st.subheader(f"**{func_name}**", anchor=f"loop_{loop_id}.{func_name}", divider="green")
+        if f"loop_{loop_id}.{evo_id}" not in tlist:
+            tlist.append(f"loop_{loop_id}.evo_step_{evo_id}")
+            st.subheader(f"**evo_step_{evo_id}**", anchor=f"loop_{loop_id}.evo_step_{evo_id}", divider="orange")
+
     if "debug_tpl" in tag:
         uri = obj["uri"]
         tpl = obj["template"]
@@ -95,13 +111,13 @@ def highlight_prompts_uri(uri):
         rd = obj["rendered"]
 
         with st.expander(highlight_prompts_uri(uri), expanded=expand_all, icon="⚙️"):
-            t1, t2, t3 = st.tabs([":blue[**Template**]", ":orange[**Context**]", ":green[**Rendered**]"])
+            t1, t2, t3 = st.tabs([":green[**Rendered**]", ":blue[**Template**]", ":orange[**Context**]"])
             with t1:
-                show_text(tpl, lang="django")
+                show_text(rd)
             with t2:
-                st.json(cxt)
+                show_text(tpl, lang="django")
             with t3:
-                show_text(rd)
+                st.json(cxt)
     elif "debug_llm" in tag:
         system = obj.get("system", None)
         user = obj["user"]
@@ -135,5 +151,12 @@ def highlight_prompts_uri(uri):
                     st.json(resp)
 
 with st.sidebar:
-    et_toc = "\n".join(f"- [**{etag}**](#{etag})" for etag in sorted(eset))
+    et_toc = ""
+    for t in tlist:
+        if t.startswith("L"):
+            et_toc += f"- [{t}](#{t})\n"
+        elif 'evo_step_' in t:
+            et_toc += f"    - [{t}](#{t})\n"
+        else:
+            et_toc += f"  - [{t}](#{t})\n"
     st.markdown(et_toc, unsafe_allow_html=True)
diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py
index 8ee89a962..07c4c39c4 100644
--- a/rdagent/utils/workflow.py
+++ b/rdagent/utils/workflow.py
@@ -106,7 +106,7 @@ def run(self, step_n: int | None = None):
                 start = datetime.datetime.now(datetime.timezone.utc)
 
                 name = self.steps[si]
-                with logger.tag(name):
+                with logger.tag(f"Loop_{li}.{name}"):
                     func = getattr(self, name)
                     try:
                         self.loop_prev_out[name] = func(self.loop_prev_out)

From a0a3db53f543b2348281ecb52f6044ff5e522f81 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Fri, 20 Dec 2024 03:17:52 +0000
Subject: [PATCH 096/304] fix break all complete strategy

---
 rdagent/components/coder/CoSTEER/evaluators.py       |  3 +++
 rdagent/core/evaluation.py                           |  3 +++
 rdagent/core/evolving_agent.py                       | 11 +++++++----
 rdagent/scenarios/data_science/proposal/prompts.yaml |  6 +++---
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
index 4729a0f24..85b6764ae 100644
--- a/rdagent/components/coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -52,6 +52,9 @@ def __str__(self) -> str:
 This implementation is {'SUCCESS' if self.final_decision else 'FAIL'}.
 """
 
+    def __bool__(self):
+        return self.final_decision
+
 
 class CoSTEERSingleFeedbackDeprecated(CoSTEERSingleFeedback):
     """This class is a base class for all code generator feedback to single implementation"""
diff --git a/rdagent/core/evaluation.py b/rdagent/core/evaluation.py
index 65d2c14d1..ce34975b7 100644
--- a/rdagent/core/evaluation.py
+++ b/rdagent/core/evaluation.py
@@ -16,6 +16,9 @@ class Feedback:
 
     pass
 
+    def __bool__(self) -> bool:
+        super().__bool__()
+
 
 class Evaluator(ABC):
     """
diff --git a/rdagent/core/evolving_agent.py b/rdagent/core/evolving_agent.py
index 344942c0c..05a7aec5d 100644
--- a/rdagent/core/evolving_agent.py
+++ b/rdagent/core/evolving_agent.py
@@ -93,14 +93,17 @@ def multistep_evolve(
                         else eva.evaluate(evo, queried_knowledge=queried_knowledge)  # type: ignore[arg-type, call-arg]
                     )
                     logger.log_object(es.feedback, tag="evolving feedback")
-                    
-                    all_completed = all(f.final_decision for f in es.feedback)
+
+                # 6. update trace
+                self.evolving_trace.append(es)
+                
+                # 7. check if all tasks are completed
+                if self.with_feedback:
+                    all_completed = all(es.feedback) if isinstance(es.feedback, list) else es.feedback
                     if all_completed:
                         logger.info("All tasks in evolving subject have been completed.")
                         break
 
-                # 6. update trace
-                self.evolving_trace.append(es)
         if self.with_feedback and filter_final_evo:
             evo = self.filter_evolvable_subjects_by_feedback(evo, self.evolving_trace[-1].feedback)
         return evo
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 034e17a09..4df8205e6 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -28,7 +28,7 @@ hypothesis_gen:
 
 task_gen:
   system: |-
-    {% if hypothesis is not None %}
+    {% if hypothesis is not none %}
     The user is trying to generate new {{targets}} based on the hypothesis generated in the previous step. 
     {% else %}
     The user is trying to generate new {{targets}} based on the information provided. 
@@ -36,7 +36,7 @@ task_gen:
     The {{targets}} are used in certain scenario, the scenario is as follows:
     {{ scenario }}
 
-    {% if hypothesis is not None %}
+    {% if hypothesis is not none %}
     The user will use the {{targets}} generated to do some experiments. The user will provide this information to you:
     1. The target hypothesis you are targeting to generate {{targets}} for.
     2. The hypothesis generated in the previous steps and their corresponding feedbacks.
@@ -47,7 +47,7 @@ task_gen:
     {{ task_output_format }}
     
   user: |-
-    {% if hypothesis is not None %}
+    {% if hypothesis is not none %}
     The user has made several hypothesis on this scenario and did several evaluation on them.
     The target hypothesis you are targeting to generate {{targets}} for is as follows:
     {{ hypothesis }}

From 74a2829790e7c0cc6a827bb23f644432cdb20b76 Mon Sep 17 00:00:00 2001
From: Xisen Wang <118058822+xisen-w@users.noreply.github.com>
Date: Fri, 20 Dec 2024 13:32:15 +0800
Subject: [PATCH 097/304] Adding queried knowledge (#508)

Co-authored-by: XianBW <36835909+XianBW@users.noreply.github.com>
---
 .../coder/data_science/ensemble/__init__.py   | 28 ++++++++++++++++---
 .../coder/data_science/ensemble/prompts.yaml  | 21 ++++++++++++++
 .../coder/data_science/ensemble/test.py       |  3 +-
 3 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/__init__.py b/rdagent/components/coder/data_science/ensemble/__init__.py
index 6727477e3..c9ff760e9 100644
--- a/rdagent/components/coder/data_science/ensemble/__init__.py
+++ b/rdagent/components/coder/data_science/ensemble/__init__.py
@@ -40,11 +40,31 @@ def implement_one_task(
         queried_knowledge: CoSTEERQueriedKnowledge | None = None,
         workspace: FBWorkspace | None = None,
     ) -> dict[str, str]:
-        # return a workspace with "ensemble.py" inside
+        # Get task information for knowledge querying
+        ensemble_information_str = target_task.get_task_information()
+        
+        # Query knowledge
+        queried_similar_successful_knowledge = (
+            queried_knowledge.task_to_similar_task_successful_knowledge[ensemble_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+        queried_former_failed_knowledge = (
+            queried_knowledge.task_to_former_failed_traces[ensemble_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+
+        # Generate code with knowledge integration
         competition_info = self.scen.get_scenario_all_desc()
-        # Generate code
-        system_prompt = T(".prompts:ensemble_coder.system").r(competition_info=competition_info)
-        user_prompt = T(".prompts:ensemble_coder.user").r(ensemble_spec=workspace.code_dict["spec/ensemble.md"])
+        system_prompt = T(".prompts:ensemble_coder.system").r(
+            competition_info=competition_info,
+            queried_similar_successful_knowledge=queried_similar_successful_knowledge,
+            queried_former_failed_knowledge=queried_former_failed_knowledge[0] if queried_former_failed_knowledge else None
+        )
+        user_prompt = T(".prompts:ensemble_coder.user").r(
+            ensemble_spec=workspace.code_dict["spec/ensemble.md"]
+        )
 
         ensemble_code = json.loads(
             APIBackend().build_messages_and_create_chat_completion(
diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
index 05493a918..8a043ca4e 100644
--- a/rdagent/components/coder/data_science/ensemble/prompts.yaml
+++ b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -10,6 +10,27 @@ ensemble_coder:
     {
         "code": "The Python code as a string."
     }
+
+    -----------Here is the relevant information for this task-----------
+    {% if queried_similar_successful_knowledge|length != 0 %}
+    --------------Successful Implementations for Similar Models:--------------
+    ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
+    {{ similar_successful_knowledge.target_task.get_task_information() }}
+    =====Code:=====
+    {{ similar_successful_knowledge.implementation.code }}
+    {% endfor %} 
+    {% endif %}
+
+    {% if queried_former_failed_knowledge|length != 0 %}
+    --------------Previous Failed Attempts:--------------
+    {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
+    =====Code:=====
+    {{ former_failed_knowledge.implementation.code }}
+    =====Feedback:=====
+    {{ former_failed_knowledge.feedback }}
+    {% endfor %}
+    {% endif %}
+
   user: |-
     Please implement an ensemble function with the following specification:
     -----------Ensemble Specification-----------
diff --git a/rdagent/components/coder/data_science/ensemble/test.py b/rdagent/components/coder/data_science/ensemble/test.py
index dc3c79e2f..66cbd5f29 100644
--- a/rdagent/components/coder/data_science/ensemble/test.py
+++ b/rdagent/components/coder/data_science/ensemble/test.py
@@ -7,12 +7,13 @@
 from rdagent.components.coder.data_science.ensemble import EnsembleCoSTEER
 from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
 from rdagent.scenarios.data_science.scen import DataScienceScen
-from rdagent.scenarios.data_science.experiment.experiment import EnsembleExperiment
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 
 # Add the competition folder to path
 COMPETITION_PATH = Path(__file__).parent.parent.parent.parent.parent / "scenarios" / "kaggle" / "tpl_ex" / "aerial-cactus-identification"
 sys.path.append(str(COMPETITION_PATH))
 
+EnsembleExperiment = DSExperiment 
 
 def load_ensemble_spec():
     spec_path = COMPETITION_PATH / "spec" / "ensemble.md"

From 737bdb9e3ba021bd3678e194a40297fcec7333b2 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 20 Dec 2024 05:36:07 +0000
Subject: [PATCH 098/304] fix loop bug

---
 rdagent/app/data_science/loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 8d30ff38f..d249760f2 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -90,6 +90,7 @@ def coding(self, prev_out: dict[str, Any]):
 
     def running(self, prev_out: dict[str, Any]):
         if not self.trace.all_components_completed():
+            self.trace.hist.append((prev_out["direct_exp_gen"].hypothesis, prev_out["coding"], None))
             raise NextLoopException("Not all 5 components are completed, skip running of DataScienceRDLoop.")
         exp = self.runner.develop(prev_out["coding"])
         return exp

From ab41352fce69edba14323cf68e354e041d4bece5 Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Fri, 20 Dec 2024 06:26:41 +0000
Subject: [PATCH 099/304] ds workflow evaluator; test; refine prompts

---
 .../coder/data_science/model/prompts.yaml     | 35 ++++++++-------
 .../coder/data_science/workflow/__init__.py   |  2 +-
 .../coder/data_science/workflow/eval.py       |  8 ++--
 .../coder/data_science/workflow/prompts.yaml  | 45 ++++++++++++-------
 .../coder/data_science/workflow/test.py       | 11 ++++-
 5 files changed, 59 insertions(+), 42 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index d67633a6c..2185e6e05 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -19,6 +19,7 @@ model_coder:
 
             Error Learning:
                 If previous failed attempts and their feedback are available, learn from them. Understand what went wrong and avoid repeating similar mistakes in your new implementation.
+                The failure knowledge may include the code unrelated to the model, such as data loading, preprocessing, or feature engineering. Focus only on the model implementation part.
 
         Formatting Your Response:
             Return only the code in a JSON format as shown below. Do not include any explanations or extra text. Example:
@@ -33,24 +34,25 @@ model_coder:
 
         {% if queried_similar_successful_knowledge|length != 0 %}
         --------------Successful Implementations for Similar Models:--------------
-        ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
+        {% for similar_successful_knowledge in queried_similar_successful_knowledge %} 
+        ===== Model {{loop.index}}: =====
         {{ similar_successful_knowledge.target_task.get_task_information() }}
-        =====Code:=====
+        ===== Code: =====
         {{ similar_successful_knowledge.implementation.code }}
         {% endfor %} 
         {% endif %}
 
         {% if queried_former_failed_knowledge|length != 0 %}
         --------------Previous Failed Attempts:--------------
-        {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
-        =====Code:=====
+        {% for former_failed_knowledge in queried_former_failed_knowledge %} 
+        Attempt {{ loop.index }}:
+        ===== Code: =====
         {{ former_failed_knowledge.implementation.code }}
-        =====Feedback:=====
+        ===== Feedback: =====
         {{ former_failed_knowledge.feedback }}
         {% endfor %}
         {% endif %}
 
-
         {% if current_code is not none %}
         --------------Latest Code:--------------
         {{ current_code }}
@@ -58,13 +60,12 @@ model_coder:
         No prior code has been implemented. 
         {% endif %}
 
-
 model_eval:
     system: |-
         You are a data scientist.
-        User is trying to implement some models in the following scenario:
+        The user is trying to implement some models in the following scenario:
         {{ scenario }}
-        User will provide you the information of the model.
+        The user will provide you with the information of the model.
         The information about how to implement the model is given in spec.md as below:
         {{ spec }}
         You are testing the model with the following code:
@@ -73,21 +74,21 @@ model_eval:
         ```
         The first time you execute it, you will not provide test inputs, only train, valid inputs, and empty hyperparameters. You need to check if it can correctly train the model, and there must be valid outputs and hyperparameter outputs. 
         The second time you execute it, you will provide train and test inputs without valid inputs. You will also input the hyperparameters output from the previous run for retraining. 
-        Therefore, during the evaluate you must check:
-        - The hyperparameters returned must not be none. It should has parameters that will be useful for retrain later. It must include the early stop round.
+        Therefore, during the evaluation you must check:
+        - The hyperparameters returned must not be none. It should have parameters that will be useful for retraining later. It must include the early stop round.
         - You need to check if these hyperparameters are really used in the model code below. The early stop round must be used if given.
         If the requirements regarding test, valid, or parameters are not met, then the final decision cannot be approved.
         
-        You should evaluate the code given by user. You should concern about whether the user implement it correctly, including whether the shape of model's output is aligned with request, the equality of code, and any other thing you think necessary.
-        You will be given the code generated by user and the stdout of the testing process.
+        You should evaluate the code given by the user. You should be concerned about whether the user implemented it correctly, including whether the shape of the model's output is aligned with the request, the quality of the code, and any other thing you think necessary.
+        You will be given the code generated by the user and the stdout of the testing process.
         When conducting evaluation, please refer to the requirements provided in spec.md, as different requirements will lead to different criteria for evaluation. 
     
-        Please respond with your feedback in the following JSON format and order
+        Please respond with your feedback in the following JSON format and order:
         ```json
         {
-            "execution": "Describe whether the model execute successfully, including any errors or issues encountered.",
-            "return_checking": "Checks about the generated value, including whether the value generated and comparing the shape of model output and the requirement in spec.md.". You also need to check whether the hyperparameters used for retraining are correctly returned during the test execution of the model.
-            "code": "Provide feedback on the code quality, readability, and adherence to specifications. Check whether the hyperparameters from the previous run are used in the model code", compare the parameters name in stdout and if it is used in retraining part of code.
+            "execution": "Describe whether the model executed successfully, including any errors or issues encountered.",
+            "return_checking": "Check the generated value, including whether the value is generated and comparing the shape of the model output with the requirement in spec.md. You also need to check whether the hyperparameters used for retraining are correctly returned during the test execution of the model.",
+            "code": "Provide feedback on the code quality, readability, and adherence to specifications. Check whether the hyperparameters from the previous run are used in the model code, compare the parameter names in stdout and if they are used in the retraining part of the code.",
             "final_decision": <true/false>
         }
         ```
diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
index 996985b1a..e879cca58 100644
--- a/rdagent/components/coder/data_science/workflow/__init__.py
+++ b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -21,4 +21,4 @@ def __init__(
             WorkflowGeneralCaseSpecEvaluator(scen=scen), scen=scen
         )  # Please specify whether you agree running your eva in parallel or not
         es = WorkflowMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
-        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
+        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=2, scen=scen, **kwargs)
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index 76be7624b..becef9f78 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -42,11 +42,9 @@ def evaluate(
             return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
         elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
             return WorkflowSingleFeedback(
-                execution_feedback="This task has failed too many times, skip implementation.",
-                shape_feedback="This task has failed too many times, skip implementation.",
-                value_feedback="This task has failed too many times, skip implementation.",
-                code_feedback="This task has failed too many times, skip implementation.",
-                final_feedback="This task has failed too many times, skip implementation.",
+                execution="This task has failed too many times, skip implementation.",
+                return_checking="This task has failed too many times, skip implementation.",
+                code="This task has failed too many times, skip implementation.",
                 final_decision=False,
             )
         ds_docker_conf = DSDockerConf()
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index ee5dc9772..4937a4bcb 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -1,58 +1,69 @@
 workflow_coder:
   system: |-
-    You are a Python data scientist working on a new kaggle competition project.
+    You are a Python data scientist working on a new Kaggle competition project.
+
+    The user has written different Python functions that can load and preprocess data, execute feature engineering, train models, and ensemble them.
 
-    The user has write different Python function that can load and preprocess data, execute feature engineering, train models and ensemble them.
     These Python codes with different functionalities are written separately in different Python files.
-    Your task is to Integrate the existing processes of load_data, feature, model, and ensemble into a complete workflow.
+    You don't need to edit the existing code. Your task is to integrate the existing processes of load_data, feature, model, and ensemble into a complete workflow.
     This workflow code is also a Python file, and it functions similarly to a main process that calls the sub-files for each step and ultimately outputs a prediction file.
 
-    The user will also provide some specifications about how to organize the whole code and give instructions. 
-    These specifications are as below: 
+    The user will also provide specifications on how to organize the code and give instructions. 
+    These specifications are as follows: 
     {{ workflow_spec }}
-    The code you implement should align the framework given in specifications.
 
-    Please response the code in the following json format. Here is an example structure for the JSON output:
+    The dataset provided by load_data is not split into training and testing sets. In the workflow, you should perform this splitting. 
+    By default, use 80% of the data for training and 20% for testing. If the specification requires a different split ratio, cross-validation, or other splitting methods, follow the specification.
+
+    The code you implement should align with the framework given in the specifications.
+    After predicting the output, print the shape and other information of the output to stdout to help the evaluator assess the code.
+   
+    Please respond with the code in the following JSON format. Here is an example structure for the JSON output:
     {
         "code": "The Python code as a string."
     }
 
   user: |-
     ---------load data code---------
+    file: load_data.py
     {{ load_data_code }}
 
     ---------feature engineering code---------
+    file: feat01.py
     {{ feature_code }}
 
     ---------model training code---------
+    Attention: The input and output of the model function is flexible. Training dataset is necessary, but validation and test dateset might be optional. The hyperparameters can either be passed as arguments or be set as default values in the function. You need to use the function correctly.
+    file: model01.py
     {{ model_code }}
 
     ---------ensemble code---------
+    file: ens.py
     {{ ensemble_code }}
 
 workflow_eval:
   system: |-
     You are a data scientist.
-    User is trying to build a workflow in the following scenario:
+    The user is trying to build a workflow in the following scenario:
     {{ scenario }}
-    User will provide you the information of the workflow and the components of it.
-    The information about how to build the workflow is given in specification file as below:
+    The user will provide you with the information of the workflow and its components.
+    The information about how to build the workflow is given in the specification file as below:
     {{ spec }}
-    This workflow will import all the codes including data loading, feature engineering, model tuning and ensembling.
+    This workflow will import all the codes including data loading, feature engineering, model tuning, and ensembling.
     You are testing it by running the workflow code. The results will be collected as the stdout and it will help you evaluate the code.
 
-    Your job is to evaluate the workflow code given by user. You should concern about whether the code executes successfully, generate prediction correctly and satisfy other requirements in specification.
+    Your job is to evaluate the workflow code given by the user. You should be concerned about whether the code executes successfully, generates predictions correctly, and satisfies other requirements in the specification.
+    The components have already been evaluated by the user, so you only need to evaluate and improve the workflow code unless there are very serious issues with the components.
 
-    Please respond with your feedback in the following JSON format and order
+    Please respond with your feedback in the following JSON format and order:
     ```json
     {
-        "execution": "Describe whether the model execute successfully, including any errors or issues encountered.",
-        "return_checking": "Checks about the generated value, including whether the value generated and comparing the shape of model output and the requirement in spec.md.". You also need to check whether the hyperparameters used for retraining are correctly returned during the test execution of the model.
-        "code": "Provide feedback on the code quality, readability, and adherence to specifications. Check whether the hyperparameters from the previous run are used in the model code", compare the parameters name in stdout and if it is used in retraining part of code.
+        "execution": "Describe whether the model executed successfully, including any errors or issues encountered.",
+        "return_checking": "Check the generated value, including whether the value is generated and comparing the shape of the model output with the requirement in the specification. You also need to check whether the hyperparameters used for retraining are correctly returned during the test execution of the model.",
+        "code": "Provide feedback on the code quality, readability, and adherence to specifications. Check whether the hyperparameters from the previous run are used in the model code, compare the parameter names in stdout and if they are used in the retraining part of the code.",
         "final_decision": <true/false>
     }
     ```
-
   user: |-
     --------------Code generated by user:---------------
     {{ code }}
diff --git a/rdagent/components/coder/data_science/workflow/test.py b/rdagent/components/coder/data_science/workflow/test.py
index 9ba15cc2b..4891bf73b 100644
--- a/rdagent/components/coder/data_science/workflow/test.py
+++ b/rdagent/components/coder/data_science/workflow/test.py
@@ -44,9 +44,16 @@ def develop_one_competition(competition: str):
     new_code = es.implement_one_task(target_task=wt, queried_knowledge=None, workspace = workflowexp)
     print(new_code)"""
 
-    eva = WorkflowGeneralCaseSpecEvaluator(scen=scen)
+    """eva = WorkflowGeneralCaseSpecEvaluator(scen=scen)
     exp.feedback = eva.evaluate(target_task=wt, queried_knowledge=None, implementation=workflowexp, gt_implementation=None)
-    print(exp.feedback)
+    print(exp.feedback)"""
+
+    # Run the experiment
+    for file_name in injected_file_names:
+        file_path = tpl_ex_path / file_name
+        exp.experiment_workspace.inject_code(**{file_name: file_path.read_text()})
+
+    exp = workflow_coder.develop(exp)
 
 
 if __name__ == "__main__":

From c2ed6e13b788f316dd2ce6ed9965c01498f0df94 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Fri, 20 Dec 2024 07:07:34 +0000
Subject: [PATCH 100/304] workflow spec

---
 .../data_science/raw_data_loader/prompts.yaml | 51 +++++++++++++------
 1 file changed, 35 insertions(+), 16 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 638d1a8ad..c575fff1b 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -130,7 +130,7 @@ spec:
                 - `pred_test`: Predictions on test data (`np.ndarray` of shape `(num_test_samples, 1)` or `None`).
                 - `hyper_params`: A dictionary of important hyperparameters for model configuration.
 
-        - Include a clear and concise docstring to explain the function’s purpose, its input parameters, and its expected return values.
+        - Include a clear and concise docstring to explain the function's purpose, its input parameters, and its expected return values.
 
       2. Precautions:
         - Ensure input arrays (`X`, `y`, `val_X`, `val_y`, `test_X`) have the correct shapes and consistent dimensions.
@@ -152,7 +152,7 @@ spec:
 
 
     ensemble: |-
-      Ensemble specification text should include two parts:
+      Ensemble specification text adhere to the following requirements:
       1. Function Interface:
         - The function name must be `ens_and_decision`.
         - The function should include:
@@ -182,20 +182,39 @@ spec:
       }
 
     workflow: |-
-      Workflow specification text should include one parts:
-      1. Precautions:
-        some precautions for workflow.
-
-      {% if latest_spec %}
-      2. Former Specification:
-        {{ latest_spec }}
-        You should follow the provided specifications to improve this task.
-      {% endif %}
-
-      Please response the specification in the following json format. Here is an example structure for the JSON output:
-      {
-          "spec": "The specification as a string."
-      }
+      Your task is to implement the main workflow script (`main.py`) for a Kaggle-style machine learning competition project. 
+      Follow the provided project structure and specifications to ensure consistency and maintainability:
+        1. Workflow Integration:
+          - Integrate the following components into the workflow:
+            - Data loading (`load_data.py`).
+            - Feature engineering (`feat*.py`).
+            - Model workflow for training and testing (`model*.py`).
+            - Ensemble and decision-making (`ens.py`).
+          - Treat each component as a modular and callable Python function.
+
+        2. Dataset Splitting
+          - The dataset returned by `load_data` is not split into training and testing sets.
+          - By default, split the dataset into 80% for training and 20% for testing. 
+          - You can also use cross-validation or other splitting methods as you deem more useful and appropriate based on the Competition Information.
+
+        3. Submission File:
+          - Save the final predictions as `submission.csv` in the format required by the competition.
+          - Present the required submission format explicitly and ensure the output adheres to it.
+
+        4. Code Standards:
+          - Use consistent naming conventions and type annotations.
+          - Document the workflow with clear comments and docstrings.
+
+        {% if latest_spec %}
+        5. Former Specification:
+          {{ latest_spec }}
+          You should follow the provided specifications to improve this task.
+        {% endif %}
+
+        Please response the specification in the following json format. Here is an example structure for the JSON output:
+        {
+            "spec": "The corresponding specification string as described above. You should create the rules based on the competition information instead of copying the requirements."
+        }
 
 data_loader_coder:
   system: |-

From 02ddf811030eaf9662144f14a461850921d664c8 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Fri, 20 Dec 2024 07:23:50 +0000
Subject: [PATCH 101/304] fix ci

---
 rdagent/app/data_science/loop.py              | 31 +++++++--------
 .../coder/data_science/ensemble/__init__.py   | 39 ++++++-------------
 .../coder/data_science/ensemble/conf.py       |  2 +-
 .../coder/data_science/ensemble/eval.py       | 18 +++++----
 .../ensemble/eval_tests/ensemble_test.py      | 16 ++++----
 .../coder/data_science/ensemble/test.py       | 27 +++++++------
 .../coder/data_science/feature/__init__.py    |  6 ++-
 .../coder/data_science/model/eval.py          |  4 +-
 .../model/eval_tests/model_execute.py         |  6 ++-
 .../data_science/raw_data_loader/__init__.py  | 22 +++++++----
 .../data_science/raw_data_loader/eval.py      |  8 +++-
 .../coder/data_science/workflow/__init__.py   |  2 +-
 .../coder/data_science/workflow/es.py         | 26 ++++++-------
 .../coder/data_science/workflow/eval.py       | 15 +++----
 .../coder/data_science/workflow/test.py       |  1 +
 rdagent/core/evolving_agent.py                |  2 +-
 rdagent/core/experiment.py                    |  4 +-
 rdagent/log/ui/llm_st.py                      | 14 ++++---
 .../scenarios/data_science/dev/feedback.py    | 34 +++++++---------
 rdagent/scenarios/data_science/dev/runner.py  | 20 +++++-----
 .../data_science/experiment/experiment.py     |  1 +
 .../data_science/proposal/exp_gen.py          |  5 ++-
 .../aerial-cactus-identification/model01.py   |  2 +-
 23 files changed, 156 insertions(+), 149 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index d249760f2..48b94683c 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -2,14 +2,18 @@
 from typing import Any, Literal
 
 import fire
-from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
-from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
+from rdagent.components.coder.data_science.ensemble import EnsembleCoSTEER
+from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
 from rdagent.components.coder.data_science.feature import FeatureCoSTEER
+from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.components.coder.data_science.model import ModelCoSTEER
-from rdagent.components.coder.data_science.ensemble import EnsembleCoSTEER
+from rdagent.components.coder.data_science.model.exp import ModelTask
+from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
+from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.components.coder.data_science.workflow import WorkflowCoSTEER
+from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
 from rdagent.components.workflow.conf import BasePropSetting
 from rdagent.components.workflow.rd_loop import NextLoopException, RDLoop
 from rdagent.core.exception import FactorEmptyError, ModelEmptyError
@@ -23,15 +27,13 @@
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
-from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace, DSExpGen
-from rdagent.scenarios.kaggle.kaggle_crawler import download_data
-from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
-from rdagent.components.coder.data_science.feature.exp import FeatureTask
-from rdagent.components.coder.data_science.model.exp import ModelTask
-from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
-from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
-from rdagent.scenarios.data_science.dev.runner import DSRunner
 from rdagent.scenarios.data_science.dev.feedback import DSExperiment2Feedback
+from rdagent.scenarios.data_science.dev.runner import DSRunner
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.scenarios.data_science.proposal.exp_gen import DSExpGen, DSTrace
+from rdagent.scenarios.kaggle.kaggle_crawler import download_data
+
+
 class DataScienceRDLoop(RDLoop):
     skip_loop_error = (NextLoopException,)
 
@@ -103,7 +105,7 @@ def feedback(self, prev_out: dict[str, Any]):
             prev_out["running"], prev_out["direct_exp_gen"].hypothesis, self.trace
         )
         self.trace.hist.append((prev_out["direct_exp_gen"].hypothesis, prev_out["running"], feedback))
-        
+
 
 def main(path=None, step_n=None, competition=None):
     """
@@ -129,8 +131,3 @@ def main(path=None, step_n=None, competition=None):
 
 if __name__ == "__main__":
     fire.Fire(main)
-
-
-
-
-
diff --git a/rdagent/components/coder/data_science/ensemble/__init__.py b/rdagent/components/coder/data_science/ensemble/__init__.py
index c9ff760e9..d4ad7ae42 100644
--- a/rdagent/components/coder/data_science/ensemble/__init__.py
+++ b/rdagent/components/coder/data_science/ensemble/__init__.py
@@ -22,16 +22,13 @@
 from rdagent.components.coder.CoSTEER.knowledge_management import (
     CoSTEERQueriedKnowledge,
 )
-from rdagent.components.coder.data_science.ensemble.eval import (
-    EnsembleCoSTEEREvaluator,
-)
+from rdagent.components.coder.data_science.ensemble.eval import EnsembleCoSTEEREvaluator
 from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
+from rdagent.core.experiment import FBWorkspace
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 
-from rdagent.core.experiment import FBWorkspace
-
 
 class EnsembleMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
     def implement_one_task(
@@ -42,7 +39,7 @@ def implement_one_task(
     ) -> dict[str, str]:
         # Get task information for knowledge querying
         ensemble_information_str = target_task.get_task_information()
-        
+
         # Query knowledge
         queried_similar_successful_knowledge = (
             queried_knowledge.task_to_similar_task_successful_knowledge[ensemble_information_str]
@@ -60,24 +57,22 @@ def implement_one_task(
         system_prompt = T(".prompts:ensemble_coder.system").r(
             competition_info=competition_info,
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
-            queried_former_failed_knowledge=queried_former_failed_knowledge[0] if queried_former_failed_knowledge else None
-        )
-        user_prompt = T(".prompts:ensemble_coder.user").r(
-            ensemble_spec=workspace.code_dict["spec/ensemble.md"]
+            queried_former_failed_knowledge=(
+                queried_former_failed_knowledge[0] if queried_former_failed_knowledge else None
+            ),
         )
+        user_prompt = T(".prompts:ensemble_coder.user").r(ensemble_spec=workspace.code_dict["spec/ensemble.md"])
 
         ensemble_code = json.loads(
             APIBackend().build_messages_and_create_chat_completion(
-                user_prompt=user_prompt,
-                system_prompt=system_prompt,
-                json_mode=True
+                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
             )
         )["code"]
 
         return {
             "ensemble.py": ensemble_code,
         }
-    
+
     def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
         """
         Assign the code list to the evolving item.
@@ -102,19 +97,7 @@ def __init__(
         *args,
         **kwargs,
     ) -> None:
-        eva = CoSTEERMultiEvaluator(
-            EnsembleCoSTEEREvaluator(scen=scen), scen=scen
-        )
+        eva = CoSTEERMultiEvaluator(EnsembleCoSTEEREvaluator(scen=scen), scen=scen)
         es = EnsembleMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
 
-        super().__init__(
-            *args,
-            settings=CoSTEER_SETTINGS,
-            eva=eva,
-            es=es,
-            evolving_version=2,
-            scen=scen,
-            **kwargs
-        )
-
-
+        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=2, scen=scen, **kwargs)
diff --git a/rdagent/components/coder/data_science/ensemble/conf.py b/rdagent/components/coder/data_science/ensemble/conf.py
index 5ba4cd60f..b6c859788 100644
--- a/rdagent/components/coder/data_science/ensemble/conf.py
+++ b/rdagent/components/coder/data_science/ensemble/conf.py
@@ -1,2 +1,2 @@
 # Configuration file for ensemble component
-# Currently empty as no specific configuration is needed 
+# Currently empty as no specific configuration is needed
diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
index fc2476c53..4ef2f87c1 100644
--- a/rdagent/components/coder/data_science/ensemble/eval.py
+++ b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -1,6 +1,7 @@
 import json
 from dataclasses import dataclass
 from pathlib import Path
+
 import numpy as np
 
 from rdagent.components.coder.CoSTEER.evaluators import (
@@ -12,12 +13,13 @@
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
-from rdagent.utils.env import DSDockerConf, DockerEnv
+from rdagent.utils.env import DockerEnv, DSDockerConf
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
 EnsembleEvalFeedback = CoSTEERSingleFeedback
 
+
 class EnsembleCoSTEEREvaluator(CoSTEEREvaluator):
     def evaluate(
         self,
@@ -27,10 +29,12 @@ def evaluate(
         queried_knowledge: QueriedKnowledge = None,
         **kwargs,
     ) -> EnsembleEvalFeedback:
-        
+
         target_task_information = target_task.get_task_information()
-        if (queried_knowledge is not None and
-                target_task_information in queried_knowledge.success_task_to_knowledge_dict):
+        if (
+            queried_knowledge is not None
+            and target_task_information in queried_knowledge.success_task_to_knowledge_dict
+        ):
             return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
         elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
             return EnsembleEvalFeedback(
@@ -51,7 +55,5 @@ def evaluate(
         system_prompt = T(".prompts:ensemble_eval.system").r(test_code=test_code)
         user_prompt = T(".prompts:ensemble_eval.user").r(stdout=stdout)
 
-        resp = APIBackend().build_messages_and_create_chat_completion(
-            user_prompt, system_prompt, json_mode=True
-        )
-        return EnsembleEvalFeedback(**json.loads(resp)) 
+        resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
+        return EnsembleEvalFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
index cc40fc7e0..6f868d357 100644
--- a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
+++ b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
@@ -5,12 +5,14 @@
 - Have correct shapes for inputs and outputs
 - Use validation data appropriately
 """
-import numpy as np
+
 import logging
+
+import numpy as np
 from ensemble import ens_and_decision
 
 # Setup logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 
 # Create test data
 n_models = 3
@@ -24,17 +26,17 @@
 # Run ensemble
 try:
     final_predictions = ens_and_decision(test_pred_l, val_pred_l, val_label)
-    
+
     # Check shape
     assert final_predictions.shape == (n_samples, 1), "Wrong output shape"
-    
+
     # Check binary values
     assert np.all(np.isin(final_predictions, [0, 1])), "Predictions must be binary (0 or 1)"
-    
+
     logging.info("Ensemble test passed successfully.")
     logging.info(f"Output shape: {final_predictions.shape}")
     logging.info(f"Unique values in predictions: {np.unique(final_predictions)}")
-    
+
 except Exception as e:
     logging.error(f"Test failed: {str(e)}")
-    raise 
\ No newline at end of file
+    raise
diff --git a/rdagent/components/coder/data_science/ensemble/test.py b/rdagent/components/coder/data_science/ensemble/test.py
index 66cbd5f29..9986aa978 100644
--- a/rdagent/components/coder/data_science/ensemble/test.py
+++ b/rdagent/components/coder/data_science/ensemble/test.py
@@ -1,23 +1,31 @@
 """
 Helper functions for testing the ensemble coder(CoSTEER-based) component.
 """
+
 import sys
 from pathlib import Path
 
 from rdagent.components.coder.data_science.ensemble import EnsembleCoSTEER
 from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
-from rdagent.scenarios.data_science.scen import DataScienceScen
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.scenarios.data_science.scen import DataScienceScen
 
 # Add the competition folder to path
-COMPETITION_PATH = Path(__file__).parent.parent.parent.parent.parent / "scenarios" / "kaggle" / "tpl_ex" / "aerial-cactus-identification"
+COMPETITION_PATH = (
+    Path(__file__).parent.parent.parent.parent.parent
+    / "scenarios"
+    / "kaggle"
+    / "tpl_ex"
+    / "aerial-cactus-identification"
+)
 sys.path.append(str(COMPETITION_PATH))
 
-EnsembleExperiment = DSExperiment 
+EnsembleExperiment = DSExperiment
+
 
 def load_ensemble_spec():
     spec_path = COMPETITION_PATH / "spec" / "ensemble.md"
-    with open(spec_path, 'r') as f:
+    with open(spec_path, "r") as f:
         return f.read()
 
 
@@ -31,15 +39,12 @@ def develop_ensemble():
     # Create the ensemble task with actual data context and specification
     task = EnsembleTask(
         name="EnsembleTask",
-        description=
-        """
+        description="""
         Implement ensemble and decision making for model predictions.
-        """
+        """,
     )
 
-    exp = EnsembleExperiment(
-        sub_tasks=[task]
-    )
+    exp = EnsembleExperiment(sub_tasks=[task])
 
     # Injecting the corresponding specification
     exp.experiment_workspace.inject_code(**{"spec/ensemble.md": ensemble_spec})
@@ -50,4 +55,4 @@ def develop_ensemble():
 
 
 if __name__ == "__main__":
-    develop_ensemble() 
+    develop_ensemble()
diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
index 30628c646..a4f2d9a3a 100644
--- a/rdagent/components/coder/data_science/feature/__init__.py
+++ b/rdagent/components/coder/data_science/feature/__init__.py
@@ -41,8 +41,10 @@ def implement_one_task(
         )
 
         # 2. code
-        system_prompt = T(".prompts:feature.system").r(queried_similar_successful_knowledge=queried_similar_successful_knowledge,
-                    queried_former_failed_knowledge=queried_former_failed_knowledge[0])
+        system_prompt = T(".prompts:feature.system").r(
+            queried_similar_successful_knowledge=queried_similar_successful_knowledge,
+            queried_former_failed_knowledge=queried_former_failed_knowledge[0],
+        )
         user_prompt = T(".prompts:feature.user").r(
             feature_spec=workspace.code_dict["spec/feature.md"],
             latest_code=workspace.code_dict.get("feat01.py"),
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index d05c5b505..b133d28db 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -71,9 +71,7 @@ def evaluate(
             implementation.inject_code(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
         system_prompt = T(".prompts:model_eval.system").r(
-            test_code=test_code,
-            scenario="No scenario information yet.",
-            spec=implementation.code_dict["spec/model.md"]
+            test_code=test_code, scenario="No scenario information yet.", spec=implementation.code_dict["spec/model.md"]
         )
         user_prompt = T(".prompts:model_eval.user").r(
             stdout=stdout,
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
index 47654e961..30459be77 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
@@ -24,7 +24,11 @@
 print("The first execution begins.\n")
 # Call model_workflow
 val_pred, test_pred, hypers = model_workflow(
-    X=train_X, y=train_y, val_X=val_X, val_y=val_y, test_X=None,
+    X=train_X,
+    y=train_y,
+    val_X=val_X,
+    val_y=val_y,
+    test_X=None,
 )
 # val_pred = np.random.rand(8, 1)
 # test_pred = np.random.rand(8, 1)
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index aad7b0dac..d19a8adf4 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -66,13 +66,17 @@ def implement_one_task(
             if queried_knowledge is not None
             else []
         )
-    
+
         # 1. specifications
         # TODO: Why is queried_former_failed_knowledge[0] used here?
-        system_prompt = T(".prompts:spec.system").r(competition_info=competition_info,
-                                                    queried_similar_successful_knowledge=queried_similar_successful_knowledge,
-                    queried_former_failed_knowledge=queried_former_failed_knowledge[0])
-        data_loader_prompt = T(".prompts:spec.user.data_loader").r(latest_spec=workspace.code_dict.get("spec/data_loader.md"))
+        system_prompt = T(".prompts:spec.system").r(
+            competition_info=competition_info,
+            queried_similar_successful_knowledge=queried_similar_successful_knowledge,
+            queried_former_failed_knowledge=queried_former_failed_knowledge[0],
+        )
+        data_loader_prompt = T(".prompts:spec.user.data_loader").r(
+            latest_spec=workspace.code_dict.get("spec/data_loader.md")
+        )
         feature_prompt = T(".prompts:spec.user.feature").r(latest_spec=workspace.code_dict.get("spec/feature.md"))
         model_prompt = T(".prompts:spec.user.model").r(latest_spec=workspace.code_dict.get("spec/model.md"))
         ensemble_prompt = T(".prompts:spec.user.ensemble").r(latest_spec=workspace.code_dict.get("spec/ensemble.md"))
@@ -97,10 +101,12 @@ def implement_one_task(
         # 2. code
         system_prompt = T(".prompts:data_loader_coder.system").r(
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
-                    queried_former_failed_knowledge=queried_former_failed_knowledge[0])
+            queried_former_failed_knowledge=queried_former_failed_knowledge[0],
+        )
         user_prompt = T(".prompts:data_loader_coder.user").r(
-            competition_info=competition_info, data_loader_spec=data_loader_spec,
-            latest_code=workspace.code_dict.get("load_data.py")
+            competition_info=competition_info,
+            data_loader_spec=data_loader_spec,
+            latest_code=workspace.code_dict.get("load_data.py"),
         )
 
         data_loader_code = json.loads(
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index 2ba76b59c..cdb3ba6cf 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -8,7 +8,9 @@
     CoSTEEREvaluator,
     CoSTEERSingleFeedback,
 )
-from rdagent.components.coder.CoSTEER.knowledge_management import CoSTEERQueriedKnowledgeV2
+from rdagent.components.coder.CoSTEER.knowledge_management import (
+    CoSTEERQueriedKnowledgeV2,
+)
 from rdagent.core.evaluation import Feedback
 from rdagent.core.experiment import FBWorkspace, Task, Workspace
 from rdagent.oai.llm_utils import APIBackend
@@ -56,7 +58,9 @@ def evaluate(
             implementation.inject_code(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
-        system_prompt = T(".prompts:data_loader_eval.system").r(test_code=test_code, code=implementation.code_dict["load_data.py"])
+        system_prompt = T(".prompts:data_loader_eval.system").r(
+            test_code=test_code, code=implementation.code_dict["load_data.py"]
+        )
         user_prompt = T(".prompts:data_loader_eval.user").r(stdout=stdout)
 
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
index e879cca58..8c120a7d9 100644
--- a/rdagent/components/coder/data_science/workflow/__init__.py
+++ b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -1,13 +1,13 @@
 from rdagent.components.coder.CoSTEER import CoSTEER
 from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
 from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-from rdagent.core.scenario import Scenario
 from rdagent.components.coder.data_science.workflow.es import (
     WorkflowMultiProcessEvolvingStrategy,
 )
 from rdagent.components.coder.data_science.workflow.eval import (
     WorkflowGeneralCaseSpecEvaluator,
 )
+from rdagent.core.scenario import Scenario
 
 
 class WorkflowCoSTEER(CoSTEER):
diff --git a/rdagent/components/coder/data_science/workflow/es.py b/rdagent/components/coder/data_science/workflow/es.py
index 6c62b288f..378098cfd 100644
--- a/rdagent/components/coder/data_science/workflow/es.py
+++ b/rdagent/components/coder/data_science/workflow/es.py
@@ -1,13 +1,17 @@
 import json
-from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
-from rdagent.components.coder.CoSTEER.knowledge_management import CoSTEERQueriedKnowledge
-from rdagent.oai.llm_utils import APIBackend
+
 from rdagent.components.coder.CoSTEER.evolving_strategy import (
     MultiProcessEvolvingStrategy,
 )
+from rdagent.components.coder.CoSTEER.knowledge_management import (
+    CoSTEERQueriedKnowledge,
+)
+from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
 from rdagent.core.experiment import FBWorkspace
+from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 
+
 class WorkflowMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
     def implement_one_task(
         self,
@@ -16,10 +20,8 @@ def implement_one_task(
         workspace: FBWorkspace | None = None,
     ) -> dict[str, str]:
         # competition_info = self.scen.competition_descriptions
-        
-        system_prompt = T(".prompts:workflow_coder.system").r(
-            workflow_spec=workspace.code_dict["spec/workflow.md"]
-        )
+
+        system_prompt = T(".prompts:workflow_coder.system").r(workflow_spec=workspace.code_dict["spec/workflow.md"])
         user_prompt = T(".prompts:workflow_coder.user").r(
             load_data_code=workspace.code_dict["load_data.py"],
             feature_code=workspace.code_dict["feat01.py"],
@@ -31,11 +33,9 @@ def implement_one_task(
                 user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
             )
         )["code"]
-        
-        return{
-            "main.py": data_loader_code
-        }
-    
+
+        return {"main.py": data_loader_code}
+
     def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
         """
         Assign the code list to the evolving item.
@@ -51,5 +51,3 @@ def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
                 evo.sub_workspace_list[index] = evo.experiment_workspace
             evo.sub_workspace_list[index].inject_code(**code_list[index])
         return evo
-
-    
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index becef9f78..dc85ff471 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -1,23 +1,25 @@
 import json
-from rdagent.core.experiment import FBWorkspace, Task
 from pathlib import Path
-from rdagent.core.evolving_framework import QueriedKnowledge
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
     CoSTEERMultiFeedback,
     CoSTEERSingleFeedback,
     CoSTEERSingleFeedbackDeprecated,
 )
+from rdagent.core.evolving_framework import QueriedKnowledge
+from rdagent.core.experiment import FBWorkspace, Task
+from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
-from rdagent.app.data_science.conf import DS_RD_SETTING
-from rdagent.oai.llm_utils import APIBackend
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
 WorkflowSingleFeedback = CoSTEERSingleFeedback
 WorkflowMultiFeedback = CoSTEERMultiFeedback
 
+
 class WorkflowGeneralCaseSpecEvaluator(CoSTEEREvaluator):
     """
     Motivation case:
@@ -26,6 +28,7 @@ class WorkflowGeneralCaseSpecEvaluator(CoSTEEREvaluator):
     Test workflow:
     - Build train, valid, and test data to run it, and test the output (e.g., shape, etc.)
     """
+
     def evaluate(
         self,
         target_task: Task,
@@ -53,8 +56,7 @@ def evaluate(
         fname = "main.py"
         stdout = implementation.execute(env=de, entry=f"python {fname}")
         system_prompt = T(".prompts:workflow_eval.system").r(
-            scenario="No scenario information yet.",
-            spec=implementation.code_dict["spec/workflow.md"]
+            scenario="No scenario information yet.", spec=implementation.code_dict["spec/workflow.md"]
         )
         user_prompt = T(".prompts:workflow_eval.user").r(
             stdout=stdout,
@@ -62,4 +64,3 @@ def evaluate(
         )
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
         return WorkflowSingleFeedback(**json.loads(resp))
-        
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/workflow/test.py b/rdagent/components/coder/data_science/workflow/test.py
index 4891bf73b..d6e255c1d 100644
--- a/rdagent/components/coder/data_science/workflow/test.py
+++ b/rdagent/components/coder/data_science/workflow/test.py
@@ -17,6 +17,7 @@
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
 
+
 def develop_one_competition(competition: str):
     scen = DataScienceScen(competition=competition)
     workflow_coder = WorkflowCoSTEER(scen)
diff --git a/rdagent/core/evolving_agent.py b/rdagent/core/evolving_agent.py
index 05a7aec5d..2f8ea1c14 100644
--- a/rdagent/core/evolving_agent.py
+++ b/rdagent/core/evolving_agent.py
@@ -96,7 +96,7 @@ def multistep_evolve(
 
                 # 6. update trace
                 self.evolving_trace.append(es)
-                
+
                 # 7. check if all tasks are completed
                 if self.with_feedback:
                     all_completed = all(es.feedback) if isinstance(es.feedback, list) else es.feedback
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index bd40c2407..e65ec5999 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -231,7 +231,9 @@ def __init__(
         self.based_experiments: Sequence[ASpecificWSForExperiment] = based_experiments
 
         self.result: object = None  # The result of the experiment, can be different types in different scenarios.
-        self.sub_results: dict[str, float] = {} # TODO: in Kaggle, now sub results are all saved in self.result, remove this in the future.
+        self.sub_results: dict[str, float] = (
+            {}
+        )  # TODO: in Kaggle, now sub results are all saved in self.result, remove this in the future.
         self.experiment_workspace: ASpecificWSForExperiment | None = None
 
 
diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
index 7b6918dcf..17dc8abf9 100644
--- a/rdagent/log/ui/llm_st.py
+++ b/rdagent/log/ui/llm_st.py
@@ -1,6 +1,6 @@
-import re
 import argparse
 import json
+import re
 from pathlib import Path
 
 import streamlit as st
@@ -27,6 +27,7 @@
 
 tlist = []
 
+
 def load_data():
     try:
         with open(f"{main_log_path}/{session_state.log_path}/debug_llm.json", "r") as f:
@@ -74,23 +75,26 @@ def highlight_prompts_uri(uri):
     parts = uri.split(":")
     return f"**{parts[0]}:**:green[**{parts[1]}**]"
 
+
 def extract_loopid_func_name(tag):
-    match = re.search(r'Loop_(\d+)\.(\w+)\.', tag)
+    match = re.search(r"Loop_(\d+)\.(\w+)\.", tag)
     if match:
         return match.group(1), match.group(2)
     return None, None
 
+
 def extract_evoid(tag):
-    match = re.search(r'\.evo_loop_(\d+)\.', tag)
+    match = re.search(r"\.evo_loop_(\d+)\.", tag)
     if match:
         return match.group(1)
     return None
 
+
 # Display the data
 for d in session_state.data:
     tag = d["tag"]
     obj = d["obj"]
-    
+
     loop_id, func_name = extract_loopid_func_name(tag)
     evo_id = extract_evoid(tag)
     if loop_id:
@@ -155,7 +159,7 @@ def extract_evoid(tag):
     for t in tlist:
         if t.startswith("L"):
             et_toc += f"- [{t}](#{t})\n"
-        elif 'evo_step_' in t:
+        elif "evo_step_" in t:
             et_toc += f"    - [{t}](#{t})\n"
         else:
             et_toc += f"  - [{t}](#{t})\n"
diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
index ab9ca03e1..e19247ccb 100644
--- a/rdagent/scenarios/data_science/dev/feedback.py
+++ b/rdagent/scenarios/data_science/dev/feedback.py
@@ -4,17 +4,14 @@
 from rdagent.components.knowledge_management.graph import UndirectedNode
 from rdagent.core.experiment import Experiment
 from rdagent.core.prompts import Prompts
-from rdagent.core.proposal import (
-    Experiment2Feedback,
-    HypothesisFeedback,
-)
+from rdagent.core.proposal import Experiment2Feedback, HypothesisFeedback
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace
 from rdagent.utils import convert2bool
 from rdagent.utils.agent.tpl import T
-from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 
-from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace
 
 class DSExperiment2Feedback(Experiment2Feedback):
     def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> HypothesisFeedback:
@@ -31,7 +28,7 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> HypothesisFeed
         elif hypothesis.component == "Workflow":
             modified_file_name = "main.py"
         modified_code = exp.experiment_workspace.code_dict[modified_file_name]
-        
+
         sota_hypothesis, sota_exp = trace.get_sota_hypothesis_and_experiment()
 
         if sota_exp:
@@ -46,15 +43,12 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> HypothesisFeed
         else:
             sota_codes = None
             sota_results = None
-        
-        
+
         last_hypothesis_and_feedback = None
         if trace.hist and len(trace.hist) > 0:
             last_hypothesis_and_feedback = (trace.hist[-1][0], trace.hist[-1][2])
-        
-        system_prompt = T(".prompts:exp_feedback.system").r(
-            scenario=self.scen.get_scenario_all_desc()
-        )
+
+        system_prompt = T(".prompts:exp_feedback.system").r(scenario=self.scen.get_scenario_all_desc())
         user_prompt = T(".prompts:exp_feedback.user").r(
             sota_codes=sota_codes,
             sota_results=sota_results,
@@ -63,19 +57,19 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> HypothesisFeed
             current_results=current_results,
             last_hypothesis_and_feedback=last_hypothesis_and_feedback,
         )
-        
+
         resp_dict = json.loads(
             APIBackend().build_messages_and_create_chat_completion(
-                    user_prompt=user_prompt,
-                    system_prompt=system_prompt,
-                    json_mode=True,
-                )
+                user_prompt=user_prompt,
+                system_prompt=system_prompt,
+                json_mode=True,
             )
-        
+        )
+
         return HypothesisFeedback(
             observations=resp_dict.get("Observations", "No observations provided"),
             hypothesis_evaluation=resp_dict.get("Feedback for Hypothesis", "No feedback provided"),
             new_hypothesis=resp_dict.get("New Hypothesis", "No new hypothesis provided"),
             reason=resp_dict.get("Reasoning", "No reasoning provided"),
             decision=convert2bool(resp_dict.get("Replace Best Result", "no")),
-        )
\ No newline at end of file
+        )
diff --git a/rdagent/scenarios/data_science/dev/runner.py b/rdagent/scenarios/data_science/dev/runner.py
index e55ea0c31..d52777638 100644
--- a/rdagent/scenarios/data_science/dev/runner.py
+++ b/rdagent/scenarios/data_science/dev/runner.py
@@ -1,30 +1,32 @@
-from rdagent.core.developer import Developer
 import pandas as pd
-from rdagent.core.exception import CoderError
-from rdagent.utils.env import DockerEnv, DSDockerConf
+
 from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.core.developer import Developer
+from rdagent.core.exception import CoderError
 from rdagent.log import rdagent_logger as logger
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.utils.env import DockerEnv, DSDockerConf
+
 
 class DSRunner(Developer[DSExperiment]):
     def develop(self, exp: DSExperiment) -> DSExperiment:
         ds_docker_conf = DSDockerConf()
         ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
-        
+
         de = DockerEnv(conf=ds_docker_conf)
-        
+
         # execute workflow
         exp.experiment_workspace.execute(env=de, entry="python main.py")
         submission_fp = exp.experiment_workspace.workspace_path / "submission.csv"
         score_fp = exp.experiment_workspace.workspace_path / "scores.csv"
-        
+
         if not submission_fp.exists():
             logger.error("Submission file (submission.csv) is not generated.")
             raise CoderError("Submission file (submission.csv) is not generated.")
-        
+
         if not score_fp.exists():
             logger.error("Metrics file (scores.csv) is not generated.")
             raise CoderError("Metrics file (scores.csv) is not generated.")
-        
+
         exp.result = pd.read_csv(score_fp, index_col=0)
-        return exp
\ No newline at end of file
+        return exp
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index 9a38ee49a..494a11707 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -1,5 +1,6 @@
 from rdagent.core.experiment import Experiment, FBWorkspace, Task
 
+
 class DSExperiment(Experiment[Task, FBWorkspace, FBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index ab388ee09..9206dd3c5 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -61,7 +61,7 @@ def get_sota_hypothesis_and_experiment(
                     continue
                 return h, exp
         return None, None
-    
+
     @property
     def successful_components(self) -> set[COMPONENT]:
         """
@@ -72,13 +72,14 @@ def successful_components(self) -> set[COMPONENT]:
             if hf.decision:
                 successful_components.add(h.component)
         return successful_components
-    
+
     def all_components_completed(self) -> bool:
         """
         Check if 5 successful components are completed.
         """
         return set(ORDER) == self.successful_components
 
+
 class DSExpGen(ExpGen):
     """Data Science Task Generator."""
 
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
index 0a22c8d51..4b4f16259 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
@@ -141,7 +141,7 @@ def model_workflow(
 
             print(f"Dynamic early_stop_round: {dynamic_early_stop}")
             hyper_params["early_stop_round"] = dynamic_early_stop
-        
+
         # Predict on validation data
         val_pred = model.predict(validation_datagen.flow(validation_images, batch_size=1, shuffle=False), verbose=1)
     else:

From bfa455a82d71506a53b58995627022eab9073801 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 20 Dec 2024 08:56:09 +0000
Subject: [PATCH 102/304] feature task changes

---
 .../coder/data_science/feature/exp.py           | 16 +---------------
 .../coder/data_science/feature/test.py          |  4 ++--
 .../scenarios/data_science/proposal/exp_gen.py  | 14 +++++---------
 .../data_science/proposal/prompts.yaml          | 17 +----------------
 4 files changed, 9 insertions(+), 42 deletions(-)

diff --git a/rdagent/components/coder/data_science/feature/exp.py b/rdagent/components/coder/data_science/feature/exp.py
index 0e3d6e80e..2ae9a6a07 100644
--- a/rdagent/components/coder/data_science/feature/exp.py
+++ b/rdagent/components/coder/data_science/feature/exp.py
@@ -13,28 +13,14 @@ def __init__(
         self,
         name: str,
         description: str,
-        variables: dict = {},
-        implementation: bool = False,
         **kwargs,
     ) -> None:
-        self.variables: dict = variables
-        self.implementation: bool = implementation
         super().__init__(name=name, description=description, **kwargs)
 
     def get_task_information(self):
         return f"""name: {self.name}
 description: {self.description}
-variables: {str(self.variables)}
-spec: {self.spec}"""
-
-    def get_task_information_and_implementation_result(self):
-        return {
-            "name": self.factor_name,
-            "description": self.factor_description,
-            "variables": str(self.variables),
-            "spec": self.spec,
-            "implementation": str(self.implementation),
-        }
+"""
 
     @staticmethod
     def from_dict(dict):
diff --git a/rdagent/components/coder/data_science/feature/test.py b/rdagent/components/coder/data_science/feature/test.py
index 801addc46..2f59f17f5 100644
--- a/rdagent/components/coder/data_science/feature/test.py
+++ b/rdagent/components/coder/data_science/feature/test.py
@@ -20,14 +20,14 @@ def develop_one_competition(competition: str):  # -> experiment
         feat_spec = file.read()
 
     # Create the experiment
-    ft = FeatureTask(name="FeatureTask", description=scen.competition_descriptions, spec=feat_spec)
+    ft = FeatureTask(name="FeatureTask", description=scen.competition_descriptions)
     exp = DSExperiment(
         sub_tasks=[ft],
     )
 
     with open("./rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py", "r") as file:
         load_data_code = file.read()
-    exp.experiment_workspace.inject_code(**{"load_data.py": load_data_code})
+    exp.experiment_workspace.inject_code(**{"load_data.py": load_data_code, "spec/feature.md": feat_spec})
 
     # Develop the experiment
     exp = feature_coder.develop(exp)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 9206dd3c5..c7e262424 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -173,16 +173,12 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 )
 
                 dependency_exp = trace.get_sota_hypothesis_and_experiment("DataLoadSpec")
-                tasks = []
-                for fn in resp_dict:
-                    ft = FeatureTask(
-                        name=fn,
-                        description=resp_dict[fn].get("description", "Factor description not provided"),
-                        formulation=resp_dict[fn].get("formulation", "Feature formulation not provided"),
-                        variables=resp_dict[fn].get("variables", "Variables not provided"),
-                    )
+                ft = FeatureTask(
+                    name="Feature Engineering",
+                    description=resp_dict.get("description", "Factor description not provided"),
+                )
 
-                exp = DSExperiment(sub_tasks=tasks, hypothesis=hypothesis)
+                exp = DSExperiment(sub_tasks=[ft], hypothesis=hypothesis)
                 exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                 return exp
             elif hypothesis.component == "Model":
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 4df8205e6..4bd0b250b 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -94,22 +94,7 @@ output_format:
     According to the hypothesis, please help user design one or more feature engineering tasks.
     The output should follow JSON format. The schema is as follows:
     {
-        "feature name 1": {
-            "description": "description of feature name 1",
-            "formulation": "latex formulation of feature or group name 1",
-            "variables": {
-                "variable or function name 1": "description of variable or function 1",
-                "variable or function name 2": "description of variable or function 2"
-            }
-        },
-        "feature name 2": {
-            "description": "description of feature name 2",
-            "formulation": "latex formulation of feature or group name 2",
-            "variables": {
-                "variable or function name 1": "description of variable or function 1",
-                "variable or function name 2": "description of variable or function 2"
-            }
-        }
+        "description": "description of feature engineering task",
         # Don't add ellipsis (...) or any filler text that might cause JSON parsing errors here!
     }
   model: |-

From 61f0cb81f2a40b7ff8530560f498e3bdb1373a9b Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 23 Dec 2024 03:13:04 +0000
Subject: [PATCH 103/304] ds loop change

---
 rdagent/app/data_science/loop.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 48b94683c..fafe4e533 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -32,7 +32,7 @@
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 from rdagent.scenarios.data_science.proposal.exp_gen import DSExpGen, DSTrace
 from rdagent.scenarios.kaggle.kaggle_crawler import download_data
-
+from rdagent.core.proposal import HypothesisFeedback
 
 class DataScienceRDLoop(RDLoop):
     skip_loop_error = (NextLoopException,)
@@ -92,13 +92,19 @@ def coding(self, prev_out: dict[str, Any]):
 
     def running(self, prev_out: dict[str, Any]):
         if not self.trace.all_components_completed():
-            self.trace.hist.append((prev_out["direct_exp_gen"].hypothesis, prev_out["coding"], None))
             raise NextLoopException("Not all 5 components are completed, skip running of DataScienceRDLoop.")
         exp = self.runner.develop(prev_out["coding"])
         return exp
 
     def feedback(self, prev_out: dict[str, Any]):
         if not self.trace.all_components_completed():
+            self.trace.hist.append((prev_out["direct_exp_gen"].hypothesis, prev_out["coding"], HypothesisFeedback(
+                observations="Not all 5 components are completed, skip feedback of DataScienceRDLoop.",
+                hypothesis_evaluation="",
+                new_hypothesis="",
+                reason="",
+                decision=True
+            )))
             raise NextLoopException("Not all 5 components are completed, skip feedback of DataScienceRDLoop.")
 
         feedback = self.summarizer.generate_feedback(

From 251688b87e5f8cb47eebda440e94519ff917fe9d Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Mon, 23 Dec 2024 06:18:31 +0000
Subject: [PATCH 104/304] fix a bug in feat

---
 rdagent/components/coder/data_science/feature/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/feature/test.py b/rdagent/components/coder/data_science/feature/test.py
index 2f59f17f5..c1f3e50b7 100644
--- a/rdagent/components/coder/data_science/feature/test.py
+++ b/rdagent/components/coder/data_science/feature/test.py
@@ -20,7 +20,7 @@ def develop_one_competition(competition: str):  # -> experiment
         feat_spec = file.read()
 
     # Create the experiment
-    ft = FeatureTask(name="FeatureTask", description=scen.competition_descriptions)
+    ft = FeatureTask(name="FeatureTask", description=scen.get_competition_full_desc())
     exp = DSExperiment(
         sub_tasks=[ft],
     )

From 438a5698af22569fbcd8882a986cb2d27fd6d39f Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Mon, 23 Dec 2024 06:57:25 +0000
Subject: [PATCH 105/304] add query knowledge for model and workflow

---
 rdagent/app/data_science/loop.py              |  23 ++--
 .../coder/data_science/feature/prompts.yaml   |   3 +-
 .../coder/data_science/model/__init__.py      |  72 ++++++++++-
 .../components/coder/data_science/model/es.py | 120 ------------------
 .../coder/data_science/model/prompts.yaml     |  42 +++---
 .../coder/data_science/model/test.py          |   3 -
 .../coder/data_science/workflow/__init__.py   |  73 ++++++++++-
 .../coder/data_science/workflow/es.py         |  53 --------
 .../coder/data_science/workflow/exp.py        |   5 +
 .../coder/data_science/workflow/prompts.yaml  |  39 +++++-
 .../coder/data_science/workflow/test.py       |   3 -
 11 files changed, 212 insertions(+), 224 deletions(-)
 delete mode 100644 rdagent/components/coder/data_science/model/es.py
 delete mode 100644 rdagent/components/coder/data_science/workflow/es.py

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index fafe4e533..2d53e6163 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -21,6 +21,7 @@
     Experiment2Feedback,
     ExpGen,
     Hypothesis2Experiment,
+    HypothesisFeedback,
     HypothesisGen,
     Trace,
 )
@@ -32,7 +33,7 @@
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 from rdagent.scenarios.data_science.proposal.exp_gen import DSExpGen, DSTrace
 from rdagent.scenarios.kaggle.kaggle_crawler import download_data
-from rdagent.core.proposal import HypothesisFeedback
+
 
 class DataScienceRDLoop(RDLoop):
     skip_loop_error = (NextLoopException,)
@@ -98,13 +99,19 @@ def running(self, prev_out: dict[str, Any]):
 
     def feedback(self, prev_out: dict[str, Any]):
         if not self.trace.all_components_completed():
-            self.trace.hist.append((prev_out["direct_exp_gen"].hypothesis, prev_out["coding"], HypothesisFeedback(
-                observations="Not all 5 components are completed, skip feedback of DataScienceRDLoop.",
-                hypothesis_evaluation="",
-                new_hypothesis="",
-                reason="",
-                decision=True
-            )))
+            self.trace.hist.append(
+                (
+                    prev_out["direct_exp_gen"].hypothesis,
+                    prev_out["coding"],
+                    HypothesisFeedback(
+                        observations="Not all 5 components are completed, skip feedback of DataScienceRDLoop.",
+                        hypothesis_evaluation="",
+                        new_hypothesis="",
+                        reason="",
+                        decision=True,
+                    ),
+                )
+            )
             raise NextLoopException("Not all 5 components are completed, skip feedback of DataScienceRDLoop.")
 
         feedback = self.summarizer.generate_feedback(
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
index e003686cc..8b63ea1aa 100644
--- a/rdagent/components/coder/data_science/feature/prompts.yaml
+++ b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -1,6 +1,6 @@
 feature:
   system: |-
-   You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
+    You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
     Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
     
     This project involves implementing feature engineering techniques to prepare data for machine learning models, and this project code will be written by GPT.
@@ -33,7 +33,6 @@ feature:
     {% endfor %}
     {% endif %}
 
-    ```
   user: |-
     ---------Feature Processing Specification---------
     {{ feature_spec }}
diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 973741d1d..f4bfbbd18 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -1,22 +1,88 @@
+import json
+from pathlib import Path
+
+from jinja2 import Environment, StrictUndefined
+
 from rdagent.components.coder.CoSTEER import CoSTEER
 from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
 from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
+from rdagent.components.coder.CoSTEER.evolving_strategy import (
+    MultiProcessEvolvingStrategy,
+)
 from rdagent.components.coder.CoSTEER.knowledge_management import (
     CoSTEERQueriedKnowledge,
 )
-from rdagent.components.coder.data_science.model.es import (
-    ModelMultiProcessEvolvingStrategy,
-)
 from rdagent.components.coder.data_science.model.eval import (
     ModelGeneralCaseSpecEvaluator,
 )
 from rdagent.components.coder.data_science.model.exp import ModelTask
+from rdagent.core.experiment import FBWorkspace
 from rdagent.core.scenario import Scenario
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
 
 # from rdagent.utils.agent.tpl import T
 # T(".prompts:model_generator.user").r()
 
 
+class ModelMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
+    def implement_one_task(
+        self,
+        target_task: ModelTask,
+        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+        workspace: FBWorkspace | None = None,
+    ) -> dict[str, str]:
+        model_information_str = target_task.get_task_information()
+
+        # 1. query
+        queried_similar_successful_knowledge = (
+            queried_knowledge.task_to_similar_task_successful_knowledge[model_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+        queried_former_failed_knowledge = (
+            queried_knowledge.task_to_former_failed_traces[model_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+
+        # 2. code
+        system_prompt = T(".prompts:model_coder.system").r(
+            queried_similar_successful_knowledge=queried_similar_successful_knowledge,
+            queried_former_failed_knowledge=queried_former_failed_knowledge[0],
+        )
+        user_prompt = T(".prompts:model_coder.user").r(
+            model_spec=workspace.code_dict["spec/model.md"],
+            latest_code=workspace.code_dict.get("model01.py"),
+        )
+
+        model_code = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(
+                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+            )
+        )["code"]
+
+        return {
+            "model01.py": model_code,
+        }
+
+    def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
+        """
+        Assign the code list to the evolving item.
+
+        The code list is aligned with the evolving item's sub-tasks.
+        If a task is not implemented, put a None in the list.
+        """
+        for index in range(len(evo.sub_tasks)):
+            if code_list[index] is None:
+                continue
+            if evo.sub_workspace_list[index] is None:
+                # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
+                evo.sub_workspace_list[index] = evo.experiment_workspace
+            evo.sub_workspace_list[index].inject_code(**code_list[index])
+        return evo
+
+
 class ModelCoSTEER(CoSTEER):
     def __init__(
         self,
diff --git a/rdagent/components/coder/data_science/model/es.py b/rdagent/components/coder/data_science/model/es.py
deleted file mode 100644
index 4408828df..000000000
--- a/rdagent/components/coder/data_science/model/es.py
+++ /dev/null
@@ -1,120 +0,0 @@
-import json
-from pathlib import Path
-
-from jinja2 import Environment, StrictUndefined
-
-from rdagent.components.coder.CoSTEER.evolving_strategy import (
-    MultiProcessEvolvingStrategy,
-)
-from rdagent.components.coder.CoSTEER.knowledge_management import (
-    CoSTEERQueriedKnowledge,
-    CoSTEERQueriedKnowledgeV2,
-)
-from rdagent.components.coder.data_science.model.exp import ModelTask
-from rdagent.core.experiment import FBWorkspace
-from rdagent.core.prompts import Prompts
-from rdagent.oai.llm_conf import LLM_SETTINGS
-from rdagent.oai.llm_utils import APIBackend
-
-coder_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
-
-
-class ModelMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
-    def implement_one_task(
-        self,
-        target_task: ModelTask,
-        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
-        workspace: FBWorkspace | None = None,
-    ) -> dict[str, str]:
-        model_information_str = target_task.get_task_information()
-
-        queried_similar_successful_knowledge = (
-            queried_knowledge.task_to_similar_task_successful_knowledge[model_information_str]
-            if queried_knowledge is not None
-            else []
-        )
-        queried_former_failed_knowledge = (
-            queried_knowledge.task_to_former_failed_traces[model_information_str]
-            if queried_knowledge is not None
-            else []
-        )
-
-        queried_former_failed_knowledge_to_render = (
-            queried_former_failed_knowledge[0]
-            if isinstance(queried_knowledge, CoSTEERQueriedKnowledgeV2)
-            else queried_former_failed_knowledge
-        )
-
-        system_prompt = (
-            Environment(undefined=StrictUndefined)
-            .from_string(
-                coder_prompts["model_coder"]["system"],
-            )
-            .render(
-                # scenario=self.scen.get_scenario_all_desc(filtered_tag=target_task.model_type),
-                # TODO: fit new scenario information
-                spec=workspace.code_dict["spec/model.md"],
-                queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
-            )
-        )
-
-        queried_similar_successful_knowledge_to_render = queried_similar_successful_knowledge
-        for _ in range(10):  # max attempt to reduce the length of user_prompt
-            user_prompt = (
-                Environment(undefined=StrictUndefined)
-                .from_string(
-                    coder_prompts["model_coder"]["user"],
-                )
-                .render(
-                    model_information_str=model_information_str,
-                    queried_similar_successful_knowledge=queried_similar_successful_knowledge_to_render,
-                    queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
-                    current_code=target_task.base_code,
-                )
-                .strip("\n")
-            )
-            if (
-                APIBackend().build_messages_and_calculate_token(
-                    user_prompt=user_prompt,
-                    system_prompt=system_prompt,
-                )
-                < LLM_SETTINGS.chat_token_limit
-            ):
-                break
-            elif len(queried_former_failed_knowledge_to_render) > 1:
-                queried_former_failed_knowledge_to_render = queried_former_failed_knowledge_to_render[1:]
-            elif len(queried_similar_successful_knowledge_to_render) > 1:
-                queried_similar_successful_knowledge_to_render = queried_similar_successful_knowledge_to_render[1:]
-
-        model_code = json.loads(
-            # APIBackend(use_chat_cache=CoSTEER_SETTINGS.coder_use_cache).build_messages_and_create_chat_completion(
-            APIBackend().build_messages_and_create_chat_completion(
-                user_prompt=user_prompt,
-                system_prompt=system_prompt,
-                json_mode=True,
-            ),
-        )["code"]
-        return {
-            "model01.py": model_code,
-        }
-        """
-        import pandas as pd
-        def Model():
-            pass
-        """
-
-    def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
-        """
-        Assign the code list to the evolving item.
-
-        The code list is aligned with the evolving item's sub-tasks.
-        If a task is not implemented, put a None in the list.
-        """
-        for index in range(len(evo.sub_tasks)):
-            if code_list[index] is None:
-                continue
-            if evo.sub_workspace_list[index] is None:
-                # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
-                evo.sub_workspace_list[index] = evo.experiment_workspace
-            evo.sub_workspace_list[index].inject_code(**code_list[index])
-        return evo
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 2185e6e05..c3b8c8f37 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -3,9 +3,6 @@ model_coder:
         You are tasked with implementing PyTorch models based on specific requirements provided by the user. The user’s ultimate goal is to obtain accurate predictions from the model on input data. Follow the instructions below to ensure your response is correct and aligned with the user’s expectations.
 
         Instructions for Code Generation:
-            Specification Compliance:
-                The user has provided a detailed framework or set of specifications under {{ spec }}. Your code must strictly adhere to this specification, including any required classes, methods, and organizational structure. Do not implement or add anything outside the scope of the provided specification.
-
             Leveraging User Inputs:
                 The user may provide various forms of additional information to guide you:
 
@@ -26,39 +23,38 @@ model_coder:
             {
                 "code": "Your corrected or newly implemented Python code as a single string"
             }
-    user: |-
-        Here is all the relevant information for this task:
-
-        Target Model Details:
-        {{ model_information_str }}
-
+        
+        -----------Here is the relevant information for this task-----------
         {% if queried_similar_successful_knowledge|length != 0 %}
         --------------Successful Implementations for Similar Models:--------------
-        {% for similar_successful_knowledge in queried_similar_successful_knowledge %} 
-        ===== Model {{loop.index}}: =====
+        ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
         {{ similar_successful_knowledge.target_task.get_task_information() }}
-        ===== Code: =====
+        =====Code:=====
         {{ similar_successful_knowledge.implementation.code }}
         {% endfor %} 
         {% endif %}
 
         {% if queried_former_failed_knowledge|length != 0 %}
         --------------Previous Failed Attempts:--------------
-        {% for former_failed_knowledge in queried_former_failed_knowledge %} 
-        Attempt {{ loop.index }}:
-        ===== Code: =====
+        {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
+        =====Code:=====
         {{ former_failed_knowledge.implementation.code }}
-        ===== Feedback: =====
+        =====Feedback:=====
         {{ former_failed_knowledge.feedback }}
         {% endfor %}
-        {% endif %}
+        {% endif %}    
+
+    user: |-
+        ---------Model Specification---------
+        {{ model_spec }}
+
+
+        {% if latest_code %}
+        ---------Former Specification---------
+        Former Code: {{ latest_code }}
+        You should follow the former code to improve it.
+        {% endif %}    
 
-        {% if current_code is not none %}
-        --------------Latest Code:--------------
-        {{ current_code }}
-        {% else %} 
-        No prior code has been implemented. 
-        {% endif %}
 
 model_eval:
     system: |-
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index f3ced2bf4..ba61d53a3 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -6,9 +6,6 @@
 
 from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
 from rdagent.components.coder.data_science.model import ModelCoSTEER
-from rdagent.components.coder.data_science.model.es import (
-    ModelMultiProcessEvolvingStrategy,
-)
 from rdagent.components.coder.data_science.model.eval import (
     ModelGeneralCaseSpecEvaluator,
 )
diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
index 8c120a7d9..e590b05e7 100644
--- a/rdagent/components/coder/data_science/workflow/__init__.py
+++ b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -1,13 +1,82 @@
+import json
+
 from rdagent.components.coder.CoSTEER import CoSTEER
 from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
 from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-from rdagent.components.coder.data_science.workflow.es import (
-    WorkflowMultiProcessEvolvingStrategy,
+from rdagent.components.coder.CoSTEER.evolving_strategy import (
+    MultiProcessEvolvingStrategy,
+)
+from rdagent.components.coder.CoSTEER.knowledge_management import (
+    CoSTEERQueriedKnowledge,
 )
 from rdagent.components.coder.data_science.workflow.eval import (
     WorkflowGeneralCaseSpecEvaluator,
 )
+from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
+from rdagent.core.experiment import FBWorkspace
 from rdagent.core.scenario import Scenario
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+
+
+class WorkflowMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
+    def implement_one_task(
+        self,
+        target_task: WorkflowTask,
+        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+        workspace: FBWorkspace | None = None,
+    ) -> dict[str, str]:
+        # competition_info = self.scen.competition_descriptions
+        workflow_information_str = target_task.get_task_information()
+
+        # 1. query
+        queried_similar_successful_knowledge = (
+            queried_knowledge.task_to_similar_task_successful_knowledge[workflow_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+        queried_former_failed_knowledge = (
+            queried_knowledge.task_to_former_failed_traces[workflow_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+
+        # 2. code
+        system_prompt = T(".prompts:workflow_coder.system").r(
+            queried_similar_successful_knowledge=queried_similar_successful_knowledge,
+            queried_former_failed_knowledge=queried_former_failed_knowledge[0],
+        )
+        user_prompt = T(".prompts:workflow_coder.user").r(
+            load_data_code=workspace.code_dict["load_data.py"],
+            feature_code=workspace.code_dict["feat01.py"],
+            model_code=workspace.code_dict["model01.py"],
+            ensemble_code=workspace.code_dict["ens.py"],
+            latest_code=workspace.code_dict.get("main.py"),
+            workflow_spec=workspace.code_dict["spec/workflow.md"],
+        )
+        data_loader_code = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(
+                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+            )
+        )["code"]
+
+        return {"main.py": data_loader_code}
+
+    def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
+        """
+        Assign the code list to the evolving item.
+
+        The code list is aligned with the evolving item's sub-tasks.
+        If a task is not implemented, put a None in the list.
+        """
+        for index in range(len(evo.sub_tasks)):
+            if code_list[index] is None:
+                continue
+            if evo.sub_workspace_list[index] is None:
+                # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
+                evo.sub_workspace_list[index] = evo.experiment_workspace
+            evo.sub_workspace_list[index].inject_code(**code_list[index])
+        return evo
 
 
 class WorkflowCoSTEER(CoSTEER):
diff --git a/rdagent/components/coder/data_science/workflow/es.py b/rdagent/components/coder/data_science/workflow/es.py
deleted file mode 100644
index 378098cfd..000000000
--- a/rdagent/components/coder/data_science/workflow/es.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import json
-
-from rdagent.components.coder.CoSTEER.evolving_strategy import (
-    MultiProcessEvolvingStrategy,
-)
-from rdagent.components.coder.CoSTEER.knowledge_management import (
-    CoSTEERQueriedKnowledge,
-)
-from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
-from rdagent.core.experiment import FBWorkspace
-from rdagent.oai.llm_utils import APIBackend
-from rdagent.utils.agent.tpl import T
-
-
-class WorkflowMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
-    def implement_one_task(
-        self,
-        target_task: WorkflowTask,
-        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
-        workspace: FBWorkspace | None = None,
-    ) -> dict[str, str]:
-        # competition_info = self.scen.competition_descriptions
-
-        system_prompt = T(".prompts:workflow_coder.system").r(workflow_spec=workspace.code_dict["spec/workflow.md"])
-        user_prompt = T(".prompts:workflow_coder.user").r(
-            load_data_code=workspace.code_dict["load_data.py"],
-            feature_code=workspace.code_dict["feat01.py"],
-            model_code=workspace.code_dict["model01.py"],
-            ensemble_code=workspace.code_dict["ens.py"],
-        )
-        data_loader_code = json.loads(
-            APIBackend().build_messages_and_create_chat_completion(
-                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-            )
-        )["code"]
-
-        return {"main.py": data_loader_code}
-
-    def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
-        """
-        Assign the code list to the evolving item.
-
-        The code list is aligned with the evolving item's sub-tasks.
-        If a task is not implemented, put a None in the list.
-        """
-        for index in range(len(evo.sub_tasks)):
-            if code_list[index] is None:
-                continue
-            if evo.sub_workspace_list[index] is None:
-                # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
-                evo.sub_workspace_list[index] = evo.experiment_workspace
-            evo.sub_workspace_list[index].inject_code(**code_list[index])
-        return evo
diff --git a/rdagent/components/coder/data_science/workflow/exp.py b/rdagent/components/coder/data_science/workflow/exp.py
index f2934ee6f..0c4124ede 100644
--- a/rdagent/components/coder/data_science/workflow/exp.py
+++ b/rdagent/components/coder/data_science/workflow/exp.py
@@ -24,3 +24,8 @@ def from_dict(dict):
 
     def __repr__(self) -> str:
         return f"<{self.__class__.__name__} {self.name}>"
+
+    def get_task_information(self):
+        return f"""name: {self.name}
+description: {self.description}
+"""
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index 4937a4bcb..c0214d6a0 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -1,7 +1,8 @@
 workflow_coder:
   system: |-
-    You are a Python data scientist working on a new Kaggle competition project.
-
+    You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
+    Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
+    
     The user has written different Python functions that can load and preprocess data, execute feature engineering, train models, and ensemble them.
 
     These Python codes with different functionalities are written separately in different Python files.
@@ -9,11 +10,6 @@ workflow_coder:
     This workflow code is also a Python file, and it functions similarly to a main process that calls the sub-files for each step and ultimately outputs a prediction file.
 
     The user will also provide specifications on how to organize the code and give instructions. 
-    These specifications are as follows: 
-    {{ workflow_spec }}
-
-    The dataset provided by load_data is not split into training and testing sets. In the workflow, you should perform this splitting. 
-    By default, use 80% of the data for training and 20% for testing. If the specification requires a different split ratio, cross-validation, or other splitting methods, follow the specification.
 
     The code you implement should align with the framework given in the specifications.
     After predicting the output, print the shape and other information of the output to stdout to help the evaluator assess the code.
@@ -23,7 +19,30 @@ workflow_coder:
         "code": "The Python code as a string."
     }
 
+    -----------Here is the relevant information for this task-----------
+    {% if queried_similar_successful_knowledge|length != 0 %}
+    --------------Successful Implementations for Similar Models:--------------
+    ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
+    {{ similar_successful_knowledge.target_task.get_task_information() }}
+    =====Code:=====
+    {{ similar_successful_knowledge.implementation.code }}
+    {% endfor %} 
+    {% endif %}
+
+    {% if queried_former_failed_knowledge|length != 0 %}
+    --------------Previous Failed Attempts:--------------
+    {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
+    =====Code:=====
+    {{ former_failed_knowledge.implementation.code }}
+    =====Feedback:=====
+    {{ former_failed_knowledge.feedback }}
+    {% endfor %}
+    {% endif %}
+
   user: |-
+    ---------Workflow Specification---------
+    {{ workflow_spec }}
+
     ---------load data code---------
     file: load_data.py
     {{ load_data_code }}
@@ -41,6 +60,12 @@ workflow_coder:
     file: ens.py
     {{ ensemble_code }}
 
+    {% if latest_code %}
+    ---------Former Specification---------
+      Former Code: {{ latest_code }}
+      You should follow the former code to improve it.
+    {% endif %}    
+
 workflow_eval:
   system: |-
     You are a data scientist.
diff --git a/rdagent/components/coder/data_science/workflow/test.py b/rdagent/components/coder/data_science/workflow/test.py
index d6e255c1d..c8e6eb3f6 100644
--- a/rdagent/components/coder/data_science/workflow/test.py
+++ b/rdagent/components/coder/data_science/workflow/test.py
@@ -6,9 +6,6 @@
 
 from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
 from rdagent.components.coder.data_science.workflow import WorkflowCoSTEER
-from rdagent.components.coder.data_science.workflow.es import (
-    WorkflowMultiProcessEvolvingStrategy,
-)
 from rdagent.components.coder.data_science.workflow.eval import (
     WorkflowGeneralCaseSpecEvaluator,
 )

From 64979576b19568f75a2892e67058a1e03ad1003e Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 23 Dec 2024 07:01:27 +0000
Subject: [PATCH 106/304] llm_debug info(for show) using pickle instead of json

---
 rdagent/log/logger.py    | 13 +++++++------
 rdagent/log/ui/llm_st.py |  5 +++--
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/rdagent/log/logger.py b/rdagent/log/logger.py
index ec194f463..87b045aef 100644
--- a/rdagent/log/logger.py
+++ b/rdagent/log/logger.py
@@ -1,6 +1,7 @@
 import json
 import os
 import sys
+import pickle
 from contextlib import contextmanager
 from datetime import datetime, timezone
 from functools import partial
@@ -115,17 +116,17 @@ def log_object(self, obj: object, *, tag: str = "") -> None:
         tag = f"{self._tag}.{tag}.{self.get_pids()}".strip(".")
 
         if "debug_" in tag:
-            debug_log_path = self.log_trace_path / "debug_llm.json"
+            debug_log_path = self.log_trace_path / "debug_llm.pkl"
             debug_data = {"tag": tag, "obj": obj}
             if debug_log_path.exists():
-                with debug_log_path.open("r+", encoding="utf-8") as f:
-                    existing_data = json.load(f)
+                with debug_log_path.open("rb+") as f:
+                    existing_data = pickle.load(f)
                     existing_data.append(debug_data)
                     f.seek(0)
-                    json.dump(existing_data, f, ensure_ascii=False, indent=4)
+                    pickle.dump(existing_data, f)
             else:
-                with debug_log_path.open("w", encoding="utf-8") as f:
-                    json.dump([debug_data], f, ensure_ascii=False, indent=4)
+                with debug_log_path.open("wb") as f:
+                    pickle.dump([debug_data], f)
             return
 
         logp = self.storage.log(obj, name=tag, save_type="pkl")
diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
index 17dc8abf9..c12b965dc 100644
--- a/rdagent/log/ui/llm_st.py
+++ b/rdagent/log/ui/llm_st.py
@@ -1,5 +1,6 @@
 import argparse
 import json
+import pickle
 import re
 from pathlib import Path
 
@@ -30,8 +31,8 @@
 
 def load_data():
     try:
-        with open(f"{main_log_path}/{session_state.log_path}/debug_llm.json", "r") as f:
-            session_state.data = json.load(f)
+        with open(f"{main_log_path}/{session_state.log_path}/debug_llm.pkl", "r") as f:
+            session_state.data = pickle.load(f)
     except Exception as e:
         session_state.data = [{"error": str(e)}]
 

From 3920a5cd7c6500d48a206647a36e901eb01a1579 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Mon, 23 Dec 2024 08:16:23 +0000
Subject: [PATCH 107/304] remove NextLoopException

---
 rdagent/app/data_science/loop.py       | 37 ++++++++++----------------
 rdagent/components/workflow/rd_loop.py |  6 -----
 2 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 2d53e6163..11316f6c6 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -15,7 +15,7 @@
 from rdagent.components.coder.data_science.workflow import WorkflowCoSTEER
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
 from rdagent.components.workflow.conf import BasePropSetting
-from rdagent.components.workflow.rd_loop import NextLoopException, RDLoop
+from rdagent.components.workflow.rd_loop import RDLoop
 from rdagent.core.exception import FactorEmptyError, ModelEmptyError
 from rdagent.core.proposal import (
     Experiment2Feedback,
@@ -36,7 +36,6 @@
 
 
 class DataScienceRDLoop(RDLoop):
-    skip_loop_error = (NextLoopException,)
 
     def __init__(self, PROP_SETTING: BasePropSetting):
         scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
@@ -92,31 +91,23 @@ def coding(self, prev_out: dict[str, Any]):
         return exp
 
     def running(self, prev_out: dict[str, Any]):
-        if not self.trace.all_components_completed():
-            raise NextLoopException("Not all 5 components are completed, skip running of DataScienceRDLoop.")
-        exp = self.runner.develop(prev_out["coding"])
+        if self.trace.all_components_completed():
+            exp = self.runner.develop(prev_out["coding"])
         return exp
 
     def feedback(self, prev_out: dict[str, Any]):
-        if not self.trace.all_components_completed():
-            self.trace.hist.append(
-                (
-                    prev_out["direct_exp_gen"].hypothesis,
-                    prev_out["coding"],
-                    HypothesisFeedback(
-                        observations="Not all 5 components are completed, skip feedback of DataScienceRDLoop.",
-                        hypothesis_evaluation="",
-                        new_hypothesis="",
-                        reason="",
-                        decision=True,
-                    ),
-                )
+        if self.trace.all_components_completed():
+            feedback = self.summarizer.generate_feedback(
+                prev_out["running"], prev_out["direct_exp_gen"].hypothesis, self.trace
+            )
+        else:
+            feedback = HypothesisFeedback(
+                observations="Not all 5 components are completed, skip feedback of DataScienceRDLoop.",
+                hypothesis_evaluation="",
+                new_hypothesis="",
+                reason="",
+                decision=True,
             )
-            raise NextLoopException("Not all 5 components are completed, skip feedback of DataScienceRDLoop.")
-
-        feedback = self.summarizer.generate_feedback(
-            prev_out["running"], prev_out["direct_exp_gen"].hypothesis, self.trace
-        )
         self.trace.hist.append((prev_out["direct_exp_gen"].hypothesis, prev_out["running"], feedback))
 
 
diff --git a/rdagent/components/workflow/rd_loop.py b/rdagent/components/workflow/rd_loop.py
index bde21e349..d1b3dcb9f 100644
--- a/rdagent/components/workflow/rd_loop.py
+++ b/rdagent/components/workflow/rd_loop.py
@@ -20,12 +20,6 @@
 from rdagent.utils.workflow import LoopBase, LoopMeta
 
 
-class NextLoopException(Exception):
-    """TODO: should we place in in rdagent/core/exception.py?"""
-
-    pass
-
-
 class RDLoop(LoopBase, metaclass=LoopMeta):
 
     def __init__(self, PROP_SETTING: BasePropSetting):

From e8a85a6069695dc02d5ad7775ab657c5a80db605 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 23 Dec 2024 08:19:10 +0000
Subject: [PATCH 108/304] loop change

---
 rdagent/app/data_science/loop.py              | 27 ++++++++-----------
 .../coder/CoSTEER/evolving_agent.py           |  3 ++-
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 2d53e6163..19fffed7c 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -92,31 +92,26 @@ def coding(self, prev_out: dict[str, Any]):
         return exp
 
     def running(self, prev_out: dict[str, Any]):
-        if not self.trace.all_components_completed():
-            raise NextLoopException("Not all 5 components are completed, skip running of DataScienceRDLoop.")
-        exp = self.runner.develop(prev_out["coding"])
+        if self.trace.all_components_completed():
+            exp = self.runner.develop(prev_out["coding"])
+        else:
+            exp = prev_out["coding"]
         return exp
 
     def feedback(self, prev_out: dict[str, Any]):
-        if not self.trace.all_components_completed():
-            self.trace.hist.append(
-                (
-                    prev_out["direct_exp_gen"].hypothesis,
-                    prev_out["coding"],
-                    HypothesisFeedback(
+        if self.trace.all_components_completed():
+            feedback = self.summarizer.generate_feedback(
+                prev_out["running"], prev_out["direct_exp_gen"].hypothesis, self.trace
+            )
+        else:
+            feedback = HypothesisFeedback(
                         observations="Not all 5 components are completed, skip feedback of DataScienceRDLoop.",
                         hypothesis_evaluation="",
                         new_hypothesis="",
                         reason="",
                         decision=True,
-                    ),
-                )
-            )
-            raise NextLoopException("Not all 5 components are completed, skip feedback of DataScienceRDLoop.")
+                    )
 
-        feedback = self.summarizer.generate_feedback(
-            prev_out["running"], prev_out["direct_exp_gen"].hypothesis, self.trace
-        )
         self.trace.hist.append((prev_out["direct_exp_gen"].hypothesis, prev_out["running"], feedback))
 
 
diff --git a/rdagent/components/coder/CoSTEER/evolving_agent.py b/rdagent/components/coder/CoSTEER/evolving_agent.py
index ece9f85c3..560d15694 100644
--- a/rdagent/components/coder/CoSTEER/evolving_agent.py
+++ b/rdagent/components/coder/CoSTEER/evolving_agent.py
@@ -2,7 +2,7 @@
 from rdagent.components.coder.CoSTEER.evolvable_subjects import EvolvingItem
 from rdagent.core.evolving_agent import RAGEvoAgent
 from rdagent.core.evolving_framework import EvolvableSubjects
-
+from rdagent.core.exception import CoderError
 
 class FilterFailedRAGEvoAgent(RAGEvoAgent):
     def filter_evolvable_subjects_by_feedback(
@@ -15,4 +15,5 @@ def filter_evolvable_subjects_by_feedback(
         for index in range(len(evo.sub_workspace_list)):
             if evo.sub_workspace_list[index] is not None and feedback[index] and not feedback[index].final_decision:
                 evo.sub_workspace_list[index].clear()
+                raise CoderError("Having a failed sub task")
         return evo

From 58451736a50408a73233679ce5d3af3d66c50a1b Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 23 Dec 2024 08:30:09 +0000
Subject: [PATCH 109/304] coder raise CoderError when all sub_tasks failed

---
 rdagent/components/coder/CoSTEER/evolving_agent.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/rdagent/components/coder/CoSTEER/evolving_agent.py b/rdagent/components/coder/CoSTEER/evolving_agent.py
index 560d15694..dcfeed78b 100644
--- a/rdagent/components/coder/CoSTEER/evolving_agent.py
+++ b/rdagent/components/coder/CoSTEER/evolving_agent.py
@@ -15,5 +15,8 @@ def filter_evolvable_subjects_by_feedback(
         for index in range(len(evo.sub_workspace_list)):
             if evo.sub_workspace_list[index] is not None and feedback[index] and not feedback[index].final_decision:
                 evo.sub_workspace_list[index].clear()
-                raise CoderError("Having a failed sub task")
+        
+        if all(not f.final_decision for f in feedback if f):
+            raise CoderError("All feedbacks of sub tasks are negative.")
+        
         return evo

From 3db73f0664b492a143157e97e4b53f573dba51c4 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 23 Dec 2024 09:58:53 +0000
Subject: [PATCH 110/304] rename code_dict to file_dict in FBWorkspace

---
 .../coder/data_science/ensemble/__init__.py     |  4 ++--
 .../coder/data_science/ensemble/eval.py         |  2 +-
 .../coder/data_science/ensemble/test.py         |  2 +-
 .../coder/data_science/feature/__init__.py      |  6 +++---
 .../coder/data_science/feature/eval.py          |  2 +-
 .../coder/data_science/feature/test.py          |  2 +-
 .../coder/data_science/model/__init__.py        |  6 +++---
 .../components/coder/data_science/model/eval.py |  6 +++---
 .../components/coder/data_science/model/test.py |  6 +++---
 .../data_science/raw_data_loader/__init__.py    | 14 +++++++-------
 .../coder/data_science/raw_data_loader/eval.py  |  4 ++--
 .../coder/data_science/workflow/__init__.py     | 14 +++++++-------
 .../coder/data_science/workflow/eval.py         |  4 ++--
 .../coder/data_science/workflow/test.py         |  6 +++---
 .../coder/factor_coder/evolving_strategy.py     |  2 +-
 rdagent/components/coder/factor_coder/factor.py |  6 +++---
 .../coder/model_coder/evolving_strategy.py      |  2 +-
 rdagent/components/coder/model_coder/model.py   |  4 ++--
 .../coder/model_coder/one_shot/__init__.py      |  2 +-
 rdagent/components/loader/task_loader.py        |  2 +-
 rdagent/core/experiment.py                      | 17 ++++++++---------
 rdagent/log/ui/app.py                           |  4 ++--
 rdagent/log/ui/web.py                           |  2 +-
 .../scenarios/data_mining/developer/feedback.py |  2 +-
 .../data_mining/developer/model_runner.py       |  4 ++--
 rdagent/scenarios/data_science/dev/feedback.py  |  2 +-
 .../scenarios/data_science/proposal/exp_gen.py  |  2 +-
 rdagent/scenarios/kaggle/developer/coder.py     |  2 +-
 rdagent/scenarios/kaggle/developer/feedback.py  |  2 +-
 rdagent/scenarios/kaggle/developer/runner.py    |  8 ++++----
 .../kaggle/experiment/kaggle_experiment.py      |  4 ++--
 .../scenarios/kaggle/experiment/workspace.py    |  2 +-
 rdagent/scenarios/kaggle/proposal/proposal.py   |  2 +-
 rdagent/scenarios/qlib/developer/feedback.py    |  2 +-
 .../scenarios/qlib/developer/model_runner.py    |  4 ++--
 .../factor_experiment_loader/json_loader.py     |  2 +-
 rdagent/scenarios/qlib/prompts.yaml             |  4 ++--
 37 files changed, 80 insertions(+), 81 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/__init__.py b/rdagent/components/coder/data_science/ensemble/__init__.py
index d4ad7ae42..5f225be17 100644
--- a/rdagent/components/coder/data_science/ensemble/__init__.py
+++ b/rdagent/components/coder/data_science/ensemble/__init__.py
@@ -61,7 +61,7 @@ def implement_one_task(
                 queried_former_failed_knowledge[0] if queried_former_failed_knowledge else None
             ),
         )
-        user_prompt = T(".prompts:ensemble_coder.user").r(ensemble_spec=workspace.code_dict["spec/ensemble.md"])
+        user_prompt = T(".prompts:ensemble_coder.user").r(ensemble_spec=workspace.file_dict["spec/ensemble.md"])
 
         ensemble_code = json.loads(
             APIBackend().build_messages_and_create_chat_completion(
@@ -86,7 +86,7 @@ def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
             if evo.sub_workspace_list[index] is None:
                 # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
                 evo.sub_workspace_list[index] = evo.experiment_workspace
-            evo.sub_workspace_list[index].inject_code(**code_list[index])
+            evo.sub_workspace_list[index].inject_files(**code_list[index])
         return evo
 
 
diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
index 4ef2f87c1..b9121767b 100644
--- a/rdagent/components/coder/data_science/ensemble/eval.py
+++ b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -49,7 +49,7 @@ def evaluate(
         fname = "ensemble_test.py"
         with (DIRNAME / "eval_tests" / "ensemble_test.py").open("r") as f:
             test_code = f.read()
-            implementation.inject_code(**{fname: test_code})
+            implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
         system_prompt = T(".prompts:ensemble_eval.system").r(test_code=test_code)
diff --git a/rdagent/components/coder/data_science/ensemble/test.py b/rdagent/components/coder/data_science/ensemble/test.py
index 9986aa978..4fa672932 100644
--- a/rdagent/components/coder/data_science/ensemble/test.py
+++ b/rdagent/components/coder/data_science/ensemble/test.py
@@ -47,7 +47,7 @@ def develop_ensemble():
     exp = EnsembleExperiment(sub_tasks=[task])
 
     # Injecting the corresponding specification
-    exp.experiment_workspace.inject_code(**{"spec/ensemble.md": ensemble_spec})
+    exp.experiment_workspace.inject_files(**{"spec/ensemble.md": ensemble_spec})
 
     # Develop the experiment
     exp = ensemble_coder.develop(exp)
diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
index a4f2d9a3a..65d0ebb43 100644
--- a/rdagent/components/coder/data_science/feature/__init__.py
+++ b/rdagent/components/coder/data_science/feature/__init__.py
@@ -46,8 +46,8 @@ def implement_one_task(
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
         )
         user_prompt = T(".prompts:feature.user").r(
-            feature_spec=workspace.code_dict["spec/feature.md"],
-            latest_code=workspace.code_dict.get("feat01.py"),
+            feature_spec=workspace.file_dict["spec/feature.md"],
+            latest_code=workspace.file_dict.get("feat01.py"),
         )
 
         feature_code = json.loads(
@@ -73,7 +73,7 @@ def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
             if evo.sub_workspace_list[index] is None:
                 # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
                 evo.sub_workspace_list[index] = evo.experiment_workspace
-            evo.sub_workspace_list[index].inject_code(**code_list[index])
+            evo.sub_workspace_list[index].inject_files(**code_list[index])
         return evo
 
 
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
index bff592a31..9f52516cf 100644
--- a/rdagent/components/coder/data_science/feature/eval.py
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -50,7 +50,7 @@ def evaluate(
         fname = "feature_test.py"
         with (DIRNAME / "eval_tests" / "feature_test.py").open("r") as f:
             test_code = f.read()
-            implementation.inject_code(**{fname: test_code})
+            implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
         system_prompt = T(".prompts:feature_eval.system").r(test_code=test_code)
diff --git a/rdagent/components/coder/data_science/feature/test.py b/rdagent/components/coder/data_science/feature/test.py
index c1f3e50b7..96cd5e590 100644
--- a/rdagent/components/coder/data_science/feature/test.py
+++ b/rdagent/components/coder/data_science/feature/test.py
@@ -27,7 +27,7 @@ def develop_one_competition(competition: str):  # -> experiment
 
     with open("./rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py", "r") as file:
         load_data_code = file.read()
-    exp.experiment_workspace.inject_code(**{"load_data.py": load_data_code, "spec/feature.md": feat_spec})
+    exp.experiment_workspace.inject_files(**{"load_data.py": load_data_code, "spec/feature.md": feat_spec})
 
     # Develop the experiment
     exp = feature_coder.develop(exp)
diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index f4bfbbd18..5615abed0 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -52,8 +52,8 @@ def implement_one_task(
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
         )
         user_prompt = T(".prompts:model_coder.user").r(
-            model_spec=workspace.code_dict["spec/model.md"],
-            latest_code=workspace.code_dict.get("model01.py"),
+            model_spec=workspace.file_dict["spec/model.md"],
+            latest_code=workspace.file_dict.get("model01.py"),
         )
 
         model_code = json.loads(
@@ -79,7 +79,7 @@ def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
             if evo.sub_workspace_list[index] is None:
                 # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
                 evo.sub_workspace_list[index] = evo.experiment_workspace
-            evo.sub_workspace_list[index].inject_code(**code_list[index])
+            evo.sub_workspace_list[index].inject_files(**code_list[index])
         return evo
 
 
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index b133d28db..7f542d9e3 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -68,14 +68,14 @@ def evaluate(
         fname = "model_execute.py"
         with (DIRNAME / "eval_tests" / "model_execute.py").open("r") as f:
             test_code = f.read()
-            implementation.inject_code(**{fname: test_code})
+            implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
         system_prompt = T(".prompts:model_eval.system").r(
-            test_code=test_code, scenario="No scenario information yet.", spec=implementation.code_dict["spec/model.md"]
+            test_code=test_code, scenario="No scenario information yet.", spec=implementation.file_dict["spec/model.md"]
         )
         user_prompt = T(".prompts:model_eval.user").r(
             stdout=stdout,
-            code=implementation.code_dict["model01.py"],
+            code=implementation.file_dict["model01.py"],
         )
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
         return ModelSingleFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index ba61d53a3..829e8b3d5 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -36,9 +36,9 @@ def develop_one_competition(competition: str):
     modelexp = FBWorkspace()
     for file_name in injected_file_names:
         file_path = tpl_ex_path / file_name
-        modelexp.inject_code(**{file_name: file_path.read_text()})
+        modelexp.inject_files(**{file_name: file_path.read_text()})
 
-    mt.base_code += modelexp.code_dict["model01.py"]
+    mt.base_code += modelexp.file_dict["model01.py"]
     exp = DSExperiment(
         sub_tasks=[mt],
     )
@@ -56,7 +56,7 @@ def develop_one_competition(competition: str):
     # Run the experiment
     for file_name in injected_file_names:
         file_path = tpl_ex_path / file_name
-        exp.experiment_workspace.inject_code(**{file_name: file_path.read_text()})
+        exp.experiment_workspace.inject_files(**{file_name: file_path.read_text()})
 
     exp = model_coder.develop(exp)
 
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index d19a8adf4..d11fbc691 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -75,12 +75,12 @@ def implement_one_task(
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
         )
         data_loader_prompt = T(".prompts:spec.user.data_loader").r(
-            latest_spec=workspace.code_dict.get("spec/data_loader.md")
+            latest_spec=workspace.file_dict.get("spec/data_loader.md")
         )
-        feature_prompt = T(".prompts:spec.user.feature").r(latest_spec=workspace.code_dict.get("spec/feature.md"))
-        model_prompt = T(".prompts:spec.user.model").r(latest_spec=workspace.code_dict.get("spec/model.md"))
-        ensemble_prompt = T(".prompts:spec.user.ensemble").r(latest_spec=workspace.code_dict.get("spec/ensemble.md"))
-        workflow_prompt = T(".prompts:spec.user.workflow").r(latest_spec=workspace.code_dict.get("spec/workflow.md"))
+        feature_prompt = T(".prompts:spec.user.feature").r(latest_spec=workspace.file_dict.get("spec/feature.md"))
+        model_prompt = T(".prompts:spec.user.model").r(latest_spec=workspace.file_dict.get("spec/model.md"))
+        ensemble_prompt = T(".prompts:spec.user.ensemble").r(latest_spec=workspace.file_dict.get("spec/ensemble.md"))
+        workflow_prompt = T(".prompts:spec.user.workflow").r(latest_spec=workspace.file_dict.get("spec/workflow.md"))
 
         spec_session = APIBackend().build_chat_session(session_system_prompt=system_prompt)
 
@@ -106,7 +106,7 @@ def implement_one_task(
         user_prompt = T(".prompts:data_loader_coder.user").r(
             competition_info=competition_info,
             data_loader_spec=data_loader_spec,
-            latest_code=workspace.code_dict.get("load_data.py"),
+            latest_code=workspace.file_dict.get("load_data.py"),
         )
 
         data_loader_code = json.loads(
@@ -137,7 +137,7 @@ def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
             if evo.sub_workspace_list[index] is None:
                 # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
                 evo.sub_workspace_list[index] = evo.experiment_workspace
-            evo.sub_workspace_list[index].inject_code(**code_list[index])
+            evo.sub_workspace_list[index].inject_files(**code_list[index])
         return evo
 
 
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index cdb3ba6cf..c0ec526d3 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -55,11 +55,11 @@ def evaluate(
         fname = "data_loader_test.py"
         with (DIRNAME / "eval_tests" / "data_loader_test.py").open("r") as f:
             test_code = f.read()
-            implementation.inject_code(**{fname: test_code})
+            implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
         system_prompt = T(".prompts:data_loader_eval.system").r(
-            test_code=test_code, code=implementation.code_dict["load_data.py"]
+            test_code=test_code, code=implementation.file_dict["load_data.py"]
         )
         user_prompt = T(".prompts:data_loader_eval.user").r(stdout=stdout)
 
diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
index e590b05e7..6d78b480f 100644
--- a/rdagent/components/coder/data_science/workflow/__init__.py
+++ b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -47,12 +47,12 @@ def implement_one_task(
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
         )
         user_prompt = T(".prompts:workflow_coder.user").r(
-            load_data_code=workspace.code_dict["load_data.py"],
-            feature_code=workspace.code_dict["feat01.py"],
-            model_code=workspace.code_dict["model01.py"],
-            ensemble_code=workspace.code_dict["ens.py"],
-            latest_code=workspace.code_dict.get("main.py"),
-            workflow_spec=workspace.code_dict["spec/workflow.md"],
+            load_data_code=workspace.file_dict["load_data.py"],
+            feature_code=workspace.file_dict["feat01.py"],
+            model_code=workspace.file_dict["model01.py"],
+            ensemble_code=workspace.file_dict["ens.py"],
+            latest_code=workspace.file_dict.get("main.py"),
+            workflow_spec=workspace.file_dict["spec/workflow.md"],
         )
         data_loader_code = json.loads(
             APIBackend().build_messages_and_create_chat_completion(
@@ -75,7 +75,7 @@ def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
             if evo.sub_workspace_list[index] is None:
                 # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
                 evo.sub_workspace_list[index] = evo.experiment_workspace
-            evo.sub_workspace_list[index].inject_code(**code_list[index])
+            evo.sub_workspace_list[index].inject_files(**code_list[index])
         return evo
 
 
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index dc85ff471..63d9b8148 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -56,11 +56,11 @@ def evaluate(
         fname = "main.py"
         stdout = implementation.execute(env=de, entry=f"python {fname}")
         system_prompt = T(".prompts:workflow_eval.system").r(
-            scenario="No scenario information yet.", spec=implementation.code_dict["spec/workflow.md"]
+            scenario="No scenario information yet.", spec=implementation.file_dict["spec/workflow.md"]
         )
         user_prompt = T(".prompts:workflow_eval.user").r(
             stdout=stdout,
-            code=implementation.code_dict["main.py"],
+            code=implementation.file_dict["main.py"],
         )
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
         return WorkflowSingleFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/workflow/test.py b/rdagent/components/coder/data_science/workflow/test.py
index c8e6eb3f6..5e9e7a1b4 100644
--- a/rdagent/components/coder/data_science/workflow/test.py
+++ b/rdagent/components/coder/data_science/workflow/test.py
@@ -31,9 +31,9 @@ def develop_one_competition(competition: str):
     workflowexp = FBWorkspace()
     for file_name in injected_file_names:
         file_path = tpl_ex_path / file_name
-        workflowexp.inject_code(**{file_name: file_path.read_text()})
+        workflowexp.inject_files(**{file_name: file_path.read_text()})
 
-    wt.base_code += workflowexp.code_dict["main.py"]
+    wt.base_code += workflowexp.file_dict["main.py"]
     exp = DSExperiment(
         sub_tasks=[wt],
     )
@@ -49,7 +49,7 @@ def develop_one_competition(competition: str):
     # Run the experiment
     for file_name in injected_file_names:
         file_path = tpl_ex_path / file_name
-        exp.experiment_workspace.inject_code(**{file_name: file_path.read_text()})
+        exp.experiment_workspace.inject_files(**{file_name: file_path.read_text()})
 
     exp = workflow_coder.develop(exp)
 
diff --git a/rdagent/components/coder/factor_coder/evolving_strategy.py b/rdagent/components/coder/factor_coder/evolving_strategy.py
index 0496d83d9..5ddc938d2 100644
--- a/rdagent/components/coder/factor_coder/evolving_strategy.py
+++ b/rdagent/components/coder/factor_coder/evolving_strategy.py
@@ -181,5 +181,5 @@ def assign_code_list_to_evo(self, code_list, evo):
                 continue
             if evo.sub_workspace_list[index] is None:
                 evo.sub_workspace_list[index] = FactorFBWorkspace(target_task=evo.sub_tasks[index])
-            evo.sub_workspace_list[index].inject_code(**{"factor.py": code_list[index]})
+            evo.sub_workspace_list[index].inject_files(**{"factor.py": code_list[index]})
         return evo
diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py
index df41a2937..502901c90 100644
--- a/rdagent/components/coder/factor_coder/factor.py
+++ b/rdagent/components/coder/factor_coder/factor.py
@@ -92,8 +92,8 @@ def __init__(
 
     def hash_func(self, data_type: str = "Debug") -> str:
         return (
-            md5_hash(data_type + self.code_dict["factor.py"])
-            if ("factor.py" in self.code_dict and not self.raise_exception)
+            md5_hash(data_type + self.file_dict["factor.py"])
+            if ("factor.py" in self.file_dict and not self.raise_exception)
             else None
         )
 
@@ -118,7 +118,7 @@ def execute(self, data_type: str = "Debug") -> Tuple[str, pd.DataFrame]:
 
         """
         super().execute()
-        if self.code_dict is None or "factor.py" not in self.code_dict:
+        if self.file_dict is None or "factor.py" not in self.file_dict:
             if self.raise_exception:
                 raise CodeFormatError(self.FB_CODE_NOT_SET)
             else:
diff --git a/rdagent/components/coder/model_coder/evolving_strategy.py b/rdagent/components/coder/model_coder/evolving_strategy.py
index 8aa5bca1c..83b7afa3e 100644
--- a/rdagent/components/coder/model_coder/evolving_strategy.py
+++ b/rdagent/components/coder/model_coder/evolving_strategy.py
@@ -104,5 +104,5 @@ def assign_code_list_to_evo(self, code_list, evo):
                 continue
             if evo.sub_workspace_list[index] is None:
                 evo.sub_workspace_list[index] = ModelFBWorkspace(target_task=evo.sub_tasks[index])
-            evo.sub_workspace_list[index].inject_code(**{"model.py": code_list[index]})
+            evo.sub_workspace_list[index].inject_files(**{"model.py": code_list[index]})
         return evo
diff --git a/rdagent/components/coder/model_coder/model.py b/rdagent/components/coder/model_coder/model.py
index ce842459f..e19eeed36 100644
--- a/rdagent/components/coder/model_coder/model.py
+++ b/rdagent/components/coder/model_coder/model.py
@@ -83,8 +83,8 @@ def hash_func(
         param_init_value: float = 1.0,
     ) -> str:
         target_file_name = f"{batch_size}_{num_features}_{num_timesteps}_{input_value}_{param_init_value}"
-        for code_file_name in sorted(list(self.code_dict.keys())):
-            target_file_name = f"{target_file_name}_{self.code_dict[code_file_name]}"
+        for code_file_name in sorted(list(self.file_dict.keys())):
+            target_file_name = f"{target_file_name}_{self.file_dict[code_file_name]}"
         return md5_hash(target_file_name)
 
     @cache_with_pickle(hash_func)
diff --git a/rdagent/components/coder/model_coder/one_shot/__init__.py b/rdagent/components/coder/model_coder/one_shot/__init__.py
index 7f7fa83e6..29c9ff921 100644
--- a/rdagent/components/coder/model_coder/one_shot/__init__.py
+++ b/rdagent/components/coder/model_coder/one_shot/__init__.py
@@ -35,7 +35,7 @@ def develop(self, exp: ModelExperiment) -> ModelExperiment:
             # Extract the code part from the response
             match = re.search(r".*```[Pp]ython\n(.*)\n```.*", resp, re.DOTALL)
             code = match.group(1)
-            mti.inject_code(**{"model.py": code})
+            mti.inject_files(**{"model.py": code})
             mti_l.append(mti)
         exp.sub_workspace_list = mti_l
         return exp
diff --git a/rdagent/components/loader/task_loader.py b/rdagent/components/loader/task_loader.py
index a3b344504..3ef807854 100644
--- a/rdagent/components/loader/task_loader.py
+++ b/rdagent/components/loader/task_loader.py
@@ -90,5 +90,5 @@ def load(self, task: ModelTask) -> ModelFBWorkspace:
         mti.prepare()
         with open(self.path / f"{task.name}.py", "r") as f:
             code = f.read()
-        mti.inject_code(**{"model.py": code})
+        mti.inject_files(**{"model.py": code})
         return mti
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index e65ec5999..029a9efc8 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -99,15 +99,14 @@ class FBWorkspace(Workspace):
 
         def run_pipeline(self, **files: str):
             self.prepare()
-            self.inject_code(**files)
+            self.inject_files(**files)
             self.execute()
 
     """
 
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
-        # TODO: rename it to file_dict;   inject_code -> inject_files
-        self.code_dict: dict[str, Any] = (
+        self.file_dict: dict[str, Any] = (
             {}
         )  # The code injected into the folder, store them in the variable to reproduce the former result
         self.workspace_path: Path = RD_AGENT_SETTINGS.workspace_path / uuid.uuid4().hex
@@ -115,7 +114,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
     @property
     def code(self) -> str:
         code_string = ""
-        for file_name, code in self.code_dict.items():
+        for file_name, code in self.file_dict.items():
             code_string += f"File: {file_name}\n{code}\n"
         return code_string
 
@@ -142,7 +141,7 @@ def link_all_files_in_folder_to_workspace(data_path: Path, workspace_path: Path)
             if platform.system() == "Windows":
                 os.link(data_file_path, workspace_data_file_path)
 
-    def inject_code(self, **files: str) -> None:
+    def inject_files(self, **files: str) -> None:
         """
         Inject the code into the folder.
         {
@@ -151,7 +150,7 @@ def inject_code(self, **files: str) -> None:
         """
         self.prepare()
         for k, v in files.items():
-            self.code_dict[k] = v
+            self.file_dict[k] = v
             target_file_path = self.workspace_path / k
             target_file_path.parent.mkdir(parents=True, exist_ok=True)
             target_file_path.write_text(v)
@@ -172,7 +171,7 @@ def inject_code_from_folder(self, folder_path: Path) -> None:
         for file_path in folder_path.rglob("*"):
             if file_path.suffix in (".py", ".yaml", ".md"):
                 relative_path = file_path.relative_to(folder_path)
-                self.inject_code(**{str(relative_path): file_path.read_text()})
+                self.inject_files(**{str(relative_path): file_path.read_text()})
 
     def copy(self) -> FBWorkspace:
         """
@@ -185,14 +184,14 @@ def clear(self) -> None:
         Clear the workspace
         """
         shutil.rmtree(self.workspace_path, ignore_errors=True)
-        self.code_dict = {}
+        self.file_dict = {}
 
     def execute(self, env: Env | None = None, entry: str | None = None) -> object | None:
         """
         Before each execution, make sure to prepare and inject code
         """
         self.prepare()
-        self.inject_code(**self.code_dict)
+        self.inject_files(**self.file_dict)
         # TODO: env should be not None in new design (no code can run without environment)
         if env is not None and entry is not None:
             return env.run(entry, self.workspace_path)
diff --git a/rdagent/log/ui/app.py b/rdagent/log/ui/app.py
index 7e47869d3..d71b30efc 100644
--- a/rdagent/log/ui/app.py
+++ b/rdagent/log/ui/app.py
@@ -441,7 +441,7 @@ def summary_window():
                 for j, w in enumerate(ws):
                     with wtabs[j]:
                         # Evolving Code
-                        for k, v in w.code_dict.items():
+                        for k, v in w.file_dict.items():
                             with st.expander(f":green[`{k}`]", expanded=False):
                                 st.code(v, language="python")
 
@@ -636,7 +636,7 @@ def evolving_window():
             with wtabs[j]:
                 # Evolving Code
                 st.markdown(f"**Workspace Path**: {w.workspace_path}")
-                for k, v in w.code_dict.items():
+                for k, v in w.file_dict.items():
                     with st.expander(f":green[`{k}`]", expanded=True):
                         st.code(v, language="python")
 
diff --git a/rdagent/log/ui/web.py b/rdagent/log/ui/web.py
index eb4862b44..452bfab0e 100644
--- a/rdagent/log/ui/web.py
+++ b/rdagent/log/ui/web.py
@@ -285,7 +285,7 @@ def consume_msg(self, msg: Message | FactorFBWorkspace | ModelFBWorkspace):
                 ModelTaskWindow(self.container.container()).consume_msg(task_msg)
 
         # task codes
-        for k, v in ws.code_dict.items():
+        for k, v in ws.file_dict.items():
             self.container.markdown(f"`{k}`")
             self.container.code(v, language="python")
 
diff --git a/rdagent/scenarios/data_mining/developer/feedback.py b/rdagent/scenarios/data_mining/developer/feedback.py
index d87862271..271efd16c 100644
--- a/rdagent/scenarios/data_mining/developer/feedback.py
+++ b/rdagent/scenarios/data_mining/developer/feedback.py
@@ -47,7 +47,7 @@ def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback
                 context=context,
                 last_hypothesis=SOTA_hypothesis,
                 last_task=SOTA_experiment.sub_tasks[0].get_task_information() if SOTA_hypothesis else None,
-                last_code=SOTA_experiment.sub_workspace_list[0].code_dict.get("model.py") if SOTA_hypothesis else None,
+                last_code=SOTA_experiment.sub_workspace_list[0].file_dict.get("model.py") if SOTA_hypothesis else None,
                 last_result=SOTA_experiment.result if SOTA_hypothesis else None,
                 hypothesis=hypothesis,
                 exp=exp,
diff --git a/rdagent/scenarios/data_mining/developer/model_runner.py b/rdagent/scenarios/data_mining/developer/model_runner.py
index 2d04149a6..5beedcef8 100644
--- a/rdagent/scenarios/data_mining/developer/model_runner.py
+++ b/rdagent/scenarios/data_mining/developer/model_runner.py
@@ -7,10 +7,10 @@
 class DMModelRunner(CachedRunner[DMModelExperiment]):
     @cache_with_pickle(CachedRunner.get_cache_key, CachedRunner.assign_cached_result)
     def develop(self, exp: DMModelExperiment) -> DMModelExperiment:
-        if exp.sub_workspace_list[0].code_dict.get("model.py") is None:
+        if exp.sub_workspace_list[0].file_dict.get("model.py") is None:
             raise ModelEmptyError("model.py is empty")
         # to replace & inject code
-        exp.experiment_workspace.inject_code(**{"model.py": exp.sub_workspace_list[0].code_dict["model.py"]})
+        exp.experiment_workspace.inject_files(**{"model.py": exp.sub_workspace_list[0].file_dict["model.py"]})
 
         env_to_use = {"PYTHONPATH": "./"}
 
diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
index e19247ccb..9905d7528 100644
--- a/rdagent/scenarios/data_science/dev/feedback.py
+++ b/rdagent/scenarios/data_science/dev/feedback.py
@@ -27,7 +27,7 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> HypothesisFeed
             modified_file_name = "ensemble.py"
         elif hypothesis.component == "Workflow":
             modified_file_name = "main.py"
-        modified_code = exp.experiment_workspace.code_dict[modified_file_name]
+        modified_code = exp.experiment_workspace.file_dict[modified_file_name]
 
         sota_hypothesis, sota_exp = trace.get_sota_hypothesis_and_experiment()
 
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index c7e262424..630bbced5 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -286,7 +286,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     dependency_exp = trace.get_sota_hypothesis_and_experiment("FeatureEng")
                     if last_model_exp := trace.get_sota_hypothesis_and_experiment("Model"):
                         # TODO: model only have one (named "model.py")?
-                        base_code = last_model_exp.experiment_workspace.code_dict["model.py"]
+                        base_code = last_model_exp.experiment_workspace.file_dict["model.py"]
                     else:
                         base_code = ""
                     mt = ModelTask(
diff --git a/rdagent/scenarios/kaggle/developer/coder.py b/rdagent/scenarios/kaggle/developer/coder.py
index 71c390bbb..a04054c57 100644
--- a/rdagent/scenarios/kaggle/developer/coder.py
+++ b/rdagent/scenarios/kaggle/developer/coder.py
@@ -68,5 +68,5 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
                 .from_string(DEFAULT_SELECTION_CODE)
                 .render(feature_index_list=chosen_index_to_list_index)
             )
-        exp.experiment_workspace.inject_code(**{KG_SELECT_MAPPING[target_model_type]: code})
+        exp.experiment_workspace.inject_files(**{KG_SELECT_MAPPING[target_model_type]: code})
         return exp
diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
index 7346cff8e..d97ff9820 100644
--- a/rdagent/scenarios/kaggle/developer/feedback.py
+++ b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -106,7 +106,7 @@ def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback
         if hypothesis.action == "Model tuning":
             current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.sub_workspace_list[0].code
         elif hypothesis.action == "Model feature selection":
-            current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.experiment_workspace.code_dict[
+            current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.experiment_workspace.file_dict[
                 KG_SELECT_MAPPING[exp.sub_tasks[0].model_type]
             ]
         else:
diff --git a/rdagent/scenarios/kaggle/developer/runner.py b/rdagent/scenarios/kaggle/developer/runner.py
index 51890086b..9d407e02c 100644
--- a/rdagent/scenarios/kaggle/developer/runner.py
+++ b/rdagent/scenarios/kaggle/developer/runner.py
@@ -73,11 +73,11 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
             # TODO: There's a possibility of generating a hybrid model (lightgbm + xgboost), which results in having two items in the model_type list.
             model_type = sub_ws.target_task.model_type
 
-            if sub_ws.code_dict == {}:
+            if sub_ws.file_dict == {}:
                 raise ModelEmptyError("No model is implemented.")
             else:
                 model_file_name = f"model/model_{model_type.lower()}.py"
-                exp.experiment_workspace.inject_code(**{model_file_name: sub_ws.code_dict["model.py"]})
+                exp.experiment_workspace.inject_files(**{model_file_name: sub_ws.file_dict["model.py"]})
         else:
             raise ModelEmptyError("No model is implemented.")
         env_to_use = {"PYTHONPATH": "./"}
@@ -102,14 +102,14 @@ def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
         current_feature_file_count = len(list(exp.experiment_workspace.workspace_path.glob("feature/feature*.py")))
         implemented_factor_count = 0
         for sub_ws in exp.sub_workspace_list:
-            if sub_ws.code_dict == {}:
+            if sub_ws.file_dict == {}:
                 continue
             execued_df = sub_ws.execute()[1]
             if execued_df is None:
                 continue
             implemented_factor_count += 1
             target_feature_file_name = f"feature/feature_{current_feature_file_count:05d}.py"
-            exp.experiment_workspace.inject_code(**{target_feature_file_name: sub_ws.code_dict["factor.py"]})
+            exp.experiment_workspace.inject_files(**{target_feature_file_name: sub_ws.file_dict["factor.py"]})
             feature_shape = execued_df.shape[-1]
             exp.experiment_workspace.data_description.append((sub_ws.target_task.get_task_information(), feature_shape))
             current_feature_file_count += 1
diff --git a/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py b/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
index 24975d499..207988579 100644
--- a/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
+++ b/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
@@ -43,7 +43,7 @@ def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
             / KAGGLE_IMPLEMENT_SETTING.competition
         )
         if len(self.based_experiments) > 0:
-            self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
+            self.experiment_workspace.inject_files(**self.based_experiments[-1].experiment_workspace.file_dict)
             self.experiment_workspace.data_description = deepcopy(
                 self.based_experiments[-1].experiment_workspace.data_description
             )
@@ -69,7 +69,7 @@ def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
             / KAGGLE_IMPLEMENT_SETTING.competition
         )
         if len(self.based_experiments) > 0:
-            self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
+            self.experiment_workspace.inject_files(**self.based_experiments[-1].experiment_workspace.file_dict)
             self.experiment_workspace.data_description = deepcopy(
                 self.based_experiments[-1].experiment_workspace.data_description
             )
diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py
index 1bfaaf866..e8ffc084b 100644
--- a/rdagent/scenarios/kaggle/experiment/workspace.py
+++ b/rdagent/scenarios/kaggle/experiment/workspace.py
@@ -34,7 +34,7 @@ def __init__(self, template_folder_path: Path, *args, **kwargs) -> None:
     @property
     def model_description(self) -> dict[str, str]:
         model_description = {}
-        for k, v in self.code_dict.items():
+        for k, v in self.file_dict.items():
             if k.startswith("model/"):
                 model_description[k] = v
         return model_description
diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py
index da6752777..888c57af2 100644
--- a/rdagent/scenarios/kaggle/proposal/proposal.py
+++ b/rdagent/scenarios/kaggle/proposal/proposal.py
@@ -404,7 +404,7 @@ def convert_model_experiment(self, response: str, hypothesis: Hypothesis, trace:
         ]
         model_type = response_dict.get("model_type", "Model type not provided")
         if model_type in KG_MODEL_MAPPING:
-            base_code = based_experiments[-1].experiment_workspace.code_dict.get(KG_MODEL_MAPPING[model_type], None)
+            base_code = based_experiments[-1].experiment_workspace.file_dict.get(KG_MODEL_MAPPING[model_type], None)
         else:
             base_code = None
 
diff --git a/rdagent/scenarios/qlib/developer/feedback.py b/rdagent/scenarios/qlib/developer/feedback.py
index 8d014fe63..1a80cfe6c 100644
--- a/rdagent/scenarios/qlib/developer/feedback.py
+++ b/rdagent/scenarios/qlib/developer/feedback.py
@@ -147,7 +147,7 @@ def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback
                 context=context,
                 last_hypothesis=SOTA_hypothesis,
                 last_task=SOTA_experiment.sub_tasks[0].get_task_information() if SOTA_hypothesis else None,
-                last_code=SOTA_experiment.sub_workspace_list[0].code_dict.get("model.py") if SOTA_hypothesis else None,
+                last_code=SOTA_experiment.sub_workspace_list[0].file_dict.get("model.py") if SOTA_hypothesis else None,
                 last_result=SOTA_experiment.result if SOTA_hypothesis else None,
                 hypothesis=hypothesis,
                 exp=exp,
diff --git a/rdagent/scenarios/qlib/developer/model_runner.py b/rdagent/scenarios/qlib/developer/model_runner.py
index ed83ac2f3..4a7196bf7 100644
--- a/rdagent/scenarios/qlib/developer/model_runner.py
+++ b/rdagent/scenarios/qlib/developer/model_runner.py
@@ -19,10 +19,10 @@ class QlibModelRunner(CachedRunner[QlibModelExperiment]):
 
     @cache_with_pickle(CachedRunner.get_cache_key, CachedRunner.assign_cached_result)
     def develop(self, exp: QlibModelExperiment) -> QlibModelExperiment:
-        if exp.sub_workspace_list[0].code_dict.get("model.py") is None:
+        if exp.sub_workspace_list[0].file_dict.get("model.py") is None:
             raise ModelEmptyError("model.py is empty")
         # to replace & inject code
-        exp.experiment_workspace.inject_code(**{"model.py": exp.sub_workspace_list[0].code_dict["model.py"]})
+        exp.experiment_workspace.inject_files(**{"model.py": exp.sub_workspace_list[0].file_dict["model.py"]})
 
         env_to_use = {"PYTHONPATH": "./"}
 
diff --git a/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py b/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py
index 19e19b5bf..b6c97d7f7 100644
--- a/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py
+++ b/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py
@@ -57,7 +57,7 @@ def load(self, json_file_path: Path) -> TestCases:
             )
             gt = FactorFBWorkspace(task, raise_exception=False)
             code = {"factor.py": factor_data["gt_code"]}
-            gt.inject_code(**code)
+            gt.inject_files(**code)
             test_cases.test_case_l.append(TestCase(task, gt))
 
         return test_cases
diff --git a/rdagent/scenarios/qlib/prompts.yaml b/rdagent/scenarios/qlib/prompts.yaml
index dc84f8deb..206d4d64e 100644
--- a/rdagent/scenarios/qlib/prompts.yaml
+++ b/rdagent/scenarios/qlib/prompts.yaml
@@ -1,7 +1,7 @@
 hypothesis_and_feedback: |-
   {% for hypothesis, experiment, feedback in trace.hist[-10:] %}
   Hypothesis {{ loop.index }}: {{ hypothesis }}
-  Corresponding Code (that leads to the difference in performance): {{experiment.sub_workspace_list[0].code_dict.get("model.py")}}
+  Corresponding Code (that leads to the difference in performance): {{experiment.sub_workspace_list[0].file_dict.get("model.py")}}
   Observation on the result with the hypothesis: {{ feedback.observations }}
   Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
   New Feedback for Context (For you to agree or improve upon):  {{ feedback.new_hypothesis }}
@@ -258,7 +258,7 @@ model_feedback_generation:
     Now let's come to this round. You will receive the result and you will evaluate if the performance increases or decreases. 
     Hypothesis: {{hypothesis.hypothesis}}
     Experiment Setup: {{exp.sub_tasks[0]}}
-    Code Implemented: {{exp.sub_workspace_list[0].code_dict.get("model.py")}}
+    Code Implemented: {{exp.sub_workspace_list[0].file_dict.get("model.py")}}
     Relevant Reasoning: {{hypothesis.reason}}
     Result: {{exp.result}}
 

From 7f85fdfb4c2a2c290d7db7af6acedd2eecd8868b Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 23 Dec 2024 12:35:06 +0000
Subject: [PATCH 111/304] add CoSTEER unittest

---
 .../coder/data_science/ensemble/test.py       |  6 ++--
 test/utils/coder/test_CoSTEER.py              | 36 ++++++++++---------
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/test.py b/rdagent/components/coder/data_science/ensemble/test.py
index 4fa672932..04f75941d 100644
--- a/rdagent/components/coder/data_science/ensemble/test.py
+++ b/rdagent/components/coder/data_science/ensemble/test.py
@@ -29,9 +29,9 @@ def load_ensemble_spec():
         return f.read()
 
 
-def develop_ensemble():
+def develop_one_competition(competition: str):
     # Initialize scenario and coder
-    scen = DataScienceScen(competition="aerial-cactus-identification")
+    scen = DataScienceScen(competition=competition)
     ensemble_coder = EnsembleCoSTEER(scen)
     # Load ensemble specification
     ensemble_spec = load_ensemble_spec()
@@ -55,4 +55,4 @@ def develop_ensemble():
 
 
 if __name__ == "__main__":
-    develop_ensemble()
+    develop_one_competition("aerial-cactus-identification")
diff --git a/test/utils/coder/test_CoSTEER.py b/test/utils/coder/test_CoSTEER.py
index 9423917c8..69f5d4850 100644
--- a/test/utils/coder/test_CoSTEER.py
+++ b/test/utils/coder/test_CoSTEER.py
@@ -1,10 +1,9 @@
 import unittest
 
-
 class CoSTEERTest(unittest.TestCase):
 
     def setUp(self):
-        pass
+        self.test_competition = "aerial-cactus-identification"
 
     def tearDown(self):
         pass
@@ -13,31 +12,36 @@ def to_str(self, obj):
         return "".join(str(obj).split())
 
     def test_data_loader(self):
-        # 1) Build the data loader task/experiment
-        # 2) build an according CoSTEER
-        # 3) test the results
-        # - check spec.md
-        # - check data_loader.py
         from rdagent.components.coder.data_science.raw_data_loader.test import (
             develop_one_competition,
         )
 
-        exp = develop_one_competition("aerial-cactus-identification")
+        # if all tasks in exp are failed, will raise CoderError
+        exp = develop_one_competition(self.test_competition)
 
-        pass
+    def test_feature(self):
+        from rdagent.components.coder.data_science.feature.test import (
+            develop_one_competition,
+        )
+        exp = develop_one_competition(self.test_competition)
 
     def test_model(self):
-        # 1) Build the model experiment/task/workspace from tpl_ex
-        # 2) build an according CoSTEER
-        # 3) test the results
         from rdagent.components.coder.data_science.model.test import (
             develop_one_competition,
         )
+        exp = develop_one_competition(self.test_competition)
 
-        exp = develop_one_competition("aerial-cactus-identification")
-
-        pass
-
+    def test_ensemble(self):
+        from rdagent.components.coder.data_science.ensemble.test import (
+            develop_one_competition,
+        )
+        exp = develop_one_competition(self.test_competition)
+    
+    def test_workflow(self):
+        from rdagent.components.coder.data_science.workflow.test import (
+            develop_one_competition,
+        )
+        exp = develop_one_competition(self.test_competition)
 
 if __name__ == "__main__":
     unittest.main()

From 9009b73800c9115ffd2f0d89a1a467c5958853b7 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 23 Dec 2024 13:03:48 +0000
Subject: [PATCH 112/304] now show self.version in Task.get_task_information(),
 simplify CoSTEER sub tasks definition

---
 .../coder/data_science/ensemble/exp.py        | 23 +------------------
 .../coder/data_science/feature/exp.py         | 21 +----------------
 .../coder/data_science/model/exp.py           |  7 ------
 .../coder/data_science/raw_data_loader/exp.py | 23 +------------------
 .../coder/data_science/workflow/exp.py        | 22 +-----------------
 rdagent/core/experiment.py                    |  4 +++-
 .../data_science/proposal/exp_gen.py          |  2 +-
 7 files changed, 8 insertions(+), 94 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/exp.py b/rdagent/components/coder/data_science/ensemble/exp.py
index f6ef9b69a..d3a848254 100644
--- a/rdagent/components/coder/data_science/ensemble/exp.py
+++ b/rdagent/components/coder/data_science/ensemble/exp.py
@@ -8,25 +8,4 @@
 from rdagent.core.utils import cache_with_pickle
 
 
-class EnsembleTask(CoSTEERTask):
-    def __init__(
-        self,
-        name: str,
-        description: str,
-        *args,
-        **kwargs,
-    ) -> None:
-        super().__init__(name=name, description=description, *args, **kwargs)
-
-    def get_task_information(self):
-        task_desc = f"""name: {self.name}
-        description: {self.description}
-        """
-        return task_desc
-
-    @staticmethod
-    def from_dict(dict):
-        return EnsembleTask(**dict)
-
-    def __repr__(self) -> str:
-        return f"<{self.__class__.__name__} {self.name}>"
+EnsembleTask = CoSTEERTask
diff --git a/rdagent/components/coder/data_science/feature/exp.py b/rdagent/components/coder/data_science/feature/exp.py
index 2ae9a6a07..2f0cff674 100644
--- a/rdagent/components/coder/data_science/feature/exp.py
+++ b/rdagent/components/coder/data_science/feature/exp.py
@@ -8,23 +8,4 @@
 from rdagent.core.utils import cache_with_pickle
 
 
-class FeatureTask(CoSTEERTask):
-    def __init__(
-        self,
-        name: str,
-        description: str,
-        **kwargs,
-    ) -> None:
-        super().__init__(name=name, description=description, **kwargs)
-
-    def get_task_information(self):
-        return f"""name: {self.name}
-description: {self.description}
-"""
-
-    @staticmethod
-    def from_dict(dict):
-        return FeatureTask(**dict)
-
-    def __repr__(self) -> str:
-        return f"<{self.__class__.__name__} {self.name}>"
+FeatureTask = CoSTEERTask
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
index 6d246cc8b..e0e93bcb4 100644
--- a/rdagent/components/coder/data_science/model/exp.py
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -44,10 +44,3 @@ def get_task_information(self):
         task_desc += f"hyperparameters: {self.hyperparameters}\n"
         task_desc += f"model_type: {self.model_type}\n"
         return task_desc
-
-    @staticmethod
-    def from_dict(dict):
-        return ModelTask(**dict)
-
-    def __repr__(self) -> str:
-        return f"<{self.__class__.__name__} {self.name}>"
diff --git a/rdagent/components/coder/data_science/raw_data_loader/exp.py b/rdagent/components/coder/data_science/raw_data_loader/exp.py
index 8bb98baaa..0f08436e5 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/exp.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/exp.py
@@ -12,25 +12,4 @@
 from rdagent.utils.env import DockerEnv, DSDockerConf
 
 
-class DataLoaderTask(CoSTEERTask):
-    def __init__(
-        self,
-        name: str,
-        description: str,
-        *args,
-        **kwargs,
-    ) -> None:
-        super().__init__(name=name, description=description, *args, **kwargs)
-
-    def get_task_information(self):
-        task_desc = f"""name: {self.name}
-description: {self.description}
-"""
-        return task_desc
-
-    @staticmethod
-    def from_dict(dict):
-        return DataLoaderTask(**dict)
-
-    def __repr__(self) -> str:
-        return f"<{self.__class__.__name__} {self.name}>"
+DataLoaderTask = CoSTEERTask
diff --git a/rdagent/components/coder/data_science/workflow/exp.py b/rdagent/components/coder/data_science/workflow/exp.py
index 0c4124ede..a4a18f720 100644
--- a/rdagent/components/coder/data_science/workflow/exp.py
+++ b/rdagent/components/coder/data_science/workflow/exp.py
@@ -8,24 +8,4 @@
 from rdagent.core.utils import cache_with_pickle
 
 
-class WorkflowTask(CoSTEERTask):
-    def __init__(
-        self,
-        name: str,
-        description: str,
-        *args,
-        **kwargs,
-    ) -> None:
-        super().__init__(name=name, description=description, *args, **kwargs)
-
-    @staticmethod
-    def from_dict(dict):
-        return WorkflowTask(**dict)
-
-    def __repr__(self) -> str:
-        return f"<{self.__class__.__name__} {self.name}>"
-
-    def get_task_information(self):
-        return f"""name: {self.name}
-description: {self.description}
-"""
+WorkflowTask = CoSTEERTask
\ No newline at end of file
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index 029a9efc8..629f8803d 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -45,8 +45,10 @@ def __init__(self, name: str, version: int = 1, description: str = "") -> None:
         self.description = description
 
     def get_task_information(self) -> str:
-        return f"{self.name}_{self.version}: {self.description}"
+        return f"Task Name: {self.name}\nDescription: {self.description}"
 
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} {self.name}>"
 
 ASpecificTask = TypeVar("ASpecificTask", bound=Task)
 
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 630bbced5..326b74275 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -131,7 +131,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 hypothesis_and_feedback=hypothesis_and_feedback,
             )
 
-            resp_dict = json.loads(
+            resp_dict: dict = json.loads(
                 APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
             )
             hypothesis = DSHypothesis(

From 39abb252d1cfd846220a4dbbaa48315b7be520f6 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 23 Dec 2024 13:16:44 +0000
Subject: [PATCH 113/304] remove some properties in ModelTask, add model_type
 in it.

---
 rdagent/components/coder/data_science/model/__init__.py | 3 ---
 rdagent/components/coder/data_science/model/exp.py      | 6 ------
 rdagent/components/coder/data_science/model/test.py     | 3 ++-
 rdagent/scenarios/data_science/proposal/exp_gen.py      | 6 +++---
 rdagent/scenarios/data_science/proposal/prompts.yaml    | 1 +
 5 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 5615abed0..12e12d8b1 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -21,9 +21,6 @@
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 
-# from rdagent.utils.agent.tpl import T
-# T(".prompts:model_generator.user").r()
-
 
 class ModelMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
     def implement_one_task(
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
index e0e93bcb4..73b917f71 100644
--- a/rdagent/components/coder/data_science/model/exp.py
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -19,14 +19,10 @@ def __init__(
         architecture: str,
         *args,
         hyperparameters: Dict[str, str],
-        formulation: str = None,
-        variables: Dict[str, str] = None,
         model_type: Optional[str] = None,
         **kwargs,
     ) -> None:
-        self.formulation: str = formulation
         self.architecture: str = architecture
-        self.variables: str = variables
         self.hyperparameters: str = hyperparameters
         self.model_type: str = (
             model_type  # Tabular for tabular model, TimesSeries for time series model, Graph for graph model, XGBoost for XGBoost model
@@ -38,9 +34,7 @@ def get_task_information(self):
         task_desc = f"""name: {self.name}
 description: {self.description}
 """
-        task_desc += f"formulation: {self.formulation}\n" if self.formulation else ""
         task_desc += f"architecture: {self.architecture}\n"
-        task_desc += f"variables: {self.variables}\n" if self.variables else ""
         task_desc += f"hyperparameters: {self.hyperparameters}\n"
         task_desc += f"model_type: {self.model_type}\n"
         return task_desc
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index 829e8b3d5..201535187 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -24,8 +24,9 @@ def develop_one_competition(competition: str):
     mt = ModelTask(
         name="ModelTask",
         description="A CNN Model",
+        model_type="CNN",
         architecture="\hat{y}_u = CNN(X_u)",
-        variables="variables: {'\\hat{y}_u': 'The predicted output for node u', 'X_u': 'The input features for node u'}",
+        # variables="variables: {'\\hat{y}_u': 'The predicted output for node u', 'X_u': 'The input features for node u'}",
         hyperparameters="...",
         base_code="",
     )
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 326b74275..8bc248b70 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -175,7 +175,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 dependency_exp = trace.get_sota_hypothesis_and_experiment("DataLoadSpec")
                 ft = FeatureTask(
                     name="Feature Engineering",
-                    description=resp_dict.get("description", "Factor description not provided"),
+                    description=resp_dict.get("description", "Feature description not provided"),
                 )
 
                 exp = DSExperiment(sub_tasks=[ft], hypothesis=hypothesis)
@@ -194,6 +194,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 mt = ModelTask(
                     name=resp_dict.get("model_name", "Model name not provided"),
                     description=resp_dict.get("description", "Model description not provided"),
+                    model_type=resp_dict.get("model_type", "Model type not provided"),
                     architecture=resp_dict.get("architecture", "Model architecture not provided"),
                     hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
                     base_code="",
@@ -270,8 +271,6 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                         ft = FeatureTask(
                             name=fn,
                             description=resp_dict[fn].get("description", "Factor description not provided"),
-                            formulation=resp_dict[fn].get("formulation", "Feature formulation not provided"),
-                            variables=resp_dict[fn].get("variables", "Variables not provided"),
                         )
                         tasks.append(ft)
                     exp = DSExperiment(sub_tasks=tasks)
@@ -292,6 +291,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     mt = ModelTask(
                         name=resp_dict.get("model_name", "Model name not provided"),
                         description=resp_dict.get("description", "Model description not provided"),
+                        model_type=resp_dict.get("model_type", "Model type not provided"),
                         architecture=resp_dict.get("architecture", "Model architecture not provided"),
                         hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
                         base_code=base_code,
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 4bd0b250b..d52331e88 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -103,6 +103,7 @@ output_format:
     {
         "model_name": "model_name",
         "description": "A detailed description of the model",
+        "model_type": "The type of the model, e.g., neural network, tree-based model, etc.",
         "architecture": "A detailed description of the model's architecture, e.g., neural network layers or tree structures",
         "hyperparameters": {
             "hyperparameter_name_1": "value of hyperparameter 1",

From a6505d164550718d6964d0f597c8a9f7f7e3edb4 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 24 Dec 2024 03:15:11 +0000
Subject: [PATCH 114/304] fix llm app bug

---
 rdagent/log/logger.py    | 6 +++---
 rdagent/log/ui/llm_st.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/rdagent/log/logger.py b/rdagent/log/logger.py
index 87b045aef..8a245703b 100644
--- a/rdagent/log/logger.py
+++ b/rdagent/log/logger.py
@@ -119,10 +119,10 @@ def log_object(self, obj: object, *, tag: str = "") -> None:
             debug_log_path = self.log_trace_path / "debug_llm.pkl"
             debug_data = {"tag": tag, "obj": obj}
             if debug_log_path.exists():
-                with debug_log_path.open("rb+") as f:
+                with debug_log_path.open("rb") as f:
                     existing_data = pickle.load(f)
-                    existing_data.append(debug_data)
-                    f.seek(0)
+                existing_data.append(debug_data)
+                with debug_log_path.open("wb") as f:
                     pickle.dump(existing_data, f)
             else:
                 with debug_log_path.open("wb") as f:
diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
index c12b965dc..42a7a879e 100644
--- a/rdagent/log/ui/llm_st.py
+++ b/rdagent/log/ui/llm_st.py
@@ -31,7 +31,7 @@
 
 def load_data():
     try:
-        with open(f"{main_log_path}/{session_state.log_path}/debug_llm.pkl", "r") as f:
+        with open(f"{main_log_path}/{session_state.log_path}/debug_llm.pkl", "rb") as f:
             session_state.data = pickle.load(f)
     except Exception as e:
         session_state.data = [{"error": str(e)}]

From 87dea183f418b352699157ca651987161803f328 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 24 Dec 2024 03:29:48 +0000
Subject: [PATCH 115/304] llm web app bug fix

---
 rdagent/log/ui/llm_st.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
index 42a7a879e..7c70108a1 100644
--- a/rdagent/log/ui/llm_st.py
+++ b/rdagent/log/ui/llm_st.py
@@ -99,15 +99,15 @@ def extract_evoid(tag):
     loop_id, func_name = extract_loopid_func_name(tag)
     evo_id = extract_evoid(tag)
     if loop_id:
-        if loop_id not in tlist:
-            tlist.append(loop_id)
-            st.subheader(f"**Loop_{loop_id}**", anchor=f"Loop_{loop_id}", divider="blue")
+        if f"Loop_{loop_id}" not in tlist:
+            tlist.append(f"Loop_{loop_id}")
+            st.header(f"Loop_{loop_id}", anchor=f"Loop_{loop_id}", divider="blue")
         if f"loop_{loop_id}.{func_name}" not in tlist:
             tlist.append(f"loop_{loop_id}.{func_name}")
-            st.subheader(f"**{func_name}**", anchor=f"loop_{loop_id}.{func_name}", divider="green")
-        if f"loop_{loop_id}.{evo_id}" not in tlist:
+            st.header(f"in *{func_name}*", anchor=f"loop_{loop_id}.{func_name}", divider="green")
+        if evo_id and f"loop_{loop_id}.evo_step_{evo_id}" not in tlist:
             tlist.append(f"loop_{loop_id}.evo_step_{evo_id}")
-            st.subheader(f"**evo_step_{evo_id}**", anchor=f"loop_{loop_id}.evo_step_{evo_id}", divider="orange")
+            st.subheader(f"evo_step_{evo_id}", anchor=f"loop_{loop_id}.evo_step_{evo_id}", divider="orange")
 
     if "debug_tpl" in tag:
         uri = obj["uri"]
@@ -161,7 +161,7 @@ def extract_evoid(tag):
         if t.startswith("L"):
             et_toc += f"- [{t}](#{t})\n"
         elif "evo_step_" in t:
-            et_toc += f"    - [{t}](#{t})\n"
+            et_toc += f"    - [{t.split('.')[1]}](#{t})\n"
         else:
-            et_toc += f"  - [{t}](#{t})\n"
+            et_toc += f"  - [{t.split('.')[1]}](#{t})\n"
     st.markdown(et_toc, unsafe_allow_html=True)

From d2d88d9ae5ee6e59a22e39e9d912f7bab4f2236d Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 24 Dec 2024 08:28:43 +0000
Subject: [PATCH 116/304] ds loop bug fix

---
 rdagent/app/data_science/loop.py              | 31 +++--------
 .../data_science/proposal/exp_gen.py          | 55 +++++++++----------
 2 files changed, 35 insertions(+), 51 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index e790789d8..212d91d66 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -1,30 +1,16 @@
-import subprocess
-from typing import Any, Literal
+from typing import Any
 
 import fire
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.data_science.ensemble import EnsembleCoSTEER
-from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
 from rdagent.components.coder.data_science.feature import FeatureCoSTEER
-from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.components.coder.data_science.model import ModelCoSTEER
-from rdagent.components.coder.data_science.model.exp import ModelTask
 from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
-from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.components.coder.data_science.workflow import WorkflowCoSTEER
-from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
 from rdagent.components.workflow.conf import BasePropSetting
 from rdagent.components.workflow.rd_loop import RDLoop
-from rdagent.core.exception import FactorEmptyError, ModelEmptyError
-from rdagent.core.proposal import (
-    Experiment2Feedback,
-    ExpGen,
-    Hypothesis2Experiment,
-    HypothesisFeedback,
-    HypothesisGen,
-    Trace,
-)
+from rdagent.core.proposal import HypothesisFeedback
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
@@ -74,19 +60,18 @@ def direct_exp_gen(self, prev_out: dict[str, Any]):
 
     def coding(self, prev_out: dict[str, Any]):
         exp: DSExperiment = prev_out["direct_exp_gen"]
-        exp_task = exp.sub_tasks[0]
-        if isinstance(exp_task, DataLoaderTask):
+        if exp.hypothesis.component == "DataLoadSpec":
             exp = self.data_loader_coder.develop(exp)
-        elif isinstance(exp_task, FeatureTask):
+        elif exp.hypothesis.component == "FeatureEng":
             exp = self.feature_coder.develop(exp)
-        elif isinstance(exp_task, ModelTask):
+        elif exp.hypothesis.component == "Model":
             exp = self.model_coder.develop(exp)
-        elif isinstance(exp_task, EnsembleTask):
+        elif exp.hypothesis.component == "Ensemble":
             exp = self.ensemble_coder.develop(exp)
-        elif isinstance(exp_task, WorkflowTask):
+        elif exp.hypothesis.component == "Workflow":
             exp = self.workflow_coder.develop(exp)
         else:
-            raise NotImplementedError(f"Unsupported task type in DataScienceRDLoop: {exp_task}")
+            raise NotImplementedError(f"Unsupported component in DataScienceRDLoop: {exp.hypothesis.component}")
 
         return exp
 
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 8bc248b70..00945b059 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -21,13 +21,13 @@
 class DSHypothesis(Hypothesis):
     def __init__(
         self,
-        hypothesis: str,
-        reason: str,
-        concise_reason: str,
-        concise_observation: str,
-        concise_justification: str,
-        concise_knowledge: str,
         component: COMPONENT,
+        hypothesis: str = "",
+        reason: str = "",
+        concise_reason: str = "",
+        concise_observation: str = "",
+        concise_justification: str = "",
+        concise_knowledge: str = "",
     ) -> None:
         super().__init__(
             hypothesis, reason, concise_reason, concise_observation, concise_justification, concise_knowledge
@@ -35,6 +35,8 @@ def __init__(
         self.component = component
 
     def __str__(self) -> str:
+        if self.hypothesis == "":
+            return f"Chosen Component: {self.component}"
         return f"""Chosen Component: {self.component}
 Hypothesis: {self.hypothesis}
 Reason: {self.reason}
@@ -135,13 +137,13 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
             )
             hypothesis = DSHypothesis(
+                component=resp_dict.get("component", "Component not provided"),
                 hypothesis=resp_dict.get("hypothesis", "Hypothesis not provided"),
                 reason=resp_dict.get("reason", "Reason not provided"),
                 concise_reason=resp_dict.get("concise_reason", "Concise reason not provided"),
                 concise_observation=resp_dict.get("concise_observation", "Concise observation not provided"),
                 concise_justification=resp_dict.get("concise_justification", "Concise justification not provided"),
                 concise_knowledge=resp_dict.get("concise_knowledge", "Concise knowledge not provided"),
-                component=resp_dict.get("component", "Component not provided"),
             )
 
             # 2. gen experiment
@@ -172,7 +174,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = trace.get_sota_hypothesis_and_experiment("DataLoadSpec")
+                dependency_exp = trace.get_sota_hypothesis_and_experiment("DataLoadSpec")[1]
                 ft = FeatureTask(
                     name="Feature Engineering",
                     description=resp_dict.get("description", "Feature description not provided"),
@@ -190,7 +192,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = trace.get_sota_hypothesis_and_experiment("FeatureEng")
+                dependency_exp = trace.get_sota_hypothesis_and_experiment("FeatureEng")[1]
                 mt = ModelTask(
                     name=resp_dict.get("model_name", "Model name not provided"),
                     description=resp_dict.get("description", "Model description not provided"),
@@ -212,7 +214,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = trace.get_sota_hypothesis_and_experiment("Model")
+                dependency_exp = trace.get_sota_hypothesis_and_experiment("Model")[1]
                 et = EnsembleTask(
                     name="Ensemble",
                     description=resp_dict.get("description", "Ensemble description not provided"),
@@ -230,7 +232,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = trace.get_sota_hypothesis_and_experiment("Ensemble")
+                dependency_exp = trace.get_sota_hypothesis_and_experiment("Ensemble")[1]
                 wt = WorkflowTask(
                     name="Workflow",
                     description=resp_dict.get("description", "Workflow description not provided"),
@@ -257,7 +259,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                         ),
                     )
 
-                    exp = DSExperiment(sub_tasks=[dt])
+                    exp = DSExperiment(sub_tasks=[dt], hypothesis=DSHypothesis("DataLoadSpec"))
                     return exp
                 elif o == "FeatureEng":
                     resp_dict = self.llm_task_gen(
@@ -265,15 +267,12 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                         scenario_desc=scenario_desc,
                         task_output_format=T(".prompts:output_format.feature").r(),
                     )
-                    dependency_exp = trace.get_sota_hypothesis_and_experiment("DataLoadSpec")
-                    tasks = []
-                    for fn in resp_dict:
-                        ft = FeatureTask(
-                            name=fn,
-                            description=resp_dict[fn].get("description", "Factor description not provided"),
-                        )
-                        tasks.append(ft)
-                    exp = DSExperiment(sub_tasks=tasks)
+                    dependency_exp = trace.get_sota_hypothesis_and_experiment("DataLoadSpec")[1]
+                    ft = FeatureTask(
+                        name="Feature Engineering",
+                        description=resp_dict.get("description", "Factor description not provided"),
+                    )
+                    exp = DSExperiment(sub_tasks=[ft], hypothesis=DSHypothesis("FeatureEng"))
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
                 elif o == "Model":
@@ -282,8 +281,8 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                         scenario_desc=scenario_desc,
                         task_output_format=T(".prompts:output_format.model").r(),
                     )
-                    dependency_exp = trace.get_sota_hypothesis_and_experiment("FeatureEng")
-                    if last_model_exp := trace.get_sota_hypothesis_and_experiment("Model"):
+                    dependency_exp = trace.get_sota_hypothesis_and_experiment("FeatureEng")[1]
+                    if last_model_exp := trace.get_sota_hypothesis_and_experiment("Model")[1]:
                         # TODO: model only have one (named "model.py")?
                         base_code = last_model_exp.experiment_workspace.file_dict["model.py"]
                     else:
@@ -296,7 +295,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                         hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
                         base_code=base_code,
                     )
-                    exp = DSExperiment(sub_tasks=[mt])
+                    exp = DSExperiment(sub_tasks=[mt], hypothesis=DSHypothesis("Model"))
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
                 elif o == "Ensemble":
@@ -305,12 +304,12 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                         scenario_desc=scenario_desc,
                         task_output_format=T(".prompts:output_format.ensemble").r(),
                     )
-                    dependency_exp = trace.get_sota_hypothesis_and_experiment("Model")
+                    dependency_exp = trace.get_sota_hypothesis_and_experiment("Model")[1]
                     et = EnsembleTask(
                         name="Ensemble",
                         description=resp_dict.get("description", "Ensemble description not provided"),
                     )
-                    exp = DSExperiment(sub_tasks=[et])
+                    exp = DSExperiment(sub_tasks=[et], hypothesis=DSHypothesis("Ensemble"))
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
                 elif o == "Workflow":
@@ -319,12 +318,12 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                         scenario_desc=scenario_desc,
                         task_output_format=T(".prompts:output_format.workflow").r(),
                     )
-                    dependency_exp = trace.get_sota_hypothesis_and_experiment("Ensemble")
+                    dependency_exp = trace.get_sota_hypothesis_and_experiment("Ensemble")[1]
                     wt = WorkflowTask(
                         name="Workflow",
                         description=resp_dict.get("description", "Workflow description not provided"),
                     )
-                    exp = DSExperiment(sub_tasks=[wt])
+                    exp = DSExperiment(sub_tasks=[wt], hypothesis=DSHypothesis("Workflow"))
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                     return exp
 

From e8c2d6c61f25d34c5703016154bf3ea8611f33a6 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 24 Dec 2024 11:50:32 +0000
Subject: [PATCH 117/304] fix: give component code to feature&ens eval

---
 .../components/coder/data_science/ensemble/eval.py  |  4 +++-
 .../coder/data_science/ensemble/prompts.yaml        | 13 +++++++++++--
 .../components/coder/data_science/feature/eval.py   |  4 +++-
 .../coder/data_science/feature/prompts.yaml         |  9 ++++++++-
 4 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
index b9121767b..66fa93188 100644
--- a/rdagent/components/coder/data_science/ensemble/eval.py
+++ b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -52,7 +52,9 @@ def evaluate(
             implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
-        system_prompt = T(".prompts:ensemble_eval.system").r(test_code=test_code)
+        system_prompt = T(".prompts:ensemble_eval.system").r(
+            test_code=test_code, code=implementation.file_dict["ensemble.py"]
+        )
         user_prompt = T(".prompts:ensemble_eval.user").r(stdout=stdout)
 
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
index 8a043ca4e..bc3b66474 100644
--- a/rdagent/components/coder/data_science/ensemble/prompts.yaml
+++ b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -39,8 +39,17 @@ ensemble_coder:
 ensemble_eval:
   system: |-
     You are a data scientist evaluating an ensemble implementation.
-    You are testing the ensemble with the following code:    ```python
-    {{test_code}}    ```
+    
+    The ensemble code is:
+    ```python
+    {{code}}
+    ```
+
+    You are testing the ensemble with the following code:
+    ```python
+    {{test_code}}
+    ```
+    
     You'll be given the stdout of your testing scripts.
     Please respond with your feedback in the following JSON format: 
     {
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
index 9f52516cf..546b341b0 100644
--- a/rdagent/components/coder/data_science/feature/eval.py
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -53,7 +53,9 @@ def evaluate(
             implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
-        system_prompt = T(".prompts:feature_eval.system").r(test_code=test_code)
+        system_prompt = T(".prompts:feature_eval.system").r(
+            test_code=test_code, code=implementation.file_dict["feat01.py"]
+        )
         user_prompt = T(".prompts:feature_eval.user").r(stdout=stdout)
 
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
index 8b63ea1aa..4fdfc4707 100644
--- a/rdagent/components/coder/data_science/feature/prompts.yaml
+++ b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -48,10 +48,17 @@ feature:
 feature_eval:
   system: |-
     You are data scientist.
-    You are testing the feature processing with the following code
+
+    The feature code is:
+    ```python
+    {{code}}
+    ```
+
+    You are testing the feature processing code with the following code
     ```python
     {{test_code}}
     ```
+
     You'll be given the stdout of your testing scripts.
     Please respond with your feedback in the following JSON format and order
     ```json

From 0722d778868d214d852113d718f451bfb1684d4d Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 25 Dec 2024 03:10:11 +0000
Subject: [PATCH 118/304] loop catch error bug

---
 rdagent/app/data_science/loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 212d91d66..dbfff1425 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -22,6 +22,7 @@
 
 
 class DataScienceRDLoop(RDLoop):
+    skip_loop_error = ()
 
     def __init__(self, PROP_SETTING: BasePropSetting):
         scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)

From b53e03ebb94dc52ba0a0bbcd0fd511c7077049c9 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 25 Dec 2024 03:13:24 +0000
Subject: [PATCH 119/304] rename load_from_raw_data to load_data

---
 .../coder/data_science/feature/eval_tests/feature_test.py     | 4 ++--
 .../coder/data_science/model/eval_tests/model_execute.py      | 4 ++--
 .../kaggle/tpl_ex/aerial-cactus-identification/load_data.py   | 2 +-
 .../kaggle/tpl_ex/aerial-cactus-identification/main.py        | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
index 029de0f52..a9d812567 100644
--- a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
@@ -18,9 +18,9 @@
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 
 # Load data
-from load_data import load_from_raw_data
+from load_data import load_data
 
-X, y, X_test, test_ids = load_from_raw_data()
+X, y, X_test, test_ids = load_data()
 
 X, y, X_param = feat_eng(X, y)
 X_test, _, _ = feat_eng(X_test, param=X_param)
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
index 30459be77..1184e99e5 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
@@ -7,11 +7,11 @@
 import traceback
 
 import numpy as np
-from load_data import load_from_raw_data
+from load_data import load_data
 from model01 import model_workflow
 from sklearn.model_selection import train_test_split
 
-X, y, test_X, test_ids = load_from_raw_data()
+X, y, test_X, test_ids = load_data()
 train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=42)
 
 
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py
index 3f161f6d3..e110f00a7 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py
@@ -32,7 +32,7 @@ def load_images_and_labels(csv_file, image_folder):
     return np.array(images), np.array(labels)
 
 
-def load_from_raw_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, list[str]]:
+def load_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, list[str]]:
     """
     load raw data from disk to get data in uniform data
 
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
index f6e2227bc..139c0803f 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
@@ -1,8 +1,8 @@
-from load_data import load_from_raw_data
+from load_data import load_data
 from sklearn.model_selection import train_test_split
 
 # Load data
-train_images, train_labels, test_images, test_ids = load_from_raw_data()
+train_images, train_labels, test_images, test_ids = load_data()
 
 
 # feature engineering

From 01ad2e900fff0bd4b580ee22eaaf72bf84bf65a8 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Wed, 25 Dec 2024 07:38:18 +0000
Subject: [PATCH 120/304] feat: Add debug data creation functionality for data
 science scenarios

---
 rdagent/app/data_science/debug.py            |   4 +
 rdagent/scenarios/data_science/debug/data.py | 133 +++++++++++++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 rdagent/app/data_science/debug.py
 create mode 100644 rdagent/scenarios/data_science/debug/data.py

diff --git a/rdagent/app/data_science/debug.py b/rdagent/app/data_science/debug.py
new file mode 100644
index 000000000..c92f7551b
--- /dev/null
+++ b/rdagent/app/data_science/debug.py
@@ -0,0 +1,4 @@
+"""
+entry of debuggging function
+- e.g. creating debug datasets
+"""
diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
new file mode 100644
index 000000000..b31b81d89
--- /dev/null
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -0,0 +1,133 @@
+from pathlib import Path
+
+import pandas as pd
+
+from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+
+
+class DataHandler:
+
+    def load(self, path) -> pd.DataFrame:
+        ...
+
+    def dump(self, df: pd.DataFrame, path):
+        ...
+
+
+class CSVDataHandler(DataHandler):
+
+    def load(self, path) -> pd.DataFrame:
+        return pd.read_csv(path)
+
+    def dump(self, df: pd.DataFrame, path):
+        df.to_csv(path, index=False)
+
+
+class DataReducer:
+
+    def reduce(self, df) -> pd.DataFrame:
+        ...
+
+
+class RandDataReducer(DataReducer):
+
+    def __init__(self, min_frac=0.05, min_num=100):
+        self.min_frac = min_frac
+        self.min_num = min_num
+
+    def reduce(self, df) -> pd.DataFrame:
+        # Calculate the fraction to sample
+        frac = max(self.min_frac, self.min_num / len(df))
+        # Sample the data
+        return df.sample(frac=frac, random_state=1)
+
+
+def create_debug_data(
+    competition,
+    original_file_name,
+    dh_cls: type[DataHandler],
+    dr_cls: type[DataReducer],
+    dr_cls_kwargs={},
+    dataset_path=KAGGLE_IMPLEMENT_SETTING.local_data_path,
+):
+    # Define the path to the original data file
+    data_path = Path(dataset_path) / competition / original_file_name
+
+    # Automatically generate full and sampled file names based on the original file name
+    original_suffix = Path(original_file_name).suffix
+    full_file_name = original_file_name.replace(original_suffix, f'.full{original_suffix}')
+    sampled_file_name = original_file_name.replace(original_suffix, f'.sampled{original_suffix}')
+
+    # Define the path to the .full data file
+    full_data_path = data_path.with_name(full_file_name)
+
+    # Check if the .full file exists
+    if not full_data_path.exists():
+        # Initialize handlers
+        data_handler = dh_cls()
+        data_reducer = dr_cls(**dr_cls_kwargs)
+
+        # Load the data file
+        df = data_handler.load(data_path)
+
+        # Reduce the data
+        df_sampled = data_reducer.reduce(df)
+
+        # Save the sampled data to a new data file
+        sampled_data_path = data_path.with_name(sampled_file_name)
+        data_handler.dump(df_sampled, sampled_data_path)
+
+        # Rename the original file with .full
+        data_path.rename(full_data_path)
+
+        # Move the sampled data to replace the original one
+        sampled_data_path.rename(data_path)
+
+
+class PickleDataHandler(DataHandler):
+
+    def load(self, path) -> pd.DataFrame:
+        return pd.read_pickle(path)
+
+    def dump(self, df: pd.DataFrame, path):
+        df.to_pickle(path)
+
+
+class ColumnReducer(DataReducer):
+
+    def reduce(self, df) -> pd.DataFrame:
+        return df.iloc[:, :5]
+
+
+def new_york_city_taxi_fare_prediction_creator():
+    create_debug_data(competition="new-york-city-taxi-fare-prediction",
+                      original_file_name="train.csv",
+                      dh_cls=CSVDataHandler,
+                      dr_cls=RandDataReducer,
+                      dr_cls_kwargs=dict(min_frac=0.05, min_num=100))
+
+
+def amc_debug_data_creator():
+    create_debug_data(
+        competition="amc",
+        original_file_name="train_feature_with_label.pkl",
+        dh_cls=PickleDataHandler,
+        dr_cls=ColumnReducer,
+    )
+
+    create_debug_data(
+        competition="amc",
+        original_file_name="test_feature.pkl",
+        dh_cls=PickleDataHandler,
+        dr_cls=ColumnReducer,
+    )
+
+
+# competition to data handler & Reducer mapping
+# find a place to store reduced data.
+# - <local_data_path>, <local_data_path>.debug
+
+import fire
+if __name__ == "__main__":
+    # fire.Fire(create_debug_data)
+    fire.Fire(amc_debug_data_creator)

From db1455bec10acac224f284476e0cd77d32df8025 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Wed, 25 Dec 2024 16:06:19 +0800
Subject: [PATCH 121/304] support local folder (#511)

* support local folder

* remove unnecessary random

* KaggleScen Subclass

* small fix

* use template for style description

* update default scen to kaggle
---
 rdagent/app/data_science/conf.py              |  2 +-
 rdagent/app/data_science/loop.py              |  8 ++-
 .../coder/data_science/ensemble/test.py       |  4 +-
 .../coder/data_science/feature/test.py        |  4 +-
 .../coder/data_science/model/test.py          |  4 +-
 .../data_science/raw_data_loader/test.py      |  4 +-
 .../coder/data_science/workflow/test.py       |  4 +-
 rdagent/core/experiment.py                    |  2 +-
 .../scenarios/data_science/scen/__init__.py   |  3 +-
 rdagent/scenarios/data_science/scen/kaggle.py | 35 ++++++++++++
 .../scenarios/data_science/scen/prompts.yaml  | 27 ++++++++-
 rdagent/scenarios/data_science/scen/scen.py   | 56 ++++++-------------
 12 files changed, 100 insertions(+), 53 deletions(-)
 create mode 100644 rdagent/scenarios/data_science/scen/kaggle.py

diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index 156956dbe..d31a93fac 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -7,7 +7,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
 
     # Main components
     ## Scen
-    scen: str = "rdagent.scenarios.data_science.scen.DataScienceScen"
+    scen: str = "rdagent.scenarios.data_science.scen.KaggleScen"
     """Scenario class for data mining model"""
 
     ## proposal
diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index dbfff1425..17f468634 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -1,5 +1,6 @@
 from typing import Any
 
+from pathlib import Path
 import fire
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
@@ -111,7 +112,12 @@ def main(path=None, step_n=None, competition=None):
         DS_RD_SETTING.competition = competition
 
     if DS_RD_SETTING.competition:
-        download_data(competition=DS_RD_SETTING.competition, settings=DS_RD_SETTING)
+        if DS_RD_SETTING.scen.endswith("KaggleScen"):
+            download_data(competition=DS_RD_SETTING.competition, settings=DS_RD_SETTING)
+        else:
+            if not Path(f"{DS_RD_SETTING.local_data_path}/{competition}").exists():
+                logger.error(f"Please prepare data for competition {competition} first.")
+                return
     else:
         logger.error("Please specify competition name.")
     if path is None:
diff --git a/rdagent/components/coder/data_science/ensemble/test.py b/rdagent/components/coder/data_science/ensemble/test.py
index 04f75941d..1a7a3d08b 100644
--- a/rdagent/components/coder/data_science/ensemble/test.py
+++ b/rdagent/components/coder/data_science/ensemble/test.py
@@ -8,7 +8,7 @@
 from rdagent.components.coder.data_science.ensemble import EnsembleCoSTEER
 from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
-from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.scenarios.data_science.scen import KaggleScen
 
 # Add the competition folder to path
 COMPETITION_PATH = (
@@ -31,7 +31,7 @@ def load_ensemble_spec():
 
 def develop_one_competition(competition: str):
     # Initialize scenario and coder
-    scen = DataScienceScen(competition=competition)
+    scen = KaggleScen(competition=competition)
     ensemble_coder = EnsembleCoSTEER(scen)
     # Load ensemble specification
     ensemble_spec = load_ensemble_spec()
diff --git a/rdagent/components/coder/data_science/feature/test.py b/rdagent/components/coder/data_science/feature/test.py
index 96cd5e590..74732e756 100644
--- a/rdagent/components/coder/data_science/feature/test.py
+++ b/rdagent/components/coder/data_science/feature/test.py
@@ -9,11 +9,11 @@
 from rdagent.components.coder.data_science.feature import FeatureCoSTEER
 from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
-from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.scenarios.data_science.scen import KaggleScen
 
 
 def develop_one_competition(competition: str):  # -> experiment
-    scen = DataScienceScen(competition=competition)
+    scen = KaggleScen(competition=competition)
     feature_coder = FeatureCoSTEER(scen)
 
     with open("./rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md", "r") as file:
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index 201535187..ad7995789 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -12,12 +12,12 @@
 from rdagent.components.coder.data_science.model.exp import ModelTask
 from rdagent.core.experiment import FBWorkspace
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
-from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.scenarios.data_science.scen import KaggleScen
 
 
 # Take tasks, spec.md and feat as input, generate a feedback as output
 def develop_one_competition(competition: str):
-    scen = DataScienceScen(competition=competition)
+    scen = KaggleScen(competition=competition)
     model_coder = ModelCoSTEER(scen)
 
     # Create the task
diff --git a/rdagent/components/coder/data_science/raw_data_loader/test.py b/rdagent/components/coder/data_science/raw_data_loader/test.py
index 5aacc8b8c..2cd68a790 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/test.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/test.py
@@ -9,11 +9,11 @@
 from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
-from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.scenarios.data_science.scen import KaggleScen
 
 
 def develop_one_competition(competition: str):  # -> experiment
-    scen = DataScienceScen(competition=competition)
+    scen = KaggleScen(competition=competition)
     data_loader_coder = DataLoaderCoSTEER(scen)
 
     # Create the experiment
diff --git a/rdagent/components/coder/data_science/workflow/test.py b/rdagent/components/coder/data_science/workflow/test.py
index 5e9e7a1b4..f2c2dcade 100644
--- a/rdagent/components/coder/data_science/workflow/test.py
+++ b/rdagent/components/coder/data_science/workflow/test.py
@@ -12,11 +12,11 @@
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
 from rdagent.core.experiment import FBWorkspace
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
-from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.scenarios.data_science.scen import KaggleScen
 
 
 def develop_one_competition(competition: str):
-    scen = DataScienceScen(competition=competition)
+    scen = KaggleScen(competition=competition)
     workflow_coder = WorkflowCoSTEER(scen)
 
     wt = WorkflowTask(
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index 629f8803d..2cc25f031 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -223,7 +223,7 @@ def __init__(
         based_experiments: Sequence[ASpecificWSForExperiment] = [],
         hypothesis: Optional["Hypothesis"] = None,
     ) -> None:
-        self.hypothesis: Optional["Hypothesis"] = hypothesis  # Experiment is opptionally generated by hypothesis
+        self.hypothesis: Optional["Hypothesis"] = hypothesis  # Experiment is optionally generated by hypothesis
         self.sub_tasks: Sequence[ASpecificTask] = sub_tasks
         self.sub_workspace_list: list[ASpecificWSForSubTasks | None] = [None] * len(self.sub_tasks)
         # TODO:
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
index 29324c02a..8aaf93146 100644
--- a/rdagent/scenarios/data_science/scen/__init__.py
+++ b/rdagent/scenarios/data_science/scen/__init__.py
@@ -1,3 +1,4 @@
 from .scen import DataScienceScen
+from .kaggle import KaggleScen
 
-__all__ = ["DataScienceScen"]
+__all__ = ["DataScienceScen", "KaggleScen"]
diff --git a/rdagent/scenarios/data_science/scen/kaggle.py b/rdagent/scenarios/data_science/scen/kaggle.py
new file mode 100644
index 000000000..3a269405f
--- /dev/null
+++ b/rdagent/scenarios/data_science/scen/kaggle.py
@@ -0,0 +1,35 @@
+import json
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.core.scenario import Scenario
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.scenarios.kaggle.kaggle_crawler import (
+    crawl_descriptions,
+    leaderboard_scores,
+)
+from rdagent.utils.agent.tpl import T
+
+
+class KaggleScen(DataScienceScen):
+    """Kaggle Scenario
+    It is based on kaggle now.
+        - But it is not use the same interface with previous kaggle version.
+        - Ideally, we should reuse previous kaggle scenario.
+          But we found that too much scenario unrelated code in kaggle scenario and hard to reuse.
+          So we start from a simple one....
+    """
+    def _get_description(self):
+        return crawl_descriptions(self.competition, DS_RD_SETTING.local_data_path)
+
+    def _get_direction(self):
+        leaderboard = leaderboard_scores(self.competition)
+        return "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize"
+
+    @property
+    def rich_style_description(self) -> str:
+        return T(".prompts:rich_style_description").r(
+            name="Kaggle",
+            competition=f"[{self.competition}](https://www.kaggle.com/competitions/{self.competition})",
+        )
+        
diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
index 432f5c4f4..b0601af56 100644
--- a/rdagent/scenarios/data_science/scen/prompts.yaml
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -34,4 +34,29 @@ competition_background: |-
   The data type used in this competition is {{ data_type }}.
   Briefly, the competition involves: {{ brief_description }}.
   The dataset used in this competition is: {{ data_description }}.
-  Your goal in this competition is to: {{target_description }}.
\ No newline at end of file
+  Your goal in this competition is to: {{target_description }}.
+
+rich_style_description: |-
+  ### {{ name }} Agent: Automated Feature Engineering & Model Tuning Evolution
+
+  #### [Overview](#_summary)
+
+  In this scenario, our automated system proposes hypothesis, choose action, implements code, conducts validation, and utilizes feedback in a continuous, iterative process.
+
+  #### {{ name }} Competition info
+
+  Current Competition: {{ competition }}
+
+  #### [Automated R&D](#_rdloops)
+
+  - **[R (Research)](#_research)**
+  - Iteration of ideas and hypotheses.
+  - Continuous learning and knowledge construction.
+
+  - **[D (Development)](#_development)**
+  - Evolving code generation, model refinement, and features generation.
+  - Automated implementation and testing of models/features.
+
+  #### [Objective](#_summary)
+
+  To automatically optimize performance metrics within the validation set, ultimately discovering the most efficient features and models through autonomous research and development.
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index 3fbdc8fda..a702c36ac 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -1,32 +1,33 @@
 import json
+from pathlib import Path
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.scenarios.kaggle.kaggle_crawler import (
-    crawl_descriptions,
-    leaderboard_scores,
-)
 from rdagent.utils.agent.tpl import T
+from rdagent.log import rdagent_logger as logger
 
 
 class DataScienceScen(Scenario):
     """Data Science Scenario
-    It is based on kaggle now.
-        - But it is not use the same interface with previous kaggle version.
-        - Ideally, we should reuse previous kaggle scenario.
-          But we found that too much scenario unrelated code in kaggle scenario and hard to reuse.
-          So we start from a simple one....
     """
 
     def __init__(self, competition: str) -> None:
         self.competition = competition
-        self.raw_description = crawl_descriptions(competition, DS_RD_SETTING.local_data_path)
+        self.raw_description = self._get_description()
+        self.metric_direction = self._get_direction()
+        self._analysis_competition_description()
 
-        leaderboard = leaderboard_scores(competition)
-        self.metric_direction = "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize"
+    def _get_description(self):
+        if (fp := Path(f"{DS_RD_SETTING.local_data_path}/{self.competition}.json")).exists():
+            logger.info(f"Found {self.competition}.json, loading from local file.")
+            with fp.open("r") as f:
+                return json.load(f)
+        else:
+            logger.error(f"Cannot find {self.competition}.json in {DS_RD_SETTING.local_data_path}, please check the file.")
 
-        self._analysis_competition_description()
+    def _get_direction(self):
+        return self.raw_description.get("metric_direction", "minimize")
 
     def _analysis_competition_description(self):
         sys_prompt = T(".prompts:competition_description_template.system").r()
@@ -75,31 +76,10 @@ def background(self) -> str:
 
     @property
     def rich_style_description(self) -> str:
-        return f"""
-### Kaggle Agent: Automated Feature Engineering & Model Tuning Evolution
-
-#### [Overview](#_summary)
-
-In this scenario, our automated system proposes hypothesis, choose action, implements code, conducts validation, and utilizes feedback in a continuous, iterative process.
-
-#### Kaggle Competition info
-
-Current Competition: [{self.competition}](https://www.kaggle.com/competitions/{self.competition})
-
-#### [Automated R&D](#_rdloops)
-
-- **[R (Research)](#_research)**
-- Iteration of ideas and hypotheses.
-- Continuous learning and knowledge construction.
-
-- **[D (Development)](#_development)**
-- Evolving code generation, model refinement, and features generation.
-- Automated implementation and testing of models/features.
-
-#### [Objective](#_summary)
-
-To automatically optimize performance metrics within the validation set or Kaggle Leaderboard, ultimately discovering the most efficient features and models through autonomous research and development.
-"""
+        return T(".prompts:rich_style_description").r(
+            name="Data Science",
+            competition=self.competition,
+        )
 
     def get_scenario_all_desc(self) -> str:
         return T(".prompts:scenario_description").r(

From 12a27ec930da2f452d46dcf06bf548af1028afb8 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Wed, 25 Dec 2024 10:07:34 +0000
Subject: [PATCH 122/304] update sample data script

---
 rdagent/scenarios/data_science/debug/data.py | 207 +++++++++++--------
 1 file changed, 119 insertions(+), 88 deletions(-)

diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index b31b81d89..91ac92300 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -1,133 +1,164 @@
+import os
 from pathlib import Path
-
+import platform
 import pandas as pd
+import fire
+import shutil
 
 from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
 
 
 class DataHandler:
+    """Base DataHandler interface."""
 
     def load(self, path) -> pd.DataFrame:
-        ...
+        raise NotImplementedError
 
     def dump(self, df: pd.DataFrame, path):
-        ...
+        raise NotImplementedError
 
 
-class CSVDataHandler(DataHandler):
+class GenericDataHandler(DataHandler):
+    """
+    A generic data handler that automatically detects file type based on suffix 
+    and uses the correct pandas method for load/dump.
+    """
 
     def load(self, path) -> pd.DataFrame:
-        return pd.read_csv(path)
+        path = Path(path)
+        suffix = path.suffix.lower()
+
+        if suffix == ".csv":
+            return pd.read_csv(path)
+        elif suffix == ".pkl":
+            return pd.read_pickle(path)
+        elif suffix == ".parquet":
+            return pd.read_parquet(path)
+        elif suffix in [".h5", ".hdf", ".hdf5"]:
+            # Note: for HDF, you need a 'key' in read_hdf. If you expect a single key,
+            # you might do: pd.read_hdf(path, key='df') or something similar.
+            # Adjust as needed based on your HDF structure.
+            return pd.read_hdf(path, key='data')
+        else:
+            raise ValueError(f"Unsupported file type: {suffix}")
 
     def dump(self, df: pd.DataFrame, path):
-        df.to_csv(path, index=False)
+        path = Path(path)
+        suffix = path.suffix.lower()
+
+        if suffix == ".csv":
+            df.to_csv(path, index=False)
+        elif suffix == ".pkl":
+            df.to_pickle(path)
+        elif suffix == ".parquet":
+            df.to_parquet(path, index=True)
+        elif suffix in [".h5", ".hdf", ".hdf5"]:
+            # Similarly, you need a key for HDF.
+            df.to_hdf(path, key="data", mode="w")
+        else:
+            raise ValueError(f"Unsupported file type: {suffix}")
 
 
 class DataReducer:
+    """Base DataReducer interface."""
 
-    def reduce(self, df) -> pd.DataFrame:
-        ...
+    def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
+        raise NotImplementedError
 
 
 class RandDataReducer(DataReducer):
+    """
+    Example random sampler: ensures at least `min_num` rows 
+    or at least `min_frac` fraction of the data (whichever is larger).
+    """
 
     def __init__(self, min_frac=0.05, min_num=100):
         self.min_frac = min_frac
         self.min_num = min_num
 
-    def reduce(self, df) -> pd.DataFrame:
-        # Calculate the fraction to sample
+    def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
         frac = max(self.min_frac, self.min_num / len(df))
-        # Sample the data
         return df.sample(frac=frac, random_state=1)
 
 
-def create_debug_data(
-    competition,
-    original_file_name,
-    dh_cls: type[DataHandler],
-    dr_cls: type[DataReducer],
-    dr_cls_kwargs={},
-    dataset_path=KAGGLE_IMPLEMENT_SETTING.local_data_path,
-):
-    # Define the path to the original data file
-    data_path = Path(dataset_path) / competition / original_file_name
-
-    # Automatically generate full and sampled file names based on the original file name
-    original_suffix = Path(original_file_name).suffix
-    full_file_name = original_file_name.replace(original_suffix, f'.full{original_suffix}')
-    sampled_file_name = original_file_name.replace(original_suffix, f'.sampled{original_suffix}')
-
-    # Define the path to the .full data file
-    full_data_path = data_path.with_name(full_file_name)
-
-    # Check if the .full file exists
-    if not full_data_path.exists():
-        # Initialize handlers
-        data_handler = dh_cls()
-        data_reducer = dr_cls(**dr_cls_kwargs)
-
-        # Load the data file
-        df = data_handler.load(data_path)
-
-        # Reduce the data
-        df_sampled = data_reducer.reduce(df)
-
-        # Save the sampled data to a new data file
-        sampled_data_path = data_path.with_name(sampled_file_name)
-        data_handler.dump(df_sampled, sampled_data_path)
-
-        # Rename the original file with .full
-        data_path.rename(full_data_path)
-
-        # Move the sampled data to replace the original one
-        sampled_data_path.rename(data_path)
-
-
-class PickleDataHandler(DataHandler):
-
-    def load(self, path) -> pd.DataFrame:
-        return pd.read_pickle(path)
-
-    def dump(self, df: pd.DataFrame, path):
-        df.to_pickle(path)
-
-
 class ColumnReducer(DataReducer):
+    """
+    Example column reducer: keep only the first 5 columns.
+    """
 
-    def reduce(self, df) -> pd.DataFrame:
+    def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
         return df.iloc[:, :5]
 
 
-def new_york_city_taxi_fare_prediction_creator():
-    create_debug_data(competition="new-york-city-taxi-fare-prediction",
-                      original_file_name="train.csv",
-                      dh_cls=CSVDataHandler,
-                      dr_cls=RandDataReducer,
-                      dr_cls_kwargs=dict(min_frac=0.05, min_num=100))
+class RowReducer(DataReducer):
+    """
+    Example row reducer: keep only the first 10% rows.
+    """
 
+    def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
+        ten_percent = int(max(len(df) * 0.1, 100))
+        return df.iloc[:ten_percent]
 
-def amc_debug_data_creator():
-    create_debug_data(
-        competition="amc",
-        original_file_name="train_feature_with_label.pkl",
-        dh_cls=PickleDataHandler,
-        dr_cls=ColumnReducer,
-    )
 
-    create_debug_data(
-        competition="amc",
-        original_file_name="test_feature.pkl",
-        dh_cls=PickleDataHandler,
-        dr_cls=ColumnReducer,
-    )
+def create_debug_data(
+    competition: str,
+    dr_cls: type[DataReducer] = RandDataReducer,
+    dr_cls_kwargs=None,
+    dataset_path=None,
+    sample_path=None,
+):
+    """
+    Reads the original data file, creates a reduced sample, 
+    and renames/moves files for easier debugging.
+    Automatically detects file type (csv, pkl, parquet, hdf, etc.).
+    """
+    if dr_cls_kwargs is None:
+        dr_cls_kwargs = {}
+
+    if dataset_path is None:
+        dataset_path = KAGGLE_IMPLEMENT_SETTING.local_data_path
+    
+    if sample_path is None:
+        sample_path = Path(dataset_path) / "sample"
+
+    data_folder = Path(dataset_path) / competition
+    sample_folder = Path(sample_path) / competition
+
+    # Traverse the folder and exclude specific file types
+    included_extensions = {".csv", ".pkl", ".parquet", ".h5", ".hdf", ".hdf5"} 
+    files_to_process = [
+        file for file in data_folder.rglob("*") 
+        if file.is_file() 
+    ]
+
+    for file_path in files_to_process:
+        sampled_file_path = sample_folder / file_path.relative_to(data_folder)
+        if sampled_file_path.exists():
+            continue
+
+        sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
+        if file_path.suffix not in included_extensions:
+            if platform.system() == "Linux":
+                os.symlink(file_path, sampled_file_path)
+            if platform.system() == "Windows":
+                os.link(file_path, sampled_file_path)
+            continue
+            
+        # Initialize the generic data handler
+        data_handler = GenericDataHandler()
+
+        # Initialize the data reducer (e.g., RandDataReducer or ColumnReducer)
+        data_reducer = dr_cls(**dr_cls_kwargs)
 
+        # Load the original data
+        df = data_handler.load(file_path)
+
+        # Create a sampled subset
+        df_sampled = data_reducer.reduce(df)
+
+        # Dump the sampled data
+        data_handler.dump(df_sampled, sampled_file_path)
 
-# competition to data handler & Reducer mapping
-# find a place to store reduced data.
-# - <local_data_path>, <local_data_path>.debug
 
-import fire
 if __name__ == "__main__":
-    # fire.Fire(create_debug_data)
-    fire.Fire(amc_debug_data_creator)
+    fire.Fire(create_debug_data)

From de2825f80728f788ed655d3d797d2acafa479ae0 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Wed, 25 Dec 2024 10:16:46 +0000
Subject: [PATCH 123/304] make sure frac < 1

---
 rdagent/scenarios/data_science/debug/data.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index 91ac92300..597dfb34b 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -78,6 +78,8 @@ def __init__(self, min_frac=0.05, min_num=100):
 
     def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
         frac = max(self.min_frac, self.min_num / len(df))
+        if frac >= 1:
+            return df
         return df.sample(frac=frac, random_state=1)
 
 

From 9a4ba5f7928e18fee8bc0ce8298a4625c3dd6a26 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Wed, 25 Dec 2024 10:22:37 +0000
Subject: [PATCH 124/304] fix a bug

---
 rdagent/app/data_science/loop.py                         | 4 ++--
 rdagent/components/coder/CoSTEER/evolving_agent.py       | 5 +++--
 rdagent/components/coder/data_science/ensemble/exp.py    | 1 -
 rdagent/components/coder/data_science/feature/eval.py    | 3 ++-
 rdagent/components/coder/data_science/feature/exp.py     | 1 -
 rdagent/components/coder/data_science/model/eval.py      | 2 ++
 .../coder/data_science/raw_data_loader/eval.py           | 3 ++-
 .../components/coder/data_science/raw_data_loader/exp.py | 1 -
 rdagent/components/coder/data_science/workflow/eval.py   | 2 ++
 rdagent/components/coder/data_science/workflow/exp.py    | 3 +--
 rdagent/core/experiment.py                               | 1 +
 rdagent/log/logger.py                                    | 2 +-
 rdagent/scenarios/data_science/scen/__init__.py          | 2 +-
 rdagent/scenarios/data_science/scen/kaggle.py            | 2 +-
 rdagent/scenarios/data_science/scen/scen.py              | 9 +++++----
 test/utils/coder/test_CoSTEER.py                         | 8 +++++++-
 16 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 17f468634..f3560ddd7 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -1,6 +1,6 @@
+from pathlib import Path
 from typing import Any
 
-from pathlib import Path
 import fire
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
@@ -100,7 +100,7 @@ def feedback(self, prev_out: dict[str, Any]):
         self.trace.hist.append((prev_out["direct_exp_gen"].hypothesis, prev_out["running"], feedback))
 
 
-def main(path=None, step_n=None, competition=None):
+def main(path=None, step_n=None, competition="bms-molecular-translation"):
     """
     Auto R&D Evolving loop for models in a kaggle{} scenario.
     You can continue running session by
diff --git a/rdagent/components/coder/CoSTEER/evolving_agent.py b/rdagent/components/coder/CoSTEER/evolving_agent.py
index dcfeed78b..9a67f08c0 100644
--- a/rdagent/components/coder/CoSTEER/evolving_agent.py
+++ b/rdagent/components/coder/CoSTEER/evolving_agent.py
@@ -4,6 +4,7 @@
 from rdagent.core.evolving_framework import EvolvableSubjects
 from rdagent.core.exception import CoderError
 
+
 class FilterFailedRAGEvoAgent(RAGEvoAgent):
     def filter_evolvable_subjects_by_feedback(
         self, evo: EvolvableSubjects, feedback: CoSTEERSingleFeedbackDeprecated
@@ -15,8 +16,8 @@ def filter_evolvable_subjects_by_feedback(
         for index in range(len(evo.sub_workspace_list)):
             if evo.sub_workspace_list[index] is not None and feedback[index] and not feedback[index].final_decision:
                 evo.sub_workspace_list[index].clear()
-        
+
         if all(not f.final_decision for f in feedback if f):
             raise CoderError("All feedbacks of sub tasks are negative.")
-        
+
         return evo
diff --git a/rdagent/components/coder/data_science/ensemble/exp.py b/rdagent/components/coder/data_science/ensemble/exp.py
index d3a848254..2850ca0f0 100644
--- a/rdagent/components/coder/data_science/ensemble/exp.py
+++ b/rdagent/components/coder/data_science/ensemble/exp.py
@@ -7,5 +7,4 @@
 from rdagent.components.coder.CoSTEER.task import CoSTEERTask
 from rdagent.core.utils import cache_with_pickle
 
-
 EnsembleTask = CoSTEERTask
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
index 546b341b0..b6993c553 100644
--- a/rdagent/components/coder/data_science/feature/eval.py
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -52,7 +52,8 @@ def evaluate(
             test_code = f.read()
             implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
-
+        if stdout is None:
+            stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
         system_prompt = T(".prompts:feature_eval.system").r(
             test_code=test_code, code=implementation.file_dict["feat01.py"]
         )
diff --git a/rdagent/components/coder/data_science/feature/exp.py b/rdagent/components/coder/data_science/feature/exp.py
index 2f0cff674..e6658050d 100644
--- a/rdagent/components/coder/data_science/feature/exp.py
+++ b/rdagent/components/coder/data_science/feature/exp.py
@@ -7,5 +7,4 @@
 from rdagent.components.coder.CoSTEER.task import CoSTEERTask
 from rdagent.core.utils import cache_with_pickle
 
-
 FeatureTask = CoSTEERTask
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 7f542d9e3..64a2273c9 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -70,6 +70,8 @@ def evaluate(
             test_code = f.read()
             implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
+        if stdout is None:
+            stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
         system_prompt = T(".prompts:model_eval.system").r(
             test_code=test_code, scenario="No scenario information yet.", spec=implementation.file_dict["spec/model.md"]
         )
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index c0ec526d3..b3c78333e 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -57,7 +57,8 @@ def evaluate(
             test_code = f.read()
             implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
-
+        if stdout is None:
+            stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
         system_prompt = T(".prompts:data_loader_eval.system").r(
             test_code=test_code, code=implementation.file_dict["load_data.py"]
         )
diff --git a/rdagent/components/coder/data_science/raw_data_loader/exp.py b/rdagent/components/coder/data_science/raw_data_loader/exp.py
index 0f08436e5..54d280719 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/exp.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/exp.py
@@ -11,5 +11,4 @@
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
 
-
 DataLoaderTask = CoSTEERTask
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index 63d9b8148..a288cdce5 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -55,6 +55,8 @@ def evaluate(
         de = DockerEnv(conf=ds_docker_conf)
         fname = "main.py"
         stdout = implementation.execute(env=de, entry=f"python {fname}")
+        if stdout is None:
+            stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
         system_prompt = T(".prompts:workflow_eval.system").r(
             scenario="No scenario information yet.", spec=implementation.file_dict["spec/workflow.md"]
         )
diff --git a/rdagent/components/coder/data_science/workflow/exp.py b/rdagent/components/coder/data_science/workflow/exp.py
index a4a18f720..e49af8339 100644
--- a/rdagent/components/coder/data_science/workflow/exp.py
+++ b/rdagent/components/coder/data_science/workflow/exp.py
@@ -7,5 +7,4 @@
 from rdagent.components.coder.CoSTEER.task import CoSTEERTask
 from rdagent.core.utils import cache_with_pickle
 
-
-WorkflowTask = CoSTEERTask
\ No newline at end of file
+WorkflowTask = CoSTEERTask
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index 2cc25f031..4c10b7520 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -50,6 +50,7 @@ def get_task_information(self) -> str:
     def __repr__(self) -> str:
         return f"<{self.__class__.__name__} {self.name}>"
 
+
 ASpecificTask = TypeVar("ASpecificTask", bound=Task)
 
 
diff --git a/rdagent/log/logger.py b/rdagent/log/logger.py
index 8a245703b..5b848571c 100644
--- a/rdagent/log/logger.py
+++ b/rdagent/log/logger.py
@@ -1,7 +1,7 @@
 import json
 import os
-import sys
 import pickle
+import sys
 from contextlib import contextmanager
 from datetime import datetime, timezone
 from functools import partial
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
index 8aaf93146..21ef7e0e6 100644
--- a/rdagent/scenarios/data_science/scen/__init__.py
+++ b/rdagent/scenarios/data_science/scen/__init__.py
@@ -1,4 +1,4 @@
-from .scen import DataScienceScen
 from .kaggle import KaggleScen
+from .scen import DataScienceScen
 
 __all__ = ["DataScienceScen", "KaggleScen"]
diff --git a/rdagent/scenarios/data_science/scen/kaggle.py b/rdagent/scenarios/data_science/scen/kaggle.py
index 3a269405f..5390582ca 100644
--- a/rdagent/scenarios/data_science/scen/kaggle.py
+++ b/rdagent/scenarios/data_science/scen/kaggle.py
@@ -19,6 +19,7 @@ class KaggleScen(DataScienceScen):
           But we found that too much scenario unrelated code in kaggle scenario and hard to reuse.
           So we start from a simple one....
     """
+
     def _get_description(self):
         return crawl_descriptions(self.competition, DS_RD_SETTING.local_data_path)
 
@@ -32,4 +33,3 @@ def rich_style_description(self) -> str:
             name="Kaggle",
             competition=f"[{self.competition}](https://www.kaggle.com/competitions/{self.competition})",
         )
-        
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index a702c36ac..680df65bd 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -3,14 +3,13 @@
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.core.scenario import Scenario
+from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
-from rdagent.log import rdagent_logger as logger
 
 
 class DataScienceScen(Scenario):
-    """Data Science Scenario
-    """
+    """Data Science Scenario"""
 
     def __init__(self, competition: str) -> None:
         self.competition = competition
@@ -24,7 +23,9 @@ def _get_description(self):
             with fp.open("r") as f:
                 return json.load(f)
         else:
-            logger.error(f"Cannot find {self.competition}.json in {DS_RD_SETTING.local_data_path}, please check the file.")
+            logger.error(
+                f"Cannot find {self.competition}.json in {DS_RD_SETTING.local_data_path}, please check the file."
+            )
 
     def _get_direction(self):
         return self.raw_description.get("metric_direction", "minimize")
diff --git a/test/utils/coder/test_CoSTEER.py b/test/utils/coder/test_CoSTEER.py
index 69f5d4850..47176ab15 100644
--- a/test/utils/coder/test_CoSTEER.py
+++ b/test/utils/coder/test_CoSTEER.py
@@ -1,5 +1,6 @@
 import unittest
 
+
 class CoSTEERTest(unittest.TestCase):
 
     def setUp(self):
@@ -23,26 +24,31 @@ def test_feature(self):
         from rdagent.components.coder.data_science.feature.test import (
             develop_one_competition,
         )
+
         exp = develop_one_competition(self.test_competition)
 
     def test_model(self):
         from rdagent.components.coder.data_science.model.test import (
             develop_one_competition,
         )
+
         exp = develop_one_competition(self.test_competition)
 
     def test_ensemble(self):
         from rdagent.components.coder.data_science.ensemble.test import (
             develop_one_competition,
         )
+
         exp = develop_one_competition(self.test_competition)
-    
+
     def test_workflow(self):
         from rdagent.components.coder.data_science.workflow.test import (
             develop_one_competition,
         )
+
         exp = develop_one_competition(self.test_competition)
 
+
 if __name__ == "__main__":
     unittest.main()
     # pytest test/utils/coder/test_CoSTEER.py

From 75b40d86f43b89162e0bcdbc9e30c70f15d6ae57 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 25 Dec 2024 10:33:03 +0000
Subject: [PATCH 125/304] feature spec changes

---
 .../feature/eval_tests/feature_test.py             | 14 +++-----------
 .../data_science/raw_data_loader/prompts.yaml      | 10 ++++++++++
 .../tpl_ex/aerial-cactus-identification/main.py    |  2 +-
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
index a9d812567..fb491da8e 100644
--- a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
@@ -7,23 +7,15 @@
 Please make sure the stdout is rich enough to support informative feedback
 """
 
-import logging
 import pickle
 
 import numpy as np
 import pandas as pd
-from feat01 import feat_eng
-
-# Setup logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-
-# Load data
 from load_data import load_data
+from feat01 import feat_eng
 
 X, y, X_test, test_ids = load_data()
-
-X, y, X_param = feat_eng(X, y)
-X_test, _, _ = feat_eng(X_test, param=X_param)
+X, y, X_test = feat_eng(X, y, X_test)
 
 
 # Validate the conditions mentioned in the docstring
@@ -41,4 +33,4 @@
 else:
     raise TypeError("Unsupported data type for X and y")
 
-logging.info("Data loader test passed successfully. Length of test images matches length of test IDs.")
+print("Data loader test passed successfully. Length of test images matches length of test IDs.")
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index c575fff1b..a4415318a 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -76,7 +76,17 @@ spec:
     feature: |-
       Feature engineering specification text should adhere to the following requirements:
       1. Function Interface:
+        - Give a python function interface code with docstring.
         - The function must be named `feat_eng`.
+        - Parameters:
+          - `X`: Train data to be transformed.
+          - `y`: Train label data.
+          - `X_test`: Test data.
+        - Output:
+          - `X_transformed`: Transformed train data.
+          - `y_transformed`: Transformed train label data.
+          - `X_test_transformed`: Transformed test data.
+
         - Must include proper and specific annotations for both input and output based on the Competition Information:
           - Input: Specify the expected input data type (e.g., `pd.DataFrame`, `dict`, `np.array`, etc.).
           - Output: Specify the transformed output data type (e.g., `pd.DataFrame`, `dict`, `np.array`, etc.).
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
index 139c0803f..3d45b0ec0 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
@@ -8,7 +8,7 @@
 # feature engineering
 from feat01 import feat_eng
 
-train_images, train_lables, train_param = feat_eng(train_images, train_labels)
+train_images, train_lables, train_param = feat_eng(train_images, train_labels, train_images, train_labels)
 test_images, _, _ = feat_eng(test_images, param=train_param)
 
 

From e8f241010e6e6b76f23e192bddad4bf700dab3ce Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 25 Dec 2024 10:35:22 +0000
Subject: [PATCH 126/304] fix

---
 rdagent/app/data_science/loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index f3560ddd7..50e3f334c 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -100,7 +100,7 @@ def feedback(self, prev_out: dict[str, Any]):
         self.trace.hist.append((prev_out["direct_exp_gen"].hypothesis, prev_out["running"], feedback))
 
 
-def main(path=None, step_n=None, competition="bms-molecular-translation"):
+def main(path=None, step_n=None, competition=None):
     """
     Auto R&D Evolving loop for models in a kaggle{} scenario.
     You can continue running session by

From 418c2ce2cc5645963319faf6ea906d84294bbd3a Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Thu, 26 Dec 2024 02:35:27 +0000
Subject: [PATCH 127/304] changeimport order

---
 rdagent/oai/llm_conf.py                         | 2 +-
 rdagent/scenarios/data_science/scen/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/rdagent/oai/llm_conf.py b/rdagent/oai/llm_conf.py
index 15bce4ed8..f3c2498d6 100644
--- a/rdagent/oai/llm_conf.py
+++ b/rdagent/oai/llm_conf.py
@@ -24,7 +24,7 @@ class LLMSettings(ExtendedBaseSettings):
     # Behavior of returning answers to the same question when caching is enabled
     use_auto_chat_cache_seed_gen: bool = False
     """
-    `_create_chat_completion_inner_function` provdies a feature to pass in a seed to affect the cache hash key
+    `_create_chat_completion_inner_function` provides a feature to pass in a seed to affect the cache hash key
     We want to enable a auto seed generator to get different default seed for `_create_chat_completion_inner_function`
     if seed is not given.
     So the cache will only not miss you ask the same question on same round.
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
index 21ef7e0e6..8aaf93146 100644
--- a/rdagent/scenarios/data_science/scen/__init__.py
+++ b/rdagent/scenarios/data_science/scen/__init__.py
@@ -1,4 +1,4 @@
-from .kaggle import KaggleScen
 from .scen import DataScienceScen
+from .kaggle import KaggleScen
 
 __all__ = ["DataScienceScen", "KaggleScen"]

From fe10da7870581650a6c188545473176043a24451 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Thu, 26 Dec 2024 03:51:43 +0000
Subject: [PATCH 128/304] clear unnecessary std outputs

---
 .../components/coder/data_science/raw_data_loader/prompts.yaml  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index a4415318a..6d8f95d46 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -148,6 +148,7 @@ spec:
         - Return hyperparameters for retrain if not exists.
         - Perform model training on `X` and `y`, and evaluate using `val_X` and `val_y`.
         - If `test_X` is provided, generate predictions for it.
+        - If there is a training progress bar, do not output it to stdout
 
       {% if latest_spec %}
       3. Former Specification:
@@ -214,6 +215,7 @@ spec:
         4. Code Standards:
           - Use consistent naming conventions and type annotations.
           - Document the workflow with clear comments and docstrings.
+          - If there is a training progress bar, do not output it to stdout
 
         {% if latest_spec %}
         5. Former Specification:

From a4e3cedeb736cbf83954e4302ec30acae7cf5b30 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Thu, 26 Dec 2024 03:55:21 +0000
Subject: [PATCH 129/304] fix a typo

---
 .../coder/data_science/feature/eval_tests/feature_test.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
index fb491da8e..0c255f7bb 100644
--- a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
@@ -33,4 +33,4 @@
 else:
     raise TypeError("Unsupported data type for X and y")
 
-print("Data loader test passed successfully. Length of test images matches length of test IDs.")
+print("Feature Engineering test passed successfully. Length of test images matches length of test IDs.")

From e009fd7808914d38ce44fc3a04d76de1f55e9c88 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Thu, 26 Dec 2024 06:30:33 +0000
Subject: [PATCH 130/304] create sample folder after unzip kaggle data

---
 rdagent/scenarios/data_science/debug/data.py | 8 ++++++--
 rdagent/scenarios/kaggle/kaggle_crawler.py   | 4 ++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index 597dfb34b..9a21f234b 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -121,6 +121,7 @@ def create_debug_data(
         dataset_path = KAGGLE_IMPLEMENT_SETTING.local_data_path
     
     if sample_path is None:
+        # Create a sample folder under the dataset folder, which should be available in docker container
         sample_path = Path(dataset_path) / "sample"
 
     data_folder = Path(dataset_path) / competition
@@ -159,8 +160,11 @@ def create_debug_data(
         df_sampled = data_reducer.reduce(df)
 
         # Dump the sampled data
-        data_handler.dump(df_sampled, sampled_file_path)
-
+        try:
+            data_handler.dump(df_sampled, sampled_file_path)
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+            continue
 
 if __name__ == "__main__":
     fire.Fire(create_debug_data)
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
index 069ca655a..df78c3609 100644
--- a/rdagent/scenarios/kaggle/kaggle_crawler.py
+++ b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -22,6 +22,7 @@
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.env import MLEBDockerEnv
+from rdagent.scenarios.data_science.debug.data import create_debug_data
 
 # %%
 options = webdriver.ChromeOptions()
@@ -161,6 +162,9 @@ def download_data(competition: str, settings: ExtendedBaseSettings = KAGGLE_IMPL
                 unzip_data(unzip_file_path=f"{zipfile_path}/{competition}.zip", unzip_target_path=unzip_path)
                 for sub_zip_file in Path(unzip_path).rglob("*.zip"):
                     unzip_data(sub_zip_file, unzip_target_path=unzip_path)
+            
+            # sample data
+            create_debug_data(competition, dataset_path=local_path)
 
 
 def unzip_data(unzip_file_path: str, unzip_target_path: str) -> None:

From 36d26eeacb76709a52c3742094e552ee72428c97 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 26 Dec 2024 06:56:45 +0000
Subject: [PATCH 131/304] feature/model test script update

---
 .../data_science/feature/eval_tests/feature_test.py   | 11 -----------
 .../data_science/model/eval_tests/model_execute.py    |  7 ++++++-
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
index 0c255f7bb..e94fb58ec 100644
--- a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
@@ -21,16 +21,5 @@
 # Validate the conditions mentioned in the docstring
 assert len(X_test) == len(test_ids), "Mismatch in length of test images and test IDs"
 assert len(X) == len(y), "Mismatch in length of training images and labels"
-# Check for missing values
-if isinstance(X, pd.DataFrame):
-    assert not X.isnull().values.any(), "Missing values found in training data"
-    assert not X_test.isnull().values.any(), "Missing values found in test data"
-    assert not y.isnull().values.any(), "Missing values found in labels"
-elif isinstance(X, np.ndarray):
-    assert not np.isnan(X).any(), "Missing values found in training data"
-    assert not np.isnan(X_test).any(), "Missing values found in test data"
-    assert not np.isnan(y).any(), "Missing values found in labels"
-else:
-    raise TypeError("Unsupported data type for X and y")
 
 print("Feature Engineering test passed successfully. Length of test images matches length of test IDs.")
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
index 1184e99e5..ea5756248 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
@@ -7,11 +7,14 @@
 import traceback
 
 import numpy as np
+from sklearn.model_selection import train_test_split
 from load_data import load_data
 from model01 import model_workflow
-from sklearn.model_selection import train_test_split
+from feat01 import feat_eng
 
 X, y, test_X, test_ids = load_data()
+X, y, test_X = feat_eng(X, y, test_X)
+
 train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=42)
 
 
@@ -64,3 +67,5 @@
 else:
     execution_feedback_str += "Hyperparameters are None.\n"
 print(execution_feedback_str)
+
+print("Model code test passed successfully.")

From 08df71af12588839a5f2ea353ed092baa0d4b58b Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Thu, 26 Dec 2024 07:20:09 +0000
Subject: [PATCH 132/304] Align the data types across modules.

---
 rdagent/app/data_science/loop.py              |  2 +-
 .../data_science/raw_data_loader/prompts.yaml | 32 +++++++++++--------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 50e3f334c..f3560ddd7 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -100,7 +100,7 @@ def feedback(self, prev_out: dict[str, Any]):
         self.trace.hist.append((prev_out["direct_exp_gen"].hypothesis, prev_out["running"], feedback))
 
 
-def main(path=None, step_n=None, competition=None):
+def main(path=None, step_n=None, competition="bms-molecular-translation"):
     """
     Auto R&D Evolving loop for models in a kaggle{} scenario.
     You can continue running session by
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 6d8f95d46..5793f2228 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -40,6 +40,7 @@ spec:
     data_loader: |-
       Data loader specification text should follow these detailed requirements:
       1. Function Interface:
+        - Give a python function interface code with docstring.
         - The function must be named `load_data`.
         - All raw data files are located in the /kaggle/input/ directory; therefore, the function should not take any input arguments.
         - The function must include proper and specific annotations for the output, specifying the expected data type (e.g., `pd.DataFrame`, `dict`, `np.array`, etc.).
@@ -47,20 +48,23 @@ spec:
           - Describes the purpose of the function.
           - Mentions the source of the data (e.g., data location or structure).
           - Explains the expected output format.
+        - Input: None
+        - Output:
+          - The function should return four objects: `X`, `y`, `X_test`, and `test_ids`.
+          - `X`: The feature matrix for the training data.
+          - `y`: The target vector for the training data.
+          - `X_test`: The feature matrix for the test data.
+          - `test_ids`: The identifiers for the test data.
       2. Precautions for Data Loading and Preprocessing:
         - Handle potential issues such as (You should depend on the competition information to make a concise specification):
-          - File encoding (e.g., UTF-8) and data delimiters (e.g., CSV comma-separated).
-          - Missing values in datasets: describe how they should be handled (e.g., fill with a specific value, drop rows, etc.).
-          - Data types: ensure proper type conversion (e.g., numeric columns, date parsing).
-          - Memory efficiency for large datasets: consider techniques such as downcasting types or reading data in chunks.
-          - Multiple files: if the dataset includes multiple files, specify how they should be combined or processed.
-        - Add any domain-specific handling (e.g., date formatting, specific transformations) relevant to the competition dataset.
-      3. Output:
-        - The function should return four objects: `X`, `y`, `X_test`, and `test_ids`.
-        - `X`: The feature matrix for the training data.
-        - `y`: The target vector for the training data.
-        - `X_test`: The feature matrix for the test data.
-        - `test_ids`: The identifiers for the test data.
+              - File encoding (e.g., UTF-8) and data delimiters (e.g., CSV comma-separated).
+              - Missing values in datasets: describe how they should be handled (e.g., fill with a specific value, drop rows, etc.).
+              - Data types: ensure proper type conversion (e.g., numeric columns, date parsing).
+              - Memory efficiency for large datasets: consider techniques such as downcasting types or reading data in chunks.
+              - Multiple files: if the dataset includes multiple files, specify how they should be combined or processed.
+            - Add any domain-specific handling (e.g., date formatting, specific transformations) relevant to the competition dataset.
+            - Do not use progress bars (e.g., tqdm) in the code.
+
       
       {% if latest_spec %}
       4. Former Specification:
@@ -148,7 +152,7 @@ spec:
         - Return hyperparameters for retrain if not exists.
         - Perform model training on `X` and `y`, and evaluate using `val_X` and `val_y`.
         - If `test_X` is provided, generate predictions for it.
-        - If there is a training progress bar, do not output it to stdout
+        - Do not use progress bars (e.g., tqdm) in the code.
 
       {% if latest_spec %}
       3. Former Specification:
@@ -215,7 +219,7 @@ spec:
         4. Code Standards:
           - Use consistent naming conventions and type annotations.
           - Document the workflow with clear comments and docstrings.
-          - If there is a training progress bar, do not output it to stdout
+          - Do not use progress bars (e.g., tqdm) in the code.
 
         {% if latest_spec %}
         5. Former Specification:

From c02fd79dd644a04386f55cd10e016db5610e5eb5 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Thu, 26 Dec 2024 07:25:06 +0000
Subject: [PATCH 133/304] fix a bug in model eval

---
 rdagent/components/coder/data_science/model/prompts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index c3b8c8f37..97eb93ffd 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -78,6 +78,8 @@ model_eval:
         You should evaluate the code given by the user. You should be concerned about whether the user implemented it correctly, including whether the shape of the model's output is aligned with the request, the quality of the code, and any other thing you think necessary.
         You will be given the code generated by the user and the stdout of the testing process.
         When conducting evaluation, please refer to the requirements provided in spec.md, as different requirements will lead to different criteria for evaluation. 
+
+        Only if there is "Model code test passed successfully." in the stdout, then the model is considered successfu, or else there must be some issues with the model.
     
         Please respond with your feedback in the following JSON format and order:
         ```json

From d3e3f60f5633c3bcbd01ac3d701c8cdaee05c915 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 26 Dec 2024 07:40:44 +0000
Subject: [PATCH 134/304] show line number

---
 rdagent/log/ui/llm_st.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
index 7c70108a1..cbcb54fc5 100644
--- a/rdagent/log/ui/llm_st.py
+++ b/rdagent/log/ui/llm_st.py
@@ -67,7 +67,7 @@ def show_text(text, lang=None):
     if lang is not None:
         st.code(text, language=lang, wrap_lines=True)
     elif "```py" in text:
-        st.code(text, language="python", wrap_lines=True)
+        st.code(text, language="python", wrap_lines=True, line_numbers=True)
     else:
         st.code(text, language="html", wrap_lines=True)
 
@@ -143,7 +143,7 @@ def extract_evoid(tag):
                     if "code" in rdict:
                         code = rdict["code"]
                         st.markdown(":red[**Code in response dict:**]")
-                        st.code(code, language="python", wrap_lines=True)
+                        st.code(code, language="python", wrap_lines=True, line_numbers=True)
                         rdict.pop("code")
                     elif "spec" in rdict:
                         spec = rdict["spec"]

From fa21c04422f3fd6bf1d800113659730d4caea25c Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Thu, 26 Dec 2024 07:54:10 +0000
Subject: [PATCH 135/304] move sample entry point to app

---
 rdagent/app/data_science/debug.py            | 11 +++++++----
 rdagent/scenarios/data_science/debug/data.py |  3 ---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/rdagent/app/data_science/debug.py b/rdagent/app/data_science/debug.py
index c92f7551b..06b0af6e4 100644
--- a/rdagent/app/data_science/debug.py
+++ b/rdagent/app/data_science/debug.py
@@ -1,4 +1,7 @@
-"""
-entry of debuggging function
-- e.g. creating debug datasets
-"""
+import fire
+
+from rdagent.scenarios.data_science.debug.data import create_debug_data
+
+
+if __name__ == "__main__":
+    fire.Fire(create_debug_data)
diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index 9a21f234b..120b0ae08 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -2,7 +2,6 @@
 from pathlib import Path
 import platform
 import pandas as pd
-import fire
 import shutil
 
 from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
@@ -166,5 +165,3 @@ def create_debug_data(
             print(f"Error processing {file_path}: {e}")
             continue
 
-if __name__ == "__main__":
-    fire.Fire(create_debug_data)

From 668271171442a9f30eda66d2190a2507dad279a5 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 27 Dec 2024 10:06:12 +0000
Subject: [PATCH 136/304] spec & model prompt changes

---
 .../model/eval_tests/model_execute.py         |  2 +-
 .../coder/data_science/model/prompts.yaml     |  2 +-
 .../data_science/raw_data_loader/prompts.yaml | 39 ++++++++++---------
 .../coder/data_science/workflow/__init__.py   |  2 +-
 4 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
index ea5756248..e6ea885fa 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_execute.py
@@ -52,7 +52,7 @@
 print(execution_feedback_str)
 
 print("The second execution begins.\n")
-val_pred, test_pred, finalhypers = model_workflow(X=train_X, y=train_y, val_X=None, val_y=None, test_X=test_X, **hypers)
+val_pred, test_pred, finalhypers = model_workflow(X=train_X, y=train_y, val_X=None, val_y=None, test_X=test_X, hyper_params=hypers)
 execution_feedback_str = "The second Execution successful.\n"
 if val_pred is not None:
     execution_feedback_str += f"Validation predictions shape: {val_pred.shape}\n"
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 97eb93ffd..9f5899d06 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -42,7 +42,7 @@ model_coder:
         =====Feedback:=====
         {{ former_failed_knowledge.feedback }}
         {% endfor %}
-        {% endif %}    
+        {% endif %}
 
     user: |-
         ---------Model Specification---------
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 5793f2228..09fda366d 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -13,6 +13,8 @@ spec:
     - Ensemble
     - The overall workflow
 
+    The specifications for each step should refer to the specifications in the previous sections and be tailored to the competition information provided.
+
     -----------Competition Information-----------
     {{ competition_info }}
 
@@ -123,31 +125,28 @@ spec:
     model: |-
       Model building specification text should adhere to the following requirements:
       1. Function Interface:
+        - Give a python function interface code with docstring.
         - The function name must be `model_workflow`.
-        - The function should include:
-          - Type annotations for all inputs and outputs.
-          - Input and output shapes:
-            - Input:
-              - `X`: A 4D NumPy array of shape `(num_samples, height, width, channels)`, where:
-                - `num_samples`: Number of training samples.
-                - `height` and `width`: Dimensions of the image (e.g., `224 x 224`).
-                - `channels`: Number of color channels (e.g., `3` for RGB).
-              - `y`: A 2D NumPy array of shape `(num_samples, 1)`, where `1` represents binary classification labels.
-              - Optional:
-                - `val_X`: Validation features of shape `(num_val_samples, height, width, channels)`.
-                - `val_y`: Validation labels of shape `(num_val_samples, 1)`.
-                - `test_X`: Test features of shape `(num_test_samples, height, width, channels)`.
-              - `hyper_params`: A dictionary of important hyperparameters for model configuration.
-            - Output:
-              - A tuple consisting of:
-                - `pred_val`: Predictions on validation data (`np.ndarray` of shape `(num_val_samples, 1)` or `None`).
-                - `pred_test`: Predictions on test data (`np.ndarray` of shape `(num_test_samples, 1)` or `None`).
-                - `hyper_params`: A dictionary of important hyperparameters for model configuration.
+        - Provide annotations for all inputs and outputs.
+        - Input:
+          - `X`: training features.
+          - `y`: training labels.
+          - Optional:
+            - `val_X`: Validation features.
+            - `val_y`: Validation labels.
+            - `test_X`: Test features.
+          - `hyper_params`: A dictionary of important hyperparameters for model configuration.
+        - Output:
+          - A tuple consisting of:
+            - `pred_val`: Predictions on validation data.
+            - `pred_test`: Predictions on test data.
+            - `hyper_params`: A dictionary of important hyperparameters for model configuration.
 
         - Include a clear and concise docstring to explain the function's purpose, its input parameters, and its expected return values.
 
       2. Precautions:
         - Ensure input arrays (`X`, `y`, `val_X`, `val_y`, `test_X`) have the correct shapes and consistent dimensions.
+        - You should check and handle outliers in your input data.
         - Use default values for hyperparameters if none are provided in `hyper_params`.
         - Return hyperparameters for retrain if not exists.
         - Perform model training on `X` and `y`, and evaluate using `val_X` and `val_y`.
@@ -276,6 +275,8 @@ data_loader_coder:
       You should follow the former code to improve it.
     {% endif %}
 
+    You should strictly follow the function interface specifications provided by the specification to implement the function.
+
 
 data_loader_eval:
   system: |-
diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
index 6d78b480f..454732b1f 100644
--- a/rdagent/components/coder/data_science/workflow/__init__.py
+++ b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -50,7 +50,7 @@ def implement_one_task(
             load_data_code=workspace.file_dict["load_data.py"],
             feature_code=workspace.file_dict["feat01.py"],
             model_code=workspace.file_dict["model01.py"],
-            ensemble_code=workspace.file_dict["ens.py"],
+            ensemble_code=workspace.file_dict["ensemble.py"],
             latest_code=workspace.file_dict.get("main.py"),
             workflow_spec=workspace.file_dict["spec/workflow.md"],
         )

From f8113b2c0fc5a4ddc4e860b601f7534fd3c7b98d Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Fri, 27 Dec 2024 10:06:14 +0000
Subject: [PATCH 137/304] Refine the competition specification to address the
 data type problem and the coherence issue.

---
 .../data_science/raw_data_loader/prompts.yaml | 219 +++++++++++-------
 1 file changed, 130 insertions(+), 89 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 09fda366d..b25f6d051 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -42,34 +42,38 @@ spec:
     data_loader: |-
       Data loader specification text should follow these detailed requirements:
       1. Function Interface:
-        - Give a python function interface code with docstring.
-        - The function must be named `load_data`.
-        - All raw data files are located in the /kaggle/input/ directory; therefore, the function should not take any input arguments.
-        - The function must include proper and specific annotations for the output, specifying the expected data type (e.g., `pd.DataFrame`, `dict`, `np.array`, etc.).
-        - A clear docstring should be provided that:
-          - Describes the purpose of the function.
-          - Mentions the source of the data (e.g., data location or structure).
-          - Explains the expected output format.
-        - Input: None
+        - Function Name: `load_data`
+        - Input: No input arguments.
         - Output:
-          - The function should return four objects: `X`, `y`, `X_test`, and `test_ids`.
-          - `X`: The feature matrix for the training data.
-          - `y`: The target vector for the training data.
-          - `X_test`: The feature matrix for the test data.
-          - `test_ids`: The identifiers for the test data.
-      2. Precautions for Data Loading and Preprocessing:
-        - Handle potential issues such as (You should depend on the competition information to make a concise specification):
-              - File encoding (e.g., UTF-8) and data delimiters (e.g., CSV comma-separated).
-              - Missing values in datasets: describe how they should be handled (e.g., fill with a specific value, drop rows, etc.).
-              - Data types: ensure proper type conversion (e.g., numeric columns, date parsing).
-              - Memory efficiency for large datasets: consider techniques such as downcasting types or reading data in chunks.
-              - Multiple files: if the dataset includes multiple files, specify how they should be combined or processed.
-            - Add any domain-specific handling (e.g., date formatting, specific transformations) relevant to the competition dataset.
-            - Do not use progress bars (e.g., tqdm) in the code.
+          - `X` (DT, define based on competition information): Feature matrix for training data.
+          - `y` (DT): Target vector for training data.
+          - `X_test` (DT): Feature matrix for test data.
+          - `test_ids` (DT): Identifiers for the test data.
+        - Docstring Requirements:
+          - Describe the purpose of the function.
+          - Specify the data source location (`/kaggle/input/`).
+          - Clearly define the structure and type of the output.
 
+      2. Precautions for Data Loading and Preprocessing:
+        - File Handling:
+          - Ensure proper file encoding (e.g., UTF-8) and delimiters (e.g., CSV comma-separated).
+          - Combine or process multiple files if necessary.
+        - Data Preprocessing:
+          - Convert data types correctly (e.g., numeric, categorical, date parsing).
+          - Handle missing values appropriately (e.g., impute, drop rows/columns).
+          - Optimize memory usage for large datasets using techniques like downcasting or reading data in chunks if necessary.
+        - Domain-Specific Handling: 
+          - Apply competition-specific preprocessing steps as needed (e.g., text tokenization, image resizing).
+
+      3. Code Standards:
+        - Avoid using progress bars (e.g., `tqdm`) in the implementation.
+
+      4. Notes:
+        - Update `DT` (data type) based on the specific competition dataset. This can include `pd.DataFrame`, `np.array`, `torch.Tensor`, etc.
+        - Extend domain-specific handling steps based on the competition information.
       
       {% if latest_spec %}
-      4. Former Specification:
+      5. Former Specification:
         {{ latest_spec }}
         You should follow the provided specifications to improve this task.
       {% endif %}
@@ -82,37 +86,41 @@ spec:
     feature: |-
       Feature engineering specification text should adhere to the following requirements:
       1. Function Interface:
-        - Give a python function interface code with docstring.
-        - The function must be named `feat_eng`.
+        - Function Name: `feat_eng`
         - Parameters:
-          - `X`: Train data to be transformed.
-          - `y`: Train label data.
-          - `X_test`: Test data.
+          - `X` (DT): Train data to be transformed.
+          - `y` (DT): Train label data.
+          - `X_test` (DT): Test data.
         - Output:
-          - `X_transformed`: Transformed train data.
-          - `y_transformed`: Transformed train label data.
-          - `X_test_transformed`: Transformed test data.
-
-        - Must include proper and specific annotations for both input and output based on the Competition Information:
-          - Input: Specify the expected input data type (e.g., `pd.DataFrame`, `dict`, `np.array`, etc.).
-          - Output: Specify the transformed output data type (e.g., `pd.DataFrame`, `dict`, `np.array`, etc.).
-          - You should depend on the competition information to make a concise specification.
-        - A comprehensive docstring must be provided that:
-          - Describes the purpose of the function.
-          - Clarifies the input parameters and their types.
-          - Defines the structure and format of the output.
-      2. Precautions for Feature Engineering (You should depend on the competition information to make a concise specification):
-        - If feature engineering is strictly part of the model pipeline and should not be done here, explicitly state that feature engineering will be handled at the model stage.
-        - If the competition requirements or modeling strategy dictate that feature engineering must be integrated into the model pipeline, this function will remain as a placeholder and return the input data unchanged.
-        - When feature engineering is applied, consider the following precautions:
+          - `X_transformed` (DT): Transformed train data.
+          - `y_transformed` (DT): Transformed train label data.
+          - `X_test_transformed` (DT): Transformed test data.
+        - Docstring Requirements:
+          - Describe the purpose of the function.
+          - Clarify the input parameters and their data types.
+          - Define the structure and format of the output.
+
+      2. Precautions for Feature Engineering:
+        - Integration with Model Pipeline
+          - If feature engineering is strictly part of the model pipeline, state explicitly that it will be handled at the model stage.
+          - If integrated here, ensure this function applies all required transformations while avoiding data leakage.
+        - General Considerations:
           - Ensure scalability for large datasets.
-          - Handle missing values and outliers appropriately during feature transformation.
-          - Feature types: Ensure consistency between feature data types and transformations.
-          - Custom features: Provide logic for domain-specific features, if applicable.
+          - Handle missing values and outliers appropriately (e.g., impute, remove, or replace).
+          - Ensure consistency between feature data types and transformations.
           - Avoid data leakage: Only use features derived from training data, excluding information from test or validation sets.
+        - Domain-Specific Features:
+          - Apply logic for competition-specific features (e.g., text vectorization, image augmentations, categorical encoding).
+
+      3. Code Standards:
+        - Avoid using progress bars (e.g., `tqdm`) in the implementation.          
+
+      4. Notes:
+        - Align `DT` (data type) definitions with those in the Data Loader specification.
+        - Extend or adjust domain-specific transformations based on competition requirements.
       
       {% if latest_spec %}
-      3. Former Specification:
+      5. Former Specification:
         {{ latest_spec }}
         You should follow the provided specifications to improve this task.
       {% endif %}
@@ -124,37 +132,58 @@ spec:
 
     model: |-
       Model building specification text should adhere to the following requirements:
+
       1. Function Interface:
-        - Give a python function interface code with docstring.
-        - The function name must be `model_workflow`.
-        - Provide annotations for all inputs and outputs.
-        - Input:
-          - `X`: training features.
-          - `y`: training labels.
-          - Optional:
-            - `val_X`: Validation features.
-            - `val_y`: Validation labels.
-            - `test_X`: Test features.
-          - `hyper_params`: A dictionary of important hyperparameters for model configuration.
+        - Function Name: `model_workflow`
+        - Parameters:
+          - `X` (DT): Training feature data.
+          - `y` (DT): Training label data.
+          - `val_X` (Optional[DT]): Validation feature data.
+          - `val_y` (Optional[DT]): Validation label data.
+          - `test_X` (Optional[DT]): Test feature data.
+          - `hyper_params` (dict): Dictionary of hyperparameters for model configuration.
         - Output:
-          - A tuple consisting of:
-            - `pred_val`: Predictions on validation data.
-            - `pred_test`: Predictions on test data.
-            - `hyper_params`: A dictionary of important hyperparameters for model configuration.
-
-        - Include a clear and concise docstring to explain the function's purpose, its input parameters, and its expected return values.
-
-      2. Precautions:
-        - Ensure input arrays (`X`, `y`, `val_X`, `val_y`, `test_X`) have the correct shapes and consistent dimensions.
-        - You should check and handle outliers in your input data.
-        - Use default values for hyperparameters if none are provided in `hyper_params`.
-        - Return hyperparameters for retrain if not exists.
-        - Perform model training on `X` and `y`, and evaluate using `val_X` and `val_y`.
+          - `pred_val` (Optional[DT]): Predictions on validation data.
+          - `pred_test` (Optional[DT]): Predictions on test data.
+          - `hyper_params` (dict): Updated dictionary of hyperparameters after training.
+        - Docstring Requirements:
+          - Describe the purpose of the function.
+          - Clarify the input parameters and their data types.
+          - Define the structure and format of the output.
+
+      2. Function Details:
+        - Input Shapes:
+          - `X`: A 4D array with shape `(num_samples, height, width, channels)`.
+            - `num_samples`: Number of training samples.
+            - `height` and `width`: Dimensions of the feature (e.g., `224 x 224` for images).
+            - `channels`: Number of channels (e.g., `3` for RGB).
+          - `y`: A 2D array with shape `(num_samples, 1)`.
+            - Binary classification labels, where `1` represents the target variable.
+          - Optional inputs:
+            - `val_X`: Validation features with shape `(num_val_samples, height, width, channels)`.
+            - `val_y`: Validation labels with shape `(num_val_samples, 1)`.
+            - `test_X`: Test features with shape `(num_test_samples, height, width, channels)`.
+        - Output Details:
+          - `pred_val`: Predictions for validation data as a 2D array `(num_val_samples, 1)` or `None` if no validation data is provided.
+          - `pred_test`: Predictions for test data as a 2D array `(num_test_samples, 1)` or `None` if no test data is provided.
+          - `hyper_params`: Updated dictionary of hyperparameters.
+
+      3. Code Standards:
+        - Avoid using progress bars (e.g., `tqdm`) in the implementation.  
+
+      4. Precautions:
+        - Ensure input arrays (`X`, `y`, `val_X`, `val_y`, `test_X`) have consistent dimensions and shapes.
+        - Use default values for hyperparameters if `hyper_params` is not provided.
+        - Train the model on `X` and `y`.
+        - Evaluate the model using `val_X` and `val_y` if validation data is available.
         - If `test_X` is provided, generate predictions for it.
-        - Do not use progress bars (e.g., tqdm) in the code.
+        - Do not use progress bars (e.g., `tqdm`) in the implementation.
+
+      5. Notes:
+        - Align `DT` (data type) with the definitions used in Feature Engineering specifications.
 
       {% if latest_spec %}
-      3. Former Specification:
+      6. Former Specification:
         {{ latest_spec }}
         You should follow the provided specifications to improve this task.
       {% endif %}
@@ -168,24 +197,36 @@ spec:
     ensemble: |-
       Ensemble specification text adhere to the following requirements:
       1. Function Interface:
-        - The function name must be `ens_and_decision`.
-        - The function should include:
-          - Type annotations for both inputs and outputs.
-          - Input (for example):
-            - `test_pred_l`: A list of NumPy arrays (as an example, if you think predictions should be represented as Pandas DataFrames, use `pd.DataFrame`) containing predictions for the test data.
-            - `val_pred_l`: A list of NumPy arrays containing predictions for the validation data.
-            - `val_label`: A 1D NumPy array of true labels for the validation data.
-          - Output:
-            - A 1D NumPy array containing the final binary predictions for the test data.
-        - Include a docstring that describes the purpose of the function, the parameters, and the expected return value.
+        - Function Name: `ens_and_decision`
+        - Parameters:
+          - `test_pred_l` (List[DT]): A list of predictions for the test data.
+          - `val_pred_l` (List[DT]): A list of predictions for the validation data.
+          - `val_label` (DT): A 1D array or series of true labels for the validation data.
+        - Output:
+          - `final_predictions` (DT): A 1D array or series containing the final binary predictions for the test data.
+        - Docstring Requirements:
+          - Describe the purpose of the function.
+          - Clarify the input parameters and their data types.
+          - Define the structure and format of the output.
 
       2. Precautions:
-        - Ensure all predictions in `test_pred_l` and `val_pred_l` have the same shape and dimensions.
-        - Validate that `val_label` is provided and has the same length as `val_pred_l` predictions.
-        - Perform checks to handle empty or invalid inputs gracefully.
+        - Validation of Inputs:
+          - Ensure all predictions in `test_pred_l` and `val_pred_l` have consistent shapes and dimensions.
+          - Verify that `val_label` is provided and matches the length of `val_pred_l` predictions.
+          - Handle empty or invalid inputs gracefully with appropriate error messages.
+        - Consensus Strategy:
+          - Clearly define how the ensemble predictions are aggregated (e.g., majority voting, weighted average).
+          - Avoid introducing biases or overfitting during decision-making.
+      
+      3. Code Standards:
+        - Avoid using progress bars (e.g., `tqdm`) in the implementation.  
+
+      4. Notes:
+        - Align `DT` (data type) definitions with those used in model specifications.
+        - Ensure flexibility to handle multiple ensemble strategies based on competition requirements.
 
       {% if latest_spec %}
-      3. Former Specification:
+      5. Former Specification:
         {{ latest_spec }}
         You should follow the provided specifications to improve this task.
       {% endif %}

From 36b419167563003c34287b83bceebd2b823891c7 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Fri, 27 Dec 2024 10:20:05 +0000
Subject: [PATCH 138/304] fix some bugs

---
 .../data_science/raw_data_loader/prompts.yaml | 29 ++++---------------
 1 file changed, 6 insertions(+), 23 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index b25f6d051..767ae5578 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -80,7 +80,7 @@ spec:
 
       Please respond with a JSON structure as follows:
       {
-          "spec": "The corresponding specification string as described above. You should create the rules based on the competition information instead of copying the requirements."
+          "spec": "The function definition in code format, tailored to the Competition Information, with detailed explanations provided in the docstring."
       }
 
     feature: |-
@@ -127,7 +127,7 @@ spec:
 
       Please respond with a JSON structure as follows:
       {
-          "spec": "The corresponding specification string as described above. You should create the rules based on the competition information instead of copying the requirements."
+          "spec": "The function definition in code format, tailored to the Competition Information, with detailed explanations provided in the docstring."
       }
 
     model: |-
@@ -151,27 +151,10 @@ spec:
           - Clarify the input parameters and their data types.
           - Define the structure and format of the output.
 
-      2. Function Details:
-        - Input Shapes:
-          - `X`: A 4D array with shape `(num_samples, height, width, channels)`.
-            - `num_samples`: Number of training samples.
-            - `height` and `width`: Dimensions of the feature (e.g., `224 x 224` for images).
-            - `channels`: Number of channels (e.g., `3` for RGB).
-          - `y`: A 2D array with shape `(num_samples, 1)`.
-            - Binary classification labels, where `1` represents the target variable.
-          - Optional inputs:
-            - `val_X`: Validation features with shape `(num_val_samples, height, width, channels)`.
-            - `val_y`: Validation labels with shape `(num_val_samples, 1)`.
-            - `test_X`: Test features with shape `(num_test_samples, height, width, channels)`.
-        - Output Details:
-          - `pred_val`: Predictions for validation data as a 2D array `(num_val_samples, 1)` or `None` if no validation data is provided.
-          - `pred_test`: Predictions for test data as a 2D array `(num_test_samples, 1)` or `None` if no test data is provided.
-          - `hyper_params`: Updated dictionary of hyperparameters.
-
-      3. Code Standards:
+      2. Code Standards:
         - Avoid using progress bars (e.g., `tqdm`) in the implementation.  
 
-      4. Precautions:
+      3. Precautions:
         - Ensure input arrays (`X`, `y`, `val_X`, `val_y`, `test_X`) have consistent dimensions and shapes.
         - Use default values for hyperparameters if `hyper_params` is not provided.
         - Train the model on `X` and `y`.
@@ -179,11 +162,11 @@ spec:
         - If `test_X` is provided, generate predictions for it.
         - Do not use progress bars (e.g., `tqdm`) in the implementation.
 
-      5. Notes:
+      4. Notes:
         - Align `DT` (data type) with the definitions used in Feature Engineering specifications.
 
       {% if latest_spec %}
-      6. Former Specification:
+      5. Former Specification:
         {{ latest_spec }}
         You should follow the provided specifications to improve this task.
       {% endif %}

From 34aa750ad1e8e9b8abff922518c5b5c103cf22df Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 27 Dec 2024 10:27:30 +0000
Subject: [PATCH 139/304] add file filter in FBworkspace.code property

---
 rdagent/components/coder/data_science/model/eval.py           | 4 ++--
 .../model/eval_tests/{model_execute.py => model_test.py}      | 0
 rdagent/core/experiment.py                                    | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)
 rename rdagent/components/coder/data_science/model/eval_tests/{model_execute.py => model_test.py} (100%)

diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 64a2273c9..517865357 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -65,8 +65,8 @@ def evaluate(
         ds_docker_conf = DSDockerConf()
         ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
         de = DockerEnv(conf=ds_docker_conf)
-        fname = "model_execute.py"
-        with (DIRNAME / "eval_tests" / "model_execute.py").open("r") as f:
+        fname = "model_test.py"
+        with (DIRNAME / "eval_tests" / fname).open("r") as f:
             test_code = f.read()
             implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_execute.py b/rdagent/components/coder/data_science/model/eval_tests/model_test.py
similarity index 100%
rename from rdagent/components/coder/data_science/model/eval_tests/model_execute.py
rename to rdagent/components/coder/data_science/model/eval_tests/model_test.py
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index 4c10b7520..6fd23a7f0 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -118,7 +118,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
     def code(self) -> str:
         code_string = ""
         for file_name, code in self.file_dict.items():
-            code_string += f"File: {file_name}\n{code}\n"
+            if file_name.endswith(".py") and 'test' not in file_name:
+                code_string += f"File: {file_name}\n{code}\n"
         return code_string
 
     def prepare(self) -> None:

From d30ff40fbcafe19afc25b98a7b5c1add6d387020 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Fri, 27 Dec 2024 10:31:52 +0000
Subject: [PATCH 140/304] support non-binary prediction

---
 .../data_science/ensemble/eval_tests/ensemble_test.py     | 5 +----
 rdagent/components/coder/data_science/model/prompts.yaml  | 8 ++++----
 .../coder/data_science/raw_data_loader/prompts.yaml       | 2 +-
 .../tpl_ex/aerial-cactus-identification/spec/ensemble.md  | 6 +++---
 4 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
index 6f868d357..7bc685b3f 100644
--- a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
+++ b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
@@ -1,7 +1,7 @@
 """
 A qualified ensemble implementation should:
 - Successfully run
-- Return binary predictions
+- Return predictions
 - Have correct shapes for inputs and outputs
 - Use validation data appropriately
 """
@@ -30,9 +30,6 @@
     # Check shape
     assert final_predictions.shape == (n_samples, 1), "Wrong output shape"
 
-    # Check binary values
-    assert np.all(np.isin(final_predictions, [0, 1])), "Predictions must be binary (0 or 1)"
-
     logging.info("Ensemble test passed successfully.")
     logging.info(f"Output shape: {final_predictions.shape}")
     logging.info(f"Unique values in predictions: {np.unique(final_predictions)}")
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 9f5899d06..fcd532edb 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -70,8 +70,8 @@ model_eval:
         ```
         The first time you execute it, you will not provide test inputs, only train, valid inputs, and empty hyperparameters. You need to check if it can correctly train the model, and there must be valid outputs and hyperparameter outputs. 
         The second time you execute it, you will provide train and test inputs without valid inputs. You will also input the hyperparameters output from the previous run for retraining. 
-        Therefore, during the evaluation you must check:
-        - The hyperparameters returned must not be none. It should have parameters that will be useful for retraining later. It must include the early stop round.
+        Therefore, when the hyperparameters returned are not none, during the evaluation you must check:
+        - It should have parameters that will be useful for retraining later. It must include the early stop round.
         - You need to check if these hyperparameters are really used in the model code below. The early stop round must be used if given.
         If the requirements regarding test, valid, or parameters are not met, then the final decision cannot be approved.
         
@@ -79,14 +79,14 @@ model_eval:
         You will be given the code generated by the user and the stdout of the testing process.
         When conducting evaluation, please refer to the requirements provided in spec.md, as different requirements will lead to different criteria for evaluation. 
 
-        Only if there is "Model code test passed successfully." in the stdout, then the model is considered successfu, or else there must be some issues with the model.
+        Only if there is "Model code test passed successfully." in the stdout, then the model is considered successful, or else there must be some issues with the model.
     
         Please respond with your feedback in the following JSON format and order:
         ```json
         {
             "execution": "Describe whether the model executed successfully, including any errors or issues encountered.",
             "return_checking": "Check the generated value, including whether the value is generated and comparing the shape of the model output with the requirement in spec.md. You also need to check whether the hyperparameters used for retraining are correctly returned during the test execution of the model.",
-            "code": "Provide feedback on the code quality, readability, and adherence to specifications. Check whether the hyperparameters from the previous run are used in the model code, compare the parameter names in stdout and if they are used in the retraining part of the code.",
+            "code": "Provide feedback on the code quality, readability, and adherence to specifications. Check whether the hyperparameters from the previous run are used in the model code, compare the parameter names in stdout and if they are used in the retraining part of the code. It is acceptable when hyperparameters is None.",
             "final_decision": <true/false>
         }
         ```
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 767ae5578..7ab18e1d7 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -186,7 +186,7 @@ spec:
           - `val_pred_l` (List[DT]): A list of predictions for the validation data.
           - `val_label` (DT): A 1D array or series of true labels for the validation data.
         - Output:
-          - `final_predictions` (DT): A 1D array or series containing the final binary predictions for the test data.
+          - `final_predictions` (DT): A 1D array or series containing the final predictions for the test data.
         - Docstring Requirements:
           - Describe the purpose of the function.
           - Clarify the input parameters and their data types.
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/ensemble.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/ensemble.md
index e3d04236e..68326d19f 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/ensemble.md
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/ensemble.md
@@ -7,7 +7,7 @@ def ens_and_decision(test_pred_l: list[np.ndarray], val_pred_l: list[np.ndarray]
     """
     Handle the following:
     1) Ensemble predictions using a simple average.
-    2) Make final decision after ensemble (convert the predictions to final binary form).
+    2) Make final decision after ensemble (convert the predictions to final form).
 
     Parameters
     ----------
@@ -21,8 +21,8 @@ def ens_and_decision(test_pred_l: list[np.ndarray], val_pred_l: list[np.ndarray]
     Returns
     -------
     np.ndarray
-        Binary predictions on the test data.
+        Predictions on the test data.
     """
 ```
 
-- The function should combine predictions and convert them to a binary format.
+- The function should combine predictions and convert them to a proper format.

From 72bfa9067ef8fc700d2079bea0dd1a65f137e544 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Fri, 27 Dec 2024 10:49:59 +0000
Subject: [PATCH 141/304] avoid too much warnings

---
 rdagent/utils/env.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
index 61ce00f6a..b26955937 100644
--- a/rdagent/utils/env.py
+++ b/rdagent/utils/env.py
@@ -311,6 +311,7 @@ def __run(
     ) -> str:
         if env is None:
             env = {}
+        env["PYTHONWARNINGS"] = "ignore"
         client = docker.from_env()
 
         volumns = {}

From d8b5a4cf65e064b7a3e990764c667c968b94a838 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Mon, 30 Dec 2024 07:02:07 +0000
Subject: [PATCH 142/304] fix a bug in ensemble module

---
 .../components/coder/data_science/raw_data_loader/prompts.yaml  | 2 +-
 rdagent/components/coder/data_science/workflow/prompts.yaml     | 2 +-
 rdagent/components/coder/data_science/workflow/test.py          | 2 +-
 .../kaggle/tpl_ex/aerial-cactus-identification/main.py          | 2 +-
 .../kaggle/tpl_ex/aerial-cactus-identification/spec/workflow.md | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 7ab18e1d7..0f4a47eda 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -227,7 +227,7 @@ spec:
             - Data loading (`load_data.py`).
             - Feature engineering (`feat*.py`).
             - Model workflow for training and testing (`model*.py`).
-            - Ensemble and decision-making (`ens.py`).
+            - Ensemble and decision-making (`ensemble.py`).
           - Treat each component as a modular and callable Python function.
 
         2. Dataset Splitting
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index c0214d6a0..403cea9ba 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -57,7 +57,7 @@ workflow_coder:
     {{ model_code }}
 
     ---------ensemble code---------
-    file: ens.py
+    file: ensemble.py
     {{ ensemble_code }}
 
     {% if latest_code %}
diff --git a/rdagent/components/coder/data_science/workflow/test.py b/rdagent/components/coder/data_science/workflow/test.py
index f2c2dcade..210ca1642 100644
--- a/rdagent/components/coder/data_science/workflow/test.py
+++ b/rdagent/components/coder/data_science/workflow/test.py
@@ -26,7 +26,7 @@ def develop_one_competition(competition: str):
     )
 
     tpl_ex_path = Path(__file__).resolve() / Path("rdagent/scenarios/kaggle/tpl_ex").resolve() / competition
-    injected_file_names = ["spec/workflow.md", "load_data.py", "feat01.py", "model01.py", "ens.py", "main.py"]
+    injected_file_names = ["spec/workflow.md", "load_data.py", "feat01.py", "model01.py", "ensemble.py", "main.py"]
 
     workflowexp = FBWorkspace()
     for file_name in injected_file_names:
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
index 3d45b0ec0..3e060628a 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
@@ -25,7 +25,7 @@
 
 
 # Ensemble
-from ens import ens_and_decision
+from ensemble import ens_and_decision
 
 pred_binary = ens_and_decision([test_pred], [val_pred], validation_labels)
 
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/workflow.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/workflow.md
index 10ee6de06..7ea16f693 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/workflow.md
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/workflow.md
@@ -9,7 +9,7 @@ The project should be organized into the following components:
 1. **Data Loading** (`load_data.py`): A module responsible for loading and preprocessing raw data.
 2. **Feature Engineering**(`feat*.py`): A module for transforming raw data into features suitable for model training.
 3. **Model Workflow**(`model*.py`): A module that manages the training, validation, and testing of machine learning models.
-4. **Ensemble and Decision Making**(`ens.py`): A module for combining predictions from multiple models and making final decisions.
+4. **Ensemble and Decision Making**(`ensemble.py`): A module for combining predictions from multiple models and making final decisions.
 5. **Workflow**(`main.py`): A script to put the above component together to get the final submission(`submission.csv`)
 
 ## Submission

From ea39d9f7277363621f2370a0154eed33a8c4c904 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Mon, 30 Dec 2024 08:10:21 +0000
Subject: [PATCH 143/304] filtered the knowledge query in all modules

---
 .../components/coder/data_science/ensemble/prompts.yaml  | 4 ++--
 .../components/coder/data_science/feature/prompts.yaml   | 4 ++--
 rdagent/components/coder/data_science/model/prompts.yaml | 9 ++++++---
 .../coder/data_science/raw_data_loader/prompts.yaml      | 4 ++--
 .../components/coder/data_science/workflow/prompts.yaml  | 4 ++--
 5 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
index bc3b66474..23923980e 100644
--- a/rdagent/components/coder/data_science/ensemble/prompts.yaml
+++ b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -17,7 +17,7 @@ ensemble_coder:
     ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
     {{ similar_successful_knowledge.target_task.get_task_information() }}
     =====Code:=====
-    {{ similar_successful_knowledge.implementation.code }}
+    {{ similar_successful_knowledge.implementation.file_dict["ensemble.py"] }}
     {% endfor %} 
     {% endif %}
 
@@ -25,7 +25,7 @@ ensemble_coder:
     --------------Previous Failed Attempts:--------------
     {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
     =====Code:=====
-    {{ former_failed_knowledge.implementation.code }}
+    {{ former_failed_knowledge.implementation.file_dict["ensemble.py"] }}
     =====Feedback:=====
     {{ former_failed_knowledge.feedback }}
     {% endfor %}
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
index 4fdfc4707..4bd7f0159 100644
--- a/rdagent/components/coder/data_science/feature/prompts.yaml
+++ b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -19,7 +19,7 @@ feature:
     ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
     {{ similar_successful_knowledge.target_task.get_task_information() }}
     =====Code:=====
-    {{ similar_successful_knowledge.implementation.code }}
+    {{ similar_successful_knowledge.implementation.file_dict["feat01.py"] }}
     {% endfor %} 
     {% endif %}
 
@@ -27,7 +27,7 @@ feature:
     --------------Previous Failed Attempts:--------------
     {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
     =====Code:=====
-    {{ former_failed_knowledge.implementation.code }}
+    {{ former_failed_knowledge.implementation.file_dict["feat01.py"] }}
     =====Feedback:=====
     {{ former_failed_knowledge.feedback }}
     {% endfor %}
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index fcd532edb..cc14a2739 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -1,6 +1,9 @@
 model_coder:
     system: |-
-        You are tasked with implementing PyTorch models based on specific requirements provided by the user. The user’s ultimate goal is to obtain accurate predictions from the model on input data. Follow the instructions below to ensure your response is correct and aligned with the user’s expectations.
+        You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
+        Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
+        
+        The user's ultimate goal is to obtain accurate predictions from the model on input data. Follow the instructions below to ensure your response is correct and aligned with the user's expectations.
 
         Instructions for Code Generation:
             Leveraging User Inputs:
@@ -30,7 +33,7 @@ model_coder:
         ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
         {{ similar_successful_knowledge.target_task.get_task_information() }}
         =====Code:=====
-        {{ similar_successful_knowledge.implementation.code }}
+        {{ similar_successful_knowledge.implementation.file_dict["model01.py"] }}
         {% endfor %} 
         {% endif %}
 
@@ -38,7 +41,7 @@ model_coder:
         --------------Previous Failed Attempts:--------------
         {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
         =====Code:=====
-        {{ former_failed_knowledge.implementation.code }}
+        {{ former_failed_knowledge.implementation.file_dict["model01.py"] }}
         =====Feedback:=====
         {{ former_failed_knowledge.feedback }}
         {% endfor %}
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 0f4a47eda..8fa9dff84 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -24,7 +24,7 @@ spec:
     ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
     {{ similar_successful_knowledge.target_task.get_task_information() }}
     =====Code:=====
-    {{ similar_successful_knowledge.implementation.code }}
+    {{ similar_successful_knowledge.implementation.file_dict["load_data.py"] }}
     {% endfor %} 
     {% endif %}
 
@@ -32,7 +32,7 @@ spec:
     --------------Previous Failed Attempts:--------------
     {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
     =====Code:=====
-    {{ former_failed_knowledge.implementation.code }}
+    {{ former_failed_knowledge.implementation.file_dict["load_data.py"] }}
     =====Feedback:=====
     {{ former_failed_knowledge.feedback }}
     {% endfor %}
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index 403cea9ba..c7b1ac4a0 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -25,7 +25,7 @@ workflow_coder:
     ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
     {{ similar_successful_knowledge.target_task.get_task_information() }}
     =====Code:=====
-    {{ similar_successful_knowledge.implementation.code }}
+    {{ similar_successful_knowledge.implementation.file_dict["main.py"] }}
     {% endfor %} 
     {% endif %}
 
@@ -33,7 +33,7 @@ workflow_coder:
     --------------Previous Failed Attempts:--------------
     {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
     =====Code:=====
-    {{ former_failed_knowledge.implementation.code }}
+    {{ former_failed_knowledge.implementation.file_dict["main.py"] }}
     =====Feedback:=====
     {{ former_failed_knowledge.feedback }}
     {% endfor %}

From ed305c15b5da1c8c787396e2f5ee5009c6777373 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Mon, 30 Dec 2024 08:11:08 +0000
Subject: [PATCH 144/304] delete RAG in idea proposal

---
 rdagent/scenarios/data_science/proposal/prompts.yaml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index d52331e88..e6cb43dd8 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -18,12 +18,6 @@ hypothesis_gen:
     The former hypothesis and the corresponding feedbacks are as follows (focus on the last one & the new hypothesis that it provides and reasoning to see if you agree):
     {{ hypothesis_and_feedback }}
     {% endif %}
-    {% if RAG %}
-    To assist you in generating new {{targets}}, we have provided the following information: {{RAG}}.
-    **Note:** The provided RAG is for reference only. 
-    You must carefully assess whether the RAG aligns with the {{targets}}. 
-    If it does not, it should not be used. Exercise caution and make your own judgment.
-    {% endif %}
     Also generate the relevant keys for the reasoning and the distilled knowledge that follows. For those keys, in particular for knowledge, explain in the context of the specific scenario to build up domain knowledge in the specific field rather than general knowledge.
 
 task_gen:

From d9d29b3d01d33f66f7822ad88231c0a8abd8eb96 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Mon, 30 Dec 2024 08:43:10 +0000
Subject: [PATCH 145/304] refine the code in ensemble

---
 .../data_science/raw_data_loader/prompts.yaml | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 8fa9dff84..0ea99e89c 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -176,7 +176,6 @@ spec:
           "spec": "The function definition in code format, tailored to the Competition Information, with detailed explanations provided in the docstring."
       }
 
-
     ensemble: |-
       Ensemble specification text adhere to the following requirements:
       1. Function Interface:
@@ -197,12 +196,28 @@ spec:
           - Ensure all predictions in `test_pred_l` and `val_pred_l` have consistent shapes and dimensions.
           - Verify that `val_label` is provided and matches the length of `val_pred_l` predictions.
           - Handle empty or invalid inputs gracefully with appropriate error messages.
+        - You should calculate the metric for each model and ensemble strategy, and save the results in a CSV file, e.g.:
+          ```python
+          scores = []
+          for id, val_pred in enumerate(val_pred_l):
+              scores.append(calculate_metric(val_label, val_pred))
+          
+          ... some code about ensemble strategy ...
+          
+          scores_df = pd.DataFrame(
+              {
+                  "Model": list(range(len(val_pred_l))) + ["<ensemble strategy name>"],
+                  "<metric name>": scores + [<ensemble strategy prediction score>],
+              }
+          )
+          scores_df.to_csv("scores.csv", index=False)
+          ```
         - Consensus Strategy:
           - Clearly define how the ensemble predictions are aggregated (e.g., majority voting, weighted average).
           - Avoid introducing biases or overfitting during decision-making.
       
       3. Code Standards:
-        - Avoid using progress bars (e.g., `tqdm`) in the implementation.  
+        - Avoid using progress bars (e.g., `tqdm`) in the implementation. 
 
       4. Notes:
         - Align `DT` (data type) definitions with those used in model specifications.

From 593854ceb063eed317a775b710679bcdf64a28c9 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 30 Dec 2024 09:22:04 +0000
Subject: [PATCH 146/304] show exp workspace in llm_st

---
 rdagent/app/data_science/loop.py | 1 +
 rdagent/log/ui/llm_st.py         | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index f3560ddd7..d22d7d312 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -58,6 +58,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):
 
     def direct_exp_gen(self, prev_out: dict[str, Any]):
         exp = self.exp_gen.gen(self.trace)
+        logger.log_object(exp, tag="debug_exp_gen")
         return exp
 
     def coding(self, prev_out: dict[str, Any]):
diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
index cbcb54fc5..a1f4e16f7 100644
--- a/rdagent/log/ui/llm_st.py
+++ b/rdagent/log/ui/llm_st.py
@@ -109,7 +109,10 @@ def extract_evoid(tag):
             tlist.append(f"loop_{loop_id}.evo_step_{evo_id}")
             st.subheader(f"evo_step_{evo_id}", anchor=f"loop_{loop_id}.evo_step_{evo_id}", divider="orange")
 
-    if "debug_tpl" in tag:
+    if "debug_exp_gen" in tag:
+        with st.expander(f"Exp in:violet[**{obj.experiment_workspace.workspace_path}**]", expanded=expand_all, icon="🧩"):
+            st.write(obj)
+    elif "debug_tpl" in tag:
         uri = obj["uri"]
         tpl = obj["template"]
         cxt = obj["context"]

From 77d8b8b00d3c1e1bec21621a42c91bc529a5af03 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 30 Dec 2024 10:46:54 +0000
Subject: [PATCH 147/304] exp_gen bug fix

---
 .../data_science/proposal/exp_gen.py          | 28 ++++++-------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 00945b059..33f2186c5 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -53,14 +53,10 @@ def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None =
         self.hist: list[tuple[DSHypothesis, DSExperiment, HypothesisFeedback]] = []
         self.knowledge_base = knowledge_base
 
-    def get_sota_hypothesis_and_experiment(
-        self, component: COMPONENT | None = None
-    ) -> tuple[DSHypothesis | None, Experiment | None]:
+    def get_sota_hypothesis_and_experiment(self) -> tuple[DSHypothesis | None, Experiment | None]:
         """Access the last experiment result, sub-task, and the corresponding hypothesis."""
         for h, exp, hf in self.hist[::-1]:
             if hf.decision:
-                if component and h.component != component:
-                    continue
                 return h, exp
         return None, None
 
@@ -174,7 +170,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = trace.get_sota_hypothesis_and_experiment("DataLoadSpec")[1]
+                dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
                 ft = FeatureTask(
                     name="Feature Engineering",
                     description=resp_dict.get("description", "Feature description not provided"),
@@ -192,7 +188,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = trace.get_sota_hypothesis_and_experiment("FeatureEng")[1]
+                dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
                 mt = ModelTask(
                     name=resp_dict.get("model_name", "Model name not provided"),
                     description=resp_dict.get("description", "Model description not provided"),
@@ -214,7 +210,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = trace.get_sota_hypothesis_and_experiment("Model")[1]
+                dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
                 et = EnsembleTask(
                     name="Ensemble",
                     description=resp_dict.get("description", "Ensemble description not provided"),
@@ -232,7 +228,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = trace.get_sota_hypothesis_and_experiment("Ensemble")[1]
+                dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
                 wt = WorkflowTask(
                     name="Workflow",
                     description=resp_dict.get("description", "Workflow description not provided"),
@@ -267,7 +263,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                         scenario_desc=scenario_desc,
                         task_output_format=T(".prompts:output_format.feature").r(),
                     )
-                    dependency_exp = trace.get_sota_hypothesis_and_experiment("DataLoadSpec")[1]
+                    dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
                     ft = FeatureTask(
                         name="Feature Engineering",
                         description=resp_dict.get("description", "Factor description not provided"),
@@ -281,19 +277,13 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                         scenario_desc=scenario_desc,
                         task_output_format=T(".prompts:output_format.model").r(),
                     )
-                    dependency_exp = trace.get_sota_hypothesis_and_experiment("FeatureEng")[1]
-                    if last_model_exp := trace.get_sota_hypothesis_and_experiment("Model")[1]:
-                        # TODO: model only have one (named "model.py")?
-                        base_code = last_model_exp.experiment_workspace.file_dict["model.py"]
-                    else:
-                        base_code = ""
+                    dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
                     mt = ModelTask(
                         name=resp_dict.get("model_name", "Model name not provided"),
                         description=resp_dict.get("description", "Model description not provided"),
                         model_type=resp_dict.get("model_type", "Model type not provided"),
                         architecture=resp_dict.get("architecture", "Model architecture not provided"),
                         hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
-                        base_code=base_code,
                     )
                     exp = DSExperiment(sub_tasks=[mt], hypothesis=DSHypothesis("Model"))
                     exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
@@ -304,7 +294,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                         scenario_desc=scenario_desc,
                         task_output_format=T(".prompts:output_format.ensemble").r(),
                     )
-                    dependency_exp = trace.get_sota_hypothesis_and_experiment("Model")[1]
+                    dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
                     et = EnsembleTask(
                         name="Ensemble",
                         description=resp_dict.get("description", "Ensemble description not provided"),
@@ -318,7 +308,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                         scenario_desc=scenario_desc,
                         task_output_format=T(".prompts:output_format.workflow").r(),
                     )
-                    dependency_exp = trace.get_sota_hypothesis_and_experiment("Ensemble")[1]
+                    dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
                     wt = WorkflowTask(
                         name="Workflow",
                         description=resp_dict.get("description", "Workflow description not provided"),

From 6eb92abff9c0cbbe93ece87561f566df7606d64c Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 30 Dec 2024 12:24:28 +0000
Subject: [PATCH 148/304] feedback bug fix

---
 rdagent/app/data_science/loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index d22d7d312..a137c57c6 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -88,7 +88,7 @@ def running(self, prev_out: dict[str, Any]):
     def feedback(self, prev_out: dict[str, Any]):
         if self.trace.all_components_completed():
             feedback = self.summarizer.generate_feedback(
-                prev_out["running"], prev_out["direct_exp_gen"].hypothesis, self.trace
+                prev_out["running"], self.trace
             )
         else:
             feedback = HypothesisFeedback(

From ab2aab4fc625e0b4bd44260fe2f873a7f4f28a98 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 30 Dec 2024 12:32:59 +0000
Subject: [PATCH 149/304] use `feature` instead of `feat01`

---
 rdagent/components/coder/data_science/feature/__init__.py     | 4 ++--
 rdagent/components/coder/data_science/feature/eval.py         | 2 +-
 .../coder/data_science/feature/eval_tests/feature_test.py     | 2 +-
 rdagent/components/coder/data_science/feature/prompts.yaml    | 4 ++--
 .../coder/data_science/model/eval_tests/model_test.py         | 2 +-
 rdagent/components/coder/data_science/model/test.py           | 2 +-
 rdagent/components/coder/data_science/workflow/__init__.py    | 2 +-
 rdagent/components/coder/data_science/workflow/prompts.yaml   | 2 +-
 rdagent/components/coder/data_science/workflow/test.py        | 2 +-
 rdagent/scenarios/data_science/dev/feedback.py                | 2 +-
 .../kaggle/tpl_ex/aerial-cactus-identification/README.md      | 2 +-
 .../aerial-cactus-identification/{ens.py => ensemble.py}      | 0
 .../aerial-cactus-identification/{feat01.py => feature.py}    | 0
 .../kaggle/tpl_ex/aerial-cactus-identification/main.py        | 2 +-
 14 files changed, 14 insertions(+), 14 deletions(-)
 rename rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/{ens.py => ensemble.py} (100%)
 rename rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/{feat01.py => feature.py} (100%)

diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
index 65d0ebb43..6b30263d5 100644
--- a/rdagent/components/coder/data_science/feature/__init__.py
+++ b/rdagent/components/coder/data_science/feature/__init__.py
@@ -47,7 +47,7 @@ def implement_one_task(
         )
         user_prompt = T(".prompts:feature.user").r(
             feature_spec=workspace.file_dict["spec/feature.md"],
-            latest_code=workspace.file_dict.get("feat01.py"),
+            latest_code=workspace.file_dict.get("feature.py"),
         )
 
         feature_code = json.loads(
@@ -57,7 +57,7 @@ def implement_one_task(
         )["code"]
 
         return {
-            "feat01.py": feature_code,
+            "feature.py": feature_code,
         }
 
     def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
index b6993c553..45e894c18 100644
--- a/rdagent/components/coder/data_science/feature/eval.py
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -55,7 +55,7 @@ def evaluate(
         if stdout is None:
             stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
         system_prompt = T(".prompts:feature_eval.system").r(
-            test_code=test_code, code=implementation.file_dict["feat01.py"]
+            test_code=test_code, code=implementation.file_dict["feature.py"]
         )
         user_prompt = T(".prompts:feature_eval.user").r(stdout=stdout)
 
diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
index e94fb58ec..bdcd2b258 100644
--- a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
@@ -12,7 +12,7 @@
 import numpy as np
 import pandas as pd
 from load_data import load_data
-from feat01 import feat_eng
+from feature import feat_eng
 
 X, y, X_test, test_ids = load_data()
 X, y, X_test = feat_eng(X, y, X_test)
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
index 4bd7f0159..b0831e20c 100644
--- a/rdagent/components/coder/data_science/feature/prompts.yaml
+++ b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -19,7 +19,7 @@ feature:
     ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
     {{ similar_successful_knowledge.target_task.get_task_information() }}
     =====Code:=====
-    {{ similar_successful_knowledge.implementation.file_dict["feat01.py"] }}
+    {{ similar_successful_knowledge.implementation.file_dict["feature.py"] }}
     {% endfor %} 
     {% endif %}
 
@@ -27,7 +27,7 @@ feature:
     --------------Previous Failed Attempts:--------------
     {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
     =====Code:=====
-    {{ former_failed_knowledge.implementation.file_dict["feat01.py"] }}
+    {{ former_failed_knowledge.implementation.file_dict["feature.py"] }}
     =====Feedback:=====
     {{ former_failed_knowledge.feedback }}
     {% endfor %}
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_test.py b/rdagent/components/coder/data_science/model/eval_tests/model_test.py
index e6ea885fa..217945e28 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_test.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_test.py
@@ -10,7 +10,7 @@
 from sklearn.model_selection import train_test_split
 from load_data import load_data
 from model01 import model_workflow
-from feat01 import feat_eng
+from feature import feat_eng
 
 X, y, test_X, test_ids = load_data()
 X, y, test_X = feat_eng(X, y, test_X)
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
index ad7995789..268bdda1b 100644
--- a/rdagent/components/coder/data_science/model/test.py
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -32,7 +32,7 @@ def develop_one_competition(competition: str):
     )
 
     tpl_ex_path = Path(__file__).resolve() / Path("rdagent/scenarios/kaggle/tpl_ex").resolve() / competition
-    injected_file_names = ["spec/model.md", "load_data.py", "feat01.py", "model01.py"]
+    injected_file_names = ["spec/model.md", "load_data.py", "feature.py", "model01.py"]
 
     modelexp = FBWorkspace()
     for file_name in injected_file_names:
diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
index 454732b1f..cd697873f 100644
--- a/rdagent/components/coder/data_science/workflow/__init__.py
+++ b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -48,7 +48,7 @@ def implement_one_task(
         )
         user_prompt = T(".prompts:workflow_coder.user").r(
             load_data_code=workspace.file_dict["load_data.py"],
-            feature_code=workspace.file_dict["feat01.py"],
+            feature_code=workspace.file_dict["feature.py"],
             model_code=workspace.file_dict["model01.py"],
             ensemble_code=workspace.file_dict["ensemble.py"],
             latest_code=workspace.file_dict.get("main.py"),
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index c7b1ac4a0..cc880b4c5 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -48,7 +48,7 @@ workflow_coder:
     {{ load_data_code }}
 
     ---------feature engineering code---------
-    file: feat01.py
+    file: feature.py
     {{ feature_code }}
 
     ---------model training code---------
diff --git a/rdagent/components/coder/data_science/workflow/test.py b/rdagent/components/coder/data_science/workflow/test.py
index 210ca1642..99b6cb3d4 100644
--- a/rdagent/components/coder/data_science/workflow/test.py
+++ b/rdagent/components/coder/data_science/workflow/test.py
@@ -26,7 +26,7 @@ def develop_one_competition(competition: str):
     )
 
     tpl_ex_path = Path(__file__).resolve() / Path("rdagent/scenarios/kaggle/tpl_ex").resolve() / competition
-    injected_file_names = ["spec/workflow.md", "load_data.py", "feat01.py", "model01.py", "ensemble.py", "main.py"]
+    injected_file_names = ["spec/workflow.md", "load_data.py", "feature.py", "model01.py", "ensemble.py", "main.py"]
 
     workflowexp = FBWorkspace()
     for file_name in injected_file_names:
diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
index 9905d7528..7defbecb2 100644
--- a/rdagent/scenarios/data_science/dev/feedback.py
+++ b/rdagent/scenarios/data_science/dev/feedback.py
@@ -22,7 +22,7 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> HypothesisFeed
         elif hypothesis.component == "FeatureEng":
             modified_file_name = "feature.py"
         elif hypothesis.component == "Model":
-            modified_file_name = "model.py"
+            modified_file_name = "model01.py"
         elif hypothesis.component == "Ensemble":
             modified_file_name = "ensemble.py"
         elif hypothesis.component == "Workflow":
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/README.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/README.md
index 0faf7dd2a..f5e48c7a0 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/README.md
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/README.md
@@ -26,7 +26,7 @@ We use a runnable concrete example to demonstrate what the project should be lik
 
 
 ## Step1: write the feature engineering code
-- We can generate some file like [[feat01.py]] that match the pattern `feat.*\.py`
+- We can generate some file like [[feature.py]] that match the pattern `feat.*\.py`
 
 ## Step2: Model training
 
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/ens.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/ensemble.py
similarity index 100%
rename from rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/ens.py
rename to rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/ensemble.py
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/feat01.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/feature.py
similarity index 100%
rename from rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/feat01.py
rename to rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/feature.py
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
index 3e060628a..d879bcb32 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
@@ -6,7 +6,7 @@
 
 
 # feature engineering
-from feat01 import feat_eng
+from feature import feat_eng
 
 train_images, train_lables, train_param = feat_eng(train_images, train_labels, train_images, train_labels)
 test_images, _, _ = feat_eng(test_images, param=train_param)

From 7ef6a0e5a656a655fa7c7d60c8900023b9a280bd Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 31 Dec 2024 12:43:51 +0000
Subject: [PATCH 150/304] Trace & method of judging if exp is completed change

---
 rdagent/app/data_science/loop.py              |  15 +-
 .../coder/data_science/ensemble/eval.py       |   6 +
 .../coder/data_science/workflow/eval.py       |  10 +
 rdagent/core/proposal.py                      |  28 ++-
 .../data_science/experiment/experiment.py     |  18 +-
 .../data_science/proposal/exp_gen.py          | 181 ++++++++----------
 6 files changed, 141 insertions(+), 117 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index a137c57c6..ac7e69da0 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -79,17 +79,16 @@ def coding(self, prev_out: dict[str, Any]):
         return exp
 
     def running(self, prev_out: dict[str, Any]):
-        if self.trace.all_components_completed():
-            exp = self.runner.develop(prev_out["coding"])
+        exp: DSExperiment = prev_out["coding"]
+        if exp.next_component_required() is None:
+            return self.runner.run(exp)
         else:
-            exp = prev_out["coding"]
-        return exp
+            return exp
 
     def feedback(self, prev_out: dict[str, Any]):
-        if self.trace.all_components_completed():
-            feedback = self.summarizer.generate_feedback(
-                prev_out["running"], self.trace
-            )
+        exp: DSExperiment = prev_out["running"]
+        if exp.next_component_required() is None:
+            feedback = self.summarizer.generate_feedback(exp, self.trace)
         else:
             feedback = HypothesisFeedback(
                 observations="Not all 5 components are completed, skip feedback of DataScienceRDLoop.",
diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
index 66fa93188..1dc742d46 100644
--- a/rdagent/components/coder/data_science/ensemble/eval.py
+++ b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -14,6 +14,7 @@
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
+from rdagent.core.exception import CoderError
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
@@ -51,6 +52,11 @@ def evaluate(
             test_code = f.read()
             implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
+        
+        # Check if the metrics file is generated
+        score_fp = implementation.experiment_workspace.workspace_path / "scores.csv"
+        if not score_fp.exists():
+            raise CoderError("Metrics file (scores.csv) is not generated.")
 
         system_prompt = T(".prompts:ensemble_eval.system").r(
             test_code=test_code, code=implementation.file_dict["ensemble.py"]
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index a288cdce5..6f41f5535 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -13,6 +13,7 @@
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
+from rdagent.core.exception import CoderError
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
@@ -55,6 +56,15 @@ def evaluate(
         de = DockerEnv(conf=ds_docker_conf)
         fname = "main.py"
         stdout = implementation.execute(env=de, entry=f"python {fname}")
+        
+        # Check if the submission file and score file are generated
+        submission_fp = implementation.experiment_workspace.workspace_path / "submission.csv"
+        score_fp = implementation.experiment_workspace.workspace_path / "scores.csv"
+        if not submission_fp.exists():
+            raise CoderError("Submission file (submission.csv) is not generated.")
+        if not score_fp.exists():
+            raise CoderError("Metrics file (scores.csv) is not generated.")
+
         if stdout is None:
             stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
         system_prompt = T(".prompts:workflow_eval.system").r(
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
index 093c34e78..925d4c3fd 100644
--- a/rdagent/core/proposal.py
+++ b/rdagent/core/proposal.py
@@ -56,8 +56,22 @@ def __str__(self) -> str:
 
 # Origin(path of repo/data/feedback) => view/summarization => generated Hypothesis
 
+class ExperimentFeedback(Feedback):
+    def __init__(
+        self,
+        decision: bool,
+        reason: bool,
+    ) -> None:
+        self.decision = decision
+        self.reason = reason
+
+    def __bool__(self) -> bool:
+        return self.decision
+
+    def __str__(self) -> str:
+        return f"Decision: {self.decision}\nReason: {self.reason}"
 
-class HypothesisFeedback(Feedback):
+class HypothesisFeedback(ExperimentFeedback):
     def __init__(
         self,
         observations: str,
@@ -66,14 +80,10 @@ def __init__(
         reason: str,
         decision: bool,
     ) -> None:
+        super().__init__(decision, reason)
         self.observations = observations
         self.hypothesis_evaluation = hypothesis_evaluation
         self.new_hypothesis = new_hypothesis
-        self.reason = reason
-        self.decision = decision
-
-    def __bool__(self) -> bool:
-        return self.decision
 
     def __str__(self) -> str:
         return f"""Observations: {self.observations}
@@ -90,7 +100,8 @@ def __str__(self) -> str:
 class Trace(Generic[ASpecificScen, ASpecificKB]):
     def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = None) -> None:
         self.scen: ASpecificScen = scen
-        self.hist: list[tuple[Hypothesis, Experiment, HypothesisFeedback]] = []
+        self.hist: list[tuple[Experiment, ExperimentFeedback]] = []
+        # TODO: self.hist is 2-tuple now, remove hypothesis from it, change old code for this later.
         self.knowledge_base: ASpecificKB | None = knowledge_base
 
     def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experiment | None]:
@@ -170,12 +181,11 @@ def __init__(self, scen: Scenario) -> None:
         self.scen = scen
 
     @abstractmethod
-    def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback:
+    def generate_feedback(self, exp: Experiment, trace: Trace) -> ExperimentFeedback:
         """
         The `exp` should be executed and the results should be included, as well as the comparison
         between previous results (done by LLM).
         For example: `mlflow` of Qlib will be included.
         """
-        # TODO: return a hypothesis feedback seems wierd now. Maybe we should return an ExerimentFeedback?
         error_message = "generate_feedback method is not implemented."
         raise NotImplementedError(error_message)
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index 494a11707..c6a0c9584 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -1,7 +1,23 @@
 from rdagent.core.experiment import Experiment, FBWorkspace, Task
-
+import re
+from rdagent.scenarios.data_science.proposal.exp_gen import COMPONENT
 
 class DSExperiment(Experiment[Task, FBWorkspace, FBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.experiment_workspace = FBWorkspace()
+    
+    def next_component_required(self) -> COMPONENT | None:
+        files = list(self.experiment_workspace.file_dict.keys())
+        if "load_data.py" not in files:
+            return "DataLoadSpec"
+        if "feature.py" not in files:
+            return "FeatureEng"
+        if not any(re.match(r'model.*\.py', file) for file in files):
+            return "Model"
+        if "ensemble.py" not in files:
+            return "Ensemble"
+        if "main.py" not in files:
+            return "Workflow"
+        return None
+        
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 33f2186c5..799810cb5 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -8,7 +8,7 @@
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
 from rdagent.core.experiment import Experiment
 from rdagent.core.knowledge_base import KnowledgeBase
-from rdagent.core.proposal import ExpGen, Hypothesis, HypothesisFeedback, Trace
+from rdagent.core.proposal import ExpGen, Hypothesis, HypothesisFeedback, Trace, ExperimentFeedback
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
@@ -50,7 +50,7 @@ def __str__(self) -> str:
 class DSTrace(Trace[DataScienceScen, KnowledgeBase]):
     def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None = None) -> None:
         self.scen: DataScienceScen = scen
-        self.hist: list[tuple[DSHypothesis, DSExperiment, HypothesisFeedback]] = []
+        self.hist: list[tuple[DSExperiment, ExperimentFeedback]] = []
         self.knowledge_base = knowledge_base
 
     def get_sota_hypothesis_and_experiment(self) -> tuple[DSHypothesis | None, Experiment | None]:
@@ -60,23 +60,6 @@ def get_sota_hypothesis_and_experiment(self) -> tuple[DSHypothesis | None, Exper
                 return h, exp
         return None, None
 
-    @property
-    def successful_components(self) -> set[COMPONENT]:
-        """
-        Get successful components.
-        """
-        successful_components = set()
-        for h, _, hf in self.hist:
-            if hf.decision:
-                successful_components.add(h.component)
-        return successful_components
-
-    def all_components_completed(self) -> bool:
-        """
-        Check if 5 successful components are completed.
-        """
-        return set(ORDER) == self.successful_components
-
 
 class DSExpGen(ExpGen):
     """Data Science Task Generator."""
@@ -110,8 +93,87 @@ def llm_task_gen(
         return resp_dict
 
     def gen(self, trace: DSTrace) -> DSExperiment:
+        if len(trace.hist) == 0:
+            next_component = "DataLoadSpec"
+        else:
+            next_component = trace.hist[-1][0].next_component_required()
+
         scenario_desc = trace.scen.get_scenario_all_desc()
-        if trace.all_components_completed():
+        if next_component == "DataLoadSpec":
+            resp_dict = self.llm_task_gen(
+                targets="Data loader and specification generation",
+                scenario_desc=scenario_desc,
+                task_output_format=T(".prompts:output_format.data_loader").r(),
+            )
+            dt = DataLoaderTask(
+                name="Data loader and specification generation",
+                description=resp_dict.get(
+                    "description", "Data loader and specification generation description not provided"
+                ),
+            )
+
+            exp = DSExperiment(sub_tasks=[dt], hypothesis=DSHypothesis("DataLoadSpec"))
+            return exp
+        elif next_component == "FeatureEng":
+            resp_dict = self.llm_task_gen(
+                targets="Feature Engineering",
+                scenario_desc=scenario_desc,
+                task_output_format=T(".prompts:output_format.feature").r(),
+            )
+            dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
+            ft = FeatureTask(
+                name="Feature Engineering",
+                description=resp_dict.get("description", "Factor description not provided"),
+            )
+            exp = DSExperiment(sub_tasks=[ft], hypothesis=DSHypothesis("FeatureEng"))
+            exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+            return exp
+        elif next_component == "Model":
+            resp_dict = self.llm_task_gen(
+                targets="Models",
+                scenario_desc=scenario_desc,
+                task_output_format=T(".prompts:output_format.model").r(),
+            )
+            dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
+            mt = ModelTask(
+                name=resp_dict.get("model_name", "Model name not provided"),
+                description=resp_dict.get("description", "Model description not provided"),
+                model_type=resp_dict.get("model_type", "Model type not provided"),
+                architecture=resp_dict.get("architecture", "Model architecture not provided"),
+                hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
+            )
+            exp = DSExperiment(sub_tasks=[mt], hypothesis=DSHypothesis("Model"))
+            exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+            return exp
+        elif next_component == "Ensemble":
+            resp_dict = self.llm_task_gen(
+                targets="Ensemble",
+                scenario_desc=scenario_desc,
+                task_output_format=T(".prompts:output_format.ensemble").r(),
+            )
+            dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
+            et = EnsembleTask(
+                name="Ensemble",
+                description=resp_dict.get("description", "Ensemble description not provided"),
+            )
+            exp = DSExperiment(sub_tasks=[et], hypothesis=DSHypothesis("Ensemble"))
+            exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+            return exp
+        elif next_component == "Workflow":
+            resp_dict = self.llm_task_gen(
+                targets="Workflow",
+                scenario_desc=scenario_desc,
+                task_output_format=T(".prompts:output_format.workflow").r(),
+            )
+            dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
+            wt = WorkflowTask(
+                name="Workflow",
+                description=resp_dict.get("description", "Workflow description not provided"),
+            )
+            exp = DSExperiment(sub_tasks=[wt], hypothesis=DSHypothesis("Workflow"))
+            exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+            return exp
+        else: # propose new component by LLM
             # base info
             hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(trace=trace)
 
@@ -237,84 +299,5 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 exp = DSExperiment(sub_tasks=[wt], hypothesis=hypothesis)
                 exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
                 return exp
-        else:
-            for o in ORDER:
-                if o in trace.successful_components:
-                    # we already have the component, then skip
-                    continue
-                elif o == "DataLoadSpec":
-                    resp_dict = self.llm_task_gen(
-                        targets="Data loader and specification generation",
-                        scenario_desc=scenario_desc,
-                        task_output_format=T(".prompts:output_format.data_loader").r(),
-                    )
-                    dt = DataLoaderTask(
-                        name="Data loader and specification generation",
-                        description=resp_dict.get(
-                            "description", "Data loader and specification generation description not provided"
-                        ),
-                    )
-
-                    exp = DSExperiment(sub_tasks=[dt], hypothesis=DSHypothesis("DataLoadSpec"))
-                    return exp
-                elif o == "FeatureEng":
-                    resp_dict = self.llm_task_gen(
-                        targets="Feature Engineering",
-                        scenario_desc=scenario_desc,
-                        task_output_format=T(".prompts:output_format.feature").r(),
-                    )
-                    dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
-                    ft = FeatureTask(
-                        name="Feature Engineering",
-                        description=resp_dict.get("description", "Factor description not provided"),
-                    )
-                    exp = DSExperiment(sub_tasks=[ft], hypothesis=DSHypothesis("FeatureEng"))
-                    exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
-                    return exp
-                elif o == "Model":
-                    resp_dict = self.llm_task_gen(
-                        targets="Models",
-                        scenario_desc=scenario_desc,
-                        task_output_format=T(".prompts:output_format.model").r(),
-                    )
-                    dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
-                    mt = ModelTask(
-                        name=resp_dict.get("model_name", "Model name not provided"),
-                        description=resp_dict.get("description", "Model description not provided"),
-                        model_type=resp_dict.get("model_type", "Model type not provided"),
-                        architecture=resp_dict.get("architecture", "Model architecture not provided"),
-                        hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
-                    )
-                    exp = DSExperiment(sub_tasks=[mt], hypothesis=DSHypothesis("Model"))
-                    exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
-                    return exp
-                elif o == "Ensemble":
-                    resp_dict = self.llm_task_gen(
-                        targets="Ensemble",
-                        scenario_desc=scenario_desc,
-                        task_output_format=T(".prompts:output_format.ensemble").r(),
-                    )
-                    dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
-                    et = EnsembleTask(
-                        name="Ensemble",
-                        description=resp_dict.get("description", "Ensemble description not provided"),
-                    )
-                    exp = DSExperiment(sub_tasks=[et], hypothesis=DSHypothesis("Ensemble"))
-                    exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
-                    return exp
-                elif o == "Workflow":
-                    resp_dict = self.llm_task_gen(
-                        targets="Workflow",
-                        scenario_desc=scenario_desc,
-                        task_output_format=T(".prompts:output_format.workflow").r(),
-                    )
-                    dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
-                    wt = WorkflowTask(
-                        name="Workflow",
-                        description=resp_dict.get("description", "Workflow description not provided"),
-                    )
-                    exp = DSExperiment(sub_tasks=[wt], hypothesis=DSHypothesis("Workflow"))
-                    exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
-                    return exp
 
         return super().gen(trace)

From 5da62ac274eb90e9875864c64ec7d7440f3ece1b Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Thu, 2 Jan 2025 03:09:58 +0000
Subject: [PATCH 151/304] fix a bug in package calling and execute ci

---
 rdagent/app/data_science/debug.py             |  1 -
 .../coder/data_science/ensemble/eval.py       |  4 +--
 .../feature/eval_tests/feature_test.py        |  2 +-
 .../model/eval_tests/model_test.py            |  8 +++---
 .../coder/data_science/workflow/eval.py       |  4 +--
 rdagent/core/experiment.py                    |  2 +-
 rdagent/core/proposal.py                      |  2 ++
 rdagent/log/ui/llm_st.py                      |  4 ++-
 rdagent/scenarios/data_science/debug/data.py  | 25 ++++++++-----------
 .../data_science/experiment/experiment.py     | 13 ++++++----
 .../data_science/proposal/exp_gen.py          | 15 ++++++-----
 rdagent/scenarios/kaggle/kaggle_crawler.py    |  4 +--
 12 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/rdagent/app/data_science/debug.py b/rdagent/app/data_science/debug.py
index 06b0af6e4..e5ea7da7b 100644
--- a/rdagent/app/data_science/debug.py
+++ b/rdagent/app/data_science/debug.py
@@ -2,6 +2,5 @@
 
 from rdagent.scenarios.data_science.debug.data import create_debug_data
 
-
 if __name__ == "__main__":
     fire.Fire(create_debug_data)
diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
index 1dc742d46..70ae04c5f 100644
--- a/rdagent/components/coder/data_science/ensemble/eval.py
+++ b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -10,11 +10,11 @@
 )
 from rdagent.core.evaluation import Feedback
 from rdagent.core.evolving_framework import QueriedKnowledge
+from rdagent.core.exception import CoderError
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
-from rdagent.core.exception import CoderError
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
@@ -52,7 +52,7 @@ def evaluate(
             test_code = f.read()
             implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
-        
+
         # Check if the metrics file is generated
         score_fp = implementation.experiment_workspace.workspace_path / "scores.csv"
         if not score_fp.exists():
diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
index bdcd2b258..d4b277c7c 100644
--- a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
@@ -11,8 +11,8 @@
 
 import numpy as np
 import pandas as pd
-from load_data import load_data
 from feature import feat_eng
+from load_data import load_data
 
 X, y, X_test, test_ids = load_data()
 X, y, X_test = feat_eng(X, y, X_test)
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_test.py b/rdagent/components/coder/data_science/model/eval_tests/model_test.py
index 217945e28..b9db686c2 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_test.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_test.py
@@ -7,10 +7,10 @@
 import traceback
 
 import numpy as np
-from sklearn.model_selection import train_test_split
+from feature import feat_eng
 from load_data import load_data
 from model01 import model_workflow
-from feature import feat_eng
+from sklearn.model_selection import train_test_split
 
 X, y, test_X, test_ids = load_data()
 X, y, test_X = feat_eng(X, y, test_X)
@@ -52,7 +52,9 @@
 print(execution_feedback_str)
 
 print("The second execution begins.\n")
-val_pred, test_pred, finalhypers = model_workflow(X=train_X, y=train_y, val_X=None, val_y=None, test_X=test_X, hyper_params=hypers)
+val_pred, test_pred, finalhypers = model_workflow(
+    X=train_X, y=train_y, val_X=None, val_y=None, test_X=test_X, hyper_params=hypers
+)
 execution_feedback_str = "The second Execution successful.\n"
 if val_pred is not None:
     execution_feedback_str += f"Validation predictions shape: {val_pred.shape}\n"
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index 6f41f5535..210272bac 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -9,11 +9,11 @@
     CoSTEERSingleFeedbackDeprecated,
 )
 from rdagent.core.evolving_framework import QueriedKnowledge
+from rdagent.core.exception import CoderError
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
-from rdagent.core.exception import CoderError
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
@@ -56,7 +56,7 @@ def evaluate(
         de = DockerEnv(conf=ds_docker_conf)
         fname = "main.py"
         stdout = implementation.execute(env=de, entry=f"python {fname}")
-        
+
         # Check if the submission file and score file are generated
         submission_fp = implementation.experiment_workspace.workspace_path / "submission.csv"
         score_fp = implementation.experiment_workspace.workspace_path / "scores.csv"
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index 6fd23a7f0..adc71a77d 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -118,7 +118,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
     def code(self) -> str:
         code_string = ""
         for file_name, code in self.file_dict.items():
-            if file_name.endswith(".py") and 'test' not in file_name:
+            if file_name.endswith(".py") and "test" not in file_name:
                 code_string += f"File: {file_name}\n{code}\n"
         return code_string
 
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
index 925d4c3fd..bfc955ddc 100644
--- a/rdagent/core/proposal.py
+++ b/rdagent/core/proposal.py
@@ -56,6 +56,7 @@ def __str__(self) -> str:
 
 # Origin(path of repo/data/feedback) => view/summarization => generated Hypothesis
 
+
 class ExperimentFeedback(Feedback):
     def __init__(
         self,
@@ -71,6 +72,7 @@ def __bool__(self) -> bool:
     def __str__(self) -> str:
         return f"Decision: {self.decision}\nReason: {self.reason}"
 
+
 class HypothesisFeedback(ExperimentFeedback):
     def __init__(
         self,
diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
index a1f4e16f7..354197557 100644
--- a/rdagent/log/ui/llm_st.py
+++ b/rdagent/log/ui/llm_st.py
@@ -110,7 +110,9 @@ def extract_evoid(tag):
             st.subheader(f"evo_step_{evo_id}", anchor=f"loop_{loop_id}.evo_step_{evo_id}", divider="orange")
 
     if "debug_exp_gen" in tag:
-        with st.expander(f"Exp in:violet[**{obj.experiment_workspace.workspace_path}**]", expanded=expand_all, icon="🧩"):
+        with st.expander(
+            f"Exp in:violet[**{obj.experiment_workspace.workspace_path}**]", expanded=expand_all, icon="🧩"
+        ):
             st.write(obj)
     elif "debug_tpl" in tag:
         uri = obj["uri"]
diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index 120b0ae08..90ad40803 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -1,8 +1,9 @@
 import os
-from pathlib import Path
 import platform
-import pandas as pd
 import shutil
+from pathlib import Path
+
+import pandas as pd
 
 from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
 
@@ -19,7 +20,7 @@ def dump(self, df: pd.DataFrame, path):
 
 class GenericDataHandler(DataHandler):
     """
-    A generic data handler that automatically detects file type based on suffix 
+    A generic data handler that automatically detects file type based on suffix
     and uses the correct pandas method for load/dump.
     """
 
@@ -37,7 +38,7 @@ def load(self, path) -> pd.DataFrame:
             # Note: for HDF, you need a 'key' in read_hdf. If you expect a single key,
             # you might do: pd.read_hdf(path, key='df') or something similar.
             # Adjust as needed based on your HDF structure.
-            return pd.read_hdf(path, key='data')
+            return pd.read_hdf(path, key="data")
         else:
             raise ValueError(f"Unsupported file type: {suffix}")
 
@@ -67,7 +68,7 @@ def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
 
 class RandDataReducer(DataReducer):
     """
-    Example random sampler: ensures at least `min_num` rows 
+    Example random sampler: ensures at least `min_num` rows
     or at least `min_frac` fraction of the data (whichever is larger).
     """
 
@@ -109,7 +110,7 @@ def create_debug_data(
     sample_path=None,
 ):
     """
-    Reads the original data file, creates a reduced sample, 
+    Reads the original data file, creates a reduced sample,
     and renames/moves files for easier debugging.
     Automatically detects file type (csv, pkl, parquet, hdf, etc.).
     """
@@ -118,7 +119,7 @@ def create_debug_data(
 
     if dataset_path is None:
         dataset_path = KAGGLE_IMPLEMENT_SETTING.local_data_path
-    
+
     if sample_path is None:
         # Create a sample folder under the dataset folder, which should be available in docker container
         sample_path = Path(dataset_path) / "sample"
@@ -127,11 +128,8 @@ def create_debug_data(
     sample_folder = Path(sample_path) / competition
 
     # Traverse the folder and exclude specific file types
-    included_extensions = {".csv", ".pkl", ".parquet", ".h5", ".hdf", ".hdf5"} 
-    files_to_process = [
-        file for file in data_folder.rglob("*") 
-        if file.is_file() 
-    ]
+    included_extensions = {".csv", ".pkl", ".parquet", ".h5", ".hdf", ".hdf5"}
+    files_to_process = [file for file in data_folder.rglob("*") if file.is_file()]
 
     for file_path in files_to_process:
         sampled_file_path = sample_folder / file_path.relative_to(data_folder)
@@ -145,7 +143,7 @@ def create_debug_data(
             if platform.system() == "Windows":
                 os.link(file_path, sampled_file_path)
             continue
-            
+
         # Initialize the generic data handler
         data_handler = GenericDataHandler()
 
@@ -164,4 +162,3 @@ def create_debug_data(
         except Exception as e:
             print(f"Error processing {file_path}: {e}")
             continue
-
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index c6a0c9584..565d45931 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -1,23 +1,26 @@
-from rdagent.core.experiment import Experiment, FBWorkspace, Task
 import re
-from rdagent.scenarios.data_science.proposal.exp_gen import COMPONENT
+from typing import Literal
+
+from rdagent.core.experiment import Experiment, FBWorkspace, Task
+
+COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow"]
+
 
 class DSExperiment(Experiment[Task, FBWorkspace, FBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.experiment_workspace = FBWorkspace()
-    
+
     def next_component_required(self) -> COMPONENT | None:
         files = list(self.experiment_workspace.file_dict.keys())
         if "load_data.py" not in files:
             return "DataLoadSpec"
         if "feature.py" not in files:
             return "FeatureEng"
-        if not any(re.match(r'model.*\.py', file) for file in files):
+        if not any(re.match(r"model.*\.py", file) for file in files):
             return "Model"
         if "ensemble.py" not in files:
             return "Ensemble"
         if "main.py" not in files:
             return "Workflow"
         return None
-        
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 799810cb5..d99d81c60 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -8,15 +8,18 @@
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
 from rdagent.core.experiment import Experiment
 from rdagent.core.knowledge_base import KnowledgeBase
-from rdagent.core.proposal import ExpGen, Hypothesis, HypothesisFeedback, Trace, ExperimentFeedback
+from rdagent.core.proposal import (
+    ExperimentFeedback,
+    ExpGen,
+    Hypothesis,
+    HypothesisFeedback,
+    Trace,
+)
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.scenarios.data_science.experiment.experiment import COMPONENT, DSExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
 from rdagent.utils.agent.tpl import T
 
-COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow"]
-ORDER = COMPONENT.__args__
-
 
 class DSHypothesis(Hypothesis):
     def __init__(
@@ -173,7 +176,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             exp = DSExperiment(sub_tasks=[wt], hypothesis=DSHypothesis("Workflow"))
             exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
             return exp
-        else: # propose new component by LLM
+        else:  # propose new component by LLM
             # base info
             hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(trace=trace)
 
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
index df78c3609..0ee16362a 100644
--- a/rdagent/scenarios/kaggle/kaggle_crawler.py
+++ b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -21,8 +21,8 @@
 from rdagent.core.prompts import Prompts
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.utils.env import MLEBDockerEnv
 from rdagent.scenarios.data_science.debug.data import create_debug_data
+from rdagent.utils.env import MLEBDockerEnv
 
 # %%
 options = webdriver.ChromeOptions()
@@ -162,7 +162,7 @@ def download_data(competition: str, settings: ExtendedBaseSettings = KAGGLE_IMPL
                 unzip_data(unzip_file_path=f"{zipfile_path}/{competition}.zip", unzip_target_path=unzip_path)
                 for sub_zip_file in Path(unzip_path).rglob("*.zip"):
                     unzip_data(sub_zip_file, unzip_target_path=unzip_path)
-            
+
             # sample data
             create_debug_data(competition, dataset_path=local_path)
 

From 0d59c6c7d1493c806bcdbcc91bdfe0ae32307198 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Thu, 2 Jan 2025 03:41:57 +0000
Subject: [PATCH 152/304] fix code

---
 rdagent/components/coder/data_science/workflow/eval.py | 4 ++--
 rdagent/scenarios/data_science/proposal/exp_gen.py     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index 210272bac..099a68e76 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -58,8 +58,8 @@ def evaluate(
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
         # Check if the submission file and score file are generated
-        submission_fp = implementation.experiment_workspace.workspace_path / "submission.csv"
-        score_fp = implementation.experiment_workspace.workspace_path / "scores.csv"
+        submission_fp = implementation.workspace_path / "submission.csv"
+        score_fp = implementation.workspace_path / "scores.csv"
         if not submission_fp.exists():
             raise CoderError("Submission file (submission.csv) is not generated.")
         if not score_fp.exists():
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index d99d81c60..bf533104a 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -99,7 +99,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
         if len(trace.hist) == 0:
             next_component = "DataLoadSpec"
         else:
-            next_component = trace.hist[-1][0].next_component_required()
+            next_component = trace.hist[-1][1].next_component_required()
 
         scenario_desc = trace.scen.get_scenario_all_desc()
         if next_component == "DataLoadSpec":

From 716ba1d98db1193a7fb60e6014cde16df115f1bd Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 2 Jan 2025 05:28:51 +0000
Subject: [PATCH 153/304] bug fix

---
 rdagent/app/data_science/loop.py | 2 +-
 rdagent/core/proposal.py         | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index ac7e69da0..f487c0b52 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -97,7 +97,7 @@ def feedback(self, prev_out: dict[str, Any]):
                 reason="",
                 decision=True,
             )
-        self.trace.hist.append((prev_out["direct_exp_gen"].hypothesis, prev_out["running"], feedback))
+        self.trace.hist.append((prev_out["running"], feedback))
 
 
 def main(path=None, step_n=None, competition="bms-molecular-translation"):
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
index 925d4c3fd..790392973 100644
--- a/rdagent/core/proposal.py
+++ b/rdagent/core/proposal.py
@@ -86,11 +86,10 @@ def __init__(
         self.new_hypothesis = new_hypothesis
 
     def __str__(self) -> str:
-        return f"""Observations: {self.observations}
+        return f"""{super().__str__()}
+Observations: {self.observations}
 Hypothesis Evaluation: {self.hypothesis_evaluation}
-New Hypothesis: {self.new_hypothesis}
-Decision: {self.decision}
-Reason: {self.reason}"""
+New Hypothesis: {self.new_hypothesis}"""
 
 
 ASpecificScen = TypeVar("ASpecificScen", bound=Scenario)

From 7c046cf392253bbd7097a24f70581b731e7fcc0d Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 2 Jan 2025 05:30:45 +0000
Subject: [PATCH 154/304] bug fix

---
 rdagent/scenarios/data_science/proposal/exp_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index bf533104a..d99d81c60 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -99,7 +99,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
         if len(trace.hist) == 0:
             next_component = "DataLoadSpec"
         else:
-            next_component = trace.hist[-1][1].next_component_required()
+            next_component = trace.hist[-1][0].next_component_required()
 
         scenario_desc = trace.scen.get_scenario_all_desc()
         if next_component == "DataLoadSpec":

From f722e6c20774b3b094d4ce92285545a860054460 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Thu, 2 Jan 2025 05:41:00 +0000
Subject: [PATCH 155/304] fix a bug

---
 rdagent/scenarios/data_science/proposal/exp_gen.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index d99d81c60..a33b24743 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -58,9 +58,9 @@ def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None =
 
     def get_sota_hypothesis_and_experiment(self) -> tuple[DSHypothesis | None, Experiment | None]:
         """Access the last experiment result, sub-task, and the corresponding hypothesis."""
-        for h, exp, hf in self.hist[::-1]:
+        for exp, hf in self.hist[::-1]:
             if hf.decision:
-                return h, exp
+                return exp.hypothesis, exp
         return None, None
 
 

From d44942db4556615e797a776edafcd490eb1909b9 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Thu, 2 Jan 2025 06:09:55 +0000
Subject: [PATCH 156/304] fix some bugs

---
 rdagent/components/coder/data_science/ensemble/eval.py      | 2 +-
 .../coder/data_science/raw_data_loader/prompts.yaml         | 6 ------
 rdagent/scenarios/data_science/proposal/prompts.yaml        | 4 ++--
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
index 70ae04c5f..b969b5db0 100644
--- a/rdagent/components/coder/data_science/ensemble/eval.py
+++ b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -54,7 +54,7 @@ def evaluate(
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
         # Check if the metrics file is generated
-        score_fp = implementation.experiment_workspace.workspace_path / "scores.csv"
+        score_fp = implementation.workspace_path / "scores.csv"
         if not score_fp.exists():
             raise CoderError("Metrics file (scores.csv) is not generated.")
 
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 0ea99e89c..1ee1acbac 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -308,12 +308,6 @@ data_loader_coder:
     ---------Data Loader Specification---------
     {{ data_loader_spec }}
 
-    {% if latest_code %}
-    ---------Former Specification---------
-      Former Code: {{ latest_code }}
-      You should follow the former code to improve it.
-    {% endif %}
-
     You should strictly follow the function interface specifications provided by the specification to implement the function.
 
 
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index e6cb43dd8..e34276afa 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -53,8 +53,8 @@ task_gen:
     {% endif %}
 
 hypothesis_and_feedback: |-
-  {% for hypothesis, experiment, feedback in trace.hist[-10:] %}
-  Hypothesis {{ loop.index }}: {{ hypothesis }}
+  {% for experiment, feedback in trace.hist[-10:] %}
+  Hypothesis {{ loop.index }}: {{ experiment.hypothesis }}
   Observation on the result with the hypothesis: {{ feedback.observations }}
   Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
   Did changing to this hypothesis work? (focus on the change):  {{ feedback.decision }}

From 4e9aff3324f293d02838eaffb6af3da8b7f37eb8 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Thu, 2 Jan 2025 09:15:33 +0000
Subject: [PATCH 157/304] fix a bug

---
 .../components/coder/data_science/raw_data_loader/prompts.yaml  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 1ee1acbac..6cc5b59d1 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -240,7 +240,7 @@ spec:
         1. Workflow Integration:
           - Integrate the following components into the workflow:
             - Data loading (`load_data.py`).
-            - Feature engineering (`feat*.py`).
+            - Feature engineering (`feature.py`).
             - Model workflow for training and testing (`model*.py`).
             - Ensemble and decision-making (`ensemble.py`).
           - Treat each component as a modular and callable Python function.

From 288777d78ca171180d49531a6dbebdb693ddd45e Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Thu, 2 Jan 2025 10:07:43 +0000
Subject: [PATCH 158/304] refactor: Enhance error handling and feedback in data
 science loop

---
 rdagent/app/data_science/loop.py | 16 +++++++++++++---
 rdagent/core/exception.py        |  7 ++++++-
 rdagent/core/proposal.py         | 11 ++++++++++-
 rdagent/utils/workflow.py        | 24 +++++++++++++++---------
 4 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index f487c0b52..3ada6e0b8 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -11,7 +11,8 @@
 from rdagent.components.coder.data_science.workflow import WorkflowCoSTEER
 from rdagent.components.workflow.conf import BasePropSetting
 from rdagent.components.workflow.rd_loop import RDLoop
-from rdagent.core.proposal import HypothesisFeedback
+from rdagent.core.exception import CoderError, RunnerError
+from rdagent.core.proposal import ExperimentFeedback, HypothesisFeedback
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
@@ -23,7 +24,7 @@
 
 
 class DataScienceRDLoop(RDLoop):
-    skip_loop_error = ()
+    skip_loop_error = (CoderError, RunnerError)
 
     def __init__(self, PROP_SETTING: BasePropSetting):
         scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
@@ -97,7 +98,16 @@ def feedback(self, prev_out: dict[str, Any]):
                 reason="",
                 decision=True,
             )
-        self.trace.hist.append((prev_out["running"], feedback))
+        return feedback
+
+    def record(self, prev_out: dict[str, Any]):
+        e = prev_out.get(self.EXCEPTION_KEY, None)
+        if e is None:
+            self.trace.hist.append((prev_out["running"], prev_out["feedback"]))
+        else:
+            # TODO: Please judge the type of the exception.
+            # Record the `experiment` when raising the exception.
+            self.trace.hist.append((prev_out.get("direct_exp_gen", None) or prev_out.get("running", None), ExperimentFeedback.from_exception(e)))
 
 
 def main(path=None, step_n=None, competition="bms-molecular-translation"):
diff --git a/rdagent/core/exception.py b/rdagent/core/exception.py
index 2167ab9dc..0e12084f9 100644
--- a/rdagent/core/exception.py
+++ b/rdagent/core/exception.py
@@ -6,6 +6,7 @@ class CoderError(Exception):
 
     The more detailed evaluation in dataframe values are managed by the evaluator.
     """
+    # NOTE: it corresponds to the error of **component**
 
 
 class CodeFormatError(CoderError):
@@ -26,10 +27,14 @@ class NoOutputError(CoderError):
     """
 
 
-class CustomRunnerError(Exception):
+class RunnerError(Exception):
     """
     Exceptions raised when running the code output.
     """
+    # NOTE: it corresponds to the error of whole **project**
+    def __init__(self, message, ws: ):
+        super().__init__(message)
+        self.message = message
 
 
 class FactorEmptyError(Exception):
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
index 0686c6f8d..edc50e28e 100644
--- a/rdagent/core/proposal.py
+++ b/rdagent/core/proposal.py
@@ -61,10 +61,12 @@ class ExperimentFeedback(Feedback):
     def __init__(
         self,
         decision: bool,
-        reason: bool,
+        reason: str,
+        exception: Exception | None = None,
     ) -> None:
         self.decision = decision
         self.reason = reason
+        self.exception: Exception | None = exception  # if the experiment raises exception, it will be integrated into part of the feedback.
 
     def __bool__(self) -> bool:
         return self.decision
@@ -72,6 +74,13 @@ def __bool__(self) -> bool:
     def __str__(self) -> str:
         return f"Decision: {self.decision}\nReason: {self.reason}"
 
+    @classmethod
+    def from_exception(cls, e: Exception) -> ExperimentFeedback:
+        """
+        A convenient method to create Feedback from an exception.
+        """
+        return cls(False, f"The experiment fails due to {str(e)}", e)
+
 
 class HypothesisFeedback(ExperimentFeedback):
     def __init__(
diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py
index 07c4c39c4..d11355365 100644
--- a/rdagent/utils/workflow.py
+++ b/rdagent/utils/workflow.py
@@ -67,10 +67,15 @@ def __new__(cls, clsname, bases, attrs):
 class LoopTrace:
     start: datetime.datetime  # the start time of the trace
     end: datetime.datetime  # the end time of the trace
+    step_idx: int
     # TODO: more information about the trace
 
 
 class LoopBase:
+    """
+    Assumption:
+    - The last step is responsible for recording information!!!!
+    """
     steps: list[Callable]  # a list of steps to work on
     loop_trace: dict[int, list[LoopTrace]]
 
@@ -78,6 +83,8 @@ class LoopBase:
         default_factory=tuple
     )  # you can define a list of error that will skip current loop
 
+    EXCEPTION_KEY = "_EXCEPTION"
+
     def __init__(self):
         self.loop_idx = 0  # current loop index
         self.step_idx = 0  # the index of next step to be run
@@ -112,22 +119,21 @@ def run(self, step_n: int | None = None):
                         self.loop_prev_out[name] = func(self.loop_prev_out)
                         # TODO: Fix the error logger.exception(f"Skip loop {li} due to {e}")
                     except self.skip_loop_error as e:
+                        # FIXME: This does not support previous instance
                         logger.warning(f"Skip loop {li} due to {e}")
-                        self.loop_idx += 1
-                        self.step_idx = 0
-                        continue
-                    except CoderError as e:
-                        logger.warning(f"Traceback loop {li} due to {e}")
-                        self.step_idx = 0
+                        # NOTE: strong assumption!  The last step is responsible for recording information
+                        self.step_idx = len(self.steps) - 1  # directly jump to the last step.
+                        self.loop_prev_out[self.EXCEPTION_KEY] = e
                         continue
 
                 end = datetime.datetime.now(datetime.timezone.utc)
 
-                self.loop_trace[li].append(LoopTrace(start, end))
+                self.loop_trace[li].append(LoopTrace(start, end, step_idx=si))
 
-                # Update tqdm progress bar
+                # Update tqdm progress bar directly to step_idx
+                pbar.n = len(self.steps) - self.step_idx  # FIXME: check it's correctness
                 pbar.set_postfix(loop_index=li, step_index=si, step_name=name)
-                pbar.update(1)
+                pbar.update(0)  # Refresh the display
 
                 # index increase and save session
                 self.step_idx = (self.step_idx + 1) % len(self.steps)

From 5e2adfa9f19e77b7484a9edb81c09806b40af796 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Thu, 2 Jan 2025 10:25:52 +0000
Subject: [PATCH 159/304] support different use_azure on chat and embedding
 models

---
 rdagent/oai/llm_conf.py  |  9 ++++++++-
 rdagent/oai/llm_utils.py | 40 ++++++++++++++++++++++++++--------------
 2 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/rdagent/oai/llm_conf.py b/rdagent/oai/llm_conf.py
index f3c2498d6..75acec4c7 100644
--- a/rdagent/oai/llm_conf.py
+++ b/rdagent/oai/llm_conf.py
@@ -2,13 +2,18 @@
 
 from pathlib import Path
 
+from pydantic import Field
+
 from rdagent.core.conf import ExtendedBaseSettings
 
 
 class LLMSettings(ExtendedBaseSettings):
     log_llm_chat_content: bool = True
 
-    use_azure: bool = False
+    use_azure: bool = Field(default=False, deprecated=True)
+    chat_use_azure: bool = False
+    embedding_use_azure: bool = False
+
     chat_use_azure_token_provider: bool = False
     embedding_use_azure_token_provider: bool = False
     managed_identity_client_id: str | None = None
@@ -34,6 +39,7 @@ class LLMSettings(ExtendedBaseSettings):
     # Chat configs
     openai_api_key: str = ""  # TODO: simplify the key design.
     chat_openai_api_key: str = ""
+    chat_openai_base_url: str = ""
     chat_azure_api_base: str = ""
     chat_azure_api_version: str = ""
     chat_model: str = "gpt-4-turbo"
@@ -50,6 +56,7 @@ class LLMSettings(ExtendedBaseSettings):
 
     # Embedding configs
     embedding_openai_api_key: str = ""
+    embedding_openai_base_url: str = ""
     embedding_azure_api_base: str = ""
     embedding_azure_api_version: str = ""
     embedding_model: str = ""
diff --git a/rdagent/oai/llm_utils.py b/rdagent/oai/llm_utils.py
index 2182d0ef3..eafbb6d0d 100644
--- a/rdagent/oai/llm_utils.py
+++ b/rdagent/oai/llm_utils.py
@@ -307,7 +307,11 @@ def __init__(  # noqa: C901, PLR0912, PLR0915
             self.chat_model = LLM_SETTINGS.chat_model if chat_model is None else chat_model
             self.encoder = None
         else:
-            self.use_azure = LLM_SETTINGS.use_azure
+            if LLM_SETTINGS.use_azure:
+                self.chat_use_azure = self.embedding_use_azure = LLM_SETTINGS.use_azure
+            else:
+                self.chat_use_azure = LLM_SETTINGS.chat_use_azure
+                self.embedding_use_azure = LLM_SETTINGS.embedding_use_azure
             self.chat_use_azure_token_provider = LLM_SETTINGS.chat_use_azure_token_provider
             self.embedding_use_azure_token_provider = LLM_SETTINGS.embedding_use_azure_token_provider
             self.managed_identity_client_id = LLM_SETTINGS.managed_identity_client_id
@@ -330,6 +334,8 @@ def __init__(  # noqa: C901, PLR0912, PLR0915
             self.chat_model = LLM_SETTINGS.chat_model if chat_model is None else chat_model
             self.chat_model_map = json.loads(LLM_SETTINGS.chat_model_map)
             self.encoder = self._get_encoder()
+            self.chat_openai_base_url = LLM_SETTINGS.chat_openai_base_url
+            self.embedding_openai_base_url = LLM_SETTINGS.embedding_openai_base_url
             self.chat_api_base = LLM_SETTINGS.chat_azure_api_base if chat_api_base is None else chat_api_base
             self.chat_api_version = (
                 LLM_SETTINGS.chat_azure_api_version if chat_api_version is None else chat_api_version
@@ -345,16 +351,18 @@ def __init__(  # noqa: C901, PLR0912, PLR0915
                 LLM_SETTINGS.embedding_azure_api_version if embedding_api_version is None else embedding_api_version
             )
 
-            if self.use_azure:
-                if self.chat_use_azure_token_provider or self.embedding_use_azure_token_provider:
-                    dac_kwargs = {}
-                    if self.managed_identity_client_id is not None:
-                        dac_kwargs["managed_identity_client_id"] = self.managed_identity_client_id
-                    credential = DefaultAzureCredential(**dac_kwargs)
-                    token_provider = get_bearer_token_provider(
-                        credential,
-                        "https://cognitiveservices.azure.com/.default",
-                    )
+            if (self.chat_use_azure or self.embedding_use_azure) and (
+                self.chat_use_azure_token_provider or self.embedding_use_azure_token_provider
+            ):
+                dac_kwargs = {}
+                if self.managed_identity_client_id is not None:
+                    dac_kwargs["managed_identity_client_id"] = self.managed_identity_client_id
+                credential = DefaultAzureCredential(**dac_kwargs)
+                token_provider = get_bearer_token_provider(
+                    credential,
+                    "https://cognitiveservices.azure.com/.default",
+                )
+            if self.chat_use_azure:
                 if self.chat_use_azure_token_provider:
                     self.chat_client = openai.AzureOpenAI(
                         azure_ad_token_provider=token_provider,
@@ -367,7 +375,10 @@ def __init__(  # noqa: C901, PLR0912, PLR0915
                         api_version=self.chat_api_version,
                         azure_endpoint=self.chat_api_base,
                     )
+            else:
+                self.chat_client = openai.OpenAI(api_key=self.chat_api_key, base_url=self.chat_openai_base_url)
 
+            if self.embedding_use_azure:
                 if self.embedding_use_azure_token_provider:
                     self.embedding_client = openai.AzureOpenAI(
                         azure_ad_token_provider=token_provider,
@@ -381,8 +392,9 @@ def __init__(  # noqa: C901, PLR0912, PLR0915
                         azure_endpoint=self.embedding_api_base,
                     )
             else:
-                self.chat_client = openai.OpenAI(api_key=self.chat_api_key)
-                self.embedding_client = openai.OpenAI(api_key=self.embedding_api_key)
+                self.embedding_client = openai.OpenAI(
+                    api_key=self.embedding_api_key, base_url=self.embedding_openai_base_url
+                )
 
         self.dump_chat_cache = LLM_SETTINGS.dump_chat_cache if dump_chat_cache is None else dump_chat_cache
         self.use_chat_cache = LLM_SETTINGS.use_chat_cache if use_chat_cache is None else use_chat_cache
@@ -587,7 +599,7 @@ def _create_embedding_inner_function(
                 filtered_input_content_list[i : i + LLM_SETTINGS.embedding_max_str_num]
                 for i in range(0, len(filtered_input_content_list), LLM_SETTINGS.embedding_max_str_num)
             ]:
-                if self.use_azure:
+                if self.embedding_use_azure:
                     response = self.embedding_client.embeddings.create(
                         model=self.embedding_model,
                         input=sliced_filtered_input_content_list,

From 2abdb966da5d73ad270cb7174ba985b329f67918 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Thu, 2 Jan 2025 10:34:31 +0000
Subject: [PATCH 160/304] multi-model proposal logic

---
 .../coder/data_science/model/__init__.py      |   4 +-
 .../data_science/proposal/exp_gen.py          | 103 +++++++++++++++---
 .../data_science/proposal/prompts.yaml        |  67 +++++++++++-
 3 files changed, 155 insertions(+), 19 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 12e12d8b1..3b217a49e 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -50,7 +50,7 @@ def implement_one_task(
         )
         user_prompt = T(".prompts:model_coder.user").r(
             model_spec=workspace.file_dict["spec/model.md"],
-            latest_code=workspace.file_dict.get("model01.py"),
+            latest_code=workspace.file_dict.get(f"{target_task.name}.py", ""),
         )
 
         model_code = json.loads(
@@ -60,7 +60,7 @@ def implement_one_task(
         )["code"]
 
         return {
-            "model01.py": model_code,
+            f"{target_task.name}.py": model_code,
         }
 
     def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index a33b24743..b138d8321 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -19,6 +19,7 @@
 from rdagent.scenarios.data_science.experiment.experiment import COMPONENT, DSExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
 from rdagent.utils.agent.tpl import T
+import pandas as pd
 
 
 class DSHypothesis(Hypothesis):
@@ -62,6 +63,20 @@ def get_sota_hypothesis_and_experiment(self) -> tuple[DSHypothesis | None, Exper
             if hf.decision:
                 return exp.hypothesis, exp
         return None, None
+    
+    def get_models_information(self) -> tuple[str, int]:
+        for exp, hf in self.hist[::-1]:
+            if hf.decision:
+                wp = exp.experiment_workspace.workspace_path
+                score_df = pd.read_csv(f"{wp}/score.csv")
+                filtered_df = score_df.iloc[:-1]
+                models = filtered_df.to_dict(orient="records")
+                # TODO: fix name
+                model_code = exp.sub_workspace_list[0].file_dict.get('spec/model.py', '')
+                # TODO: 组合模型名，模型代码，模型表现
+                models_info = ""
+                return models_info, len(models)
+        return "", 0
 
 
 class DSExpGen(ExpGen):
@@ -180,33 +195,90 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             # base info
             hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(trace=trace)
 
-            # 1. hypothesis gen
+            # Step 1: Generate component
             # TODO: how to generate sota solution
             sota_solution = ""
-            system_prompt = T(".prompts:hypothesis_gen.system").r(
+            component_sys_prompt = T(".prompts:component_gen").r(
                 targets="data science project",
                 scenario=scenario_desc,
-                hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
+                hypothesis_output_format=T(".prompts:output_format.component").r(),
                 hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
             )
-            user_prompt = T(".prompts:hypothesis_gen.user").r(
+
+            component_user_prompt = T(".prompts:hypothesis_gen.user").r(
                 targets="data science project",
                 hypothesis_and_feedback=hypothesis_and_feedback,
             )
 
-            resp_dict: dict = json.loads(
-                APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
-            )
-            hypothesis = DSHypothesis(
-                component=resp_dict.get("component", "Component not provided"),
-                hypothesis=resp_dict.get("hypothesis", "Hypothesis not provided"),
-                reason=resp_dict.get("reason", "Reason not provided"),
-                concise_reason=resp_dict.get("concise_reason", "Concise reason not provided"),
-                concise_observation=resp_dict.get("concise_observation", "Concise observation not provided"),
-                concise_justification=resp_dict.get("concise_justification", "Concise justification not provided"),
-                concise_knowledge=resp_dict.get("concise_knowledge", "Concise knowledge not provided"),
+            resp_dict_component: dict = json.loads(
+                APIBackend().build_messages_and_create_chat_completion(component_user_prompt, component_sys_prompt, json_mode=True)
             )
 
+            component = resp_dict_component.get("component", "Component not provided")
+            if component != "Model":
+                # Step 2: Generate the rest of the hypothesis
+                hypothesis_sys_prompt = T(".prompts:hypothesis_gen.system").r(
+                    targets="data science project",
+                    scenario=scenario_desc,
+                    hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
+                    hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
+                    component=component,
+                )
+                hypothesis_user_prompt = T(".prompts:hypothesis_gen.user").r(
+                    targets="data science project",
+                    hypothesis_and_feedback=hypothesis_and_feedback,
+                )
+
+                resp_dict: dict = json.loads(
+                    APIBackend().build_messages_and_create_chat_completion(hypothesis_user_prompt, hypothesis_sys_prompt, json_mode=True)
+                )
+                hypothesis = DSHypothesis(
+                    component=resp_dict.get("component", "Component not provided"),
+                    hypothesis=resp_dict.get("hypothesis", "Hypothesis not provided"),
+                    reason=resp_dict.get("reason", "Reason not provided"),
+                    concise_reason=resp_dict.get("concise_reason", "Concise reason not provided"),
+                    concise_observation=resp_dict.get("concise_observation", "Concise observation not provided"),
+                    concise_justification=resp_dict.get("concise_justification", "Concise justification not provided"),
+                    concise_knowledge=resp_dict.get("concise_knowledge", "Concise knowledge not provided"),
+                )
+            else:
+                # 
+                model_info, model_num = trace.get_models_information()
+                if model_num >= 3:
+                    hypothesis_sys_prompt = T(".prompts:hypothesis_model.system").r(
+                        targets="data science project",
+                        scenario=scenario_desc,
+                        hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
+                        hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
+                        model_info=model_info,
+                        model_enough=True,
+                    )
+                else:
+                    hypothesis_sys_prompt = T(".prompts:hypothesis_model.system").r(
+                        targets="data science project",
+                        scenario=scenario_desc,
+                        hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
+                        hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
+                        model_info=model_info,
+                        model_enough=False,
+                    )
+                hypothesis_user_prompt = T(".prompts:hypothesis_gen.user").r(
+                    targets="data science project",
+                    hypothesis_and_feedback=hypothesis_and_feedback,
+                )
+                resp_dict: dict = json.loads(
+                    APIBackend().build_messages_and_create_chat_completion(hypothesis_user_prompt, hypothesis_sys_prompt, json_mode=True)
+                )
+                hypothesis = DSHypothesis(
+                    component=resp_dict.get("component", "Component not provided"),
+                    hypothesis=resp_dict.get("hypothesis", "Hypothesis not provided"),
+                    reason=resp_dict.get("reason", "Reason not provided"),
+                    concise_reason=resp_dict.get("concise_reason", "Concise reason not provided"),
+                    concise_observation=resp_dict.get("concise_observation", "Concise observation not provided"),
+                    concise_justification=resp_dict.get("concise_justification", "Concise justification not provided"),
+                    concise_knowledge=resp_dict.get("concise_knowledge", "Concise knowledge not provided"),
+                )
+
             # 2. gen experiment
             if hypothesis.component == "DataLoadSpec":
                 resp_dict = self.llm_task_gen(
@@ -246,7 +318,6 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 return exp
             elif hypothesis.component == "Model":
                 resp_dict = self.llm_task_gen(
-                    targets="Models",
                     scenario_desc=scenario_desc,
                     hypothesis=hypothesis,
                     task_output_format=T(".prompts:output_format.model").r(),
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index e34276afa..142c2131b 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -5,9 +5,10 @@ hypothesis_gen:
     {{scenario}}
     The user has already proposed several hypotheses and conducted evaluations on them. This information will be provided to you. Your task is to check whether a similar hypothesis has already been generated. 
     If one exists and you agree with it, feel free to use it. If you disagree, please generate an improved version.
+    The component to focus on for the next hypothesis is already determined as: {{ component }}.
     {% if hypothesis_specification %}
     To assist you in formulating new hypotheses, the user has provided some additional information: {{hypothesis_specification}}.
-    **Important:** If the hypothesis_specification outlines the next steps you need to follow, ensure you adhere to those instructions.
+    Important: If the hypothesis_specification outlines the next steps you need to follow, ensure you adhere to those instructions.
     {% endif %}
     Please generate the output using the following format and specifications:
     {{ hypothesis_output_format }}
@@ -20,6 +21,34 @@ hypothesis_gen:
     {% endif %}
     Also generate the relevant keys for the reasoning and the distilled knowledge that follows. For those keys, in particular for knowledge, explain in the context of the specific scenario to build up domain knowledge in the specific field rather than general knowledge.
 
+
+hypothesis_model:
+  system: |-
+    The user is working on generating new hypotheses for the {{targets}} in a data-driven research and development process. 
+    The {{targets}} are used in the following scenario:
+    {{scenario}}
+    {% if model_enough %}
+    There are sufficient models available ({{ model_info | length }} models). Your task is to choose one of the existing models for further tuning or optimization. Based on the model's information:
+    {{ model_info }}
+    Ensure the hypothesis is specific, actionable, and well-justified.
+    {% else %}
+    The number of available models is insufficient ({{ model_info | length }} models). Your task is to first decide whether to:
+    - Tune an existing model: Select one of the current models for further tuning and improvement.
+    - Add a new model: Introduce a new model to expand the hypothesis space.
+    Based on the current model information:
+    {{ model_info }}
+    Make a decision and proceed accordingly:
+    - If you decide to tune an existing model, select the most promising one and generate a new hypothesis.
+    - If you decide to add a new model, specify the type of model you would add and generate a new hypothesis related to the new model.
+    {% endif %}
+    {% if hypothesis_specification %}
+    To assist you in formulating new hypotheses, the user has provided some additional information: {{hypothesis_specification}}.
+    Important: If the hypothesis_specification outlines the next steps you need to follow, ensure you adhere to those instructions.
+    {% endif %}
+    Please generate the output using the following format and specifications:
+    {{ hypothesis_output_format }}
+
+
 task_gen:
   system: |-
     {% if hypothesis is not none %}
@@ -52,6 +81,38 @@ task_gen:
     Please generate the new {{targets}} task.
     {% endif %}
 
+task_gen_model:
+  system: |-
+    {% if hypothesis is not none %}
+    The user is trying to generate new {{targets}} based on the hypothesis generated in the previous step. 
+    {% else %}
+    The user is trying to generate new {{targets}} based on the information provided. 
+    {% endif %}
+    The {{targets}} are used in certain scenario, the scenario is as follows:
+    {{ scenario }}
+
+    {% if hypothesis is not none %}
+    The user will use the {{targets}} generated to do some experiments. The user will provide this information to you:
+    1. The target hypothesis you are targeting to generate {{targets}} for.
+    2. The hypothesis generated in the previous steps and their corresponding feedbacks.
+    3. Former proposed {{targets}} on similar hypothesis.
+    4. Some additional information to help you generate new {{targets}}.
+    {% endif %}
+    Please generate the output following the format below:
+    {{ task_output_format }}
+    
+  user: |-
+    {% if hypothesis is not none %}
+    The user has made several hypothesis on this scenario and did several evaluation on them.
+    The target hypothesis you are targeting to generate {{targets}} for is as follows:
+    {{ hypothesis }}
+    The former hypothesis and the corresponding feedbacks are as follows:
+    {{ hypothesis_and_feedback }}
+    Please generate the new {{targets}} based on the information above.
+    {% else %}
+    Please generate the new {{targets}} task.
+    {% endif %}
+
 hypothesis_and_feedback: |-
   {% for experiment, feedback in trace.hist[-10:] %}
   Hypothesis {{ loop.index }}: {{ experiment.hypothesis }}
@@ -66,6 +127,10 @@ hypothesis_specification: |-
   {{ sota_solution}}
 
 output_format:
+  component: |-
+    {
+      "component": "The component you suggest to focus on. It must be one of ['DataLoadSpec', 'FeatureEng', 'Model', 'Ensemble', 'Workflow']."
+    }
   hypothesis: |-
     The output should follow JSON format. The schema is as follows:
     {

From e4af4119f70e875a68fb113044ca7285e55c1c53 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Thu, 2 Jan 2025 11:01:23 +0000
Subject: [PATCH 161/304] fix a small syntax error

---
 rdagent/core/exception.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rdagent/core/exception.py b/rdagent/core/exception.py
index 0e12084f9..c789f5663 100644
--- a/rdagent/core/exception.py
+++ b/rdagent/core/exception.py
@@ -6,6 +6,7 @@ class CoderError(Exception):
 
     The more detailed evaluation in dataframe values are managed by the evaluator.
     """
+
     # NOTE: it corresponds to the error of **component**
 
 
@@ -31,8 +32,9 @@ class RunnerError(Exception):
     """
     Exceptions raised when running the code output.
     """
+
     # NOTE: it corresponds to the error of whole **project**
-    def __init__(self, message, ws: ):
+    def __init__(self, message, ws):
         super().__init__(message)
         self.message = message
 

From f97092fb2fb65f96012f54eddafe4d62ef64d8ee Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 2 Jan 2025 12:12:47 +0000
Subject: [PATCH 162/304] loopBase and some changes

---
 rdagent/app/data_science/loop.py              | 18 +++++++++---------
 .../coder/CoSTEER/evolving_agent.py           |  2 +-
 .../coder/data_science/ensemble/eval.py       | 14 ++------------
 .../ensemble/eval_tests/ensemble_test.py      | 19 +++++++++----------
 .../coder/data_science/feature/eval.py        |  6 +++---
 .../coder/data_science/model/eval.py          |  7 +++----
 .../data_science/raw_data_loader/eval.py      |  5 ++---
 .../coder/data_science/workflow/eval.py       | 14 ++++++++------
 rdagent/core/exception.py                     |  3 ---
 rdagent/scenarios/data_science/dev/runner.py  | 15 ++++++++-------
 rdagent/utils/workflow.py                     |  1 -
 11 files changed, 45 insertions(+), 59 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 3ada6e0b8..b7b2faae4 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -86,16 +86,13 @@ def running(self, prev_out: dict[str, Any]):
         else:
             return exp
 
-    def feedback(self, prev_out: dict[str, Any]):
+    def feedback(self, prev_out: dict[str, Any]) -> ExperimentFeedback:
         exp: DSExperiment = prev_out["running"]
         if exp.next_component_required() is None:
             feedback = self.summarizer.generate_feedback(exp, self.trace)
         else:
-            feedback = HypothesisFeedback(
-                observations="Not all 5 components are completed, skip feedback of DataScienceRDLoop.",
-                hypothesis_evaluation="",
-                new_hypothesis="",
-                reason="",
+            feedback = ExperimentFeedback(
+                reason=f"{exp.hypothesis.component} is completed.",
                 decision=True,
             )
         return feedback
@@ -105,9 +102,12 @@ def record(self, prev_out: dict[str, Any]):
         if e is None:
             self.trace.hist.append((prev_out["running"], prev_out["feedback"]))
         else:
-            # TODO: Please judge the type of the exception.
-            # Record the `experiment` when raising the exception.
-            self.trace.hist.append((prev_out.get("direct_exp_gen", None) or prev_out.get("running", None), ExperimentFeedback.from_exception(e)))
+            self.trace.hist.append(
+                (
+                    prev_out["direct_exp_gen"] if isinstance(e, CoderError) else prev_out["coding"],
+                    ExperimentFeedback.from_exception(e)
+                )
+            )
 
 
 def main(path=None, step_n=None, competition="bms-molecular-translation"):
diff --git a/rdagent/components/coder/CoSTEER/evolving_agent.py b/rdagent/components/coder/CoSTEER/evolving_agent.py
index 9a67f08c0..50b22a5f7 100644
--- a/rdagent/components/coder/CoSTEER/evolving_agent.py
+++ b/rdagent/components/coder/CoSTEER/evolving_agent.py
@@ -18,6 +18,6 @@ def filter_evolvable_subjects_by_feedback(
                 evo.sub_workspace_list[index].clear()
 
         if all(not f.final_decision for f in feedback if f):
-            raise CoderError("All feedbacks of sub tasks are negative.")
+            raise CoderError("All tasks are failed")
 
         return evo
diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
index b969b5db0..cef32bf42 100644
--- a/rdagent/components/coder/data_science/ensemble/eval.py
+++ b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -2,15 +2,11 @@
 from dataclasses import dataclass
 from pathlib import Path
 
-import numpy as np
-
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
     CoSTEERSingleFeedback,
 )
-from rdagent.core.evaluation import Feedback
 from rdagent.core.evolving_framework import QueriedKnowledge
-from rdagent.core.exception import CoderError
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
@@ -48,16 +44,10 @@ def evaluate(
         de = DockerEnv(conf=DSDockerConf())
 
         fname = "ensemble_test.py"
-        with (DIRNAME / "eval_tests" / "ensemble_test.py").open("r") as f:
-            test_code = f.read()
-            implementation.inject_files(**{fname: test_code})
+        test_code = (DIRNAME / "eval_tests" / "ensemble_test.py").read_text()
+        implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
-        # Check if the metrics file is generated
-        score_fp = implementation.workspace_path / "scores.csv"
-        if not score_fp.exists():
-            raise CoderError("Metrics file (scores.csv) is not generated.")
-
         system_prompt = T(".prompts:ensemble_eval.system").r(
             test_code=test_code, code=implementation.file_dict["ensemble.py"]
         )
diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
index 7bc685b3f..63e8d1659 100644
--- a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
+++ b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
@@ -6,13 +6,9 @@
 - Use validation data appropriately
 """
 
-import logging
-
 import numpy as np
 from ensemble import ens_and_decision
-
-# Setup logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+from pathlib import Path
 
 # Create test data
 n_models = 3
@@ -30,10 +26,13 @@
     # Check shape
     assert final_predictions.shape == (n_samples, 1), "Wrong output shape"
 
-    logging.info("Ensemble test passed successfully.")
-    logging.info(f"Output shape: {final_predictions.shape}")
-    logging.info(f"Unique values in predictions: {np.unique(final_predictions)}")
+    # check if scores.csv is generated
+    if not Path("scores.csv").exists():
+        raise Exception("scores.csv is not generated")
+    
+    print("Ensemble test passed successfully.")
+    print(f"Output shape: {final_predictions.shape}")
+    print(f"Unique values in predictions: {np.unique(final_predictions)}")
 
 except Exception as e:
-    logging.error(f"Test failed: {str(e)}")
-    raise
+    print(f"Test failed: {str(e)}")
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
index 45e894c18..82408b608 100644
--- a/rdagent/components/coder/data_science/feature/eval.py
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -48,9 +48,9 @@ def evaluate(
 
         # TODO: do we need to clean the generated temporary content?
         fname = "feature_test.py"
-        with (DIRNAME / "eval_tests" / "feature_test.py").open("r") as f:
-            test_code = f.read()
-            implementation.inject_files(**{fname: test_code})
+        test_code = (DIRNAME / "eval_tests" / "feature_test.py").read_text()
+        implementation.inject_files(**{fname: test_code})
+
         stdout = implementation.execute(env=de, entry=f"python {fname}")
         if stdout is None:
             stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 517865357..728d87516 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -66,14 +66,13 @@ def evaluate(
         ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
         de = DockerEnv(conf=ds_docker_conf)
         fname = "model_test.py"
-        with (DIRNAME / "eval_tests" / fname).open("r") as f:
-            test_code = f.read()
-            implementation.inject_files(**{fname: test_code})
+        test_code = (DIRNAME / "eval_tests" / fname).read_text()
+        implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
         if stdout is None:
             stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
         system_prompt = T(".prompts:model_eval.system").r(
-            test_code=test_code, scenario="No scenario information yet.", spec=implementation.file_dict["spec/model.md"]
+            test_code=test_code, scenario=self.scen.get_scenario_all_desc(), spec=implementation.file_dict["spec/model.md"]
         )
         user_prompt = T(".prompts:model_eval.user").r(
             stdout=stdout,
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index b3c78333e..4d1031fed 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -53,9 +53,8 @@ def evaluate(
 
         # TODO: do we need to clean the generated tempory content?
         fname = "data_loader_test.py"
-        with (DIRNAME / "eval_tests" / "data_loader_test.py").open("r") as f:
-            test_code = f.read()
-            implementation.inject_files(**{fname: test_code})
+        test_code = (DIRNAME / "eval_tests" / "data_loader_test.py").read_text()
+        implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
         if stdout is None:
             stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index 099a68e76..e809d39d8 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -58,17 +58,19 @@ def evaluate(
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
         # Check if the submission file and score file are generated
-        submission_fp = implementation.workspace_path / "submission.csv"
         score_fp = implementation.workspace_path / "scores.csv"
-        if not submission_fp.exists():
-            raise CoderError("Submission file (submission.csv) is not generated.")
+        submission_fp = implementation.workspace_path / "submission.csv"
         if not score_fp.exists():
-            raise CoderError("Metrics file (scores.csv) is not generated.")
+            stdout += "Metrics file (scores.csv) is not generated."
+        if not submission_fp.exists():
+            stdout += "Submission file (submission.csv) is not generated."
 
         if stdout is None:
-            stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
+            stdout = "The execution exceeded the time limit."
+
         system_prompt = T(".prompts:workflow_eval.system").r(
-            scenario="No scenario information yet.", spec=implementation.file_dict["spec/workflow.md"]
+            scenario=self.scen.get_scenario_all_desc(),
+            spec=implementation.file_dict["spec/workflow.md"]
         )
         user_prompt = T(".prompts:workflow_eval.user").r(
             stdout=stdout,
diff --git a/rdagent/core/exception.py b/rdagent/core/exception.py
index 0e12084f9..25e6eb5fb 100644
--- a/rdagent/core/exception.py
+++ b/rdagent/core/exception.py
@@ -32,9 +32,6 @@ class RunnerError(Exception):
     Exceptions raised when running the code output.
     """
     # NOTE: it corresponds to the error of whole **project**
-    def __init__(self, message, ws: ):
-        super().__init__(message)
-        self.message = message
 
 
 class FactorEmptyError(Exception):
diff --git a/rdagent/scenarios/data_science/dev/runner.py b/rdagent/scenarios/data_science/dev/runner.py
index d52777638..dee050886 100644
--- a/rdagent/scenarios/data_science/dev/runner.py
+++ b/rdagent/scenarios/data_science/dev/runner.py
@@ -2,7 +2,7 @@
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.core.developer import Developer
-from rdagent.core.exception import CoderError
+from rdagent.core.exception import RunnerError
 from rdagent.log import rdagent_logger as logger
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 from rdagent.utils.env import DockerEnv, DSDockerConf
@@ -16,17 +16,18 @@ def develop(self, exp: DSExperiment) -> DSExperiment:
         de = DockerEnv(conf=ds_docker_conf)
 
         # execute workflow
-        exp.experiment_workspace.execute(env=de, entry="python main.py")
-        submission_fp = exp.experiment_workspace.workspace_path / "submission.csv"
+        stdout = exp.experiment_workspace.execute(env=de, entry="python main.py")
+
         score_fp = exp.experiment_workspace.workspace_path / "scores.csv"
+        if not score_fp.exists():
+            logger.error("Metrics file (scores.csv) is not generated.")
+            raise RunnerError(f"Metrics file (scores.csv) is not generated, log is:\n{stdout}")
 
+        submission_fp = exp.experiment_workspace.workspace_path / "submission.csv"
         if not submission_fp.exists():
             logger.error("Submission file (submission.csv) is not generated.")
-            raise CoderError("Submission file (submission.csv) is not generated.")
+            raise RunnerError(f"Submission file (submission.csv) is not generated, log is:\n{stdout}")
 
-        if not score_fp.exists():
-            logger.error("Metrics file (scores.csv) is not generated.")
-            raise CoderError("Metrics file (scores.csv) is not generated.")
 
         exp.result = pd.read_csv(score_fp, index_col=0)
         return exp
diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py
index d11355365..4f4a7ace4 100644
--- a/rdagent/utils/workflow.py
+++ b/rdagent/utils/workflow.py
@@ -17,7 +17,6 @@
 
 from tqdm.auto import tqdm
 
-from rdagent.core.exception import CoderError
 from rdagent.log import rdagent_logger as logger
 
 

From 386fffac428f6626e27a0f1fc5885e2bca5a7f2f Mon Sep 17 00:00:00 2001
From: bowen xian <xianbowen@outlook.com>
Date: Thu, 2 Jan 2025 15:05:33 +0000
Subject: [PATCH 163/304] ensemble scores change

---
 .../ensemble/eval_tests/ensemble_test.py      |  6 ++---
 .../data_science/raw_data_loader/prompts.yaml | 23 ++++++++-----------
 .../coder/data_science/workflow/__init__.py   |  4 ++--
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
index 63e8d1659..b6277a7b3 100644
--- a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
+++ b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
@@ -15,13 +15,13 @@
 n_samples = 100
 
 # Create synthetic predictions
-test_pred_l = [np.random.rand(n_samples, 1) for _ in range(n_models)]
-val_pred_l = [np.random.rand(n_samples, 1) for _ in range(n_models)]
+test_preds_dict = {f"model_{i}": np.random.rand(n_samples, 1) for i in range(n_models)}
+val_preds_dict = {f"model_{i}": np.random.rand(n_samples, 1) for i in range(n_models)}
 val_label = np.random.randint(0, 2, (n_samples, 1))
 
 # Run ensemble
 try:
-    final_predictions = ens_and_decision(test_pred_l, val_pred_l, val_label)
+    final_predictions = ens_and_decision(test_preds_dict, val_preds_dict, val_label)
 
     # Check shape
     assert final_predictions.shape == (n_samples, 1), "Wrong output shape"
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 6cc5b59d1..52f314ef2 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -181,8 +181,8 @@ spec:
       1. Function Interface:
         - Function Name: `ens_and_decision`
         - Parameters:
-          - `test_pred_l` (List[DT]): A list of predictions for the test data.
-          - `val_pred_l` (List[DT]): A list of predictions for the validation data.
+          - `test_preds_dict` (Dict[str, DT]): A dictionary of test predictions from different models.
+          - `val_preds_dict` (Dict[str, DT]): A dictionary of validation predictions from different models.
           - `val_label` (DT): A 1D array or series of true labels for the validation data.
         - Output:
           - `final_predictions` (DT): A 1D array or series containing the final predictions for the test data.
@@ -193,23 +193,20 @@ spec:
 
       2. Precautions:
         - Validation of Inputs:
-          - Ensure all predictions in `test_pred_l` and `val_pred_l` have consistent shapes and dimensions.
-          - Verify that `val_label` is provided and matches the length of `val_pred_l` predictions.
+          - Ensure all predictions in `test_preds_dict` and `val_preds_dict` have consistent shapes and dimensions.
+          - Verify that `val_label` is provided and matches the length of `val_preds_dict` predictions.
           - Handle empty or invalid inputs gracefully with appropriate error messages.
         - You should calculate the metric for each model and ensemble strategy, and save the results in a CSV file, e.g.:
           ```python
-          scores = []
-          for id, val_pred in enumerate(val_pred_l):
-              scores.append(calculate_metric(val_label, val_pred))
+          scores = {}
+          for model_name, val_pred in val_preds_dict.items():
+              scores[model_name] = calculate_metric(val_label, val_pred)
           
           ... some code about ensemble strategy ...
+          ensemble_score = calculate_metric(val_label, ensemble_pred)
+          scores[<ensemble_strategy_name>] = ensemble_score
           
-          scores_df = pd.DataFrame(
-              {
-                  "Model": list(range(len(val_pred_l))) + ["<ensemble strategy name>"],
-                  "<metric name>": scores + [<ensemble strategy prediction score>],
-              }
-          )
+          scores_df = pd.DataFrame(scores.items(), columns=['Model', <metric_name>])
           scores_df.to_csv("scores.csv", index=False)
           ```
         - Consensus Strategy:
diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
index cd697873f..e587997aa 100644
--- a/rdagent/components/coder/data_science/workflow/__init__.py
+++ b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -54,13 +54,13 @@ def implement_one_task(
             latest_code=workspace.file_dict.get("main.py"),
             workflow_spec=workspace.file_dict["spec/workflow.md"],
         )
-        data_loader_code = json.loads(
+        workflow_code = json.loads(
             APIBackend().build_messages_and_create_chat_completion(
                 user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
             )
         )["code"]
 
-        return {"main.py": data_loader_code}
+        return {"main.py": workflow_code}
 
     def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
         """

From 749c6ca73a867b588f2b809993868fcc2c30c683 Mon Sep 17 00:00:00 2001
From: bowen xian <xianbowen@outlook.com>
Date: Thu, 2 Jan 2025 15:44:14 +0000
Subject: [PATCH 164/304] fbworkspace.code -> .all_codes

---
 .../coder/CoSTEER/knowledge_management.py        |  2 +-
 .../data_science/raw_data_loader/prompts.yaml    |  4 ++--
 .../components/coder/factor_coder/prompts.yaml   | 16 ++++++++--------
 .../components/coder/model_coder/eva_utils.py    |  4 ++--
 .../components/coder/model_coder/prompts.yaml    |  6 +++---
 rdagent/core/experiment.py                       |  2 +-
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/CoSTEER/knowledge_management.py
index 7d05e32bb..d24bf53a9 100644
--- a/rdagent/components/coder/CoSTEER/knowledge_management.py
+++ b/rdagent/components/coder/CoSTEER/knowledge_management.py
@@ -48,7 +48,7 @@ def __init__(
 
     def get_implementation_and_feedback_str(self) -> str:
         return f"""------------------implementation code:------------------
-{self.implementation.code}
+{self.implementation.all_codes}
 ------------------implementation feedback:------------------
 {self.feedback!s}
 """
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 52f314ef2..d54568ecd 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -284,7 +284,7 @@ data_loader_coder:
     ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
     {{ similar_successful_knowledge.target_task.get_task_information() }}
     =====Code:=====
-    {{ similar_successful_knowledge.implementation.code }}
+    {{ similar_successful_knowledge.implementation.all_codes }}
     {% endfor %} 
     {% endif %}
 
@@ -292,7 +292,7 @@ data_loader_coder:
     --------------Previous Failed Attempts:--------------
     {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
     =====Code:=====
-    {{ former_failed_knowledge.implementation.code }}
+    {{ former_failed_knowledge.implementation.all_codes }}
     =====Feedback:=====
     {{ former_failed_knowledge.feedback }}
     {% endfor %}
diff --git a/rdagent/components/coder/factor_coder/prompts.yaml b/rdagent/components/coder/factor_coder/prompts.yaml
index 94a0c02b3..a9836cffb 100644
--- a/rdagent/components/coder/factor_coder/prompts.yaml
+++ b/rdagent/components/coder/factor_coder/prompts.yaml
@@ -52,7 +52,7 @@ evolving_strategy_factor_implementation_v1_system: |-
   {% if queried_former_failed_knowledge|length != 0 %}
   --------------Your former latest attempt:---------------
   =====Code to the former implementation=====
-  {{ queried_former_failed_knowledge[-1].implementation.code }}
+  {{ queried_former_failed_knowledge[-1].implementation.all_codes }}
   =====Feedback to the former implementation=====
   {{ queried_former_failed_knowledge[-1].feedback }}
   {% endif %}
@@ -74,9 +74,9 @@ evolving_strategy_factor_implementation_v2_user: |-
   --------------Factor information to similar error ({{error_content}}):---------------
   {{ similar_error_knowledge[0].target_task.get_task_information() }}
   =====Code with similar error ({{error_content}}):=====
-  {{ similar_error_knowledge[0].implementation.code }}
+  {{ similar_error_knowledge[0].implementation.all_codes }}
   =====Success code to former code with similar error ({{error_content}}):=====
-  {{ similar_error_knowledge[1].implementation.code }}
+  {{ similar_error_knowledge[1].implementation.all_codes }}
   {% endfor %}
   {% else %}
   Recall your last failure, your implementation met some errors.
@@ -91,13 +91,13 @@ evolving_strategy_factor_implementation_v2_user: |-
   =====Factor {{loop.index}}:=====
   {{ similar_successful_knowledge.target_task.get_task_information() }}
   =====Code:=====
-  {{ similar_successful_knowledge.implementation.code }}
+  {{ similar_successful_knowledge.implementation.all_codes }}
   {% endfor %}
   {% endif %}
   {% if latest_attempt_to_latest_successful_execution is not none %}
   You have tried to correct your former failed code but still met some errors. Here is the latest attempt to the latest successful execution, try not to get the same error to your new code:
   =====Your latest attempt=====
-  {{ latest_attempt_to_latest_successful_execution.implementation.code }}
+  {{ latest_attempt_to_latest_successful_execution.implementation.all_codes }}
   =====Feedback to your latest attempt=====
   {{ latest_attempt_to_latest_successful_execution.feedback }}
   {% endif %}
@@ -126,9 +126,9 @@ evolving_strategy_error_summary_v2_user: |-
   --------------Factor information to similar error ({{error_content}}):---------------
   {{ similar_error_knowledge[0].target_task.get_task_information() }}
   =====Code with similar error ({{error_content}}):=====
-  {{ similar_error_knowledge[0].implementation.code }}
+  {{ similar_error_knowledge[0].implementation.all_codes }}
   =====Success code to former code with similar error ({{error_content}}):=====
-  {{ similar_error_knowledge[1].implementation.code }}
+  {{ similar_error_knowledge[1].implementation.all_codes }}
   {% endfor %}
   {% endif %}
 
@@ -158,7 +158,7 @@ select_implementable_factor_user: |-
   --------------Your former attempt:---------------
   {% for former_attempt in factor_info[2] %}
   =====Code to attempt {{ loop.index }}=====
-  {{ former_attempt.implementation.code }}
+  {{ former_attempt.implementation.all_codes }}
   =====Feedback to attempt {{ loop.index }}=====
   {{ former_attempt.feedback }}
   {% endfor %}
diff --git a/rdagent/components/coder/model_coder/eva_utils.py b/rdagent/components/coder/model_coder/eva_utils.py
index 366000f2e..1d2bb88f7 100644
--- a/rdagent/components/coder/model_coder/eva_utils.py
+++ b/rdagent/components/coder/model_coder/eva_utils.py
@@ -68,7 +68,7 @@ def evaluate(
             assert isinstance(gt_implementation, ModelFBWorkspace)
 
         model_task_information = target_task.get_task_information()
-        code = implementation.code
+        code = implementation.all_codes
 
         system_prompt = (
             Environment(undefined=StrictUndefined)
@@ -94,7 +94,7 @@ def evaluate(
                     code=code,
                     model_execution_feedback=execution_feedback_to_render,
                     model_value_feedback=model_value_feedback,
-                    gt_code=gt_implementation.code if gt_implementation else None,
+                    gt_code=gt_implementation.all_codes if gt_implementation else None,
                 )
             )
             if (
diff --git a/rdagent/components/coder/model_coder/prompts.yaml b/rdagent/components/coder/model_coder/prompts.yaml
index 8742bd26b..126c986af 100644
--- a/rdagent/components/coder/model_coder/prompts.yaml
+++ b/rdagent/components/coder/model_coder/prompts.yaml
@@ -65,7 +65,7 @@ evolving_strategy_model_coder:
         {% if queried_former_failed_knowledge|length != 0 %}
         --------------Your former latest attempt:---------------
         =====Code to the former implementation=====
-        {{ queried_former_failed_knowledge[-1].implementation.code }}
+        {{ queried_former_failed_knowledge[-1].implementation.all_codes }}
         =====Feedback to the former implementation=====
         {{ queried_former_failed_knowledge[-1].feedback }}
         {% endif %}
@@ -85,7 +85,7 @@ evolving_strategy_model_coder:
         =====Model {{loop.index}}:=====
         {{ similar_successful_knowledge.target_task.get_task_information() }}
         =====Code:=====
-        {{ similar_successful_knowledge.implementation.code }}
+        {{ similar_successful_knowledge.implementation.all_codes }}
         {% endfor %}
         {% endif %}
 
@@ -93,7 +93,7 @@ evolving_strategy_model_coder:
         --------------Former failed code:---------------
         {% for former_failed_knowledge in queried_former_failed_knowledge %}
         =====Code to implementation {{ loop.index }}=====
-        {{ former_failed_knowledge.implementation.code }}
+        {{ former_failed_knowledge.implementation.all_codes }}
         =====Feedback to implementation {{ loop.index }}=====
         {{ former_failed_knowledge.feedback }}
         {% endfor %}
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index adc71a77d..3336b6578 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -115,7 +115,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         self.workspace_path: Path = RD_AGENT_SETTINGS.workspace_path / uuid.uuid4().hex
 
     @property
-    def code(self) -> str:
+    def all_codes(self) -> str:
         code_string = ""
         for file_name, code in self.file_dict.items():
             if file_name.endswith(".py") and "test" not in file_name:

From f159b11b0c2d48bcb85d716920af7722c47dc8db Mon Sep 17 00:00:00 2001
From: bowen xian <xianbowen@outlook.com>
Date: Thu, 2 Jan 2025 15:52:25 +0000
Subject: [PATCH 165/304] use all model codes in workflow coder

---
 .../components/coder/data_science/workflow/__init__.py    | 2 +-
 .../components/coder/data_science/workflow/prompts.yaml   | 5 ++---
 rdagent/core/experiment.py                                | 8 ++++++++
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
index e587997aa..92948a57d 100644
--- a/rdagent/components/coder/data_science/workflow/__init__.py
+++ b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -49,7 +49,7 @@ def implement_one_task(
         user_prompt = T(".prompts:workflow_coder.user").r(
             load_data_code=workspace.file_dict["load_data.py"],
             feature_code=workspace.file_dict["feature.py"],
-            model_code=workspace.file_dict["model01.py"],
+            model_codes=workspace.get_codes(r'^model_.+\.py$'),
             ensemble_code=workspace.file_dict["ensemble.py"],
             latest_code=workspace.file_dict.get("main.py"),
             workflow_spec=workspace.file_dict["spec/workflow.md"],
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index cc880b4c5..67608b352 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -13,7 +13,7 @@ workflow_coder:
 
     The code you implement should align with the framework given in the specifications.
     After predicting the output, print the shape and other information of the output to stdout to help the evaluator assess the code.
-   
+
     Please respond with the code in the following JSON format. Here is an example structure for the JSON output:
     {
         "code": "The Python code as a string."
@@ -53,8 +53,7 @@ workflow_coder:
 
     ---------model training code---------
     Attention: The input and output of the model function is flexible. Training dataset is necessary, but validation and test dateset might be optional. The hyperparameters can either be passed as arguments or be set as default values in the function. You need to use the function correctly.
-    file: model01.py
-    {{ model_code }}
+    {{ model_codes }}
 
     ---------ensemble code---------
     file: ensemble.py
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index 3336b6578..8eba11c2b 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+import re
 import platform
 import shutil
 import typing
@@ -122,6 +123,13 @@ def all_codes(self) -> str:
                 code_string += f"File: {file_name}\n{code}\n"
         return code_string
 
+    def get_codes(self, pattern: str) -> str:
+        code_string = ""
+        for file_name, code in self.file_dict.items():
+            if re.search(pattern, file_name) and file_name.endswith(".py") and "test" not in file_name:
+                code_string += f"File: {file_name}\n{code}\n"
+        return code_string
+
     def prepare(self) -> None:
         """
         Prepare the workspace except the injected code

From 3b22e7cbf5134d7dc7a05a1b45525e9245f10dd0 Mon Sep 17 00:00:00 2001
From: bowen xian <xianbowen@outlook.com>
Date: Thu, 2 Jan 2025 16:02:01 +0000
Subject: [PATCH 166/304] check scores.csv's keys(model_names)

---
 .../components/coder/data_science/workflow/eval.py  | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index e809d39d8..16ee8219e 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -1,5 +1,7 @@
 import json
 from pathlib import Path
+import pandas as pd
+import re
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.CoSTEER.evaluators import (
@@ -57,11 +59,18 @@ def evaluate(
         fname = "main.py"
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
-        # Check if the submission file and score file are generated
+        # Check score file
         score_fp = implementation.workspace_path / "scores.csv"
-        submission_fp = implementation.workspace_path / "submission.csv"
         if not score_fp.exists():
             stdout += "Metrics file (scores.csv) is not generated."
+        score_df = pd.read_csv(score_fp, index_col=0)
+        model_set_in_scores = set(score_df.index)
+        model_set_in_folder = set(f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_.+\.py$", f))
+        if model_set_in_scores != model_set_in_folder:
+            stdout += "The models used by ensemble are not consistent with the models in the workspace."
+
+        # Check submission file
+        submission_fp = implementation.workspace_path / "submission.csv"
         if not submission_fp.exists():
             stdout += "Submission file (submission.csv) is not generated."
 

From f4b1dd2349365ffb2765cd4a401622b7a6b01079 Mon Sep 17 00:00:00 2001
From: bowen xian <xianbowen@outlook.com>
Date: Thu, 2 Jan 2025 16:17:08 +0000
Subject: [PATCH 167/304] model name changes

---
 .../coder/data_science/model/eval.py          | 19 +++++--------------
 .../{model_test.py => test_model.py}          |  0
 .../data_science/raw_data_loader/prompts.yaml |  2 +-
 .../data_science/proposal/prompts.yaml        |  2 +-
 4 files changed, 7 insertions(+), 16 deletions(-)
 rename rdagent/components/coder/data_science/model/eval_tests/{model_test.py => test_model.py} (100%)

diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 728d87516..487f2acd1 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -18,13 +18,9 @@
 from rdagent.utils.env import DockerEnv, DSDockerConf
 
 DIRNAME = Path(__file__).absolute().resolve().parent
-
-
 ModelSingleFeedback = CoSTEERSingleFeedback
 
-
 # Below are unit tests for testing the specification of the implemented model ------------------
-#
 class ModelGeneralCaseSpecEvaluator(CoSTEEREvaluator):
     """
     Motivation case:
@@ -55,22 +51,19 @@ def evaluate(
                 code="This task has failed too many times, skip implementation.",
                 final_decision=False,
             )
-        # assert isinstance(target_task, ModelTask)
 
-        batch_size = 8
-        assert isinstance(implementation, FBWorkspace)
-        """model_execution_feedback, pred_list= implementation.execute(
-            batch_size=batch_size,
-        )"""
         ds_docker_conf = DSDockerConf()
         ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
         de = DockerEnv(conf=ds_docker_conf)
-        fname = "model_test.py"
-        test_code = (DIRNAME / "eval_tests" / fname).read_text()
+
+        fname = "test_model.py"
+        test_code = (DIRNAME / "eval_tests" / fname).read_text().replace("model01", target_task.name) # only check the model changed this time
         implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
+
         if stdout is None:
             stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
+
         system_prompt = T(".prompts:model_eval.system").r(
             test_code=test_code, scenario=self.scen.get_scenario_all_desc(), spec=implementation.file_dict["spec/model.md"]
         )
@@ -81,8 +74,6 @@ def evaluate(
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
         return ModelSingleFeedback(**json.loads(resp))
 
-    """feedback"""
-
 
 class XXX2SpecEval:
     """
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_test.py b/rdagent/components/coder/data_science/model/eval_tests/test_model.py
similarity index 100%
rename from rdagent/components/coder/data_science/model/eval_tests/model_test.py
rename to rdagent/components/coder/data_science/model/eval_tests/test_model.py
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index d54568ecd..ccb682201 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -238,7 +238,7 @@ spec:
           - Integrate the following components into the workflow:
             - Data loading (`load_data.py`).
             - Feature engineering (`feature.py`).
-            - Model workflow for training and testing (`model*.py`).
+            - Model workflow for training and testing (`model_*.py`).
             - Ensemble and decision-making (`ensemble.py`).
           - Treat each component as a modular and callable Python function.
 
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 142c2131b..23f3b90e0 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -160,7 +160,7 @@ output_format:
     According to the hypothesis, please help user design one model task.
     The output should follow JSON format. The schema is as follows: 
     {
-        "model_name": "model_name",
+        "model_name": "model name, must start with 'model_' and only contain letters, numbers, and underscores",
         "description": "A detailed description of the model",
         "model_type": "The type of the model, e.g., neural network, tree-based model, etc.",
         "architecture": "A detailed description of the model's architecture, e.g., neural network layers or tree structures",

From ff710d611e1ae83fa4755c2aee241145a8573625 Mon Sep 17 00:00:00 2001
From: bowen xian <xianbowen@outlook.com>
Date: Thu, 2 Jan 2025 16:18:21 +0000
Subject: [PATCH 168/304] add a todo in  ensemble test

---
 .../coder/data_science/ensemble/eval_tests/ensemble_test.py      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
index b6277a7b3..82385229a 100644
--- a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
+++ b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
@@ -14,6 +14,7 @@
 n_models = 3
 n_samples = 100
 
+# TODO: use real data.
 # Create synthetic predictions
 test_preds_dict = {f"model_{i}": np.random.rand(n_samples, 1) for i in range(n_models)}
 val_preds_dict = {f"model_{i}": np.random.rand(n_samples, 1) for i in range(n_models)}

From aac53494b96d6a93dd94cd63ecc2bd756d7fc2fe Mon Sep 17 00:00:00 2001
From: bowen xian <xianbowen@outlook.com>
Date: Thu, 2 Jan 2025 16:40:23 +0000
Subject: [PATCH 169/304] sota_exp changes

---
 .../data_science/proposal/exp_gen.py          | 48 +++++++++----------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index b138d8321..b34f31562 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -57,12 +57,12 @@ def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None =
         self.hist: list[tuple[DSExperiment, ExperimentFeedback]] = []
         self.knowledge_base = knowledge_base
 
-    def get_sota_hypothesis_and_experiment(self) -> tuple[DSHypothesis | None, Experiment | None]:
-        """Access the last experiment result, sub-task, and the corresponding hypothesis."""
-        for exp, hf in self.hist[::-1]:
-            if hf.decision:
-                return exp.hypothesis, exp
-        return None, None
+    def sota_experiment(self) -> Experiment | None:
+        """Access the last experiment result."""
+        for exp, ef in self.hist[::-1]:
+            if ef.decision:
+                return exp
+        return None
     
     def get_models_information(self) -> tuple[str, int]:
         for exp, hf in self.hist[::-1]:
@@ -117,6 +117,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             next_component = trace.hist[-1][0].next_component_required()
 
         scenario_desc = trace.scen.get_scenario_all_desc()
+        sota_exp = trace.sota_experiment()
         if next_component == "DataLoadSpec":
             resp_dict = self.llm_task_gen(
                 targets="Data loader and specification generation",
@@ -138,13 +139,13 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 scenario_desc=scenario_desc,
                 task_output_format=T(".prompts:output_format.feature").r(),
             )
-            dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
+
             ft = FeatureTask(
                 name="Feature Engineering",
                 description=resp_dict.get("description", "Factor description not provided"),
             )
             exp = DSExperiment(sub_tasks=[ft], hypothesis=DSHypothesis("FeatureEng"))
-            exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+            exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
             return exp
         elif next_component == "Model":
             resp_dict = self.llm_task_gen(
@@ -152,7 +153,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 scenario_desc=scenario_desc,
                 task_output_format=T(".prompts:output_format.model").r(),
             )
-            dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
+
             mt = ModelTask(
                 name=resp_dict.get("model_name", "Model name not provided"),
                 description=resp_dict.get("description", "Model description not provided"),
@@ -161,7 +162,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
             )
             exp = DSExperiment(sub_tasks=[mt], hypothesis=DSHypothesis("Model"))
-            exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+            exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
             return exp
         elif next_component == "Ensemble":
             resp_dict = self.llm_task_gen(
@@ -169,13 +170,13 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 scenario_desc=scenario_desc,
                 task_output_format=T(".prompts:output_format.ensemble").r(),
             )
-            dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
+
             et = EnsembleTask(
                 name="Ensemble",
                 description=resp_dict.get("description", "Ensemble description not provided"),
             )
             exp = DSExperiment(sub_tasks=[et], hypothesis=DSHypothesis("Ensemble"))
-            exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+            exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
             return exp
         elif next_component == "Workflow":
             resp_dict = self.llm_task_gen(
@@ -183,20 +184,18 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 scenario_desc=scenario_desc,
                 task_output_format=T(".prompts:output_format.workflow").r(),
             )
-            dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
+
             wt = WorkflowTask(
                 name="Workflow",
                 description=resp_dict.get("description", "Workflow description not provided"),
             )
             exp = DSExperiment(sub_tasks=[wt], hypothesis=DSHypothesis("Workflow"))
-            exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+            exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
             return exp
         else:  # propose new component by LLM
             # base info
             hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(trace=trace)
-
             # Step 1: Generate component
-            # TODO: how to generate sota solution
             sota_solution = ""
             component_sys_prompt = T(".prompts:component_gen").r(
                 targets="data science project",
@@ -296,7 +295,9 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     ),
                 )
 
-                return DSExperiment(sub_tasks=[dt], hypothesis=hypothesis)
+                exp = DSExperiment(sub_tasks=[dt], hypothesis=hypothesis)
+                exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
+                return exp
             elif hypothesis.component == "FeatureEng":
                 # TODO: RAG
                 resp_dict = self.llm_task_gen(
@@ -307,14 +308,14 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
+                
                 ft = FeatureTask(
                     name="Feature Engineering",
                     description=resp_dict.get("description", "Feature description not provided"),
                 )
 
                 exp = DSExperiment(sub_tasks=[ft], hypothesis=hypothesis)
-                exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+                exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
                 return exp
             elif hypothesis.component == "Model":
                 resp_dict = self.llm_task_gen(
@@ -324,7 +325,6 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
                 mt = ModelTask(
                     name=resp_dict.get("model_name", "Model name not provided"),
                     description=resp_dict.get("description", "Model description not provided"),
@@ -335,7 +335,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 )
 
                 exp = DSExperiment(sub_tasks=[mt], hypothesis=hypothesis)
-                exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+                exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
                 return exp
             elif hypothesis.component == "Ensemble":
                 resp_dict = self.llm_task_gen(
@@ -346,14 +346,13 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
                 et = EnsembleTask(
                     name="Ensemble",
                     description=resp_dict.get("description", "Ensemble description not provided"),
                 )
 
                 exp = DSExperiment(sub_tasks=[et], hypothesis=hypothesis)
-                exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+                exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
                 return exp
             elif hypothesis.component == "Workflow":
                 resp_dict = self.llm_task_gen(
@@ -364,14 +363,13 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                dependency_exp = trace.get_sota_hypothesis_and_experiment()[1]
                 wt = WorkflowTask(
                     name="Workflow",
                     description=resp_dict.get("description", "Workflow description not provided"),
                 )
 
                 exp = DSExperiment(sub_tasks=[wt], hypothesis=hypothesis)
-                exp.experiment_workspace.inject_code_from_folder(dependency_exp.experiment_workspace.workspace_path)
+                exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
                 return exp
 
         return super().gen(trace)

From 07a3ef7ebc8938324708f5e88bea17477cd305dc Mon Sep 17 00:00:00 2001
From: bowen xian <xianbowen@outlook.com>
Date: Thu, 2 Jan 2025 16:53:08 +0000
Subject: [PATCH 170/304] give model info in exp gen

---
 .../data_science/proposal/exp_gen.py          | 31 ++++++++-----------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index b34f31562..b3dbc8e34 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -1,4 +1,5 @@
 import json
+import re
 from typing import Literal
 
 from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
@@ -63,20 +64,6 @@ def sota_experiment(self) -> Experiment | None:
             if ef.decision:
                 return exp
         return None
-    
-    def get_models_information(self) -> tuple[str, int]:
-        for exp, hf in self.hist[::-1]:
-            if hf.decision:
-                wp = exp.experiment_workspace.workspace_path
-                score_df = pd.read_csv(f"{wp}/score.csv")
-                filtered_df = score_df.iloc[:-1]
-                models = filtered_df.to_dict(orient="records")
-                # TODO: fix name
-                model_code = exp.sub_workspace_list[0].file_dict.get('spec/model.py', '')
-                # TODO: 组合模型名，模型代码，模型表现
-                models_info = ""
-                return models_info, len(models)
-        return "", 0
 
 
 class DSExpGen(ExpGen):
@@ -241,15 +228,23 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     concise_knowledge=resp_dict.get("concise_knowledge", "Concise knowledge not provided"),
                 )
             else:
-                # 
-                model_info, model_num = trace.get_models_information()
+                model_infos = []
+                score_df = pd.read_csv(sota_exp.experiment_workspace.workspace_path / "score.csv", index_col=0)
+                metric_name = score_df.columns[0]
+                for fname in sota_exp.experiment_workspace.file_dict:
+                    if re.match(r"^model_.+\.py", fname):
+                        model_str = f"{fname}:\n{metric_name} on valid: {score_df.loc[fname[:-3]]}\n```python\n{sota_exp.experiment_workspace.file_dict[fname]}\n```\n"
+                        model_infos.append(model_str)
+                
+                model_num = len(model_infos)
+                models_info_str = ("-"*20).join(model_infos)
                 if model_num >= 3:
                     hypothesis_sys_prompt = T(".prompts:hypothesis_model.system").r(
                         targets="data science project",
                         scenario=scenario_desc,
                         hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
                         hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
-                        model_info=model_info,
+                        model_info=models_info_str,
                         model_enough=True,
                     )
                 else:
@@ -258,7 +253,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                         scenario=scenario_desc,
                         hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
                         hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
-                        model_info=model_info,
+                        model_info=models_info_str,
                         model_enough=False,
                     )
                 hypothesis_user_prompt = T(".prompts:hypothesis_gen.user").r(

From f6b55f6091ba95496bef43058b3e640c38060aef Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 3 Jan 2025 03:15:37 +0000
Subject: [PATCH 171/304] add runner time limit

---
 rdagent/scenarios/data_science/dev/runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rdagent/scenarios/data_science/dev/runner.py b/rdagent/scenarios/data_science/dev/runner.py
index dee050886..20083eb76 100644
--- a/rdagent/scenarios/data_science/dev/runner.py
+++ b/rdagent/scenarios/data_science/dev/runner.py
@@ -12,6 +12,7 @@ class DSRunner(Developer[DSExperiment]):
     def develop(self, exp: DSExperiment) -> DSExperiment:
         ds_docker_conf = DSDockerConf()
         ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
+        ds_docker_conf.running_timeout_period = 60 * 60  # 1 hours
 
         de = DockerEnv(conf=ds_docker_conf)
 

From 9f4c84d411c5763cd3091a9c5f8a3bd2c9bbbc62 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 3 Jan 2025 03:18:02 +0000
Subject: [PATCH 172/304] config using debug data or not in evals

---
 rdagent/components/coder/data_science/feature/eval.py         | 2 +-
 rdagent/components/coder/data_science/model/eval.py           | 2 +-
 rdagent/components/coder/data_science/raw_data_loader/eval.py | 2 +-
 rdagent/components/coder/data_science/workflow/eval.py        | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
index 82408b608..2be1fc8eb 100644
--- a/rdagent/components/coder/data_science/feature/eval.py
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -43,7 +43,7 @@ def evaluate(
             )
 
         ds_docker_conf = DSDockerConf()
-        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
+        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
         de = DockerEnv(conf=ds_docker_conf)
 
         # TODO: do we need to clean the generated temporary content?
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 487f2acd1..35e093337 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -53,7 +53,7 @@ def evaluate(
             )
 
         ds_docker_conf = DSDockerConf()
-        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
+        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
         de = DockerEnv(conf=ds_docker_conf)
 
         fname = "test_model.py"
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index 4d1031fed..b7ae94f98 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -48,7 +48,7 @@ def evaluate(
             )
 
         ds_docker_conf = DSDockerConf()
-        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
+        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
         de = DockerEnv(conf=ds_docker_conf)
 
         # TODO: do we need to clean the generated tempory content?
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index 16ee8219e..7ba40a708 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -54,7 +54,7 @@ def evaluate(
                 final_decision=False,
             )
         ds_docker_conf = DSDockerConf()
-        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
+        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
         de = DockerEnv(conf=ds_docker_conf)
         fname = "main.py"
         stdout = implementation.execute(env=de, entry=f"python {fname}")

From f51b35e90a89bfe6d33ff5ae39a972e15e843e33 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 3 Jan 2025 03:31:10 +0000
Subject: [PATCH 173/304] exp to feedback base

---
 rdagent/scenarios/data_science/dev/feedback.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
index 7defbecb2..cd32d8803 100644
--- a/rdagent/scenarios/data_science/dev/feedback.py
+++ b/rdagent/scenarios/data_science/dev/feedback.py
@@ -4,7 +4,7 @@
 from rdagent.components.knowledge_management.graph import UndirectedNode
 from rdagent.core.experiment import Experiment
 from rdagent.core.prompts import Prompts
-from rdagent.core.proposal import Experiment2Feedback, HypothesisFeedback
+from rdagent.core.proposal import Experiment2Feedback, HypothesisFeedback, ExperimentFeedback
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
@@ -14,7 +14,14 @@
 
 
 class DSExperiment2Feedback(Experiment2Feedback):
-    def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> HypothesisFeedback:
+    def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeedback:
+        # 用哪些信息来生成feedback
+        # 1. sub_tasks[0] 任务的描述
+        # 2. hypothesis 任务的假设
+        # 3. 相对sota_exp的改动
+        # 4. result 任务的结果
+        # 5. sota_exp.result 之前最好的结果
+        sota_exp = trace.sota_experiment()
         hypothesis = exp.hypothesis
         current_results = exp.result
         if hypothesis.component == "DataLoadSpec":
@@ -29,7 +36,7 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> HypothesisFeed
             modified_file_name = "main.py"
         modified_code = exp.experiment_workspace.file_dict[modified_file_name]
 
-        sota_hypothesis, sota_exp = trace.get_sota_hypothesis_and_experiment()
+        sota_exp = trace.sota_experiment()
 
         if sota_exp:
             sota_codes = {

From 3b2f15cb29f1da9bbf7ed0b1943fd43ab266b09c Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 3 Jan 2025 03:50:18 +0000
Subject: [PATCH 174/304] add feature code when writing model task

---
 rdagent/components/coder/data_science/model/__init__.py  | 3 ++-
 rdagent/components/coder/data_science/model/prompts.yaml | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 3b217a49e..e31bff1d4 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -50,7 +50,8 @@ def implement_one_task(
         )
         user_prompt = T(".prompts:model_coder.user").r(
             model_spec=workspace.file_dict["spec/model.md"],
-            latest_code=workspace.file_dict.get(f"{target_task.name}.py", ""),
+            feature_code=workspace.file_dict["feature.py"],
+            latest_code=workspace.file_dict.get(f"{target_task.name}.py", None),
         )
 
         model_code = json.loads(
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index cc14a2739..1f0739280 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -51,9 +51,11 @@ model_coder:
         ---------Model Specification---------
         {{ model_spec }}
 
+        ---------Feature Engineering Code---------
+        {{ feature_code }}
 
         {% if latest_code %}
-        ---------Former Specification---------
+        ---------Former Code---------
         Former Code: {{ latest_code }}
         You should follow the former code to improve it.
         {% endif %}    

From 92408209b22f3b1013c18396d86591e412bfcbc2 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 3 Jan 2025 06:58:22 +0000
Subject: [PATCH 175/304] small problem

---
 rdagent/log/ui/llm_st.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
index 354197557..f77b1acf4 100644
--- a/rdagent/log/ui/llm_st.py
+++ b/rdagent/log/ui/llm_st.py
@@ -111,7 +111,7 @@ def extract_evoid(tag):
 
     if "debug_exp_gen" in tag:
         with st.expander(
-            f"Exp in:violet[**{obj.experiment_workspace.workspace_path}**]", expanded=expand_all, icon="🧩"
+            f"Exp in :violet[**{obj.experiment_workspace.workspace_path}**]", expanded=expand_all, icon="🧩"
         ):
             st.write(obj)
     elif "debug_tpl" in tag:

From 82d0635d8ae83bbe907fd6f40c465f6ee5a62e2e Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Fri, 3 Jan 2025 07:09:18 +0000
Subject: [PATCH 176/304] copying during sampling

---
 rdagent/scenarios/data_science/debug/data.py | 5 +----
 rdagent/scenarios/kaggle/kaggle_crawler.py   | 5 +++--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index 90ad40803..c3d1132d1 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -138,10 +138,7 @@ def create_debug_data(
 
         sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
         if file_path.suffix not in included_extensions:
-            if platform.system() == "Linux":
-                os.symlink(file_path, sampled_file_path)
-            if platform.system() == "Windows":
-                os.link(file_path, sampled_file_path)
+            shutil.copy(file_path, sampled_file_path)
             continue
 
         # Initialize the generic data handler
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
index 0ee16362a..64767dccf 100644
--- a/rdagent/scenarios/kaggle/kaggle_crawler.py
+++ b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -163,8 +163,9 @@ def download_data(competition: str, settings: ExtendedBaseSettings = KAGGLE_IMPL
                 for sub_zip_file in Path(unzip_path).rglob("*.zip"):
                     unzip_data(sub_zip_file, unzip_target_path=unzip_path)
 
-            # sample data
-            create_debug_data(competition, dataset_path=local_path)
+    # sample data
+    if not Path(f"{local_path}/sample/{competition}").exists():
+        create_debug_data(competition, dataset_path=local_path)
 
 
 def unzip_data(unzip_file_path: str, unzip_target_path: str) -> None:

From 19e9b4c3faef0b756baf5fa9648941d6dd5ecbed Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Fri, 3 Jan 2025 09:07:13 +0000
Subject: [PATCH 177/304] update

---
 .../coder/CoSTEER/knowledge_management.py     |  2 +-
 .../data_science/raw_data_loader/__init__.py  |  2 +
 .../data_science/raw_data_loader/eval.py      |  2 +-
 .../data_science/raw_data_loader/prompts.yaml |  7 ++
 rdagent/scenarios/data_science/debug/data.py  |  9 +--
 rdagent/scenarios/data_science/scen/scen.py   | 66 +++++++++++++++++++
 rdagent/scenarios/kaggle/kaggle_crawler.py    | 17 +++--
 rdagent/utils/agent/tpl.py                    |  5 +-
 8 files changed, 94 insertions(+), 16 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/CoSTEER/knowledge_management.py
index d24bf53a9..5f3bff064 100644
--- a/rdagent/components/coder/CoSTEER/knowledge_management.py
+++ b/rdagent/components/coder/CoSTEER/knowledge_management.py
@@ -718,7 +718,7 @@ def __init__(self, init_component_list=None, path: str | Path = None) -> None:
         Load knowledge, offer brief information of knowledge and common handle interfaces
         """
         self.graph: UndirectedGraph = UndirectedGraph(Path.cwd() / "graph.pkl")
-        logger.info(f"Knowledge Graph loaded, size={self.graph.size()}")
+        logger.info(f"CoSTEER Knowledge Graph loaded, size={self.graph.size()}")
 
         if init_component_list:
             for component in init_component_list:
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index d11fbc691..79eca029f 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -54,6 +54,7 @@ def implement_one_task(
         # return a workspace with "load_data.py", "spec/load_data.md" inside
         # assign the implemented code to the new workspace.
         competition_info = self.scen.get_scenario_all_desc()
+        data_folder_info = self.scen.get_data_folder_description()
         data_loader_task_info = target_task.get_task_information()
 
         queried_similar_successful_knowledge = (
@@ -106,6 +107,7 @@ def implement_one_task(
         user_prompt = T(".prompts:data_loader_coder.user").r(
             competition_info=competition_info,
             data_loader_spec=data_loader_spec,
+            folder_spec=data_folder_info,
             latest_code=workspace.file_dict.get("load_data.py"),
         )
 
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index b7ae94f98..75223bfb5 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -51,7 +51,7 @@ def evaluate(
         ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
         de = DockerEnv(conf=ds_docker_conf)
 
-        # TODO: do we need to clean the generated tempory content?
+        # TODO: do we need to clean the generated temporary content?
         fname = "data_loader_test.py"
         test_code = (DIRNAME / "eval_tests" / "data_loader_test.py").read_text()
         implementation.inject_files(**{fname: test_code})
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index ccb682201..418386a4c 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -18,7 +18,9 @@ spec:
     -----------Competition Information-----------
     {{ competition_info }}
 
+    {% if queried_similar_successful_knowledge|length != 0 or queried_former_failed_knowledge|length != 0 %}
     -----------Here is the relevant information for this task-----------
+    {% endif %}
     {% if queried_similar_successful_knowledge|length != 0 %}
     --------------Successful Implementations for Similar Models:--------------
     ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
@@ -278,7 +280,9 @@ data_loader_coder:
         "code": "The Python code as a string."
     }
 
+    {% if queried_similar_successful_knowledge|length != 0 or queried_former_failed_knowledge|length != 0 %}
     -----------Here is the relevant information for this task-----------
+    {% endif %}
     {% if queried_similar_successful_knowledge|length != 0 %}
     --------------Successful Implementations for Similar Models:--------------
     ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
@@ -305,6 +309,9 @@ data_loader_coder:
     ---------Data Loader Specification---------
     {{ data_loader_spec }}
 
+    ---------Data Folder Description---------(All path are relative to the data folder)
+    {{ folder_spec }}
+
     You should strictly follow the function interface specifications provided by the specification to implement the function.
 
 
diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index 90ad40803..b947dbf17 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -138,10 +138,11 @@ def create_debug_data(
 
         sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
         if file_path.suffix not in included_extensions:
-            if platform.system() == "Linux":
-                os.symlink(file_path, sampled_file_path)
-            if platform.system() == "Windows":
-                os.link(file_path, sampled_file_path)
+            shutil.copy(file_path.absolute(), sampled_file_path.absolute())
+            # if platform.system() == "Linux":
+            #     os.symlink(file_path.absolute(), sampled_file_path.absolute())
+            # if platform.system() == "Windows":
+            #     os.link(file_path.absolute(), sampled_file_path.absolute())
             continue
 
         # Initialize the generic data handler
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index 680df65bd..10a0fa626 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -1,6 +1,9 @@
 import json
+import os
 from pathlib import Path
 
+import pandas as pd
+
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.core.scenario import Scenario
 from rdagent.log import rdagent_logger as logger
@@ -8,6 +11,66 @@
 from rdagent.utils.agent.tpl import T
 
 
+def read_csv_head(file_path, indent, lines=5):
+    try:
+        df = pd.read_csv(file_path, nrows=lines)
+        df_string_lines = df.to_string(index=False).split("\n")
+        for i in range(len(df_string_lines)):
+            df_string_lines[i] = " " * (indent) + df_string_lines[i]
+        return "\n".join(df_string_lines)
+    except Exception as e:
+        return f"Error reading CSV: {e}"
+
+
+def describe_data_folder(folder_path, indent=0):
+    result = []
+    files_count = {}
+    files_details = {}
+
+    for root, dirs, files in os.walk(folder_path):
+        # Process files
+        for file in files:
+            file_path = os.path.join(root, file)
+            file_type = os.path.splitext(file)[1][1:]
+            file_size = os.path.getsize(file_path)
+            if file_type not in files_count:
+                files_count[file_type] = 0
+                files_details[file_type] = []
+            files_count[file_type] += 1
+            if len(files_details[file_type]) < 3:
+                files_details[file_type].append((file, file_size, file_path))
+
+        # Process directories
+        for d in dirs:
+            result.append(" " * indent + f"- Folder: {d}")
+            result.append(describe_data_folder(os.path.join(root, d), indent + 2))
+
+        # Ensure we only process the current directory and not subdirectories in this loop
+        break
+
+    # Print the folder and its contents
+    for file_type, count in files_count.items():
+        if count > 3:
+            result.append(" " * indent + f"{count} {file_type}s:")
+            for file, size, path in files_details[file_type]:
+                result.append(" " * (indent + 2) + f"- {file} ({size} bytes)")
+            result.append(" " * (indent + 2) + "...")
+        else:
+            for file, size, path in files_details[file_type]:
+                if file_type == "zip":
+                    continue
+                result.append(" " * indent + f"- {file} ({size} bytes)")
+                if file_type == "csv":
+                    result.append(f" " * (indent + 2) + f"- Head of {file}:")
+                    result.append(read_csv_head(path, indent + 2))
+                if file_type == "md":
+                    result.append(f" " * (indent + 2) + f"- Content of {file}:")
+                    with open(path, "r") as f:
+                        result.append(f.read())
+
+    return "\n".join(result)
+
+
 class DataScienceScen(Scenario):
     """Data Science Scenario"""
 
@@ -88,3 +151,6 @@ def get_scenario_all_desc(self) -> str:
             submission_specifications=self.submission_specifications,
             metric_direction=self.metric_direction,
         )
+
+    def get_data_folder_description(self) -> str:
+        return describe_data_folder(Path(DS_RD_SETTING.local_data_path) / self.competition)
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
index 0ee16362a..1e5d0d7b8 100644
--- a/rdagent/scenarios/kaggle/kaggle_crawler.py
+++ b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -108,21 +108,20 @@ def download_data(competition: str, settings: ExtendedBaseSettings = KAGGLE_IMPL
     if settings.if_using_mle_data:
         zipfile_path = f"{local_path}/zip_files"
         zip_competition_path = Path(zipfile_path) / competition
-        if (
-            not zip_competition_path.exists()
-            or not (Path(local_path) / competition).exists()
-            or list((Path(local_path) / competition).iterdir()) == []
-        ):
-            mleb_env = MLEBDockerEnv()
-            mleb_env.prepare()
-            (Path(local_path) / "zip_files").mkdir(parents=True, exist_ok=True)
-            (Path(local_path) / competition).mkdir(parents=True, exist_ok=True)
 
+        mleb_env = MLEBDockerEnv()
+        mleb_env.prepare()
+        if not zip_competition_path.exists():
+            (Path(zipfile_path)).mkdir(parents=True, exist_ok=True)
             mleb_env.run(
                 f"mlebench prepare -c {competition} --data-dir ./zip_files",
                 local_path=local_path,
                 running_extra_volume={str(Path("~/.kaggle").expanduser().absolute()): "/root/.kaggle"},
             )
+
+        if not (Path(local_path) / competition).exists() or list((Path(local_path) / competition).iterdir()) == []:
+            (Path(local_path) / competition).mkdir(parents=True, exist_ok=True)
+
             mleb_env.run(
                 f"/bin/sh -c 'cp -r ./zip_files/{competition}/prepared/public/* ./{competition}'", local_path=local_path
             )
diff --git a/rdagent/utils/agent/tpl.py b/rdagent/utils/agent/tpl.py
index 2323b1106..9b0af4376 100644
--- a/rdagent/utils/agent/tpl.py
+++ b/rdagent/utils/agent/tpl.py
@@ -69,7 +69,10 @@ def r(self, **context: Any):
         """
         Render the template with the given context.
         """
-        rendered = Environment(undefined=StrictUndefined).from_string(self.template).render(**context)
+        rendered = Environment(undefined=StrictUndefined).from_string(self.template).render(**context).strip("\n")
+        while "\n\n\n" in rendered:
+            rendered = rendered.replace("\n\n\n", "\n\n")
+        rendered = "\n".join(line for line in rendered.splitlines() if line.strip())
         logger.log_object(
             obj={
                 "uri": self.uri,

From 1fb000f34665b896b58d14406f72a5d4abcd6726 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Fri, 3 Jan 2025 09:08:17 +0000
Subject: [PATCH 178/304] refactor: Simplify code handling and improve
 workspace management

---
 rdagent/components/coder/CoSTEER/task.py      |  1 +
 .../coder/data_science/model/__init__.py      | 27 +++++----
 .../coder/data_science/model/prompts.yaml     | 11 ++++
 rdagent/core/experiment.py                    | 56 +++++++++++++------
 rdagent/oai/llm_conf.py                       |  4 +-
 rdagent/scenarios/data_science/debug/data.py  |  2 +-
 .../data_science/proposal/exp_gen.py          | 40 ++++++-------
 .../data_science/proposal/prompts.yaml        |  7 +++
 rdagent/utils/agent/ret.py                    | 13 +++++
 rdagent/utils/agent/tpl.yaml                  | 11 ++++
 rdagent/utils/workflow.py                     |  2 +-
 11 files changed, 121 insertions(+), 53 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/task.py b/rdagent/components/coder/CoSTEER/task.py
index 92e90ed0e..5bc898994 100644
--- a/rdagent/components/coder/CoSTEER/task.py
+++ b/rdagent/components/coder/CoSTEER/task.py
@@ -5,4 +5,5 @@ class CoSTEERTask(Task):
     def __init__(self, base_code: str = None, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         # TODO: we may upgrade the base_code into a workspace-like thing to know previous.
+        # NOTE: (xiao) think we don't need the base_code anymore. The information should be retrieved from the workspace.
         self.base_code = base_code
diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index e31bff1d4..657308f00 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -19,6 +19,7 @@
 from rdagent.core.experiment import FBWorkspace
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.ret import BatchEditOut
 from rdagent.utils.agent.tpl import T
 
 
@@ -47,22 +48,26 @@ def implement_one_task(
         system_prompt = T(".prompts:model_coder.system").r(
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
+            out_spec=BatchEditOut.get_spec(),
         )
-        user_prompt = T(".prompts:model_coder.user").r(
+        # user_prompt = T(".prompts:model_coder.user").r(
+        #     model_spec=workspace.file_dict["spec/model.md"],
+        #     feature_code=workspace.file_dict["feature.py"],
+        #     latest_code=workspace.file_dict.get(f"{target_task.name}.py", None),
+        # )
+        # We want to use a simpler way to
+        user_prompt = T(".prompts:model_coder.user_general").r(
             model_spec=workspace.file_dict["spec/model.md"],
-            feature_code=workspace.file_dict["feature.py"],
-            latest_code=workspace.file_dict.get(f"{target_task.name}.py", None),
+            worksapce_code=workspace.all_codes,  # TODO: If we have high failure rate here, we should clean this step with less information.
         )
 
-        model_code = json.loads(
-            APIBackend().build_messages_and_create_chat_completion(
-                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-            )
-        )["code"]
+        batch_edit = BatchEditOut.extract_output(APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=system_prompt,
+            json_mode=BatchEditOut.json_mode,
+        ))
 
-        return {
-            f"{target_task.name}.py": model_code,
-        }
+        return batch_edit
 
     def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
         """
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 1f0739280..5c7180287 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -21,11 +21,15 @@ model_coder:
                 If previous failed attempts and their feedback are available, learn from them. Understand what went wrong and avoid repeating similar mistakes in your new implementation.
                 The failure knowledge may include the code unrelated to the model, such as data loading, preprocessing, or feature engineering. Focus only on the model implementation part.
 
+        {% if out_spec %}
+        {{out_spec}}
+        {% else %}
         Formatting Your Response:
             Return only the code in a JSON format as shown below. Do not include any explanations or extra text. Example:
             {
                 "code": "Your corrected or newly implemented Python code as a single string"
             }
+        {% endif %}
         
         -----------Here is the relevant information for this task-----------
         {% if queried_similar_successful_knowledge|length != 0 %}
@@ -60,6 +64,13 @@ model_coder:
         You should follow the former code to improve it.
         {% endif %}    
 
+    user_general: |-
+        --------- Workspace code---------
+        {{worksapce_code}}
+        ---------Model Specification---------
+        When you are implementing the code, you should follow the spec
+        {{ model_spec }}
+
 
 model_eval:
     system: |-
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index 8eba11c2b..968e37e44 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -74,6 +74,14 @@ def copy(self) -> Workspace:
         error_message = "copy method is not implemented."
         raise NotImplementedError(error_message)
 
+    @property
+    @abstractmethod
+    def all_codes(self) -> str:
+        """
+        Get all the code files in the workspace as a single string.
+        """
+        pass
+
 
 ASpecificWS = TypeVar("ASpecificWS", bound=Workspace)
 
@@ -115,21 +123,30 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         )  # The code injected into the folder, store them in the variable to reproduce the former result
         self.workspace_path: Path = RD_AGENT_SETTINGS.workspace_path / uuid.uuid4().hex
 
-    @property
-    def all_codes(self) -> str:
+    @staticmethod
+    def _format_code_dict(code_dict: dict[str, str]) -> str:
+        """
+        Helper function to format the code dictionary into a string.
+        """
         code_string = ""
-        for file_name, code in self.file_dict.items():
-            if file_name.endswith(".py") and "test" not in file_name:
-                code_string += f"File: {file_name}\n{code}\n"
+        for file_name, code in code_dict.items():
+            code_string += f"File Path: {file_name}\n```\n{code}\n```"
         return code_string
 
-    def get_codes(self, pattern: str) -> str:
-        code_string = ""
-        for file_name, code in self.file_dict.items():
-            if re.search(pattern, file_name) and file_name.endswith(".py") and "test" not in file_name:
-                code_string += f"File: {file_name}\n{code}\n"
-        return code_string
+    @property
+    def all_codes(self) -> str:
+        """
+        Get all the code files in the workspace as a single string, excluding test files.
+        """
+        filtered_dict = {k: v for k, v in self.file_dict.items() if k.endswith(".py") and "test" not in k}
+        return self._format_code_dict(filtered_dict)
 
+    def get_codes(self, pattern: str) -> str:
+        """
+        Get code files matching a specific pattern as a single string, excluding test files.
+        """
+        filtered_dict = {k: v for k, v in self.file_dict.items() if re.search(pattern, k) and k.endswith(".py") and "test" not in k}
+        return self._format_code_dict(filtered_dict)
     def prepare(self) -> None:
         """
         Prepare the workspace except the injected code
@@ -153,19 +170,26 @@ def link_all_files_in_folder_to_workspace(data_path: Path, workspace_path: Path)
             if platform.system() == "Windows":
                 os.link(data_file_path, workspace_data_file_path)
 
+    DEL_KEY = "__DEL__"
     def inject_files(self, **files: str) -> None:
         """
         Inject the code into the folder.
         {
-            <file name>: <code>
+            <file name1>: <code>,  // indicate writing <code> into <file name> (create new file or replace existing file)
+            <file name2>: "__DEL__"  // indicate removing file name2. When we want to replace a file to a new one, we usually use this
         }
         """
         self.prepare()
         for k, v in files.items():
-            self.file_dict[k] = v
-            target_file_path = self.workspace_path / k
-            target_file_path.parent.mkdir(parents=True, exist_ok=True)
-            target_file_path.write_text(v)
+            target_file_path = self.workspace_path / k  # Define target_file_path before using it
+            if v == self.DEL_KEY:  # Use self.DEL_KEY to access the class variable
+                if target_file_path.exists():
+                    target_file_path.unlink()  # Unlink the file if it exists
+                self.file_dict.pop(k, None)  # Safely remove the key from file_dict
+            else:
+                self.file_dict[k] = v
+                target_file_path.parent.mkdir(parents=True, exist_ok=True)
+                target_file_path.write_text(v)
 
     def get_files(self) -> list[Path]:
         """
diff --git a/rdagent/oai/llm_conf.py b/rdagent/oai/llm_conf.py
index 75acec4c7..5f7777ade 100644
--- a/rdagent/oai/llm_conf.py
+++ b/rdagent/oai/llm_conf.py
@@ -38,8 +38,8 @@ class LLMSettings(ExtendedBaseSettings):
 
     # Chat configs
     openai_api_key: str = ""  # TODO: simplify the key design.
-    chat_openai_api_key: str = ""
-    chat_openai_base_url: str = ""
+    chat_openai_api_key: str | None = None
+    chat_openai_base_url: str | None = None  #
     chat_azure_api_base: str = ""
     chat_azure_api_version: str = ""
     chat_model: str = "gpt-4-turbo"
diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index c3d1132d1..e9074757e 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -118,7 +118,7 @@ def create_debug_data(
         dr_cls_kwargs = {}
 
     if dataset_path is None:
-        dataset_path = KAGGLE_IMPLEMENT_SETTING.local_data_path
+        dataset_path = KAGGLE_IMPLEMENT_SETTING.local_data_path  # FIXME: don't hardcode this KAGGLE_IMPLEMENT_SETTING
 
     if sample_path is None:
         # Create a sample folder under the dataset folder, which should be available in docker container
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index b3dbc8e34..09153281e 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -7,7 +7,7 @@
 from rdagent.components.coder.data_science.model.exp import ModelTask
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
-from rdagent.core.experiment import Experiment
+from rdagent.core.experiment import Experiment, Workspace
 from rdagent.core.knowledge_base import KnowledgeBase
 from rdagent.core.proposal import (
     ExperimentFeedback,
@@ -74,6 +74,7 @@ def llm_task_gen(
         targets: str,
         scenario_desc: str,
         task_output_format: str,
+        workspace_code: str | None = None,
         hypothesis: Hypothesis | None = None,
         hypothesis_and_feedback: str | None = None,
     ) -> dict:
@@ -86,6 +87,7 @@ def llm_task_gen(
         user_prompt = T(".prompts:task_gen.user").r(
             targets=targets,
             hypothesis=hypothesis,
+            workspace_code=workspace_code,
             hypothesis_and_feedback=hypothesis_and_feedback,
         )
 
@@ -180,6 +182,8 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
             return exp
         else:  # propose new component by LLM
+            assert sota_exp is not None, "SOTA experiment is not provided."
+
             # base info
             hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(trace=trace)
             # Step 1: Generate component
@@ -201,8 +205,8 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             )
 
             component = resp_dict_component.get("component", "Component not provided")
+            # Step 2: Generate the rest of the hypothesis
             if component != "Model":
-                # Step 2: Generate the rest of the hypothesis
                 hypothesis_sys_prompt = T(".prompts:hypothesis_gen.system").r(
                     targets="data science project",
                     scenario=scenario_desc,
@@ -235,27 +239,19 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     if re.match(r"^model_.+\.py", fname):
                         model_str = f"{fname}:\n{metric_name} on valid: {score_df.loc[fname[:-3]]}\n```python\n{sota_exp.experiment_workspace.file_dict[fname]}\n```\n"
                         model_infos.append(model_str)
-                
+
                 model_num = len(model_infos)
                 models_info_str = ("-"*20).join(model_infos)
-                if model_num >= 3:
-                    hypothesis_sys_prompt = T(".prompts:hypothesis_model.system").r(
-                        targets="data science project",
-                        scenario=scenario_desc,
-                        hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
-                        hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
-                        model_info=models_info_str,
-                        model_enough=True,
-                    )
-                else:
-                    hypothesis_sys_prompt = T(".prompts:hypothesis_model.system").r(
-                        targets="data science project",
-                        scenario=scenario_desc,
-                        hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
-                        hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
-                        model_info=models_info_str,
-                        model_enough=False,
-                    )
+
+                hypothesis_sys_prompt = T(".prompts:hypothesis_model.system").r(
+                    targets="data science project",
+                    scenario=scenario_desc,
+                    hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
+                    hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
+                    model_info=models_info_str,
+                    model_enough=model_num >= 3,  # NOTE: Assumption: limited model number is usually enough for good results.
+                )
+
                 hypothesis_user_prompt = T(".prompts:hypothesis_gen.user").r(
                     targets="data science project",
                     hypothesis_and_feedback=hypothesis_and_feedback,
@@ -303,7 +299,6 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                
                 ft = FeatureTask(
                     name="Feature Engineering",
                     description=resp_dict.get("description", "Feature description not provided"),
@@ -316,6 +311,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 resp_dict = self.llm_task_gen(
                     scenario_desc=scenario_desc,
                     hypothesis=hypothesis,
+                    workspace_code=sota_exp.experiment_workspace.all_codes,
                     task_output_format=T(".prompts:output_format.model").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 23f3b90e0..df32dabb5 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -66,10 +66,16 @@ task_gen:
     3. Former proposed {{targets}} on similar hypothesis.
     4. Some additional information to help you generate new {{targets}}.
     {% endif %}
+
     Please generate the output following the format below:
     {{ task_output_format }}
     
   user: |-
+    {% if workspace_code %}
+    Here is a list of all the filenames and their corresponding content in the workspace:
+    {{workspace_code}}
+    {% endif %}
+
     {% if hypothesis is not none %}
     The user has made several hypothesis on this scenario and did several evaluation on them.
     The target hypothesis you are targeting to generate {{targets}} for is as follows:
@@ -169,6 +175,7 @@ output_format:
             "hyperparameter_name_2": "value of hyperparameter 2",
             "hyperparameter_name_3": "value of hyperparameter 3"
         },
+      "edit_strategy": <specify the editing strategy to implement the hypothesis: we have the following strategies 1) remove an existing model (you MUST specify which model will be removed) and add a new model with the "model_name" 2) add a new model in addition to the existing models; the following coding agent will use your edit strategy to make the necessary changes>
     }
     Usually, a larger model works better than a smaller one. Hence, the parameters should be larger.
   ensemble: |-
diff --git a/rdagent/utils/agent/ret.py b/rdagent/utils/agent/ret.py
index 3c68354ab..f84f1a8db 100644
--- a/rdagent/utils/agent/ret.py
+++ b/rdagent/utils/agent/ret.py
@@ -5,6 +5,7 @@
 """
 
 import re
+import json
 from abc import abstractclassmethod
 from typing import Any
 
@@ -12,6 +13,7 @@
 
 
 class AgentOut:
+    json_mode: bool = False   # To get the output, is json_mode required.
     @abstractclassmethod
     def get_spec(cls, **context: Any) -> str:
         raise NotImplementedError(f"Please implement the `get_spec` method")
@@ -32,3 +34,14 @@ def extract_output(cls, resp: str):
         if match:
             code = match.group(1)
             return code
+
+class BatchEditOut(AgentOut):
+    json_mode: bool = True
+
+    @classmethod
+    def get_spec(cls):
+        return T(".tpl:BatchEditOut").r()
+
+    @classmethod
+    def extract_output(cls, resp: str):
+        return json.loads(resp)
diff --git a/rdagent/utils/agent/tpl.yaml b/rdagent/utils/agent/tpl.yaml
index 2b41013a0..afd390f95 100644
--- a/rdagent/utils/agent/tpl.yaml
+++ b/rdagent/utils/agent/tpl.yaml
@@ -4,3 +4,14 @@ PythonAgentOut: |-
   <You code>
   ```
   
+
+BatchEditOut: |-
+  You should return a edition that applies to multiple files in a workspace in JSON.
+
+  For example:
+
+  Inject the code into the folder.
+  {
+      <file name1>: "<code>",  // indicate writing <code> into <file name> (create new file or replace existing file)
+      <file name2>: "__DEL__"  // indicate removing file name2. When we want to replace a file to a new one, we usually use this
+  }
diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py
index 4f4a7ace4..28cf1375f 100644
--- a/rdagent/utils/workflow.py
+++ b/rdagent/utils/workflow.py
@@ -118,7 +118,7 @@ def run(self, step_n: int | None = None):
                         self.loop_prev_out[name] = func(self.loop_prev_out)
                         # TODO: Fix the error logger.exception(f"Skip loop {li} due to {e}")
                     except self.skip_loop_error as e:
-                        # FIXME: This does not support previous instance
+                        # FIXME: This does not support previous demo (due to their last step is not for recording)
                         logger.warning(f"Skip loop {li} due to {e}")
                         # NOTE: strong assumption!  The last step is responsible for recording information
                         self.step_idx = len(self.steps) - 1  # directly jump to the last step.

From 615d3b58396a22db359d3235b8c5cad3a4c69245 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 3 Jan 2025 10:02:49 +0000
Subject: [PATCH 179/304] model part output fix

---
 .../components/coder/data_science/model/__init__.py   |  1 +
 rdagent/components/coder/data_science/model/eval.py   |  2 +-
 .../components/coder/data_science/model/prompts.yaml  |  9 ++++++---
 .../components/coder/data_science/workflow/eval.py    | 11 ++++++-----
 rdagent/utils/agent/tpl.yaml                          |  3 ++-
 5 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 657308f00..d912f030c 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -49,6 +49,7 @@ def implement_one_task(
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
             out_spec=BatchEditOut.get_spec(),
+            task_info=model_information_str,
         )
         # user_prompt = T(".prompts:model_coder.user").r(
         #     model_spec=workspace.file_dict["spec/model.md"],
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 35e093337..c51dcf0f8 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -69,7 +69,7 @@ def evaluate(
         )
         user_prompt = T(".prompts:model_eval.user").r(
             stdout=stdout,
-            code=implementation.file_dict["model01.py"],
+            code=implementation.file_dict[f"{target_task.name}.py"],
         )
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
         return ModelSingleFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 5c7180287..c59cc97b1 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -20,9 +20,13 @@ model_coder:
             Error Learning:
                 If previous failed attempts and their feedback are available, learn from them. Understand what went wrong and avoid repeating similar mistakes in your new implementation.
                 The failure knowledge may include the code unrelated to the model, such as data loading, preprocessing, or feature engineering. Focus only on the model implementation part.
+        
+        ---------Model Task Description---------
+        {{ task_info }}
 
         {% if out_spec %}
         {{out_spec}}
+        The file name should be the model name described in the model task
         {% else %}
         Formatting Your Response:
             Return only the code in a JSON format as shown below. Do not include any explanations or extra text. Example:
@@ -30,7 +34,6 @@ model_coder:
                 "code": "Your corrected or newly implemented Python code as a single string"
             }
         {% endif %}
-        
         -----------Here is the relevant information for this task-----------
         {% if queried_similar_successful_knowledge|length != 0 %}
         --------------Successful Implementations for Similar Models:--------------
@@ -62,11 +65,11 @@ model_coder:
         ---------Former Code---------
         Former Code: {{ latest_code }}
         You should follow the former code to improve it.
-        {% endif %}    
+        {% endif %}
 
     user_general: |-
         --------- Workspace code---------
-        {{worksapce_code}}
+        {{ worksapce_code }}
         ---------Model Specification---------
         When you are implementing the code, you should follow the spec
         {{ model_spec }}
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index 7ba40a708..9567f19fb 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -63,11 +63,12 @@ def evaluate(
         score_fp = implementation.workspace_path / "scores.csv"
         if not score_fp.exists():
             stdout += "Metrics file (scores.csv) is not generated."
-        score_df = pd.read_csv(score_fp, index_col=0)
-        model_set_in_scores = set(score_df.index)
-        model_set_in_folder = set(f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_.+\.py$", f))
-        if model_set_in_scores != model_set_in_folder:
-            stdout += "The models used by ensemble are not consistent with the models in the workspace."
+        else:
+            score_df = pd.read_csv(score_fp, index_col=0)
+            model_set_in_scores = set(score_df.index)
+            model_set_in_folder = set(f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_.+\.py$", f))
+            if model_set_in_scores != model_set_in_folder:
+                stdout += "The models used by ensemble are not consistent with the models in the workspace."
 
         # Check submission file
         submission_fp = implementation.workspace_path / "submission.csv"
diff --git a/rdagent/utils/agent/tpl.yaml b/rdagent/utils/agent/tpl.yaml
index afd390f95..d69425e75 100644
--- a/rdagent/utils/agent/tpl.yaml
+++ b/rdagent/utils/agent/tpl.yaml
@@ -7,7 +7,8 @@ PythonAgentOut: |-
 
 BatchEditOut: |-
   You should return a edition that applies to multiple files in a workspace in JSON.
-
+  Except for the model file, other files should not be renamed.
+  Files that do not need to be modified do not need to be included in the returned dict.
   For example:
 
   Inject the code into the folder.

From 86180bfd8e87f60489942e744230baa8af351dd2 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Fri, 3 Jan 2025 10:16:47 +0000
Subject: [PATCH 180/304] print model's execution time

---
 .../model/eval_tests/test_model.py            | 81 +++++++------------
 rdagent/core/utils.py                         |  6 +-
 2 files changed, 31 insertions(+), 56 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/eval_tests/test_model.py b/rdagent/components/coder/data_science/model/eval_tests/test_model.py
index b9db686c2..cf0f97935 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/test_model.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/test_model.py
@@ -1,31 +1,28 @@
-"""
-adapt for cv models
-"""
-
-import os
-import pickle
-import traceback
-
-import numpy as np
-from feature import feat_eng
+import time
+from sklearn.model_selection import train_test_split
 from load_data import load_data
+from feature import feat_eng
 from model01 import model_workflow
-from sklearn.model_selection import train_test_split
 
-X, y, test_X, test_ids = load_data()
-X, y, test_X = feat_eng(X, y, test_X)
 
-train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=42)
+def log_execution_results(start_time, val_pred, test_pred, hypers, execution_label):
+    """Log the results of a single model execution."""
+    feedback_str = f"{execution_label} successful.\n"
+    feedback_str += f"Validation predictions shape: {val_pred.shape if val_pred is not None else 'None'}\n"
+    feedback_str += f"Test predictions shape: {test_pred.shape if test_pred is not None else 'None'}\n"
+    feedback_str += f"Hyperparameters: {hypers if hypers is not None else 'None'}\n"
+    feedback_str += f"Execution time: {time.time() - start_time:.2f} seconds.\n"
+    print(feedback_str)
 
 
-"""train_X = np.random.rand(8, 64, 64, 3)
-train_y = np.random.rand(8, 1)
-val_X = np.random.rand(8, 64, 64, 3)
-val_y = np.random.rand(8, 1)
-test_X = np.random.rand(8, 64, 64, 3)"""
+# Load and preprocess data
+X, y, test_X, test_ids = load_data()
+X, y, test_X = feat_eng(X, y, test_X)
+train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=42)
 
+# First execution
 print("The first execution begins.\n")
-# Call model_workflow
+start_time = time.time()
 val_pred, test_pred, hypers = model_workflow(
     X=train_X,
     y=train_y,
@@ -33,41 +30,19 @@
     val_y=val_y,
     test_X=None,
 )
-# val_pred = np.random.rand(8, 1)
-# test_pred = np.random.rand(8, 1)
-
-execution_feedback_str = "The first Execution successful.\n"
-if val_pred is not None:
-    execution_feedback_str += f"Validation predictions shape: {val_pred.shape}\n"
-else:
-    execution_feedback_str += "Validation predictions are None.\n"
-if test_pred is not None:
-    execution_feedback_str += f"Test predictions shape: {test_pred.shape}\n"
-else:
-    execution_feedback_str += "Test predictions are None.\n"
-if hypers is not None:
-    execution_feedback_str += f"Hyperparameters:{hypers}\n"
-else:
-    execution_feedback_str += "Hyperparameters are None.\n"
-print(execution_feedback_str)
+log_execution_results(start_time, val_pred, test_pred, hypers, "The first execution")
 
+# Second execution
 print("The second execution begins.\n")
-val_pred, test_pred, finalhypers = model_workflow(
-    X=train_X, y=train_y, val_X=None, val_y=None, test_X=test_X, hyper_params=hypers
+start_time = time.time()
+val_pred, test_pred, final_hypers = model_workflow(
+    X=train_X,
+    y=train_y,
+    val_X=None,
+    val_y=None,
+    test_X=test_X,
+    hyper_params=hypers,
 )
-execution_feedback_str = "The second Execution successful.\n"
-if val_pred is not None:
-    execution_feedback_str += f"Validation predictions shape: {val_pred.shape}\n"
-else:
-    execution_feedback_str += "Validation predictions are None.\n"
-if test_pred is not None:
-    execution_feedback_str += f"Test predictions shape: {test_pred.shape}\n"
-else:
-    execution_feedback_str += "Test predictions are None.\n"
-if hypers is not None:
-    execution_feedback_str += f"Hyperparameters:{finalhypers}\n"
-else:
-    execution_feedback_str += "Hyperparameters are None.\n"
-print(execution_feedback_str)
+log_execution_results(start_time, val_pred, test_pred, final_hypers, "The second execution")
 
 print("Model code test passed successfully.")
diff --git a/rdagent/core/utils.py b/rdagent/core/utils.py
index be0a15cf8..45fff015d 100644
--- a/rdagent/core/utils.py
+++ b/rdagent/core/utils.py
@@ -49,7 +49,7 @@ def __reduce__(self) -> NoReturn:
         NOTE:
         When loading an object from a pickle, the __new__ method does not receive the `kwargs`
         it was initialized with. This makes it difficult to retrieve the correct singleton object.
-        Therefore, we have made it unpickable.
+        Therefore, we have made it unpicklable.
         """
         msg = f"Instances of {self.__class__.__name__} cannot be pickled"
         raise pickle.PicklingError(msg)
@@ -69,7 +69,7 @@ def similarity(text1: str, text2: str) -> int:
     text2 = text2 if isinstance(text2, str) else ""
 
     # Maybe we can use other similarity algorithm such as tfidf
-    return cast(int, fuzz.ratio(text1, text2))  # mypy does not reguard it as int
+    return cast(int, fuzz.ratio(text1, text2))  # mypy does not regard it as int
 
 
 def import_class(class_path: str) -> Any:
@@ -127,7 +127,7 @@ def multiprocessing_wrapper(func_calls: list[tuple[Callable, tuple]], n: int) ->
     It will not call multiprocessing if `n=1`
 
     NOTE:
-    We coooperate with chat_cache_seed feature
+    We cooperate with chat_cache_seed feature
     We ensure get the same seed trace even we have multiple number of seed
 
     Parameters

From cfda303b8d727e8dcb918bfc12093b9fb64bebce Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 6 Jan 2025 03:01:15 +0000
Subject: [PATCH 181/304] bug fix

---
 rdagent/components/coder/data_science/model/eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 35e093337..c51dcf0f8 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -69,7 +69,7 @@ def evaluate(
         )
         user_prompt = T(".prompts:model_eval.user").r(
             stdout=stdout,
-            code=implementation.file_dict["model01.py"],
+            code=implementation.file_dict[f"{target_task.name}.py"],
         )
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
         return ModelSingleFeedback(**json.loads(resp))

From 2ddcb241c7b1839a9e71f99e27ec95d77d9c0db3 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 6 Jan 2025 03:57:48 +0000
Subject: [PATCH 182/304] ensemble test fix

---
 .../coder/data_science/ensemble/eval.py       | 10 +++++-
 .../ensemble/eval_tests/ensemble_test.py      | 32 ++++++++++++-------
 .../coder/data_science/model/prompts.yaml     |  4 +--
 3 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
index cef32bf42..c7ed9b88f 100644
--- a/rdagent/components/coder/data_science/ensemble/eval.py
+++ b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -1,6 +1,8 @@
 import json
 from dataclasses import dataclass
 from pathlib import Path
+from jinja2 import Environment, StrictUndefined
+from rdagent.app.data_science.conf import DS_RD_SETTING
 
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
@@ -41,10 +43,16 @@ def evaluate(
                 final_decision=False,
             )
 
-        de = DockerEnv(conf=DSDockerConf())
+        ds_docker_conf = DSDockerConf()
+        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
+        de = DockerEnv(conf=ds_docker_conf)
 
         fname = "ensemble_test.py"
         test_code = (DIRNAME / "eval_tests" / "ensemble_test.py").read_text()
+        test_code = Environment(undefined=StrictUndefined).from_string(test_code).render(
+            model_names=[fn[:-3] for fn in implementation.file_dict.keys() if fn.startswith("model_")]
+        )
+        
         implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
index 82385229a..8dde58742 100644
--- a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
+++ b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
@@ -7,25 +7,35 @@
 """
 
 import numpy as np
-from ensemble import ens_and_decision
 from pathlib import Path
+from sklearn.model_selection import train_test_split
+from load_data import load_data
+from feature import feat_eng
+from ensemble import ens_and_decision
 
-# Create test data
-n_models = 3
-n_samples = 100
+X, y, test_X, test_ids = load_data()
+X, y, test_X = feat_eng(X, y, test_X)
+train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=42)
 
-# TODO: use real data.
-# Create synthetic predictions
-test_preds_dict = {f"model_{i}": np.random.rand(n_samples, 1) for i in range(n_models)}
-val_preds_dict = {f"model_{i}": np.random.rand(n_samples, 1) for i in range(n_models)}
-val_label = np.random.randint(0, 2, (n_samples, 1))
+test_preds_dict = {}
+val_preds_dict = {}
+{% for mn in model_names %}
+from {{mn}} import model_workflow as {{mn}}_workflow
+test_preds_dict["{{mn}}"], val_preds_dict["{{mn}}"], _ = {{mn}}_workflow(
+    X=train_X,
+    y=train_y,
+    val_X=val_X,
+    val_y=val_y,
+    test_X=test_X
+)
+{% endfor %}
 
 # Run ensemble
 try:
-    final_predictions = ens_and_decision(test_preds_dict, val_preds_dict, val_label)
+    final_predictions = ens_and_decision(test_preds_dict, val_preds_dict, val_y)
 
     # Check shape
-    assert final_predictions.shape == (n_samples, 1), "Wrong output shape"
+    assert final_predictions.shape == val_y.shape, "Wrong output shape"
 
     # check if scores.csv is generated
     if not Path("scores.csv").exists():
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 1f0739280..4098f380f 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -33,7 +33,7 @@ model_coder:
         ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
         {{ similar_successful_knowledge.target_task.get_task_information() }}
         =====Code:=====
-        {{ similar_successful_knowledge.implementation.file_dict["model01.py"] }}
+        {{ similar_successful_knowledge.implementation.file_dict[similar_successful_knowledge.target_task.name ~ '.py'] }}
         {% endfor %} 
         {% endif %}
 
@@ -41,7 +41,7 @@ model_coder:
         --------------Previous Failed Attempts:--------------
         {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
         =====Code:=====
-        {{ former_failed_knowledge.implementation.file_dict["model01.py"] }}
+        {{ former_failed_knowledge.implementation.file_dict[former_failed_knowledge.target_task.name ~ '.py'] }}
         =====Feedback:=====
         {{ former_failed_knowledge.feedback }}
         {% endfor %}

From 28576dfbcd2a8f6049e36d72082a95f0922dd960 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 6 Jan 2025 04:07:10 +0000
Subject: [PATCH 183/304] ens small change

---
 .../data_science/ensemble/eval_tests/ensemble_test.py     | 8 ++++----
 .../coder/data_science/raw_data_loader/prompts.yaml       | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
index 8dde58742..ff1b55dfb 100644
--- a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
+++ b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
@@ -32,18 +32,18 @@
 
 # Run ensemble
 try:
-    final_predictions = ens_and_decision(test_preds_dict, val_preds_dict, val_y)
+    final_pred = ens_and_decision(test_preds_dict, val_preds_dict, val_y)
 
     # Check shape
-    assert final_predictions.shape == val_y.shape, "Wrong output shape"
+    assert final_pred.shape == val_y.shape, "Wrong output shape"
 
     # check if scores.csv is generated
     if not Path("scores.csv").exists():
         raise Exception("scores.csv is not generated")
     
     print("Ensemble test passed successfully.")
-    print(f"Output shape: {final_predictions.shape}")
-    print(f"Unique values in predictions: {np.unique(final_predictions)}")
+    print(f"Output shape: {final_pred.shape}")
+    print(f"Unique values in predictions: {np.unique(final_pred)}")
 
 except Exception as e:
     print(f"Test failed: {str(e)}")
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 418386a4c..153958e5b 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -185,9 +185,9 @@ spec:
         - Parameters:
           - `test_preds_dict` (Dict[str, DT]): A dictionary of test predictions from different models.
           - `val_preds_dict` (Dict[str, DT]): A dictionary of validation predictions from different models.
-          - `val_label` (DT): A 1D array or series of true labels for the validation data.
+          - `val_label` (DT): Validation label.
         - Output:
-          - `final_predictions` (DT): A 1D array or series containing the final predictions for the test data.
+          - `final_pred` (DT): Ensemble prediction for the test data.
         - Docstring Requirements:
           - Describe the purpose of the function.
           - Clarify the input parameters and their data types.

From 271e5f187a64f4881a831377f6310c4057e9d0e5 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 6 Jan 2025 04:29:55 +0000
Subject: [PATCH 184/304] ens_test bug fix

---
 .../coder/data_science/ensemble/eval_tests/ensemble_test.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
index ff1b55dfb..ab135559a 100644
--- a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
+++ b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
@@ -21,7 +21,7 @@
 val_preds_dict = {}
 {% for mn in model_names %}
 from {{mn}} import model_workflow as {{mn}}_workflow
-test_preds_dict["{{mn}}"], val_preds_dict["{{mn}}"], _ = {{mn}}_workflow(
+val_preds_dict["{{mn}}"], test_preds_dict["{{mn}}"], _ = {{mn}}_workflow(
     X=train_X,
     y=train_y,
     val_X=val_X,

From 82388028f70c2e1fbe090d48c61e5f252d459c20 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Mon, 6 Jan 2025 07:10:43 +0000
Subject: [PATCH 185/304] Refine partial expansion logic to display only a few
 subfolders when their structure is uniform, improving readability in nested
 directories.

---
 rdagent/scenarios/data_science/scen/scen.py | 107 +++++++++++++++++---
 1 file changed, 92 insertions(+), 15 deletions(-)

diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index 10a0fa626..aeec8d2f5 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -22,53 +22,130 @@ def read_csv_head(file_path, indent, lines=5):
         return f"Error reading CSV: {e}"
 
 
-def describe_data_folder(folder_path, indent=0):
+def get_dir_snapshot(folder_path):
+    """
+    [note]
+        - Returns a set of file extensions within the subfolder (excluding subfolder names)
+        - Compares only the types of files contained, not specific file names or quantities
+    """
+    exts = set()
+    try:
+        with os.scandir(folder_path) as it:
+            for entry in it:
+                if entry.is_file():
+                    file_ext = os.path.splitext(entry.name)[1]
+                    exts.add(file_ext)
+    except Exception as e:
+        logger.error(f"Error scanning directory: {e}")
+
+    return frozenset(exts)
+
+
+def describe_data_folder(folder_path, indent=0, max_files=3, partial_expand_subfolders=3):
+    """
+    folder_path              : Current directory path
+    indent                   : Current indentation
+    max_files                : Maximum number of files of the same type to display
+    partial_expand_subfolders: When all subfolders have the same internal file types, only expand this many subfolders, the rest are omitted
+    """
     result = []
     files_count = {}
     files_details = {}
 
     for root, dirs, files in os.walk(folder_path):
-        # Process files
+        dirs.sort()
+
+        if not dirs:
+            for file in files:
+                file_path = os.path.join(root, file)
+                file_type = os.path.splitext(file)[1][1:]
+                file_size = os.path.getsize(file_path)
+
+                if file_type not in files_count:
+                    files_count[file_type] = 0
+                    files_details[file_type] = []
+                files_count[file_type] += 1
+                if len(files_details[file_type]) < max_files:
+                    files_details[file_type].append((file, file_size, file_path))
+            break
+
+        # Collect "type snapshots" of subfolders
+        snapshots = []
+        for d in dirs:
+            subfolder_path = os.path.join(root, d)
+            snapshot = get_dir_snapshot(subfolder_path)
+            snapshots.append(snapshot)
+
+        # Determine if all subfolders have the same file type distribution
+        first_snapshot = snapshots[0]
+        all_same_structure = all(s == first_snapshot for s in snapshots)
+
+        if all_same_structure:
+            for i, d in enumerate(dirs):
+                if i < partial_expand_subfolders:
+                    result.append(" " * indent + f"- Folder: {d}")
+                    subfolder_path = os.path.join(root, d)
+                    result.append(
+                        describe_data_folder(
+                            folder_path=subfolder_path,
+                            indent=indent + 2,
+                            max_files=max_files,
+                            partial_expand_subfolders=partial_expand_subfolders
+                        )
+                    )
+                else:
+                    remaining = len(dirs) - i
+                    result.append(" " * indent + f"... ({remaining} more subfolders)")
+                    break
+        else:
+            for d in dirs:
+                result.append(" " * indent + f"- Folder: {d}")
+                subfolder_path = os.path.join(root, d)
+                result.append(
+                    describe_data_folder(
+                        folder_path=subfolder_path,
+                        indent=indent + 2,
+                        max_files=max_files,
+                        partial_expand_subfolders=partial_expand_subfolders
+                    )
+                )
+
         for file in files:
             file_path = os.path.join(root, file)
             file_type = os.path.splitext(file)[1][1:]
             file_size = os.path.getsize(file_path)
+
             if file_type not in files_count:
                 files_count[file_type] = 0
                 files_details[file_type] = []
             files_count[file_type] += 1
-            if len(files_details[file_type]) < 3:
-                files_details[file_type].append((file, file_size, file_path))
 
-        # Process directories
-        for d in dirs:
-            result.append(" " * indent + f"- Folder: {d}")
-            result.append(describe_data_folder(os.path.join(root, d), indent + 2))
+            if len(files_details[file_type]) < max_files:
+                files_details[file_type].append((file, file_size, file_path))
 
-        # Ensure we only process the current directory and not subdirectories in this loop
         break
 
     # Print the folder and its contents
     for file_type, count in files_count.items():
-        if count > 3:
+        if count > max_files:
             result.append(" " * indent + f"{count} {file_type}s:")
             for file, size, path in files_details[file_type]:
                 result.append(" " * (indent + 2) + f"- {file} ({size} bytes)")
-            result.append(" " * (indent + 2) + "...")
+            result.append(" " * (indent + 2) + "... (file limit reached)")
         else:
             for file, size, path in files_details[file_type]:
                 if file_type == "zip":
                     continue
                 result.append(" " * indent + f"- {file} ({size} bytes)")
                 if file_type == "csv":
-                    result.append(f" " * (indent + 2) + f"- Head of {file}:")
+                    result.append(" " * (indent + 2) + f"- Head of {file}:")
                     result.append(read_csv_head(path, indent + 2))
                 if file_type == "md":
-                    result.append(f" " * (indent + 2) + f"- Content of {file}:")
-                    with open(path, "r") as f:
+                    result.append(" " * (indent + 2) + f"- Content of {file}:")
+                    with open(path, "r", encoding="utf-8") as f:
                         result.append(f.read())
 
-    return "\n".join(result)
+    return "\n".join(result) + "\n"
 
 
 class DataScienceScen(Scenario):

From 43f8c1fd4df9e3bbdfcf1e555b9429b218096cb8 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Mon, 6 Jan 2025 08:27:28 +0000
Subject: [PATCH 186/304] several update on prompts

---
 .../coder/data_science/ensemble/prompts.yaml  |  2 +
 .../coder/data_science/feature/__init__.py    |  2 +
 .../coder/data_science/feature/prompts.yaml   | 13 +++-
 .../coder/data_science/model/__init__.py      |  4 +-
 .../coder/data_science/model/prompts.yaml     | 18 +++--
 .../data_science/raw_data_loader/__init__.py  | 71 +++++++++++--------
 .../data_science/raw_data_loader/prompts.yaml | 35 +++------
 .../coder/data_science/workflow/prompts.yaml  |  2 +
 rdagent/core/experiment.py                    |  9 ++-
 rdagent/utils/env.py                          | 10 +++
 10 files changed, 105 insertions(+), 61 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
index 23923980e..2045ddfda 100644
--- a/rdagent/components/coder/data_science/ensemble/prompts.yaml
+++ b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -11,7 +11,9 @@ ensemble_coder:
         "code": "The Python code as a string."
     }
 
+    {% if queried_similar_successful_knowledge|length != 0 or queried_former_failed_knowledge|length != 0 %}
     -----------Here is the relevant information for this task-----------
+    {% endif %}
     {% if queried_similar_successful_knowledge|length != 0 %}
     --------------Successful Implementations for Similar Models:--------------
     ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
index 6b30263d5..a01135b5c 100644
--- a/rdagent/components/coder/data_science/feature/__init__.py
+++ b/rdagent/components/coder/data_science/feature/__init__.py
@@ -42,6 +42,8 @@ def implement_one_task(
 
         # 2. code
         system_prompt = T(".prompts:feature.system").r(
+            task_desc=feature_information_str,
+            data_loader_code=workspace.file_dict.get("load_data.py"),
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
         )
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
index b0831e20c..0e6e41ebc 100644
--- a/rdagent/components/coder/data_science/feature/prompts.yaml
+++ b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -2,18 +2,29 @@ feature:
   system: |-
     You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
     Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
+
+    Your task is as follows:
+    {{task_desc}}
     
     This project involves implementing feature engineering techniques to prepare data for machine learning models, and this project code will be written by GPT.
     Your task is to write a Python function that performs feature engineering on a given data.
     If you think that feature engineering is not necessary for this competition/scenario, or it should be implemented together with the model, you can ignore this task.
     You should follow the provided specifications to complete this task.
 
+    Your function input is the output of a data loading function, the data loader function code is as follows:
+    ```python
+    {{data_loader_code}}
+    ```
+    Please understand the code and try to implement the feature engineering function based on the data loader output.
+
     Please response the code in the following json format. Here is an example structure for the JSON output:
     {
         "code": "The Python code as a string."
     }
 
+    {% if queried_similar_successful_knowledge|length != 0 or queried_former_failed_knowledge|length != 0 %}
     -----------Here is the relevant information for this task-----------
+    {% endif %}
     {% if queried_similar_successful_knowledge|length != 0 %}
     --------------Successful Implementations for Similar Models:--------------
     ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
@@ -72,4 +83,4 @@ feature_eval:
   user: |-
     ```
     {{stdout}}
-    ```
\ No newline at end of file
+    ```
diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index e31bff1d4..0323a2ae1 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -45,12 +45,14 @@ def implement_one_task(
 
         # 2. code
         system_prompt = T(".prompts:model_coder.system").r(
+            task_desc=model_information_str,
+            data_loader_code=workspace.file_dict.get("load_data.py"),
+            feature_code=workspace.file_dict["feature.py"],
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
         )
         user_prompt = T(".prompts:model_coder.user").r(
             model_spec=workspace.file_dict["spec/model.md"],
-            feature_code=workspace.file_dict["feature.py"],
             latest_code=workspace.file_dict.get(f"{target_task.name}.py", None),
         )
 
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 1f0739280..d6e1d5d91 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -3,8 +3,17 @@ model_coder:
         You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
         Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
         
+        Your task is as follows:
+        {{task_desc}}
+
         The user's ultimate goal is to obtain accurate predictions from the model on input data. Follow the instructions below to ensure your response is correct and aligned with the user's expectations.
 
+        Your function's input is from the output of a feature engineering function whose input is the output of a data loading function. The raw data loader function and feature engineer function code is as follows:
+        --------- Raw Data Loader Code: ---------
+        {{data_loader_code}}
+        --------- Feature Engineering Code: ---------
+        {{feature_code}}
+
         Instructions for Code Generation:
             Leveraging User Inputs:
                 The user may provide various forms of additional information to guide you:
@@ -27,13 +36,15 @@ model_coder:
                 "code": "Your corrected or newly implemented Python code as a single string"
             }
         
+        {% if queried_similar_successful_knowledge|length != 0 or queried_former_failed_knowledge|length != 0 %}
         -----------Here is the relevant information for this task-----------
+        {% endif %}
         {% if queried_similar_successful_knowledge|length != 0 %}
         --------------Successful Implementations for Similar Models:--------------
         ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
         {{ similar_successful_knowledge.target_task.get_task_information() }}
         =====Code:=====
-        {{ similar_successful_knowledge.implementation.file_dict["model01.py"] }}
+        {{ similar_successful_knowledge.implementation.file_dict[similar_successful_knowledge.target_task.name + ".py"] }}
         {% endfor %} 
         {% endif %}
 
@@ -41,7 +52,7 @@ model_coder:
         --------------Previous Failed Attempts:--------------
         {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
         =====Code:=====
-        {{ former_failed_knowledge.implementation.file_dict["model01.py"] }}
+        {{ former_failed_knowledge.implementation.file_dict[former_failed_knowledge.target_task.name + ".py"] }}
         =====Feedback:=====
         {{ former_failed_knowledge.feedback }}
         {% endfor %}
@@ -51,9 +62,6 @@ model_coder:
         ---------Model Specification---------
         {{ model_spec }}
 
-        ---------Feature Engineering Code---------
-        {{ feature_code }}
-
         {% if latest_code %}
         ---------Former Code---------
         Former Code: {{ latest_code }}
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index 79eca029f..e67799a29 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -69,38 +69,51 @@ def implement_one_task(
         )
 
         # 1. specifications
-        # TODO: Why is queried_former_failed_knowledge[0] used here?
-        system_prompt = T(".prompts:spec.system").r(
-            competition_info=competition_info,
-            queried_similar_successful_knowledge=queried_similar_successful_knowledge,
-            queried_former_failed_knowledge=queried_former_failed_knowledge[0],
-        )
-        data_loader_prompt = T(".prompts:spec.user.data_loader").r(
-            latest_spec=workspace.file_dict.get("spec/data_loader.md")
-        )
-        feature_prompt = T(".prompts:spec.user.feature").r(latest_spec=workspace.file_dict.get("spec/feature.md"))
-        model_prompt = T(".prompts:spec.user.model").r(latest_spec=workspace.file_dict.get("spec/model.md"))
-        ensemble_prompt = T(".prompts:spec.user.ensemble").r(latest_spec=workspace.file_dict.get("spec/ensemble.md"))
-        workflow_prompt = T(".prompts:spec.user.workflow").r(latest_spec=workspace.file_dict.get("spec/workflow.md"))
-
-        spec_session = APIBackend().build_chat_session(session_system_prompt=system_prompt)
-
-        data_loader_spec = json.loads(
-            spec_session.build_chat_completion(user_prompt=data_loader_prompt, json_mode=True)
-        )["spec"]
-        feature_spec = json.loads(spec_session.build_chat_completion(user_prompt=feature_prompt, json_mode=True))[
-            "spec"
-        ]
-        model_spec = json.loads(spec_session.build_chat_completion(user_prompt=model_prompt, json_mode=True))["spec"]
-        ensemble_spec = json.loads(spec_session.build_chat_completion(user_prompt=ensemble_prompt, json_mode=True))[
-            "spec"
-        ]
-        workflow_spec = json.loads(spec_session.build_chat_completion(user_prompt=workflow_prompt, json_mode=True))[
-            "spec"
-        ]
+        # TODO: We may move spec into a separated COSTEER task
+        if "spec/data_loader.md" not in workspace.file_dict:  # Only generate the spec once
+            system_prompt = T(".prompts:spec.system").r(
+                task_desc=data_loader_task_info,
+                competition_info=competition_info,
+            )
+            data_loader_prompt = T(".prompts:spec.user.data_loader").r(
+                latest_spec=workspace.file_dict.get("spec/data_loader.md")
+            )
+            feature_prompt = T(".prompts:spec.user.feature").r(latest_spec=workspace.file_dict.get("spec/feature.md"))
+            model_prompt = T(".prompts:spec.user.model").r(latest_spec=workspace.file_dict.get("spec/model.md"))
+            ensemble_prompt = T(".prompts:spec.user.ensemble").r(
+                latest_spec=workspace.file_dict.get("spec/ensemble.md")
+            )
+            workflow_prompt = T(".prompts:spec.user.workflow").r(
+                latest_spec=workspace.file_dict.get("spec/workflow.md")
+            )
+
+            spec_session = APIBackend().build_chat_session(session_system_prompt=system_prompt)
+
+            data_loader_spec = json.loads(
+                spec_session.build_chat_completion(user_prompt=data_loader_prompt, json_mode=True)
+            )["spec"]
+            feature_spec = json.loads(spec_session.build_chat_completion(user_prompt=feature_prompt, json_mode=True))[
+                "spec"
+            ]
+            model_spec = json.loads(spec_session.build_chat_completion(user_prompt=model_prompt, json_mode=True))[
+                "spec"
+            ]
+            ensemble_spec = json.loads(spec_session.build_chat_completion(user_prompt=ensemble_prompt, json_mode=True))[
+                "spec"
+            ]
+            workflow_spec = json.loads(spec_session.build_chat_completion(user_prompt=workflow_prompt, json_mode=True))[
+                "spec"
+            ]
+        else:
+            data_loader_spec = workspace.file_dict["spec/data_loader.md"]
+            feature_spec = workspace.file_dict["spec/feature.md"]
+            model_spec = workspace.file_dict["spec/model.md"]
+            ensemble_spec = workspace.file_dict["spec/ensemble.md"]
+            workflow_spec = workspace.file_dict["spec/workflow.md"]
 
         # 2. code
         system_prompt = T(".prompts:data_loader_coder.system").r(
+            task_desc=data_loader_task_info,
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
         )
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 418386a4c..07c8779de 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -5,6 +5,9 @@ spec:
 
     Currently, you are working on a Kaggle competition project. 
     This project involves analyzing data and building models to beat other competitors, with the code being generated by large language models.
+
+    Your overall task is provided below:
+    {{ task_desc }}
     
     Your task is to write five specification texts (in markdown format) for the following tasks, based on the competition information provided
     - Data loading (and preprocessing)
@@ -13,33 +16,16 @@ spec:
     - Ensemble
     - The overall workflow
 
-    The specifications for each step should refer to the specifications in the previous sections and be tailored to the competition information provided.
+    The specifications for each step should be tailored to the competition information provided. 
+    
+    Your specification should consists two parts:
+    1. The function definition in code format with detailed annotation to each parameter and return value.
+    2. A detailed docstring to the function that explains the purpose of the function, the input parameters, and the output.
+    Your specifications should not include any code implementation, only the function definition and docstring.
 
     -----------Competition Information-----------
     {{ competition_info }}
 
-    {% if queried_similar_successful_knowledge|length != 0 or queried_former_failed_knowledge|length != 0 %}
-    -----------Here is the relevant information for this task-----------
-    {% endif %}
-    {% if queried_similar_successful_knowledge|length != 0 %}
-    --------------Successful Implementations for Similar Models:--------------
-    ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
-    {{ similar_successful_knowledge.target_task.get_task_information() }}
-    =====Code:=====
-    {{ similar_successful_knowledge.implementation.file_dict["load_data.py"] }}
-    {% endfor %} 
-    {% endif %}
-
-    {% if queried_former_failed_knowledge|length != 0 %}
-    --------------Previous Failed Attempts:--------------
-    {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
-    =====Code:=====
-    {{ former_failed_knowledge.implementation.file_dict["load_data.py"] }}
-    =====Feedback:=====
-    {{ former_failed_knowledge.feedback }}
-    {% endfor %}
-    {% endif %}
-
   user:
     data_loader: |-
       Data loader specification text should follow these detailed requirements:
@@ -272,7 +258,8 @@ spec:
 data_loader_coder:
   system: |-
     You are a Python data scientist working on a new project. This project will be used to analyze data and build models to predict future outcomes, and this project codes will be written by GPT.
-    Your task is to write a Python function that loads and preprocesses data. The function should take a file path as input and return a pandas DataFrame with the data loaded and preprocessed.
+    Your task is described below:
+    {{ task_desc }}
     You should follow the provided specifications to complete this task.
 
     Please response the code in the following json format. Here is an example structure for the JSON output:
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index 67608b352..8be9216b2 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -19,7 +19,9 @@ workflow_coder:
         "code": "The Python code as a string."
     }
 
+    {% if queried_similar_successful_knowledge|length != 0 or queried_former_failed_knowledge|length != 0 %}
     -----------Here is the relevant information for this task-----------
+    {% endif %}
     {% if queried_similar_successful_knowledge|length != 0 %}
     --------------Successful Implementations for Similar Models:--------------
     ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index 8eba11c2b..46b58d486 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -1,8 +1,9 @@
 from __future__ import annotations
 
+import json
 import os
-import re
 import platform
+import re
 import shutil
 import typing
 import uuid
@@ -13,6 +14,8 @@
 from typing import Any, Generic, Optional, TypeVar
 
 from rdagent.core.conf import RD_AGENT_SETTINGS
+from rdagent.core.utils import cache_with_pickle
+from rdagent.oai.llm_utils import md5_hash
 from rdagent.utils.env import Env
 
 if typing.TYPE_CHECKING:
@@ -198,6 +201,10 @@ def clear(self) -> None:
         shutil.rmtree(self.workspace_path, ignore_errors=True)
         self.file_dict = {}
 
+    def hash_func(self, env: Env | None = None, entry: str | None = None) -> str:
+        return md5_hash(json.dumps(tuple(sorted(self.file_dict.items()))) + entry)
+
+    @cache_with_pickle(hash_func)
     def execute(self, env: Env | None = None, entry: str | None = None) -> object | None:
         """
         Before each execution, make sure to prepare and inject code
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
index b26955937..bfbc0e553 100644
--- a/rdagent/utils/env.py
+++ b/rdagent/utils/env.py
@@ -10,6 +10,7 @@
 import json
 import os
 import pickle
+import re
 import subprocess
 import uuid
 from abc import abstractmethod
@@ -302,12 +303,20 @@ def _gpu_kwargs(self, client):
             return {}
         return gpu_kwargs
 
+    def replace_time_info(self, input_string):
+        """To remove any time related information from the logs since it will destroy the cache mechanism"""
+        """We currently set this function as default, but it can be changed in the future"""
+        datetime_pattern = r"\b\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(?:\.\d+)?\b"
+        output_string = re.sub(datetime_pattern, "[DATETIME]", input_string)
+        return output_string
+
     def __run(
         self,
         entry: str | None = None,
         local_path: str | None = None,
         env: dict | None = None,
         running_extra_volume: dict | None = None,
+        remove_timestamp: bool = True,
     ) -> str:
         if env is None:
             env = {}
@@ -355,6 +364,7 @@ def __run(
             print(table)
             for log in logs:
                 decoded_log = log.strip().decode()
+                decoded_log = self.replace_time_info(decoded_log) if remove_timestamp else decoded_log
                 Console().print(decoded_log, markup=False)
                 log_output += decoded_log + "\n"
             print(Rule("[bold green]Docker Logs End[/bold green]", style="dark_orange"))

From 1f5ce9a1d1f45e160f3de7440236afbfc9e086d0 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Mon, 6 Jan 2025 09:45:52 +0000
Subject: [PATCH 187/304] sample subfolders

---
 rdagent/scenarios/data_science/debug/data.py | 87 +++++++++++++++++---
 1 file changed, 77 insertions(+), 10 deletions(-)

diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index e9074757e..fa87eba8d 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -102,6 +102,13 @@ def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
         return df.iloc[:ten_percent]
 
 
+def count_files_in_folder(folder: Path) -> int:
+    """
+    Count the total number of files in a folder, including files in subfolders.
+    """
+    return sum(1 for _ in folder.rglob("*") if _.is_file())
+
+
 def create_debug_data(
     competition: str,
     dr_cls: type[DataReducer] = RandDataReducer,
@@ -121,32 +128,34 @@ def create_debug_data(
         dataset_path = KAGGLE_IMPLEMENT_SETTING.local_data_path  # FIXME: don't hardcode this KAGGLE_IMPLEMENT_SETTING
 
     if sample_path is None:
-        # Create a sample folder under the dataset folder, which should be available in docker container
         sample_path = Path(dataset_path) / "sample"
 
     data_folder = Path(dataset_path) / competition
     sample_folder = Path(sample_path) / competition
+    total_files_count = count_files_in_folder(data_folder)
+    print(f"[INFO] Original dataset folder `{data_folder}` has {total_files_count} files in total (including subfolders).")
 
     # Traverse the folder and exclude specific file types
     included_extensions = {".csv", ".pkl", ".parquet", ".h5", ".hdf", ".hdf5"}
     files_to_process = [file for file in data_folder.rglob("*") if file.is_file()]
 
+    # This set will store filenames or paths that appear in the sampled data
+    sample_used_file_names = set()
+
+    # Prepare data handler and reducer
+    data_handler = GenericDataHandler()
+    data_reducer = dr_cls(**dr_cls_kwargs)
+
     for file_path in files_to_process:
         sampled_file_path = sample_folder / file_path.relative_to(data_folder)
         if sampled_file_path.exists():
             continue
 
-        sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
-        if file_path.suffix not in included_extensions:
-            shutil.copy(file_path, sampled_file_path)
+        if file_path.suffix.lower() not in included_extensions:
             continue
 
-        # Initialize the generic data handler
-        data_handler = GenericDataHandler()
-
-        # Initialize the data reducer (e.g., RandDataReducer or ColumnReducer)
-        data_reducer = dr_cls(**dr_cls_kwargs)
-
+        sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
+       
         # Load the original data
         df = data_handler.load(file_path)
 
@@ -156,6 +165,64 @@ def create_debug_data(
         # Dump the sampled data
         try:
             data_handler.dump(df_sampled, sampled_file_path)
+            # Extract possible file references from the sampled data
+            for col in df_sampled.columns:
+                unique_vals = df_sampled[col].astype(str).unique()
+                for val in unique_vals:
+                    # Add the entire string to the set;
+                    # in real usage, might want to parse or extract basename, etc.
+                    sample_used_file_names.add(val)
         except Exception as e:
             print(f"Error processing {file_path}: {e}")
             continue
+
+    # Process non-data files
+    subfolder_dict = {}
+    for file_path in files_to_process:
+        if file_path.suffix.lower() in included_extensions:
+            continue  # Already handled above
+
+        rel_dir = file_path.relative_to(data_folder).parent
+        subfolder_dict.setdefault(rel_dir, []).append(file_path)
+
+    # For each subfolder, decide which files to copy
+    for rel_dir, file_list in subfolder_dict.items():
+        used_files = []
+        not_used_files = []
+
+        # Check if each file is in the "used" list
+        for fp in file_list:
+            # If your logic is only about the file's name:
+            # if fp.name in sample_used_file_names:
+            if str(fp.name) in sample_used_file_names or str(fp) in sample_used_file_names:
+                used_files.append(fp)
+            else:
+                not_used_files.append(fp)
+
+        # Directly copy used files
+        for uf in used_files:
+            sampled_file_path = sample_folder / uf.relative_to(data_folder)
+            if sampled_file_path.exists():
+                continue
+            sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
+            shutil.copy(uf, sampled_file_path)
+
+        # If no files are used, randomly sample files to keep the folder from being empty
+        if len(used_files) == 0:
+            if len(file_list) <= 100:
+                num_to_keep = len(file_list)
+            else:
+                num_to_keep = int(len(file_list) * 0.05)
+                if num_to_keep <= 100:
+                    num_to_keep = 100  # Keep at least one file if fraction is too small
+
+            sampled_not_used = pd.Series(not_used_files).sample(n=num_to_keep, random_state=1)
+            for nf in sampled_not_used:
+                sampled_file_path = sample_folder / nf.relative_to(data_folder)
+                if sampled_file_path.exists():
+                    continue
+                sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
+                shutil.copy(nf, sampled_file_path)
+    
+    final_files_count = count_files_in_folder(sample_folder)
+    print(f"[INFO] After sampling, the sample folder `{sample_folder}` contains {final_files_count} files in total.")

From 990049535e047c6d09b1c59003a4fb0df5bf14cb Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Mon, 6 Jan 2025 10:12:16 +0000
Subject: [PATCH 188/304] Filter the stdout after code execution to remove
 irrelevant information e.g. progress bars, whitespace characters, excessive
 line breaks.

---
 .../coder/data_science/model/eval.py          | 20 ++++--
 rdagent/scenarios/data_science/scen/scen.py   |  4 +-
 rdagent/utils/__init__.py                     | 70 +++++++++++++++++++
 rdagent/utils/agent/tpl.py                    |  2 +-
 rdagent/utils/prompts.yaml                    | 33 +++++++++
 5 files changed, 122 insertions(+), 7 deletions(-)
 create mode 100644 rdagent/utils/prompts.yaml

diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index c51dcf0f8..ff3ce7368 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -4,6 +4,7 @@
 """
 
 import json
+import re
 from pathlib import Path
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
@@ -14,12 +15,14 @@
 from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils import filter_progress_bar
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 ModelSingleFeedback = CoSTEERSingleFeedback
 
+
 # Below are unit tests for testing the specification of the implemented model ------------------
 class ModelGeneralCaseSpecEvaluator(CoSTEEREvaluator):
     """
@@ -53,22 +56,31 @@ def evaluate(
             )
 
         ds_docker_conf = DSDockerConf()
-        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
+        ds_docker_conf.extra_volumes = {
+            f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
+        }
         de = DockerEnv(conf=ds_docker_conf)
 
         fname = "test_model.py"
-        test_code = (DIRNAME / "eval_tests" / fname).read_text().replace("model01", target_task.name) # only check the model changed this time
+        test_code = (
+            (DIRNAME / "eval_tests" / fname).read_text().replace("model01", target_task.name)
+        )  # only check the model changed this time
         implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
         if stdout is None:
             stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
 
+        # Filter out progress bars from stdout using regex
+        filtered_stdout = filter_progress_bar(stdout)
+
         system_prompt = T(".prompts:model_eval.system").r(
-            test_code=test_code, scenario=self.scen.get_scenario_all_desc(), spec=implementation.file_dict["spec/model.md"]
+            test_code=test_code,
+            scenario=self.scen.get_scenario_all_desc(),
+            spec=implementation.file_dict["spec/model.md"],
         )
         user_prompt = T(".prompts:model_eval.user").r(
-            stdout=stdout,
+            stdout=filtered_stdout,
             code=implementation.file_dict[f"{target_task.name}.py"],
         )
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index aeec8d2f5..8366dc242 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -90,7 +90,7 @@ def describe_data_folder(folder_path, indent=0, max_files=3, partial_expand_subf
                             folder_path=subfolder_path,
                             indent=indent + 2,
                             max_files=max_files,
-                            partial_expand_subfolders=partial_expand_subfolders
+                            partial_expand_subfolders=partial_expand_subfolders,
                         )
                     )
                 else:
@@ -106,7 +106,7 @@ def describe_data_folder(folder_path, indent=0, max_files=3, partial_expand_subf
                         folder_path=subfolder_path,
                         indent=indent + 2,
                         max_files=max_files,
-                        partial_expand_subfolders=partial_expand_subfolders
+                        partial_expand_subfolders=partial_expand_subfolders,
                     )
                 )
 
diff --git a/rdagent/utils/__init__.py b/rdagent/utils/__init__.py
index 41a096ace..e0b150e82 100644
--- a/rdagent/utils/__init__.py
+++ b/rdagent/utils/__init__.py
@@ -7,11 +7,15 @@
 # TODO: split the utils in this module into different modules in the future.
 
 import importlib
+import json
 import re
 import sys
 from types import ModuleType
 from typing import Union
 
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+
 
 def get_module_by_module_path(module_path: Union[str, ModuleType]):
     """Load module from path like a/b/c/d.py or a.b.c.d
@@ -53,3 +57,69 @@ def convert2bool(value: Union[str, bool]) -> bool:
         return value
     else:
         raise ValueError(f"Unknown value type {value} to bool")
+
+def remove_ansi_codes(s: str) -> str:
+    """
+    It is for removing ansi ctrl characters in the string(e.g. colored text)
+    """
+    ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
+    return ansi_escape.sub("", s)
+
+
+def filter_progress_bar(stdout: str) -> str:
+    """
+    Filter out progress bars from stdout using regex.
+    """
+    # Initial progress bar regex pattern
+    progress_bar_re = (
+        r"(\d+/\d+\s+[━]+\s+\d+s?\s+\d+ms/step.*?\u0008+|"
+        r"\d+/\d+\s+[━]+\s+\d+s?\s+\d+ms/step|"
+        r"\d+/\d+\s+[━]+\s+\d+s?\s+\d+ms/step.*|"
+        r"\d+/\d+\s+[━]+.*?\u0008+|"
+        r"\d+/\d+\s+[━]+.*|[ ]*\u0008+)"
+    )
+
+    filtered_stdout = remove_ansi_codes(stdout)
+    filtered_stdout = re.sub(progress_bar_re, "", filtered_stdout)
+    filtered_stdout = re.sub(r'\s*\n\s*', '\n', filtered_stdout)
+
+    # Check if progress bars are already filtered
+    system_prompt = T(".prompts:if_filtered.system").r()
+    user_prompt = T(".prompts:if_filtered.user").r(
+        filtered_stdout=filtered_stdout,
+    )
+    if_filtered_stdout = json.loads(
+        APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
+    ).get("progress bar filtered", False)
+
+    if convert2bool(if_filtered_stdout):
+        return filtered_stdout
+
+    # Attempt further filtering up to 5 times
+    for _ in range(5):
+        system_prompt = T(".prompts:filter_progress_bar.system").r()
+        user_prompt = T(".prompts:filter_progress_bar.user").r(
+            stdout=filtered_stdout,
+        )
+
+        new_pattern = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(
+                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+            )
+        )["regex pattern"]
+
+        filtered_stdout = re.sub(new_pattern, "", filtered_stdout)
+        filtered_stdout = re.sub(r'\s*\n\s*', '\n', filtered_stdout)
+
+        system_prompt = T(".prompts:if_filtered.system").r()
+        user_prompt = T(".prompts:if_filtered.user").r(
+            filtered_stdout=filtered_stdout,
+        )
+        if_filtered_stdout = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
+        ).get("progress bar filtered", False)
+
+        if convert2bool(if_filtered_stdout):
+            break
+
+    return filtered_stdout
\ No newline at end of file
diff --git a/rdagent/utils/agent/tpl.py b/rdagent/utils/agent/tpl.py
index 9b0af4376..e87559efe 100644
--- a/rdagent/utils/agent/tpl.py
+++ b/rdagent/utils/agent/tpl.py
@@ -51,7 +51,7 @@ def __init__(self, uri: str):
 
         if path_part.startswith("."):
             yaml_file_path = caller_dir / f"{path_part[1:].replace('.', '/')}.yaml"
-            self.uri = f"{str(caller_dir.relative_to(PROJ_PATH)).replace('/', '.')}{uri}"
+            self.uri = f"{str(caller_dir.resolve().relative_to(PROJ_PATH)).replace('/', '.')}{uri}"
         else:
             yaml_file_path = (PROJ_PATH / path_part.replace(".", "/")).with_suffix(".yaml")
 
diff --git a/rdagent/utils/prompts.yaml b/rdagent/utils/prompts.yaml
new file mode 100644
index 000000000..fd3786586
--- /dev/null
+++ b/rdagent/utils/prompts.yaml
@@ -0,0 +1,33 @@
+filter_progress_bar:
+  system: |
+    You are an assistant helping to filter progress bars from a given text. Generate a regex pattern that matches typical progress bar formats, including those with percentages, loading bars (e.g., `====>`), or dynamic text such as elapsed time or remaining time. Be flexible to handle variations.
+  user: |
+    The following text contains stdout with progress bars:
+
+    {{ stdout }}
+
+    Generate a regex pattern to filter out progress bars from this stdout text. Focus on identifying and removing lines or segments that represent progress bars while preserving the rest of the content.
+
+    Please respond with the regex pattern in the following JSON format and order:
+    ```json
+    {
+        "regex pattern": "The regex pattern to filter out progress bars."
+    }
+    ```
+
+if_filtered:
+  system: |
+    You are an assistant helping to verify if progress bars have been successfully filtered from a given text. Analyze the filtered text to determine if any progress bar-like patterns remain.
+  user: |
+    The following is the filtered stdout text:
+
+    {{ filtered_stdout }}
+
+    Check if the text still contains any progress bar patterns such as percentages, loading bars, or similar elements. Return true if no progress bar remains; otherwise, return false.
+
+    Please respond with your answer in the following JSON format and order:
+    ```json
+    {
+        "progress bar filtered": <true/false>
+    }
+    ```

From 6c23e7d0d3a7098c53d1a0053b489db464c1ab4c Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Mon, 6 Jan 2025 10:36:54 +0000
Subject: [PATCH 189/304] Add some more prompts and comments

---
 .../data_science/proposal/exp_gen.py          |  4 +++
 .../data_science/proposal/prompts.yaml        | 32 +++++++++++++++++--
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 09153281e..9ea05424a 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -205,6 +205,10 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             )
 
             component = resp_dict_component.get("component", "Component not provided")
+
+            # Why we should split component selection and hpothesis generation
+            # - after we know the selected component, we can use RAG.
+
             # Step 2: Generate the rest of the hypothesis
             if component != "Model":
                 hypothesis_sys_prompt = T(".prompts:hypothesis_gen.system").r(
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index df32dabb5..9c217b3f3 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -119,10 +119,38 @@ task_gen_model:
     Please generate the new {{targets}} task.
     {% endif %}
 
-hypothesis_and_feedback: |-
+component_gen:
+  system: |-
+    You are a Kaggle Grander Master. You are going to provide a solution for a kaggle competition.
+
+    Here is the description of the competion scenario
+    ```
+    {{scenario}}
+    ```
+    
+    Here is the latest version of implementation.
+    ```
+    {{}}
+
+    ```
+
+    You will be proivded the feedback for the latest implementation.
+
+    Please select the component you are going to improve the latest implementation.
+
+  user: |-
+    {{feedback}}
+
+
+
+exp_and_feedback: |-
   {% for experiment, feedback in trace.hist[-10:] %}
-  Hypothesis {{ loop.index }}: {{ experiment.hypothesis }}
+  ## Experiment {{ loop.index }}
+  Experiment are focusing on task: {{experiment.sub_tasks[0]}}
+  {% if experiment.hypothesis %}
+  The experiment is design driven by hypothesis : {{ experiment.hypothesis }}
   Observation on the result with the hypothesis: {{ feedback.observations }}
+  {% endif %}
   Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
   Did changing to this hypothesis work? (focus on the change):  {{ feedback.decision }}
   {% endfor %}

From 929509428378431e4c7f6b1cc80fe457cd7cf2ed Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Mon, 6 Jan 2025 11:23:59 +0000
Subject: [PATCH 190/304] several update on the first init rounds

---
 .../coder/data_science/ensemble/__init__.py   |  1 +
 .../coder/data_science/ensemble/eval.py       | 19 +++++++++---
 .../ensemble/eval_tests/ensemble_test.py      | 20 +++++-------
 .../coder/data_science/ensemble/prompts.yaml  |  4 +++
 .../coder/data_science/model/eval.py          | 15 ++++++---
 .../{test_model.py => model_test.py}          |  0
 .../data_science/raw_data_loader/prompts.yaml | 10 +++++-
 .../data_science/proposal/exp_gen.py          | 31 ++++++++++++++-----
 .../data_science/proposal/prompts.yaml        |  6 ++++
 9 files changed, 76 insertions(+), 30 deletions(-)
 rename rdagent/components/coder/data_science/model/eval_tests/{test_model.py => model_test.py} (100%)

diff --git a/rdagent/components/coder/data_science/ensemble/__init__.py b/rdagent/components/coder/data_science/ensemble/__init__.py
index 5f225be17..b6ee34d4c 100644
--- a/rdagent/components/coder/data_science/ensemble/__init__.py
+++ b/rdagent/components/coder/data_science/ensemble/__init__.py
@@ -55,6 +55,7 @@ def implement_one_task(
         # Generate code with knowledge integration
         competition_info = self.scen.get_scenario_all_desc()
         system_prompt = T(".prompts:ensemble_coder.system").r(
+            task_desc=ensemble_information_str,
             competition_info=competition_info,
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
             queried_former_failed_knowledge=(
diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
index c7ed9b88f..de5b2bfc3 100644
--- a/rdagent/components/coder/data_science/ensemble/eval.py
+++ b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -1,9 +1,10 @@
 import json
 from dataclasses import dataclass
 from pathlib import Path
+
 from jinja2 import Environment, StrictUndefined
-from rdagent.app.data_science.conf import DS_RD_SETTING
 
+from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
     CoSTEERSingleFeedback,
@@ -44,15 +45,23 @@ def evaluate(
             )
 
         ds_docker_conf = DSDockerConf()
-        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
+        ds_docker_conf.extra_volumes = {
+            f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
+        }
         de = DockerEnv(conf=ds_docker_conf)
 
         fname = "ensemble_test.py"
         test_code = (DIRNAME / "eval_tests" / "ensemble_test.py").read_text()
-        test_code = Environment(undefined=StrictUndefined).from_string(test_code).render(
-            model_names=[fn[:-3] for fn in implementation.file_dict.keys() if fn.startswith("model_")]
+        test_code = (
+            Environment(undefined=StrictUndefined)
+            .from_string(test_code)
+            .render(
+                model_names=[
+                    fn[:-3] for fn in implementation.file_dict.keys() if fn.startswith("model_") and "test" not in fn
+                ]
+            )
         )
-        
+
         implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
index ab135559a..6a40b2d8e 100644
--- a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
+++ b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
@@ -31,19 +31,13 @@
 {% endfor %}
 
 # Run ensemble
-try:
-    final_pred = ens_and_decision(test_preds_dict, val_preds_dict, val_y)
+final_pred = ens_and_decision(test_preds_dict, val_preds_dict, val_y)
 
-    # Check shape
-    assert final_pred.shape == val_y.shape, "Wrong output shape"
+# Check shape
+assert final_pred.shape[0] == test_X.shape[0], "Wrong output sample size"
 
-    # check if scores.csv is generated
-    if not Path("scores.csv").exists():
-        raise Exception("scores.csv is not generated")
-    
-    print("Ensemble test passed successfully.")
-    print(f"Output shape: {final_pred.shape}")
-    print(f"Unique values in predictions: {np.unique(final_pred)}")
+# check if scores.csv is generated
+assert Path("scores.csv").exists(), "scores.csv is not generated"
 
-except Exception as e:
-    print(f"Test failed: {str(e)}")
+print("Ensemble test passed successfully.")
+print(f"Output shape: {final_pred.shape}")
diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
index 2045ddfda..99047ab0d 100644
--- a/rdagent/components/coder/data_science/ensemble/prompts.yaml
+++ b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -1,6 +1,10 @@
 ensemble_coder:
   system: |-
     You are a Python data scientist working on model ensemble implementation. Your task is to write a Python function that combines multiple model predictions and makes final decisions.
+
+    Your specific task as follows:
+    {{task_desc}}
+
     You should follow the provided specifications to complete this task.
 
     -----------Competition Information-----------
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index c51dcf0f8..90c9c89e5 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -20,6 +20,7 @@
 DIRNAME = Path(__file__).absolute().resolve().parent
 ModelSingleFeedback = CoSTEERSingleFeedback
 
+
 # Below are unit tests for testing the specification of the implemented model ------------------
 class ModelGeneralCaseSpecEvaluator(CoSTEEREvaluator):
     """
@@ -53,11 +54,15 @@ def evaluate(
             )
 
         ds_docker_conf = DSDockerConf()
-        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
+        ds_docker_conf.extra_volumes = {
+            f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
+        }
         de = DockerEnv(conf=ds_docker_conf)
 
-        fname = "test_model.py"
-        test_code = (DIRNAME / "eval_tests" / fname).read_text().replace("model01", target_task.name) # only check the model changed this time
+        fname = "model_test.py"
+        test_code = (
+            (DIRNAME / "eval_tests" / fname).read_text().replace("model01", target_task.name)
+        )  # only check the model changed this time
         implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
@@ -65,7 +70,9 @@ def evaluate(
             stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
 
         system_prompt = T(".prompts:model_eval.system").r(
-            test_code=test_code, scenario=self.scen.get_scenario_all_desc(), spec=implementation.file_dict["spec/model.md"]
+            test_code=test_code,
+            scenario=self.scen.get_scenario_all_desc(),
+            spec=implementation.file_dict["spec/model.md"],
         )
         user_prompt = T(".prompts:model_eval.user").r(
             stdout=stdout,
diff --git a/rdagent/components/coder/data_science/model/eval_tests/test_model.py b/rdagent/components/coder/data_science/model/eval_tests/model_test.py
similarity index 100%
rename from rdagent/components/coder/data_science/model/eval_tests/test_model.py
rename to rdagent/components/coder/data_science/model/eval_tests/model_test.py
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 51ec9a49a..39195129b 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -41,6 +41,7 @@ spec:
           - Describe the purpose of the function.
           - Specify the data source location (`/kaggle/input/`).
           - Clearly define the structure and type of the output.
+          - Inferred data shape to each input and output data variables. To uncertain dimension, use -1.
 
       2. Precautions for Data Loading and Preprocessing:
         - File Handling:
@@ -87,8 +88,13 @@ spec:
           - Describe the purpose of the function.
           - Clarify the input parameters and their data types.
           - Define the structure and format of the output.
+          - Inferred data shape to each input and output data variables. To uncertain dimension, use -1.
 
       2. Precautions for Feature Engineering:
+        - Well handle the shape of the data
+          - The sample size of the train data and the test data should be the same in all scenarios.
+          - To most of the scenario, the input shape and the output shape should be exactly the same.
+          - To some tabular data, you may add or remove some columns so your inferred column number may be unsure.
         - Integration with Model Pipeline
           - If feature engineering is strictly part of the model pipeline, state explicitly that it will be handled at the model stage.
           - If integrated here, ensure this function applies all required transformations while avoiding data leakage.
@@ -138,6 +144,7 @@ spec:
           - Describe the purpose of the function.
           - Clarify the input parameters and their data types.
           - Define the structure and format of the output.
+          - Inferred data shape to each input and output data variables. To uncertain dimension, use -1.
 
       2. Code Standards:
         - Avoid using progress bars (e.g., `tqdm`) in the implementation.  
@@ -178,6 +185,7 @@ spec:
           - Describe the purpose of the function.
           - Clarify the input parameters and their data types.
           - Define the structure and format of the output.
+          - Inferred data shape to each input and output data variables. To uncertain dimension, use -1.
 
       2. Precautions:
         - Validation of Inputs:
@@ -241,7 +249,7 @@ spec:
 
         4. Code Standards:
           - Use consistent naming conventions and type annotations.
-          - Document the workflow with clear comments and docstrings.
+          - Document the workflow with clear comments and docstring.
           - Do not use progress bars (e.g., tqdm) in the code.
 
         {% if latest_spec %}
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index b3dbc8e34..d6447193e 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -2,6 +2,8 @@
 import re
 from typing import Literal
 
+import pandas as pd
+
 from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
 from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.components.coder.data_science.model.exp import ModelTask
@@ -20,7 +22,6 @@
 from rdagent.scenarios.data_science.experiment.experiment import COMPONENT, DSExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
 from rdagent.utils.agent.tpl import T
-import pandas as pd
 
 
 class DSHypothesis(Hypothesis):
@@ -74,12 +75,14 @@ def llm_task_gen(
         targets: str,
         scenario_desc: str,
         task_output_format: str,
+        spec: str = None,
         hypothesis: Hypothesis | None = None,
         hypothesis_and_feedback: str | None = None,
     ) -> dict:
         system_prompt = T(".prompts:task_gen.system").r(
             targets=targets,
             scenario=scenario_desc,
+            task_specification=spec,
             hypothesis=hypothesis,
             task_output_format=task_output_format,
         )
@@ -124,6 +127,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             resp_dict = self.llm_task_gen(
                 targets="Feature Engineering",
                 scenario_desc=scenario_desc,
+                spec=sota_exp.experiment_workspace.file_dict["spec/feature.md"],
                 task_output_format=T(".prompts:output_format.feature").r(),
             )
 
@@ -138,6 +142,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             resp_dict = self.llm_task_gen(
                 targets="Models",
                 scenario_desc=scenario_desc,
+                spec=sota_exp.experiment_workspace.file_dict["spec/model.md"],
                 task_output_format=T(".prompts:output_format.model").r(),
             )
 
@@ -155,6 +160,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             resp_dict = self.llm_task_gen(
                 targets="Ensemble",
                 scenario_desc=scenario_desc,
+                spec=sota_exp.experiment_workspace.file_dict["spec/ensemble.md"],
                 task_output_format=T(".prompts:output_format.ensemble").r(),
             )
 
@@ -169,6 +175,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             resp_dict = self.llm_task_gen(
                 targets="Workflow",
                 scenario_desc=scenario_desc,
+                spec=sota_exp.experiment_workspace.file_dict["spec/workflow.md"],
                 task_output_format=T(".prompts:output_format.workflow").r(),
             )
 
@@ -197,7 +204,9 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             )
 
             resp_dict_component: dict = json.loads(
-                APIBackend().build_messages_and_create_chat_completion(component_user_prompt, component_sys_prompt, json_mode=True)
+                APIBackend().build_messages_and_create_chat_completion(
+                    component_user_prompt, component_sys_prompt, json_mode=True
+                )
             )
 
             component = resp_dict_component.get("component", "Component not provided")
@@ -216,7 +225,9 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 )
 
                 resp_dict: dict = json.loads(
-                    APIBackend().build_messages_and_create_chat_completion(hypothesis_user_prompt, hypothesis_sys_prompt, json_mode=True)
+                    APIBackend().build_messages_and_create_chat_completion(
+                        hypothesis_user_prompt, hypothesis_sys_prompt, json_mode=True
+                    )
                 )
                 hypothesis = DSHypothesis(
                     component=resp_dict.get("component", "Component not provided"),
@@ -235,9 +246,9 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     if re.match(r"^model_.+\.py", fname):
                         model_str = f"{fname}:\n{metric_name} on valid: {score_df.loc[fname[:-3]]}\n```python\n{sota_exp.experiment_workspace.file_dict[fname]}\n```\n"
                         model_infos.append(model_str)
-                
+
                 model_num = len(model_infos)
-                models_info_str = ("-"*20).join(model_infos)
+                models_info_str = ("-" * 20).join(model_infos)
                 if model_num >= 3:
                     hypothesis_sys_prompt = T(".prompts:hypothesis_model.system").r(
                         targets="data science project",
@@ -261,7 +272,9 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
                 resp_dict: dict = json.loads(
-                    APIBackend().build_messages_and_create_chat_completion(hypothesis_user_prompt, hypothesis_sys_prompt, json_mode=True)
+                    APIBackend().build_messages_and_create_chat_completion(
+                        hypothesis_user_prompt, hypothesis_sys_prompt, json_mode=True
+                    )
                 )
                 hypothesis = DSHypothesis(
                     component=resp_dict.get("component", "Component not provided"),
@@ -278,6 +291,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 resp_dict = self.llm_task_gen(
                     targets="Data loader and specification generation",
                     scenario_desc=scenario_desc,
+                    spec=sota_exp.experiment_workspace.file_dict["spec/data_loader.md"],
                     hypothesis=hypothesis,
                     task_output_format=T(".prompts:output_format.data_loader").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
@@ -298,12 +312,12 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 resp_dict = self.llm_task_gen(
                     targets="Feature Engineering",
                     scenario_desc=scenario_desc,
+                    spec=sota_exp.experiment_workspace.file_dict["spec/feature.md"],
                     hypothesis=hypothesis,
                     task_output_format=T(".prompts:output_format.feature").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
 
-                
                 ft = FeatureTask(
                     name="Feature Engineering",
                     description=resp_dict.get("description", "Feature description not provided"),
@@ -315,6 +329,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             elif hypothesis.component == "Model":
                 resp_dict = self.llm_task_gen(
                     scenario_desc=scenario_desc,
+                    spec=sota_exp.experiment_workspace.file_dict["spec/model.md"],
                     hypothesis=hypothesis,
                     task_output_format=T(".prompts:output_format.model").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
@@ -336,6 +351,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 resp_dict = self.llm_task_gen(
                     targets="Ensemble",
                     scenario_desc=scenario_desc,
+                    spec=sota_exp.experiment_workspace.file_dict["spec/ensemble.md"],
                     hypothesis=hypothesis,
                     task_output_format=T(".prompts:output_format.ensemble").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
@@ -353,6 +369,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 resp_dict = self.llm_task_gen(
                     targets="Workflow",
                     scenario_desc=scenario_desc,
+                    spec=sota_exp.experiment_workspace.file_dict["spec/workflow.md"],
                     hypothesis=hypothesis,
                     task_output_format=T(".prompts:output_format.workflow").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 23f3b90e0..3a6fbc155 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -59,6 +59,12 @@ task_gen:
     The {{targets}} are used in certain scenario, the scenario is as follows:
     {{ scenario }}
 
+    {% if task_specification is not none %}
+    The user has wrote some specification for the {{targets}}. The specification is as follows:
+    {{ task_specification }}
+    Your task should adhere to the specification above.
+    {% endif %}
+
     {% if hypothesis is not none %}
     The user will use the {{targets}} generated to do some experiments. The user will provide this information to you:
     1. The target hypothesis you are targeting to generate {{targets}} for.

From edeb3373096a4a52175f38dc9276f12b40f03e3d Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Tue, 7 Jan 2025 03:05:11 +0000
Subject: [PATCH 191/304] model timeout as error

---
 rdagent/components/coder/data_science/model/prompts.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 72f178f0b..55a0a1e12 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -108,6 +108,7 @@ model_eval:
         When conducting evaluation, please refer to the requirements provided in spec.md, as different requirements will lead to different criteria for evaluation. 
 
         Only if there is "Model code test passed successfully." in the stdout, then the model is considered successful, or else there must be some issues with the model.
+        If no stdout is provided, the model is considered to have failed due to a timeout. Please check if there are any ways to improve the model's execution speed.
     
         Please respond with your feedback in the following JSON format and order:
         ```json

From 5e4f5442fbb248e42b4304aa2bfbca5cf404cbc1 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 7 Jan 2025 03:24:44 +0000
Subject: [PATCH 192/304] fix pattern of getting model codes in workspace

---
 rdagent/components/coder/data_science/workflow/__init__.py | 2 +-
 rdagent/core/experiment.py                                 | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
index 92948a57d..e7da0ed38 100644
--- a/rdagent/components/coder/data_science/workflow/__init__.py
+++ b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -49,7 +49,7 @@ def implement_one_task(
         user_prompt = T(".prompts:workflow_coder.user").r(
             load_data_code=workspace.file_dict["load_data.py"],
             feature_code=workspace.file_dict["feature.py"],
-            model_codes=workspace.get_codes(r'^model_.+\.py$'),
+            model_codes=workspace.get_codes(r'^model_(?!test)\w+\.py$'),
             ensemble_code=workspace.file_dict["ensemble.py"],
             latest_code=workspace.file_dict.get("main.py"),
             workflow_spec=workspace.file_dict["spec/workflow.md"],
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index fe247af93..c842bb08d 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -150,6 +150,7 @@ def get_codes(self, pattern: str) -> str:
         """
         filtered_dict = {k: v for k, v in self.file_dict.items() if re.search(pattern, k) and k.endswith(".py") and "test" not in k}
         return self._format_code_dict(filtered_dict)
+
     def prepare(self) -> None:
         """
         Prepare the workspace except the injected code

From d657834fcaa2dee30220e79537c11c7a0892fcb7 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Tue, 7 Jan 2025 03:42:01 +0000
Subject: [PATCH 193/304] small bux fix on model prompts

---
 .../coder/data_science/model/__init__.py         | 16 ++++++++++------
 .../coder/data_science/model/prompts.yaml        |  2 +-
 rdagent/core/experiment.py                       | 13 ++++++++++++-
 rdagent/utils/agent/tpl.yaml                     |  2 +-
 4 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index c3bb5b90d..9307c46c7 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -62,14 +62,18 @@ def implement_one_task(
         # We want to use a simpler way to
         user_prompt = T(".prompts:model_coder.user_general").r(
             model_spec=workspace.file_dict["spec/model.md"],
-            worksapce_code=workspace.all_codes,  # TODO: If we have high failure rate here, we should clean this step with less information.
+            workspace_code=workspace.get_code_with_key(
+                "model"
+            ),  # TODO: If we have high failure rate here, we should clean this step with less information.
         )
 
-        batch_edit = BatchEditOut.extract_output(APIBackend().build_messages_and_create_chat_completion(
-            user_prompt=user_prompt,
-            system_prompt=system_prompt,
-            json_mode=BatchEditOut.json_mode,
-        ))
+        batch_edit = BatchEditOut.extract_output(
+            APIBackend().build_messages_and_create_chat_completion(
+                user_prompt=user_prompt,
+                system_prompt=system_prompt,
+                json_mode=BatchEditOut.json_mode,
+            )
+        )
 
         return batch_edit
 
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 72f178f0b..2024b1843 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -78,7 +78,7 @@ model_coder:
 
     user_general: |-
         --------- Workspace code---------
-        {{ worksapce_code }}
+        {{ workspace_code }}
         ---------Model Specification---------
         When you are implementing the code, you should follow the spec
         {{ model_spec }}
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index fe247af93..27c66c282 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -144,12 +144,22 @@ def all_codes(self) -> str:
         filtered_dict = {k: v for k, v in self.file_dict.items() if k.endswith(".py") and "test" not in k}
         return self._format_code_dict(filtered_dict)
 
+    def get_code_with_key(self, key: str) -> str:
+        """
+        Get the code if the file name contains the key.
+        """
+        filtered_dict = {k: v for k, v in self.file_dict.items() if k.endswith(".py") and "test" not in k and key in k}
+        return self._format_code_dict(filtered_dict)
+
     def get_codes(self, pattern: str) -> str:
         """
         Get code files matching a specific pattern as a single string, excluding test files.
         """
-        filtered_dict = {k: v for k, v in self.file_dict.items() if re.search(pattern, k) and k.endswith(".py") and "test" not in k}
+        filtered_dict = {
+            k: v for k, v in self.file_dict.items() if re.search(pattern, k) and k.endswith(".py") and "test" not in k
+        }
         return self._format_code_dict(filtered_dict)
+
     def prepare(self) -> None:
         """
         Prepare the workspace except the injected code
@@ -174,6 +184,7 @@ def link_all_files_in_folder_to_workspace(data_path: Path, workspace_path: Path)
                 os.link(data_file_path, workspace_data_file_path)
 
     DEL_KEY = "__DEL__"
+
     def inject_files(self, **files: str) -> None:
         """
         Inject the code into the folder.
diff --git a/rdagent/utils/agent/tpl.yaml b/rdagent/utils/agent/tpl.yaml
index d69425e75..4ba6b07de 100644
--- a/rdagent/utils/agent/tpl.yaml
+++ b/rdagent/utils/agent/tpl.yaml
@@ -11,7 +11,7 @@ BatchEditOut: |-
   Files that do not need to be modified do not need to be included in the returned dict.
   For example:
 
-  Inject the code into the folder.
+  Inject the code into the folder. Your file name should always contain the suffix.
   {
       <file name1>: "<code>",  // indicate writing <code> into <file name> (create new file or replace existing file)
       <file name2>: "__DEL__"  // indicate removing file name2. When we want to replace a file to a new one, we usually use this

From eb891535ae295afe8302f0cdaa81c5b688487ca2 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Tue, 7 Jan 2025 03:49:31 +0000
Subject: [PATCH 194/304] remove get_code_with_key since we have regex pattern

---
 rdagent/components/coder/data_science/model/__init__.py | 4 ++--
 rdagent/core/experiment.py                              | 7 -------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 9307c46c7..42dfff3e0 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -62,8 +62,8 @@ def implement_one_task(
         # We want to use a simpler way to
         user_prompt = T(".prompts:model_coder.user_general").r(
             model_spec=workspace.file_dict["spec/model.md"],
-            workspace_code=workspace.get_code_with_key(
-                "model"
+            workspace_code=workspace.get_codes(
+                r"^model_(?!test)\w+\.py$"
             ),  # TODO: If we have high failure rate here, we should clean this step with less information.
         )
 
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index 27c66c282..564cc0646 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -144,13 +144,6 @@ def all_codes(self) -> str:
         filtered_dict = {k: v for k, v in self.file_dict.items() if k.endswith(".py") and "test" not in k}
         return self._format_code_dict(filtered_dict)
 
-    def get_code_with_key(self, key: str) -> str:
-        """
-        Get the code if the file name contains the key.
-        """
-        filtered_dict = {k: v for k, v in self.file_dict.items() if k.endswith(".py") and "test" not in k and key in k}
-        return self._format_code_dict(filtered_dict)
-
     def get_codes(self, pattern: str) -> str:
         """
         Get code files matching a specific pattern as a single string, excluding test files.

From 9d27fe7bc69afd3e3c27a1803c9956b95825ebcc Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Tue, 7 Jan 2025 02:07:05 +0000
Subject: [PATCH 195/304] fix: Correct tqdm progress bar update logic in
 LoopBase class

---
 rdagent/utils/workflow.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py
index 28cf1375f..6648f1e0e 100644
--- a/rdagent/utils/workflow.py
+++ b/rdagent/utils/workflow.py
@@ -130,9 +130,8 @@ def run(self, step_n: int | None = None):
                 self.loop_trace[li].append(LoopTrace(start, end, step_idx=si))
 
                 # Update tqdm progress bar directly to step_idx
-                pbar.n = len(self.steps) - self.step_idx  # FIXME: check it's correctness
+                pbar.n = si + 1
                 pbar.set_postfix(loop_index=li, step_index=si, step_name=name)
-                pbar.update(0)  # Refresh the display
 
                 # index increase and save session
                 self.step_idx = (self.step_idx + 1) % len(self.steps)

From 0e671ab1c69bd58687ab5b3ebdcadbcc05f1a662 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Tue, 7 Jan 2025 06:54:51 +0000
Subject: [PATCH 196/304] feat: Add diff generation and enhance feedback
 mechanism in data science loop

---
 rdagent/app/data_science/loop.py              |  12 +-
 rdagent/core/experiment.py                    |   2 +-
 .../scenarios/data_science/dev/feedback.py    | 107 ++++++++++++++----
 .../scenarios/data_science/dev/prompts.yaml   |  57 +++++++---
 .../data_science/proposal/exp_gen.py          |  25 +++-
 5 files changed, 157 insertions(+), 46 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index b7b2faae4..0c1ea22a5 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -82,7 +82,7 @@ def coding(self, prev_out: dict[str, Any]):
     def running(self, prev_out: dict[str, Any]):
         exp: DSExperiment = prev_out["coding"]
         if exp.next_component_required() is None:
-            return self.runner.run(exp)
+            return self.runner.develop(exp)
         else:
             return exp
 
@@ -112,6 +112,16 @@ def record(self, prev_out: dict[str, Any]):
 
 def main(path=None, step_n=None, competition="bms-molecular-translation"):
     """
+
+    Parameters
+    ----------
+    path : 
+        path like `$LOG_PATH/__session__/1/0_propose`. It indicates that we restore the state that after finish the step 0 in loop1
+    step_n : 
+        How many steps to run; if None, it will run forever until error or KeyboardInterrupt
+    competition : 
+        
+
     Auto R&D Evolving loop for models in a kaggle{} scenario.
     You can continue running session by
     .. code-block:: bash
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index 564cc0646..06ce05a1c 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -133,7 +133,7 @@ def _format_code_dict(code_dict: dict[str, str]) -> str:
         """
         code_string = ""
         for file_name, code in code_dict.items():
-            code_string += f"File Path: {file_name}\n```\n{code}\n```"
+            code_string += f"\nFile Path: {file_name}\n```\n{code}\n```"
         return code_string
 
     @property
diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
index cd32d8803..9e39ad5da 100644
--- a/rdagent/scenarios/data_science/dev/feedback.py
+++ b/rdagent/scenarios/data_science/dev/feedback.py
@@ -13,6 +13,67 @@
 from rdagent.utils.agent.tpl import T
 
 
+from typing import List
+import difflib
+from pathlib import Path
+
+# TODO:  find a better place.
+def generate_diff(dir1: str, dir2: str) -> List[str]:
+    """
+    Generate a diff between two directories, considering only .py files.
+    It is mocking `diff -durN dir1 dir2` in linux.
+
+    Args:
+        dir1 (str): Path to the first directory.
+        dir2 (str): Path to the second directory.
+
+    Returns:
+        List[str]: A list of diffs for .py files that are different between the two directories.
+    """
+
+    diff_files = []
+
+    dir1_files = {f.relative_to(dir1) for f in Path(dir1).rglob('*.py') if f.is_file()}
+    dir2_files = {f.relative_to(dir2) for f in Path(dir2).rglob('*.py') if f.is_file()}
+
+    all_files = dir1_files.union(dir2_files)
+
+    for file in all_files:
+        file1 = Path(dir1) / file
+        file2 = Path(dir2) / file
+
+        if file1.exists() and file2.exists():
+            with file1.open() as f1, file2.open() as f2:
+                diff = list(difflib.unified_diff(
+                    f1.readlines(),
+                    f2.readlines(),
+                    fromfile=str(file1),
+                    tofile=str(file2)
+                ))
+                if diff:
+                    diff_files.extend(diff)
+        else:
+            if file1.exists():
+                with file1.open() as f1:
+                    diff = list(difflib.unified_diff(
+                        f1.readlines(),
+                        [],
+                        fromfile=str(file1),
+                        tofile=str(file2) + " (empty file)"
+                    ))
+                    diff_files.extend(diff)
+            elif file2.exists():
+                with file2.open() as f2:
+                    diff = list(difflib.unified_diff(
+                        [],
+                        f2.readlines(),
+                        fromfile=str(file1) + " (empty file)",
+                        tofile=str(file2)
+                    ))
+                    diff_files.extend(diff)
+
+    return diff_files
+
 class DSExperiment2Feedback(Experiment2Feedback):
     def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeedback:
         # 用哪些信息来生成feedback
@@ -22,47 +83,45 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
         # 4. result 任务的结果
         # 5. sota_exp.result 之前最好的结果
         sota_exp = trace.sota_experiment()
-        hypothesis = exp.hypothesis
         current_results = exp.result
-        if hypothesis.component == "DataLoadSpec":
-            modified_file_name = "load_data.py"
-        elif hypothesis.component == "FeatureEng":
-            modified_file_name = "feature.py"
-        elif hypothesis.component == "Model":
-            modified_file_name = "model01.py"
-        elif hypothesis.component == "Ensemble":
-            modified_file_name = "ensemble.py"
-        elif hypothesis.component == "Workflow":
-            modified_file_name = "main.py"
-        modified_code = exp.experiment_workspace.file_dict[modified_file_name]
+
+        # TODO:
+        # -  Should we choose between the diff from last experiment or last sota ?
+
+        # Retrieve the last experiment from the history
+        last_exp = trace.hist[-1][0] if trace.hist else None
+        if last_exp:
+            last_workspace_path = last_exp.experiment_workspace.workspace_path
+            current_workspace_path = exp.experiment_workspace.workspace_path
+            # Generate a diff between the two workspaces
+            diff_edition = generate_diff(last_workspace_path, current_workspace_path)
+        else:
+            diff_edition = []
 
         sota_exp = trace.sota_experiment()
+        # assumption:
+        # The feedback should focus on experiment **improving**.
+        # Assume that all the the sota exp is based on the previous sota experiment
 
         if sota_exp:
-            sota_codes = {
-                "load_data.py": (sota_exp.experiment_workspace.workspace_path / "load_data.py").read_text(),
-                "feature.py": (sota_exp.experiment_workspace.workspace_path / "feature.py").read_text(),
-                "model.py": (sota_exp.experiment_workspace.workspace_path / "model.py").read_text(),
-                "ensemble.py": (sota_exp.experiment_workspace.workspace_path / "ensemble.py").read_text(),
-                "main.py": (sota_exp.experiment_workspace.workspace_path / "main.py").read_text(),
-            }
+            sota_codes = sota_exp.experiment_workspace.all_codes
             sota_results = sota_exp.result
         else:
             sota_codes = None
             sota_results = None
 
-        last_hypothesis_and_feedback = None
+        last_exp_and_feedback = None
         if trace.hist and len(trace.hist) > 0:
-            last_hypothesis_and_feedback = (trace.hist[-1][0], trace.hist[-1][2])
+            last_exp_and_feedback = (trace.hist[-1][0], trace.hist[-1][1])
 
         system_prompt = T(".prompts:exp_feedback.system").r(scenario=self.scen.get_scenario_all_desc())
         user_prompt = T(".prompts:exp_feedback.user").r(
             sota_codes=sota_codes,
             sota_results=sota_results,
-            hypothesis=str(hypothesis),
-            modified_code=modified_code,
+            cur_exp=exp,
+            diff_edition=diff_edition,
             current_results=current_results,
-            last_hypothesis_and_feedback=last_hypothesis_and_feedback,
+            last_exp_and_feedback=last_exp_and_feedback,
         )
 
         resp_dict = json.loads(
diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
index dffaf5a76..fada2d1fb 100644
--- a/rdagent/scenarios/data_science/dev/prompts.yaml
+++ b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -9,6 +9,7 @@ exp_feedback:
     1. Confirm if the current result supports or refutes the hypothesis.
     2. Compare with previous best results.
     3. Suggest improvements or new directions. Stay innovative and adapative.
+    4. SOTA results are the best outcomes we have achieved in this scenario. If we do not have complete results (i.e., results that are runnable and can generate evaluation outcomes), please replace it as the best result.
 
     Please provide detailed and constructive feedback.
     Example JSON Structure for Result Analysis:
@@ -23,32 +24,56 @@ exp_feedback:
   user: |-
     We are in a process of finding and validating hypotheses to build powerful codes. Each round aims to confirm or reject hypotheses based on results.
 
+    ## SOTA of previous exploration of the scenario
     {% if sota_codes %}
-    {% for name, code in sota_codes.items() %}
-    The SOTA code for {{ name }} is:
-    ```python
-    {{ code }}
-    ```
-    {% endfor %}
+    ### Code
+    The improvement of sota results is
+    {{sota_codes}}
+
+    ### According results
+    {% if sota_results %}
     The SOTA results is:
     {{ sota_results }}
+    {% else %}
+    There are no previous results
+    {% endif %}
+    {% else %}
+    We have no previous complete complete results.
     {% endif %}
 
+    ## Current solution
     Current solution to be evaluated:
-    Hypothesis: {{ hypothesis }}
+
+    ### Task of Current solution
+    {{cur_exp.sub_tasks[0].get_task_information()}}
+
+    {% if cur_exp.hypothesis %}
+    the experiment is designed based on hypothesis: {{ cur_exp.hypothesis }}
     Modified code according to hypothesis:
-    ```python
-    {{ modified_code }}
-    ```
+    {% else %}
+    Modified code:
+    {% endif %}
+
+    {% for de in diff_edition %}
+    {{ de }}
+    {% endfor %}
+
     Final results of the current solution: {{ current_results }}
+    ### Code of current solution
+    {{cur_exp.experiment_workspace.all_codes}}
 
-    {% if last_hypothesis_and_feedback %}
-    The user has made some hypothesis and conducted experiments to validate them, and the results are as follows:
-    hypothesis: {{ last_hypothesis_and_feedback[0].hypothesis }}
-    feedback decision: {{ last_hypothesis_and_feedback[1].decision }} 
-    reason: {{ last_hypothesis_and_feedback[1].reason }}
+    {% if last_exp_and_feedback and last_exp_and_feedback|length > 1 %}
+    ## Previous trial
+    Before current trial, previous recent trial is listed below.
+    {% if last_exp_and_feedback[0].hypothesis %}
+    the experiment is designed based on hypothesis: {{ last_exp_and_feedback[0].hypothesis }}
+    {% endif %}
+    ### Task of previous trial
+    {{ last_exp_and_feedback[0].sub_tasks[0].get_task_information() }}
+    feedback decision: {{ last_exp_and_feedback[1].decision }}
+    reason: {{ last_exp_and_feedback[1].reason }}
     {% endif %}
-    Please refer to these hypothesis and feedback to help you recommend new hypothesis
+    Please refer to these hypotheses and feedback to help you recommend new experiment and hypothesis
 
     Consider Changing Direction for Significant Gaps with the Best Result and the last round:
       - If the new results significantly differ from SOTA, consider a new direction.
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index b8d7bd905..775f79b35 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -59,11 +59,28 @@ def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None =
         self.hist: list[tuple[DSExperiment, ExperimentFeedback]] = []
         self.knowledge_base = knowledge_base
 
-    def sota_experiment(self) -> Experiment | None:
-        """Access the last experiment result."""
+    def sota_experiment(self, last_n: int = -1) -> Experiment | None:
+        """
+        Access the last experiment result.
+
+        Parameters
+        ----------
+        last_n : int
+            The index from the last experiment result to access. 
+            Use -1 for the most recent experiment, -2 for the second most recent, and so on.
+
+        Returns
+        -------
+        Experiment or None
+            The experiment result if found, otherwise None.
+        """
+        assert last_n < 0
         for exp, ef in self.hist[::-1]:
-            if ef.decision:
-                return exp
+            # the sota exp should be accepted decision and all required components are completed.
+            if ef.decision and exp.next_component_required() is None:
+                last_n += 1
+                if last_n == 0:
+                    return exp
         return None
 
 

From a300ae4243792b699dddc152b7bb1bb9eaae0750 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Tue, 7 Jan 2025 07:33:37 +0000
Subject: [PATCH 197/304] update some fix to model and workflow prompts

---
 .../coder/CoSTEER/evolving_agent.py           |  2 +-
 .../coder/data_science/model/__init__.py      |  6 +++
 .../coder/data_science/model/eval.py          | 45 -------------------
 .../coder/data_science/model/prompts.yaml     |  6 ++-
 .../data_science/raw_data_loader/prompts.yaml |  4 ++
 .../coder/data_science/workflow/__init__.py   |  4 +-
 .../coder/data_science/workflow/prompts.yaml  |  6 +++
 .../data_science/proposal/exp_gen.py          |  7 +--
 8 files changed, 29 insertions(+), 51 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/evolving_agent.py b/rdagent/components/coder/CoSTEER/evolving_agent.py
index 50b22a5f7..c5c95d980 100644
--- a/rdagent/components/coder/CoSTEER/evolving_agent.py
+++ b/rdagent/components/coder/CoSTEER/evolving_agent.py
@@ -14,7 +14,7 @@ def filter_evolvable_subjects_by_feedback(
         assert len(evo.sub_workspace_list) == len(feedback)
 
         for index in range(len(evo.sub_workspace_list)):
-            if evo.sub_workspace_list[index] is not None and feedback[index] and not feedback[index].final_decision:
+            if evo.sub_workspace_list[index] is not None and feedback[index] is not None and not feedback[index]:
                 evo.sub_workspace_list[index].clear()
 
         if all(not f.final_decision for f in feedback if f):
diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 42dfff3e0..b5de472a4 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -75,6 +75,12 @@ def implement_one_task(
             )
         )
 
+        # 3. post process to align file name to the task name
+        for key, value in batch_edit.items():
+            if value != "__DEL__" and key != f"{target_task.name}.py":
+                batch_edit[f"{target_task.name}.py"] = value
+                del batch_edit[key]
+
         return batch_edit
 
     def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 17a12e10f..ab967836b 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -85,48 +85,3 @@ def evaluate(
         )
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
         return ModelSingleFeedback(**json.loads(resp))
-
-
-class XXX2SpecEval:
-    """
-    Based on XXX1SpecEval, but considering the following case:
-
-    Motivation case:
-    - Sometimes we don't need validation (e.g., simple models not prone to overfitting, or data is too scarce to split).
-
-    Test workflow:
-    - Build train and test data to run it, and test the output (e.g., shape, etc.)
-    - valid_data == None
-    """
-
-
-class XXX3SpecEval:
-    """
-    Motivation case:
-    - We need to tune hyperparameters.
-
-    Test workflow:
-    - Input:
-        - Build train and valid data
-        - test == None
-        - Hyperparameters are not blank
-    - Output:
-        - The early stop hyperparameters must be returned
-    """
-
-
-class XXX4SpecEval:
-    """
-    Motivation case:
-    - After obtaining good hyperparameters, we retrain the model.
-
-    Test workflow:
-    - Test1: Since we have already tested it in XXX2SpecEval, we'll focus on another aspect.
-        - Input:
-            - Build train and test data
-            - valid == None
-            - Previous good hyperparameters (a parameter representing early stop)
-    - Test2: Ensure the hyperparameters are 1) being used, and 2) the model remains stable.
-        - Different hyperparameters will yield different results
-        - Same hyperparameters will yield the same results
-    """
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 08c8d025d..382cbb580 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -35,7 +35,7 @@ model_coder:
 
         {% if out_spec %}
         {{out_spec}}
-        The file name should be the model name described in the model task
+        The file name should be the model name described in the model task in the format "{task_name}.py". You should always follow this name format.
         {% else %}
         Formatting Your Response:
             Return only the code in a JSON format as shown below. Do not include any explanations or extra text. Example:
@@ -78,7 +78,11 @@ model_coder:
 
     user_general: |-
         --------- Workspace code---------
+        {% if workspace_code|length == 0 %}
+        So far the workspace is empty. No model code has been implemented yet.
+        {% else %}
         {{ workspace_code }}
+        {% endif %}
         ---------Model Specification---------
         When you are implementing the code, you should follow the spec
         {{ model_spec }}
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 39195129b..48b249eee 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -21,6 +21,7 @@ spec:
     Your specification should consists two parts:
     1. The function definition in code format with detailed annotation to each parameter and return value.
     2. A detailed docstring to the function that explains the purpose of the function, the input parameters, and the output.
+    3. Additional information or notes that the coder should consider while implementing the function.
     Your specifications should not include any code implementation, only the function definition and docstring.
 
     -----------Competition Information-----------
@@ -112,6 +113,8 @@ spec:
       4. Notes:
         - Align `DT` (data type) definitions with those in the Data Loader specification.
         - Extend or adjust domain-specific transformations based on competition requirements.
+        - The device has GPU support, so you can use it for feature engineering if necessary to accelerate the process.
+        - Multi processing or parallel processing can be used to speed up the feature engineering process.
       
       {% if latest_spec %}
       5. Former Specification:
@@ -159,6 +162,7 @@ spec:
 
       4. Notes:
         - Align `DT` (data type) with the definitions used in Feature Engineering specifications.
+        - The device has GPU support, so you can use it for training if necessary to accelerate the process.
 
       {% if latest_spec %}
       5. Former Specification:
diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
index e7da0ed38..855c545dc 100644
--- a/rdagent/components/coder/data_science/workflow/__init__.py
+++ b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -43,13 +43,15 @@ def implement_one_task(
 
         # 2. code
         system_prompt = T(".prompts:workflow_coder.system").r(
+            task_desc=workflow_information_str,
+            competition_info=self.scen.get_competition_full_desc(),
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
         )
         user_prompt = T(".prompts:workflow_coder.user").r(
             load_data_code=workspace.file_dict["load_data.py"],
             feature_code=workspace.file_dict["feature.py"],
-            model_codes=workspace.get_codes(r'^model_(?!test)\w+\.py$'),
+            model_codes=workspace.get_codes(r"^model_(?!test)\w+\.py$"),
             ensemble_code=workspace.file_dict["ensemble.py"],
             latest_code=workspace.file_dict.get("main.py"),
             workflow_spec=workspace.file_dict["spec/workflow.md"],
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index 8be9216b2..f31a9b4e4 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -2,6 +2,12 @@ workflow_coder:
   system: |-
     You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
     Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
+
+    Your specific task is as follows:
+    {{task_desc}}
+
+    Your current competition information is as follows:
+    {{ competition_info }}
     
     The user has written different Python functions that can load and preprocess data, execute feature engineering, train models, and ensemble them.
 
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index b8d7bd905..73240025c 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -103,13 +103,14 @@ def llm_task_gen(
         return resp_dict
 
     def gen(self, trace: DSTrace) -> DSExperiment:
+        scenario_desc = trace.scen.get_scenario_all_desc()
+        sota_exp = trace.sota_experiment()
+
         if len(trace.hist) == 0:
             next_component = "DataLoadSpec"
         else:
-            next_component = trace.hist[-1][0].next_component_required()
+            next_component = sota_exp.next_component_required()
 
-        scenario_desc = trace.scen.get_scenario_all_desc()
-        sota_exp = trace.sota_experiment()
         if next_component == "DataLoadSpec":
             resp_dict = self.llm_task_gen(
                 targets="Data loader and specification generation",

From 84d4891c6cec2a493b378738f322a7d3d673b10a Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Tue, 7 Jan 2025 07:41:10 +0000
Subject: [PATCH 198/304] refine the logic of progress bar filter

---
 .../coder/data_science/model/eval.py          |  4 ++
 rdagent/utils/__init__.py                     | 67 +++++++++++++------
 rdagent/utils/prompts.yaml                    | 24 ++++---
 3 files changed, 62 insertions(+), 33 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index ab967836b..2440e19c7 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -18,6 +18,7 @@
 from rdagent.utils import filter_progress_bar
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
+from rdagent.core.exception import CoderError
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 ModelSingleFeedback = CoSTEERSingleFeedback
@@ -74,6 +75,9 @@ def evaluate(
         # Filter out progress bars from stdout using regex
         filtered_stdout = filter_progress_bar(stdout)
 
+        if filtered_stdout is None:
+            raise CoderError("The execution output contains too many progress bars and results in the LLM's token size exceeding the limit.")
+
         system_prompt = T(".prompts:model_eval.system").r(
             test_code=test_code,
             scenario=self.scen.get_scenario_all_desc(),
diff --git a/rdagent/utils/__init__.py b/rdagent/utils/__init__.py
index e0b150e82..408b4c4bf 100644
--- a/rdagent/utils/__init__.py
+++ b/rdagent/utils/__init__.py
@@ -13,6 +13,7 @@
 from types import ModuleType
 from typing import Union
 
+from rdagent.oai.llm_conf import LLM_SETTINGS
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 
@@ -76,50 +77,72 @@ def filter_progress_bar(stdout: str) -> str:
         r"\d+/\d+\s+[━]+\s+\d+s?\s+\d+ms/step|"
         r"\d+/\d+\s+[━]+\s+\d+s?\s+\d+ms/step.*|"
         r"\d+/\d+\s+[━]+.*?\u0008+|"
-        r"\d+/\d+\s+[━]+.*|[ ]*\u0008+)"
+        r"\d+/\d+\s+[━]+.*|[ ]*\u0008+|"
+        r"\d+%\|[█▏▎▍▌▋▊▉]+\s+\|\s+\d+/\d+\s+\[\d{2}:\d{2}<\d{2}:\d{2},\s+\d+\.\d+it/s\]|"
+        r"\d+%\|[█]+\|\s+\d+/\d+\s+\[\d{2}:\d{2}<\d{2}:\d{2},\s*\d+\.\d+it/s\])"
     )
 
     filtered_stdout = remove_ansi_codes(stdout)
     filtered_stdout = re.sub(progress_bar_re, "", filtered_stdout)
-    filtered_stdout = re.sub(r'\s*\n\s*', '\n', filtered_stdout)
+    filtered_stdout = re.sub(r"\s*\n\s*", "\n", filtered_stdout)
 
     # Check if progress bars are already filtered
     system_prompt = T(".prompts:if_filtered.system").r()
     user_prompt = T(".prompts:if_filtered.user").r(
         filtered_stdout=filtered_stdout,
     )
-    if_filtered_stdout = json.loads(
-        APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
-    ).get("progress bar filtered", False)
-
-    if convert2bool(if_filtered_stdout):
+    stdout_token_size = APIBackend().build_messages_and_calculate_token(
+        user_prompt=user_prompt,
+        system_prompt=system_prompt,
+    )
+    if stdout_token_size < LLM_SETTINGS.chat_token_limit * 0.1:
         return filtered_stdout
+    elif stdout_token_size < LLM_SETTINGS.chat_token_limit * 0.8:
+        if_filtered_stdout = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
+        ).get("progress bar filtered", False)
 
+        if convert2bool(if_filtered_stdout):
+            return filtered_stdout
+
+    filtered_stdout_shortened = filtered_stdout
+    needs_sub = True
     # Attempt further filtering up to 5 times
     for _ in range(5):
         system_prompt = T(".prompts:filter_progress_bar.system").r()
         user_prompt = T(".prompts:filter_progress_bar.user").r(
-            stdout=filtered_stdout,
+            stdout=filtered_stdout_shortened,
+        )
+
+        stdout_token_size = APIBackend().build_messages_and_calculate_token(
+            user_prompt=user_prompt,
+            system_prompt=system_prompt,
         )
+        if stdout_token_size < LLM_SETTINGS.chat_token_limit * 0.1:
+            return filtered_stdout_shortened
+        elif stdout_token_size > LLM_SETTINGS.chat_token_limit * 0.8:
+            filtered_stdout_shortened = filtered_stdout[len(filtered_stdout) // 4 : len(filtered_stdout) * 3 // 4]
 
-        new_pattern = json.loads(
+        response = json.loads(
             APIBackend().build_messages_and_create_chat_completion(
                 user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
             )
-        )["regex pattern"]
-
-        filtered_stdout = re.sub(new_pattern, "", filtered_stdout)
-        filtered_stdout = re.sub(r'\s*\n\s*', '\n', filtered_stdout)
-
-        system_prompt = T(".prompts:if_filtered.system").r()
-        user_prompt = T(".prompts:if_filtered.user").r(
-            filtered_stdout=filtered_stdout,
         )
-        if_filtered_stdout = json.loads(
-            APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
-        ).get("progress bar filtered", False)
+        needs_sub = response.get("needs_sub", True)
+        regex_patterns = response.get("regex patterns", [])
+        if isinstance(regex_patterns, list):
+            for pattern in regex_patterns:
+                filtered_stdout = re.sub(pattern, "", filtered_stdout)
+        else:
+            filtered_stdout = re.sub(regex_patterns, "", filtered_stdout)
 
-        if convert2bool(if_filtered_stdout):
+        if not needs_sub:
             break
+        filtered_stdout = re.sub(regex_patterns, "", filtered_stdout)
+        filtered_stdout = re.sub(r"\s*\n\s*", "\n", filtered_stdout)
+
+        filtered_stdout_shortened = filtered_stdout
 
-    return filtered_stdout
\ No newline at end of file
+    if needs_sub:
+        return None
+    return filtered_stdout
diff --git a/rdagent/utils/prompts.yaml b/rdagent/utils/prompts.yaml
index fd3786586..77341ffaa 100644
--- a/rdagent/utils/prompts.yaml
+++ b/rdagent/utils/prompts.yaml
@@ -1,19 +1,21 @@
 filter_progress_bar:
   system: |
-    You are an assistant helping to filter progress bars from a given text. Generate a regex pattern that matches typical progress bar formats, including those with percentages, loading bars (e.g., `====>`), or dynamic text such as elapsed time or remaining time. Be flexible to handle variations.
-  user: |
-    The following text contains stdout with progress bars:
-
-    {{ stdout }}
-
-    Generate a regex pattern to filter out progress bars from this stdout text. Focus on identifying and removing lines or segments that represent progress bars while preserving the rest of the content.
+    You are an assistant helping to analyze and filter progress bars from a given text. Evaluate the text to determine if progress bar patterns are present and, if so, generate a list of regex patterns to remove them. 
+    Additionally, indicate whether substitution is needed. If the input exceeds a token limit, the system will provide only a shortened portion of the text.
+    Note: You can keep metrics or logs, such as `val_accuracy`, `val_loss`, and similar entries, are retained and not filtered.
 
-    Please respond with the regex pattern in the following JSON format and order:
+    Respond in the following JSON format and order:
     ```json
     {
-        "regex pattern": "The regex pattern to filter out progress bars."
+        "needs_sub": <true/false>, 
+        "regex patterns": ["regex pattern 1", "regex pattern 2", ...]
     }
-    ```
+  user: |
+    The following text contains stdout:
+
+    {{ stdout }}
+
+    Check if the text contains progress bar patterns. If patterns are found, provide a list of regex patterns to filter them. Otherwise, indicate that substitution is not needed.
 
 if_filtered:
   system: |
@@ -30,4 +32,4 @@ if_filtered:
     {
         "progress bar filtered": <true/false>
     }
-    ```
+    ```
\ No newline at end of file

From 3e58cb5d0b02400ec39938fb5d87a03d6a0f19d0 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Tue, 7 Jan 2025 07:52:09 +0000
Subject: [PATCH 199/304] add last_successful_exp in exp_gen

---
 .../data_science/proposal/exp_gen.py          | 73 ++++++++++++-------
 1 file changed, 47 insertions(+), 26 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 1ff12c94b..89a70651f 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -66,7 +66,7 @@ def sota_experiment(self, last_n: int = -1) -> Experiment | None:
         Parameters
         ----------
         last_n : int
-            The index from the last experiment result to access. 
+            The index from the last experiment result to access.
             Use -1 for the most recent experiment, -2 for the second most recent, and so on.
 
         Returns
@@ -83,6 +83,15 @@ def sota_experiment(self, last_n: int = -1) -> Experiment | None:
                     return exp
         return None
 
+    def last_successful_exp(self) -> Experiment | None:
+        """
+        Access the last successful experiment even part of the components are not completed.
+        """
+        for exp, ef in self.hist[::-1]:
+            if ef.decision:
+                return exp
+        return None
+
 
 class DSExpGen(ExpGen):
     """Data Science Task Generator."""
@@ -121,12 +130,12 @@ def llm_task_gen(
 
     def gen(self, trace: DSTrace) -> DSExperiment:
         scenario_desc = trace.scen.get_scenario_all_desc()
-        sota_exp = trace.sota_experiment()
+        last_successful_exp = trace.last_successful_exp()
 
-        if len(trace.hist) == 0:
+        if len(trace.hist) == 0 or last_successful_exp is None:
             next_component = "DataLoadSpec"
         else:
-            next_component = sota_exp.next_component_required()
+            next_component = last_successful_exp.next_component_required()
 
         if next_component == "DataLoadSpec":
             resp_dict = self.llm_task_gen(
@@ -147,7 +156,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             resp_dict = self.llm_task_gen(
                 targets="Feature Engineering",
                 scenario_desc=scenario_desc,
-                spec=sota_exp.experiment_workspace.file_dict["spec/feature.md"],
+                spec=last_successful_exp.experiment_workspace.file_dict["spec/feature.md"],
                 task_output_format=T(".prompts:output_format.feature").r(),
             )
 
@@ -156,13 +165,13 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 description=resp_dict.get("description", "Factor description not provided"),
             )
             exp = DSExperiment(sub_tasks=[ft], hypothesis=DSHypothesis("FeatureEng"))
-            exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
+            exp.experiment_workspace.inject_code_from_folder(last_successful_exp.experiment_workspace.workspace_path)
             return exp
         elif next_component == "Model":
             resp_dict = self.llm_task_gen(
                 targets="Models",
                 scenario_desc=scenario_desc,
-                spec=sota_exp.experiment_workspace.file_dict["spec/model.md"],
+                spec=last_successful_exp.experiment_workspace.file_dict["spec/model.md"],
                 task_output_format=T(".prompts:output_format.model").r(),
             )
 
@@ -174,13 +183,13 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
             )
             exp = DSExperiment(sub_tasks=[mt], hypothesis=DSHypothesis("Model"))
-            exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
+            exp.experiment_workspace.inject_code_from_folder(last_successful_exp.experiment_workspace.workspace_path)
             return exp
         elif next_component == "Ensemble":
             resp_dict = self.llm_task_gen(
                 targets="Ensemble",
                 scenario_desc=scenario_desc,
-                spec=sota_exp.experiment_workspace.file_dict["spec/ensemble.md"],
+                spec=last_successful_exp.experiment_workspace.file_dict["spec/ensemble.md"],
                 task_output_format=T(".prompts:output_format.ensemble").r(),
             )
 
@@ -195,7 +204,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             resp_dict = self.llm_task_gen(
                 targets="Workflow",
                 scenario_desc=scenario_desc,
-                spec=sota_exp.experiment_workspace.file_dict["spec/workflow.md"],
+                spec=last_successful_exp.experiment_workspace.file_dict["spec/workflow.md"],
                 task_output_format=T(".prompts:output_format.workflow").r(),
             )
 
@@ -204,10 +213,10 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 description=resp_dict.get("description", "Workflow description not provided"),
             )
             exp = DSExperiment(sub_tasks=[wt], hypothesis=DSHypothesis("Workflow"))
-            exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
+            exp.experiment_workspace.inject_code_from_folder(last_successful_exp.experiment_workspace.workspace_path)
             return exp
         else:  # propose new component by LLM
-            assert sota_exp is not None, "SOTA experiment is not provided."
+            assert last_successful_exp is not None, "SOTA experiment is not provided."
 
             # base info
             hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(trace=trace)
@@ -266,11 +275,13 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 )
             else:
                 model_infos = []
-                score_df = pd.read_csv(sota_exp.experiment_workspace.workspace_path / "score.csv", index_col=0)
+                score_df = pd.read_csv(
+                    last_successful_exp.experiment_workspace.workspace_path / "score.csv", index_col=0
+                )
                 metric_name = score_df.columns[0]
-                for fname in sota_exp.experiment_workspace.file_dict:
+                for fname in last_successful_exp.experiment_workspace.file_dict:
                     if re.match(r"^model_.+\.py", fname):
-                        model_str = f"{fname}:\n{metric_name} on valid: {score_df.loc[fname[:-3]]}\n```python\n{sota_exp.experiment_workspace.file_dict[fname]}\n```\n"
+                        model_str = f"{fname}:\n{metric_name} on valid: {score_df.loc[fname[:-3]]}\n```python\n{last_successful_exp.experiment_workspace.file_dict[fname]}\n```\n"
                         model_infos.append(model_str)
 
                 model_num = len(model_infos)
@@ -317,7 +328,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 resp_dict = self.llm_task_gen(
                     targets="Data loader and specification generation",
                     scenario_desc=scenario_desc,
-                    spec=sota_exp.experiment_workspace.file_dict["spec/data_loader.md"],
+                    spec=last_successful_exp.experiment_workspace.file_dict["spec/data_loader.md"],
                     hypothesis=hypothesis,
                     task_output_format=T(".prompts:output_format.data_loader").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
@@ -331,14 +342,16 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 )
 
                 exp = DSExperiment(sub_tasks=[dt], hypothesis=hypothesis)
-                exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
+                exp.experiment_workspace.inject_code_from_folder(
+                    last_successful_exp.experiment_workspace.workspace_path
+                )
                 return exp
             elif hypothesis.component == "FeatureEng":
                 # TODO: RAG
                 resp_dict = self.llm_task_gen(
                     targets="Feature Engineering",
                     scenario_desc=scenario_desc,
-                    spec=sota_exp.experiment_workspace.file_dict["spec/feature.md"],
+                    spec=last_successful_exp.experiment_workspace.file_dict["spec/feature.md"],
                     hypothesis=hypothesis,
                     task_output_format=T(".prompts:output_format.feature").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
@@ -350,14 +363,16 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 )
 
                 exp = DSExperiment(sub_tasks=[ft], hypothesis=hypothesis)
-                exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
+                exp.experiment_workspace.inject_code_from_folder(
+                    sota_last_successful_expexp.experiment_workspace.workspace_path
+                )
                 return exp
             elif hypothesis.component == "Model":
                 resp_dict = self.llm_task_gen(
                     scenario_desc=scenario_desc,
-                    spec=sota_exp.experiment_workspace.file_dict["spec/model.md"],
+                    spec=last_successful_exp.experiment_workspace.file_dict["spec/model.md"],
                     hypothesis=hypothesis,
-                    workspace_code=sota_exp.experiment_workspace.all_codes,
+                    workspace_code=last_successful_exp.experiment_workspace.all_codes,
                     task_output_format=T(".prompts:output_format.model").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
                 )
@@ -372,13 +387,15 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 )
 
                 exp = DSExperiment(sub_tasks=[mt], hypothesis=hypothesis)
-                exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
+                exp.experiment_workspace.inject_code_from_folder(
+                    last_successful_exp.experiment_workspace.workspace_path
+                )
                 return exp
             elif hypothesis.component == "Ensemble":
                 resp_dict = self.llm_task_gen(
                     targets="Ensemble",
                     scenario_desc=scenario_desc,
-                    spec=sota_exp.experiment_workspace.file_dict["spec/ensemble.md"],
+                    spec=last_successful_exp.experiment_workspace.file_dict["spec/ensemble.md"],
                     hypothesis=hypothesis,
                     task_output_format=T(".prompts:output_format.ensemble").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
@@ -390,13 +407,15 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 )
 
                 exp = DSExperiment(sub_tasks=[et], hypothesis=hypothesis)
-                exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
+                exp.experiment_workspace.inject_code_from_folder(
+                    last_successful_exp.experiment_workspace.workspace_path
+                )
                 return exp
             elif hypothesis.component == "Workflow":
                 resp_dict = self.llm_task_gen(
                     targets="Workflow",
                     scenario_desc=scenario_desc,
-                    spec=sota_exp.experiment_workspace.file_dict["spec/workflow.md"],
+                    spec=last_successful_exp.experiment_workspace.file_dict["spec/workflow.md"],
                     hypothesis=hypothesis,
                     task_output_format=T(".prompts:output_format.workflow").r(),
                     hypothesis_and_feedback=hypothesis_and_feedback,
@@ -408,7 +427,9 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 )
 
                 exp = DSExperiment(sub_tasks=[wt], hypothesis=hypothesis)
-                exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
+                exp.experiment_workspace.inject_code_from_folder(
+                    last_successful_exp.experiment_workspace.workspace_path
+                )
                 return exp
 
         return super().gen(trace)

From 301b0c025b0c0236815d449d65dbd97fe1344c22 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Tue, 7 Jan 2025 07:58:15 +0000
Subject: [PATCH 200/304] fix a one line bug

---
 rdagent/scenarios/data_science/proposal/exp_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 89a70651f..7a819134b 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -198,7 +198,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 description=resp_dict.get("description", "Ensemble description not provided"),
             )
             exp = DSExperiment(sub_tasks=[et], hypothesis=DSHypothesis("Ensemble"))
-            exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
+            exp.experiment_workspace.inject_code_from_folder(last_successful_exp.experiment_workspace.workspace_path)
             return exp
         elif next_component == "Workflow":
             resp_dict = self.llm_task_gen(

From 29e71496d25c563ae7f84893dbad3a848c0a71e6 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Tue, 7 Jan 2025 08:25:12 +0000
Subject: [PATCH 201/304] add a hint in prompt

---
 .../coder/data_science/workflow/eval.py       | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index 9567f19fb..253188fa0 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -1,7 +1,8 @@
 import json
+import re
 from pathlib import Path
+
 import pandas as pd
-import re
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.CoSTEER.evaluators import (
@@ -54,7 +55,9 @@ def evaluate(
                 final_decision=False,
             )
         ds_docker_conf = DSDockerConf()
-        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
+        ds_docker_conf.extra_volumes = {
+            f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
+        }
         de = DockerEnv(conf=ds_docker_conf)
         fname = "main.py"
         stdout = implementation.execute(env=de, entry=f"python {fname}")
@@ -62,25 +65,26 @@ def evaluate(
         # Check score file
         score_fp = implementation.workspace_path / "scores.csv"
         if not score_fp.exists():
-            stdout += "Metrics file (scores.csv) is not generated."
+            stdout += "\nMetrics file (scores.csv) is not generated."
         else:
             score_df = pd.read_csv(score_fp, index_col=0)
             model_set_in_scores = set(score_df.index)
-            model_set_in_folder = set(f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_.+\.py$", f))
+            model_set_in_folder = set(
+                f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_.+\.py$", f) and "test" not in f
+            )
             if model_set_in_scores != model_set_in_folder:
-                stdout += "The models used by ensemble are not consistent with the models in the workspace."
+                stdout += f"\nThe models used by ensemble are not consistent with the models in the workspace.\nThe model names in the score.csv are {model_set_in_scores}, while the model names in the workspace are {model_set_in_folder}."
 
         # Check submission file
         submission_fp = implementation.workspace_path / "submission.csv"
         if not submission_fp.exists():
-            stdout += "Submission file (submission.csv) is not generated."
+            stdout += "\nSubmission file (submission.csv) is not generated."
 
         if stdout is None:
-            stdout = "The execution exceeded the time limit."
+            stdout = "\nThe execution exceeded the time limit."
 
         system_prompt = T(".prompts:workflow_eval.system").r(
-            scenario=self.scen.get_scenario_all_desc(),
-            spec=implementation.file_dict["spec/workflow.md"]
+            scenario=self.scen.get_scenario_all_desc(), spec=implementation.file_dict["spec/workflow.md"]
         )
         user_prompt = T(".prompts:workflow_eval.user").r(
             stdout=stdout,

From 7e3d7741e6489505d5b7928584ed910535963e74 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Tue, 7 Jan 2025 09:17:02 +0000
Subject: [PATCH 202/304] fix data sample for bms

---
 rdagent/scenarios/data_science/debug/data.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index fa87eba8d..c1be5d14e 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -128,7 +128,7 @@ def create_debug_data(
         dataset_path = KAGGLE_IMPLEMENT_SETTING.local_data_path  # FIXME: don't hardcode this KAGGLE_IMPLEMENT_SETTING
 
     if sample_path is None:
-        sample_path = Path(dataset_path) / "sample"
+        sample_path = Path(dataset_path) / "sample1"
 
     data_folder = Path(dataset_path) / competition
     sample_folder = Path(sample_path) / competition
@@ -186,16 +186,17 @@ def create_debug_data(
         subfolder_dict.setdefault(rel_dir, []).append(file_path)
 
     # For each subfolder, decide which files to copy
+    cnt =0
     for rel_dir, file_list in subfolder_dict.items():
         used_files = []
         not_used_files = []
 
         # Check if each file is in the "used" list
         for fp in file_list:
-            # If your logic is only about the file's name:
-            # if fp.name in sample_used_file_names:
-            if str(fp.name) in sample_used_file_names or str(fp) in sample_used_file_names:
+            if str(fp.name) in sample_used_file_names or str(fp.stem) in sample_used_file_names:
                 used_files.append(fp)
+                print(f"{cnt} Copying {fp} to used_files")
+                cnt += 1
             else:
                 not_used_files.append(fp)
 

From 67fdcebe3dcfdc4ca078974df0601301ce777f69 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Tue, 7 Jan 2025 09:31:38 +0000
Subject: [PATCH 203/304] fix data sample for bms

---
 rdagent/scenarios/data_science/debug/data.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index c1be5d14e..3625a57ef 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -128,7 +128,7 @@ def create_debug_data(
         dataset_path = KAGGLE_IMPLEMENT_SETTING.local_data_path  # FIXME: don't hardcode this KAGGLE_IMPLEMENT_SETTING
 
     if sample_path is None:
-        sample_path = Path(dataset_path) / "sample1"
+        sample_path = Path(dataset_path) / "sample"
 
     data_folder = Path(dataset_path) / competition
     sample_folder = Path(sample_path) / competition
@@ -186,7 +186,6 @@ def create_debug_data(
         subfolder_dict.setdefault(rel_dir, []).append(file_path)
 
     # For each subfolder, decide which files to copy
-    cnt =0
     for rel_dir, file_list in subfolder_dict.items():
         used_files = []
         not_used_files = []
@@ -195,8 +194,6 @@ def create_debug_data(
         for fp in file_list:
             if str(fp.name) in sample_used_file_names or str(fp.stem) in sample_used_file_names:
                 used_files.append(fp)
-                print(f"{cnt} Copying {fp} to used_files")
-                cnt += 1
             else:
                 not_used_files.append(fp)
 

From b35d7fba6159e66dddea4aeac663bc55784283b2 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Tue, 7 Jan 2025 09:34:23 +0000
Subject: [PATCH 204/304] hypothesis small fix

---
 rdagent/scenarios/data_science/proposal/exp_gen.py  |  7 +++----
 .../scenarios/data_science/proposal/prompts.yaml    | 13 ++++++++++---
 rdagent/scenarios/kaggle/prompts.yaml               |  4 ++--
 rdagent/scenarios/kaggle/proposal/proposal.py       |  2 +-
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 7a819134b..c7fa88d9e 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -219,7 +219,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             assert last_successful_exp is not None, "SOTA experiment is not provided."
 
             # base info
-            hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(trace=trace)
+            hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(hist=[i for i in trace.hist[-10:] if isinstance(i[1], HypothesisFeedback)])
             # Step 1: Generate component
             sota_solution = ""
             component_sys_prompt = T(".prompts:component_gen").r(
@@ -230,8 +230,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             )
 
             component_user_prompt = T(".prompts:hypothesis_gen.user").r(
-                targets="data science project",
-                hypothesis_and_feedback=hypothesis_and_feedback,
+                feedback=hypothesis_and_feedback,
             )
 
             resp_dict_component: dict = json.loads(
@@ -242,7 +241,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
 
             component = resp_dict_component.get("component", "Component not provided")
 
-            # Why we should split component selection and hpothesis generation
+            # Why we should split component selection and hypothesis generation
             # - after we know the selected component, we can use RAG.
 
             # Step 2: Generate the rest of the hypothesis
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 63f1a62e8..edd4b62c9 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -48,6 +48,13 @@ hypothesis_model:
     Please generate the output using the following format and specifications:
     {{ hypothesis_output_format }}
 
+hypothesis_and_feedback: |-
+  {% for experiment, feedback in hist %}
+  Hypothesis {{ loop.index }}
+  Observation on the result with the hypothesis: {{ feedback.observations }}
+  Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
+  Did changing to this hypothesis work? (focus on the change):  {{ feedback.decision }}
+  {% endfor %}
 
 task_gen:
   system: |-
@@ -129,7 +136,7 @@ component_gen:
   system: |-
     You are a Kaggle Grander Master. You are going to provide a solution for a kaggle competition.
 
-    Here is the description of the competion scenario
+    Here is the description of the competition scenario
     ```
     {{scenario}}
     ```
@@ -140,7 +147,7 @@ component_gen:
 
     ```
 
-    You will be proivded the feedback for the latest implementation.
+    You will be provided the feedback for the latest implementation.
 
     Please select the component you are going to improve the latest implementation.
 
@@ -178,7 +185,7 @@ output_format:
       "hypothesis": "The new hypothesis generated based on the information provided.",
       "reason": "The reason why you generate this hypothesis. It should be comprehensive and logical. It should cover the other keys below and extend them.",
       "concise_reason": "Two-line summary. First line focuses on a concise justification for the change. Second line generalizes a knowledge statement.",
-      "concise_observation": "One line summary. It focuses on the observation of the given scenario, data characteristics, or previous experiences (failures & succeses).",
+      "concise_observation": "One line summary. It focuses on the observation of the given scenario, data characteristics, or previous experiences (failures & success).",
       "concise_justification": "One line summary. Justify the hypothesis based on theoretical principles or initial assumptions.",
       "concise_knowledge": "One line summary. Transferable knowledge based on theoretical principles. Use conditional grammar. eg. "If...., ..; When..., .; and etc" Make sure that you state things clearly without ambiguity. Eg. avoid saying "previous hypothesis", because one wouldn't know what that is."
     }
diff --git a/rdagent/scenarios/kaggle/prompts.yaml b/rdagent/scenarios/kaggle/prompts.yaml
index c73ae099f..79ee84b31 100644
--- a/rdagent/scenarios/kaggle/prompts.yaml
+++ b/rdagent/scenarios/kaggle/prompts.yaml
@@ -225,7 +225,7 @@ model_tuning_feedback_generation:
     Your feedback should:
     1. Confirm if the current result supports or refutes the hypothesis.
     2. Compare with previous best results.
-    3. Suggest improvements or new directions. Stay innovative and adapative.
+    3. Suggest improvements or new directions. Stay innovative and adaptive.
 
     Please provide detailed and constructive feedback. Note that as hypothesis evolve, a general trend should be that the model grows larger. 
     Example JSON Structure for Result Analysis:
@@ -343,7 +343,7 @@ gen_knowledge_from_code_mini_case:
     You were a proficient data scientist.
   user: |-
     The following notebook (contain markdown part and code part) is a high-performing solution for a kaggle competition.
-    Please answer the following questions one by one and **as detailedly as possible**.
+    Please answer the following questions one by one and **as detailed as possible**.
     Make sure that another data scientist can exactly reproduce this copy of code based on your answer.
     Focus on the training process.
 
diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py
index 888c57af2..d1072d6dc 100644
--- a/rdagent/scenarios/kaggle/proposal/proposal.py
+++ b/rdagent/scenarios/kaggle/proposal/proposal.py
@@ -204,7 +204,7 @@ class KGHypothesisGen(FactorAndModelHypothesisGen):
     .. code-block:: python
 
         class KGHypothesisGen(ModelHypothesisGen):
-            prompts: Prompts = a_specifc_prompt_dict
+            prompts: Prompts = a_specific_prompt_dict
     """
 
     def __init__(self, scen: Scenario) -> Tuple[dict, bool]:

From c86bbd908f2f25c94d99b9c25cb00f628a02235b Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 8 Jan 2025 02:26:02 +0000
Subject: [PATCH 205/304] crawler readme update

---
 rdagent/scenarios/kaggle/README.md         | 12 +++++++++++-
 rdagent/scenarios/kaggle/kaggle_crawler.py | 17 +++--------------
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/rdagent/scenarios/kaggle/README.md b/rdagent/scenarios/kaggle/README.md
index 940ac92dc..2ba601401 100644
--- a/rdagent/scenarios/kaggle/README.md
+++ b/rdagent/scenarios/kaggle/README.md
@@ -17,4 +17,14 @@ sudo mv chromedriver /usr/local/bin
 sudo chmod +x /usr/local/bin/chromedriver
 
 chromedriver --version
-```
\ No newline at end of file
+```
+
+## config
+
+1. authentication: `~/.kaggle/kaggle.json`
+2. Accept Rules in competition website. (Join Competition)
+
+## notebook crawler
+
+1. `download_notebooks()`
+2. `convert_notebooks_to_text()`
\ No newline at end of file
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
index 3a6a62937..a4d63addc 100644
--- a/rdagent/scenarios/kaggle/kaggle_crawler.py
+++ b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -23,7 +23,7 @@
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.scenarios.data_science.debug.data import create_debug_data
 from rdagent.utils.env import MLEBDockerEnv
-
+from rdagent.utils.agent.tpl import T
 # %%
 options = webdriver.ChromeOptions()
 options.add_argument("--no-sandbox")
@@ -227,19 +227,8 @@ def download_notebooks(
 
 
 def notebook_to_knowledge(notebook_text: str) -> str:
-    prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
-
-    sys_prompt = (
-        Environment(undefined=StrictUndefined)
-        .from_string(prompt_dict["gen_knowledge_from_code_mini_case"]["system"])
-        .render()
-    )
-
-    user_prompt = (
-        Environment(undefined=StrictUndefined)
-        .from_string(prompt_dict["gen_knowledge_from_code_mini_case"]["user"])
-        .render(notebook=notebook_text)
-    )
+    sys_prompt = T(".prompts:gen_knowledge_from_code_mini_case.system").r()
+    user_prompt = T(".prompts:gen_knowledge_from_code_mini_case.user").r(notebook=notebook_text)
 
     response = APIBackend().build_messages_and_create_chat_completion(
         user_prompt=user_prompt,

From 2018bc82d898d51f9ab78d90a67241213d553b38 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Wed, 8 Jan 2025 03:02:35 +0000
Subject: [PATCH 206/304] fix component gen

---
 .../data_science/proposal/exp_gen.py          | 20 +++++++++----------
 .../data_science/proposal/prompts.yaml        |  5 ++++-
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index c7fa88d9e..2c376ed17 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -221,17 +221,16 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             # base info
             hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(hist=[i for i in trace.hist[-10:] if isinstance(i[1], HypothesisFeedback)])
             # Step 1: Generate component
-            sota_solution = ""
-            component_sys_prompt = T(".prompts:component_gen").r(
-                targets="data science project",
-                scenario=scenario_desc,
-                hypothesis_output_format=T(".prompts:output_format.component").r(),
-                hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
-            )
+            sota_solution = trace.sota_experiment()
+            component_sys_prompt = T(".prompts:component_gen.system").r(
+                    scenario=scenario_desc,
+                    implementation=last_successful_exp.experiment_workspace.all_codes,
+                    component_output_format=T(".prompts:output_format.component").r(),
+                )
 
-            component_user_prompt = T(".prompts:hypothesis_gen.user").r(
-                feedback=hypothesis_and_feedback,
-            )
+            component_user_prompt = T(".prompts:component_gen.user").r(
+                    feedback=hypothesis_and_feedback,
+                )
 
             resp_dict_component: dict = json.loads(
                 APIBackend().build_messages_and_create_chat_completion(
@@ -240,7 +239,6 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             )
 
             component = resp_dict_component.get("component", "Component not provided")
-
             # Why we should split component selection and hypothesis generation
             # - after we know the selected component, we can use RAG.
 
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index edd4b62c9..02f82ff75 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -143,7 +143,7 @@ component_gen:
     
     Here is the latest version of implementation.
     ```
-    {{}}
+    {{implementation}}
 
     ```
 
@@ -151,6 +151,9 @@ component_gen:
 
     Please select the component you are going to improve the latest implementation.
 
+    Please generate the output following the format below:
+    {{ component_output_format }}
+
   user: |-
     {{feedback}}
 

From 40faf5aabf28be2487c45dd3cad33a58395a1a28 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 8 Jan 2025 03:04:47 +0000
Subject: [PATCH 207/304] fix bug

---
 rdagent/scenarios/data_science/proposal/exp_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index c7fa88d9e..7484939fd 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -363,7 +363,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
 
                 exp = DSExperiment(sub_tasks=[ft], hypothesis=hypothesis)
                 exp.experiment_workspace.inject_code_from_folder(
-                    sota_last_successful_expexp.experiment_workspace.workspace_path
+                    last_successful_exp.experiment_workspace.workspace_path
                 )
                 return exp
             elif hypothesis.component == "Model":

From 7cbf7a3b91ff44962f64bf2392414004eb90ec6e Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 8 Jan 2025 03:13:39 +0000
Subject: [PATCH 208/304] annotation change

---
 rdagent/scenarios/data_science/proposal/exp_gen.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index d1014cd03..6bbb96c28 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -59,7 +59,7 @@ def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None =
         self.hist: list[tuple[DSExperiment, ExperimentFeedback]] = []
         self.knowledge_base = knowledge_base
 
-    def sota_experiment(self, last_n: int = -1) -> Experiment | None:
+    def sota_experiment(self, last_n: int = -1) -> DSExperiment | None:
         """
         Access the last experiment result.
 
@@ -83,7 +83,7 @@ def sota_experiment(self, last_n: int = -1) -> Experiment | None:
                     return exp
         return None
 
-    def last_successful_exp(self) -> Experiment | None:
+    def last_successful_exp(self) -> DSExperiment | None:
         """
         Access the last successful experiment even part of the components are not completed.
         """

From 3ce82421aaac4d1c5658e959b879d51438c06d84 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Wed, 8 Jan 2025 04:06:51 +0000
Subject: [PATCH 209/304] load description.md if it exists

---
 rdagent/scenarios/kaggle/kaggle_crawler.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
index a4d63addc..64aa8fbb9 100644
--- a/rdagent/scenarios/kaggle/kaggle_crawler.py
+++ b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -22,8 +22,9 @@
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.scenarios.data_science.debug.data import create_debug_data
-from rdagent.utils.env import MLEBDockerEnv
 from rdagent.utils.agent.tpl import T
+from rdagent.utils.env import MLEBDockerEnv
+
 # %%
 options = webdriver.ChromeOptions()
 options.add_argument("--no-sandbox")
@@ -36,6 +37,10 @@
 def crawl_descriptions(
     competition: str, local_data_path: str, wait: float = 3.0, force: bool = False
 ) -> dict[str, str]:
+    if (fp := Path(f"{local_data_path}/{competition}/description.md")).exists() and not force:
+        logger.info(f"Found {competition}/description.md, loading from it.")
+        return fp.read_text()
+
     if (fp := Path(f"{local_data_path}/{competition}.json")).exists() and not force:
         logger.info(f"Found {competition}.json, loading from local file.")
         with fp.open("r") as f:

From ae29d1ddc8c9fe8dd6f927e892376c07c72c7f8c Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Wed, 8 Jan 2025 06:04:12 +0000
Subject: [PATCH 210/304] refactor: Simplify SOTA description handling in
 feedback and prompts

---
 .../scenarios/data_science/dev/feedback.py    | 13 ++----------
 .../scenarios/data_science/dev/prompts.yaml   | 21 +++----------------
 .../data_science/proposal/exp_gen.py          |  8 +++++++
 .../data_science/proposal/prompts.yaml        |  2 --
 4 files changed, 13 insertions(+), 31 deletions(-)

diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
index 9e39ad5da..a50e73028 100644
--- a/rdagent/scenarios/data_science/dev/feedback.py
+++ b/rdagent/scenarios/data_science/dev/feedback.py
@@ -83,7 +83,7 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
         # 4. result 任务的结果
         # 5. sota_exp.result 之前最好的结果
         sota_exp = trace.sota_experiment()
-        current_results = exp.result
+        sota_desc = T("scenarios.data_science.share:describe.exp").r(exp=sota_exp, heading="SOTA of previous exploration of the scenario")
 
         # TODO:
         # -  Should we choose between the diff from last experiment or last sota ?
@@ -98,17 +98,10 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
         else:
             diff_edition = []
 
-        sota_exp = trace.sota_experiment()
         # assumption:
         # The feedback should focus on experiment **improving**.
         # Assume that all the the sota exp is based on the previous sota experiment
 
-        if sota_exp:
-            sota_codes = sota_exp.experiment_workspace.all_codes
-            sota_results = sota_exp.result
-        else:
-            sota_codes = None
-            sota_results = None
 
         last_exp_and_feedback = None
         if trace.hist and len(trace.hist) > 0:
@@ -116,11 +109,9 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
 
         system_prompt = T(".prompts:exp_feedback.system").r(scenario=self.scen.get_scenario_all_desc())
         user_prompt = T(".prompts:exp_feedback.user").r(
-            sota_codes=sota_codes,
-            sota_results=sota_results,
+            sota_desc=sota_desc,
             cur_exp=exp,
             diff_edition=diff_edition,
-            current_results=current_results,
             last_exp_and_feedback=last_exp_and_feedback,
         )
 
diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
index fada2d1fb..54b71aa76 100644
--- a/rdagent/scenarios/data_science/dev/prompts.yaml
+++ b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -24,22 +24,7 @@ exp_feedback:
   user: |-
     We are in a process of finding and validating hypotheses to build powerful codes. Each round aims to confirm or reject hypotheses based on results.
 
-    ## SOTA of previous exploration of the scenario
-    {% if sota_codes %}
-    ### Code
-    The improvement of sota results is
-    {{sota_codes}}
-
-    ### According results
-    {% if sota_results %}
-    The SOTA results is:
-    {{ sota_results }}
-    {% else %}
-    There are no previous results
-    {% endif %}
-    {% else %}
-    We have no previous complete complete results.
-    {% endif %}
+    {{ sota_desc }}
 
     ## Current solution
     Current solution to be evaluated:
@@ -58,8 +43,8 @@ exp_feedback:
     {{ de }}
     {% endfor %}
 
-    Final results of the current solution: {{ current_results }}
-    ### Code of current solution
+    Final results of the current solution: {{ cur_exp.result }}
+    ### Complete Code of current solution
     {{cur_exp.experiment_workspace.all_codes}}
 
     {% if last_exp_and_feedback and last_exp_and_feedback|length > 1 %}
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 6bbb96c28..59d871dc7 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -216,6 +216,14 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             exp.experiment_workspace.inject_code_from_folder(last_successful_exp.experiment_workspace.workspace_path)
             return exp
         else:  # propose new component by LLM
+            # Guidelines:
+            # System prompts: Shared condition you are facing
+            # - scenario description: `scenario_desc`
+            # - expected output format
+            # User prompts: Task Specific information
+            # - Previous Feedback
+            # - Current sota implementation (encourage change based on it)
+            # - Extra RAG
             assert last_successful_exp is not None, "SOTA experiment is not provided."
 
             # base info
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 02f82ff75..61ac036aa 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -144,7 +144,6 @@ component_gen:
     Here is the latest version of implementation.
     ```
     {{implementation}}
-
     ```
 
     You will be provided the feedback for the latest implementation.
@@ -158,7 +157,6 @@ component_gen:
     {{feedback}}
 
 
-
 exp_and_feedback: |-
   {% for experiment, feedback in trace.hist[-10:] %}
   ## Experiment {{ loop.index }}

From 8492cd410abe6d87d44e6f0029c6be09d506abd1 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Wed, 8 Jan 2025 06:07:53 +0000
Subject: [PATCH 211/304] refactor: Use shared templates for feedback and
 experiment descriptions

---
 .../scenarios/data_science/dev/feedback.py    | 13 ++++---
 .../scenarios/data_science/dev/prompts.yaml   | 15 ++------
 .../data_science/proposal/exp_gen.py          | 26 ++++++++-----
 .../data_science/proposal/prompts.yaml        | 18 +++++----
 rdagent/scenarios/data_science/share.yaml     | 38 +++++++++++++++++++
 5 files changed, 76 insertions(+), 34 deletions(-)
 create mode 100644 rdagent/scenarios/data_science/share.yaml

diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
index a50e73028..86e602943 100644
--- a/rdagent/scenarios/data_science/dev/feedback.py
+++ b/rdagent/scenarios/data_science/dev/feedback.py
@@ -85,6 +85,12 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
         sota_exp = trace.sota_experiment()
         sota_desc = T("scenarios.data_science.share:describe.exp").r(exp=sota_exp, heading="SOTA of previous exploration of the scenario")
 
+        # Get feedback description using shared template
+        feedback_desc = T("scenarios.data_science.share:describe.feedback").r(
+            exp_and_feedback=(trace.hist[-1] if trace.hist else None),
+            heading="Previous Trial Feedback"
+        )
+
         # TODO:
         # -  Should we choose between the diff from last experiment or last sota ?
 
@@ -102,17 +108,12 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
         # The feedback should focus on experiment **improving**.
         # Assume that all the the sota exp is based on the previous sota experiment
 
-
-        last_exp_and_feedback = None
-        if trace.hist and len(trace.hist) > 0:
-            last_exp_and_feedback = (trace.hist[-1][0], trace.hist[-1][1])
-
         system_prompt = T(".prompts:exp_feedback.system").r(scenario=self.scen.get_scenario_all_desc())
         user_prompt = T(".prompts:exp_feedback.user").r(
             sota_desc=sota_desc,
             cur_exp=exp,
             diff_edition=diff_edition,
-            last_exp_and_feedback=last_exp_and_feedback,
+            feedback_desc=feedback_desc,
         )
 
         resp_dict = json.loads(
diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
index 54b71aa76..37ea21d4c 100644
--- a/rdagent/scenarios/data_science/dev/prompts.yaml
+++ b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -9,7 +9,7 @@ exp_feedback:
     1. Confirm if the current result supports or refutes the hypothesis.
     2. Compare with previous best results.
     3. Suggest improvements or new directions. Stay innovative and adapative.
-    4. SOTA results are the best outcomes we have achieved in this scenario. If we do not have complete results (i.e., results that are runnable and can generate evaluation outcomes), please replace it as the best result.
+    4. SOTA results are the best outcomes we have achieved in this scenario. If we do not have complete experiment available (i.e., results that are runnable and can generate evaluation outcomes), **please replace it as the best result/SOTA**.
 
     Please provide detailed and constructive feedback.
     Example JSON Structure for Result Analysis:
@@ -47,17 +47,8 @@ exp_feedback:
     ### Complete Code of current solution
     {{cur_exp.experiment_workspace.all_codes}}
 
-    {% if last_exp_and_feedback and last_exp_and_feedback|length > 1 %}
-    ## Previous trial
-    Before current trial, previous recent trial is listed below.
-    {% if last_exp_and_feedback[0].hypothesis %}
-    the experiment is designed based on hypothesis: {{ last_exp_and_feedback[0].hypothesis }}
-    {% endif %}
-    ### Task of previous trial
-    {{ last_exp_and_feedback[0].sub_tasks[0].get_task_information() }}
-    feedback decision: {{ last_exp_and_feedback[1].decision }}
-    reason: {{ last_exp_and_feedback[1].reason }}
-    {% endif %}
+    {{feedback_desc}}
+
     Please refer to these hypotheses and feedback to help you recommend new experiment and hypothesis
 
     Consider Changing Direction for Significant Gaps with the Best Result and the last round:
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 59d871dc7..2f717740c 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -225,20 +225,27 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             # - Current sota implementation (encourage change based on it)
             # - Extra RAG
             assert last_successful_exp is not None, "SOTA experiment is not provided."
+            exp_and_feedback = trace.hist[-1]
+            last_exp = exp_and_feedback[0]
 
-            # base info
-            hypothesis_and_feedback = T(".prompts:hypothesis_and_feedback").r(hist=[i for i in trace.hist[-10:] if isinstance(i[1], HypothesisFeedback)])
             # Step 1: Generate component
+            # Describe current best solution using shared template
             sota_solution = trace.sota_experiment()
+            sota_exp_desc = T("scenarios.data_science.share:describe.exp").r(exp=last_successful_exp, heading="Best of previous exploration of the scenario")
+            current_exp_desc = T("scenarios.data_science.share:describe.exp").r(exp=last_exp, heading="Current exploration of the scenario")
+            exp_and_feedback_desc = T("scenarios.data_science.share:describe.feedback").r(exp_and_feedback=exp_and_feedback)
+
+            # Generate component using template with proper context
             component_sys_prompt = T(".prompts:component_gen.system").r(
-                    scenario=scenario_desc,
-                    implementation=last_successful_exp.experiment_workspace.all_codes,
-                    component_output_format=T(".prompts:output_format.component").r(),
-                )
+                scenario=scenario_desc,
+                sota_exp_desc=sota_exp_desc,
+                current_exp_desc=current_exp_desc,
+                component_output_format=T(".prompts:output_format.component").r(),
+            )
 
             component_user_prompt = T(".prompts:component_gen.user").r(
-                    feedback=hypothesis_and_feedback,
-                )
+                exp_and_feedback_desc=exp_and_feedback_desc,
+            )
 
             resp_dict_component: dict = json.loads(
                 APIBackend().build_messages_and_create_chat_completion(
@@ -247,6 +254,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             )
 
             component = resp_dict_component.get("component", "Component not provided")
+
             # Why we should split component selection and hypothesis generation
             # - after we know the selected component, we can use RAG.
 
@@ -281,7 +289,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             else:
                 model_infos = []
                 score_df = pd.read_csv(
-                    last_successful_exp.experiment_workspace.workspace_path / "score.csv", index_col=0
+                    last_successful_exp.experiment_workspace.workspace_path / "scores.csv", index_col=0
                 )
                 metric_name = score_df.columns[0]
                 for fname in last_successful_exp.experiment_workspace.file_dict:
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 61ac036aa..eb35ee5b5 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -140,21 +140,25 @@ component_gen:
     ```
     {{scenario}}
     ```
-    
-    Here is the latest version of implementation.
-    ```
-    {{implementation}}
-    ```
+
+    # Here is the current best version of implementation.
+    {{sota_exp_desc}}
+
+    # Here is the latest version of implementation
+    {{current_exp_desc}}
 
     You will be provided the feedback for the latest implementation.
 
-    Please select the component you are going to improve the latest implementation.
+    # The expected output format
+    {{component_output_format}}
+
+    Please select the component you are going to improve the latest implementation or sota implementation.
 
     Please generate the output following the format below:
     {{ component_output_format }}
 
   user: |-
-    {{feedback}}
+    {{exp_and_feedback_desc}}
 
 
 exp_and_feedback: |-
diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
new file mode 100644
index 000000000..c1435e5cd
--- /dev/null
+++ b/rdagent/scenarios/data_science/share.yaml
@@ -0,0 +1,38 @@
+describe: # some template to describe some object
+  # exp is a template used fo
+  exp: |-
+    ## {{ heading | default('Best solution of previous exploration of the scenario') }}
+    {% if exp %}
+    ### Code
+    Here is the complete code of the solution.
+    {{ exp.experiment_workspace.all_codes }}
+
+    {% if exp.hypothesis is not none %}
+    ### Hypothesis for the experiment
+    the experiment is designed based on hypothesis: {{exp.hypothesis}}
+    {% endif %}
+
+    ### Results
+    {% if exp.result is none %}
+    There are no according evaluation results
+    {% else %}
+    Evaluated results is:
+    {{ exp.result }}
+    {% endif %}
+
+    {% else %}
+    No previous complete experiment available.
+    {% endif %}
+
+  feedback: |-
+    {% if exp_and_feedback and exp_and_feedback|length > 1 %}
+    ## {{heading | default('Previous trial and feedback')}}
+    Before current trial, previous recent trial is listed below.
+    {% if exp_and_feedback[0].hypothesis %}
+    the experiment is designed based on hypothesis: {{ exp_and_feedback[0].hypothesis }}
+    {% endif %}
+    ### Task of previous trial
+    {{ exp_and_feedback[0].sub_tasks[0].get_task_information() }}
+    feedback decision: {{ exp_and_feedback[1].decision }}
+    reason: {{ exp_and_feedback[1].reason }}
+    {% endif %}

From 06515e8f9fcb6ec971031ecac7e5aa9e6bb44a89 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 8 Jan 2025 06:43:12 +0000
Subject: [PATCH 212/304] change webapp for model codes changes

---
 rdagent/log/ui/llm_st.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
index f77b1acf4..054220278 100644
--- a/rdagent/log/ui/llm_st.py
+++ b/rdagent/log/ui/llm_st.py
@@ -155,6 +155,16 @@ def extract_evoid(tag):
                         st.markdown(":red[**Spec in response dict:**]")
                         st.markdown(spec)
                         rdict.pop("spec")
+                    else:
+                        # show model codes
+                        showed_keys = []
+                        for k,v in rdict.items():
+                            if k.startswith("model_") and k.endswith(".py"):
+                                st.markdown(f":red[**{k}**]")
+                                st.code(v, language="python", wrap_lines=True, line_numbers=True)
+                                showed_keys.append(k)
+                        for k in showed_keys:
+                            rdict.pop(k)
                     st.write(":red[**Other parts (except for the code or spec) in response dict:**]")
                     st.json(rdict)
                 except:

From 71506af31362c4e7ff15c11eee969c41ed0ad485 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Wed, 8 Jan 2025 08:21:05 +0000
Subject: [PATCH 213/304] update proposal

---
 .../data_science/proposal/exp_gen.py          | 21 ++++++++++---------
 .../data_science/proposal/prompts.yaml        | 11 ++++------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 2f717740c..98bad4d94 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -104,7 +104,7 @@ def llm_task_gen(
         workspace_code: str | None = None,
         spec: str = None,
         hypothesis: Hypothesis | None = None,
-        hypothesis_and_feedback: str | None = None,
+        exp_and_feedback_desc: str | None = None,
     ) -> dict:
         system_prompt = T(".prompts:task_gen.system").r(
             targets=targets,
@@ -117,7 +117,7 @@ def llm_task_gen(
             targets=targets,
             hypothesis=hypothesis,
             workspace_code=workspace_code,
-            hypothesis_and_feedback=hypothesis_and_feedback,
+            exp_and_feedback_desc=exp_and_feedback_desc,
         )
 
         resp_dict = json.loads(
@@ -269,7 +269,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 )
                 hypothesis_user_prompt = T(".prompts:hypothesis_gen.user").r(
                     targets="data science project",
-                    hypothesis_and_feedback=hypothesis_and_feedback,
+                    exp_and_feedback_desc=exp_and_feedback_desc,
                 )
 
                 resp_dict: dict = json.loads(
@@ -294,7 +294,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 metric_name = score_df.columns[0]
                 for fname in last_successful_exp.experiment_workspace.file_dict:
                     if re.match(r"^model_.+\.py", fname):
-                        model_str = f"{fname}:\n{metric_name} on valid: {score_df.loc[fname[:-3]]}\n```python\n{last_successful_exp.experiment_workspace.file_dict[fname]}\n```\n"
+                        model_str = f"{fname}:\n{metric_name} on valid: {score_df[metric_name].max()}\n```python\n{last_successful_exp.experiment_workspace.file_dict[fname]}\n```\n"
                         model_infos.append(model_str)
 
                 model_num = len(model_infos)
@@ -319,7 +319,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     )
                 hypothesis_user_prompt = T(".prompts:hypothesis_gen.user").r(
                     targets="data science project",
-                    hypothesis_and_feedback=hypothesis_and_feedback,
+                    exp_and_feedback_desc=exp_and_feedback_desc,
                 )
                 resp_dict: dict = json.loads(
                     APIBackend().build_messages_and_create_chat_completion(
@@ -344,7 +344,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     spec=last_successful_exp.experiment_workspace.file_dict["spec/data_loader.md"],
                     hypothesis=hypothesis,
                     task_output_format=T(".prompts:output_format.data_loader").r(),
-                    hypothesis_and_feedback=hypothesis_and_feedback,
+                    exp_and_feedback_desc=exp_and_feedback_desc,
                 )
 
                 dt = DataLoaderTask(
@@ -367,7 +367,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     spec=last_successful_exp.experiment_workspace.file_dict["spec/feature.md"],
                     hypothesis=hypothesis,
                     task_output_format=T(".prompts:output_format.feature").r(),
-                    hypothesis_and_feedback=hypothesis_and_feedback,
+                    exp_and_feedback_desc=exp_and_feedback_desc,
                 )
 
                 ft = FeatureTask(
@@ -382,12 +382,13 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 return exp
             elif hypothesis.component == "Model":
                 resp_dict = self.llm_task_gen(
+                    targets="Models",
                     scenario_desc=scenario_desc,
                     spec=last_successful_exp.experiment_workspace.file_dict["spec/model.md"],
                     hypothesis=hypothesis,
                     workspace_code=last_successful_exp.experiment_workspace.all_codes,
                     task_output_format=T(".prompts:output_format.model").r(),
-                    hypothesis_and_feedback=hypothesis_and_feedback,
+                    exp_and_feedback_desc=exp_and_feedback_desc,
                 )
 
                 mt = ModelTask(
@@ -411,7 +412,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     spec=last_successful_exp.experiment_workspace.file_dict["spec/ensemble.md"],
                     hypothesis=hypothesis,
                     task_output_format=T(".prompts:output_format.ensemble").r(),
-                    hypothesis_and_feedback=hypothesis_and_feedback,
+                    exp_and_feedback_desc=exp_and_feedback_desc,
                 )
 
                 et = EnsembleTask(
@@ -431,7 +432,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     spec=last_successful_exp.experiment_workspace.file_dict["spec/workflow.md"],
                     hypothesis=hypothesis,
                     task_output_format=T(".prompts:output_format.workflow").r(),
-                    hypothesis_and_feedback=hypothesis_and_feedback,
+                    exp_and_feedback_desc=exp_and_feedback_desc,
                 )
 
                 wt = WorkflowTask(
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index eb35ee5b5..ed61c3c3e 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -14,10 +14,10 @@ hypothesis_gen:
     {{ hypothesis_output_format }}
 
   user: |-
-    {% if hypothesis_and_feedback|length == 0 %}It is the first round of hypothesis generation. The user has no hypothesis on this scenario yet.
+    {% if exp_and_feedback_desc|length == 0 %}It is the first round of hypothesis generation. The user has no hypothesis on this scenario yet.
     {% else %}It is not the first round, the user has made several hypothesis on this scenario and did several evaluation on them.
     The former hypothesis and the corresponding feedbacks are as follows (focus on the last one & the new hypothesis that it provides and reasoning to see if you agree):
-    {{ hypothesis_and_feedback }}
+    {{ exp_and_feedback_desc }}
     {% endif %}
     Also generate the relevant keys for the reasoning and the distilled knowledge that follows. For those keys, in particular for knowledge, explain in the context of the specific scenario to build up domain knowledge in the specific field rather than general knowledge.
 
@@ -94,7 +94,7 @@ task_gen:
     The target hypothesis you are targeting to generate {{targets}} for is as follows:
     {{ hypothesis }}
     The former hypothesis and the corresponding feedbacks are as follows:
-    {{ hypothesis_and_feedback }}
+    {{ exp_and_feedback_desc }}
     Please generate the new {{targets}} based on the information above.
     {% else %}
     Please generate the new {{targets}} task.
@@ -126,7 +126,7 @@ task_gen_model:
     The target hypothesis you are targeting to generate {{targets}} for is as follows:
     {{ hypothesis }}
     The former hypothesis and the corresponding feedbacks are as follows:
-    {{ hypothesis_and_feedback }}
+    {{ exp_and_feedback_desc }}
     Please generate the new {{targets}} based on the information above.
     {% else %}
     Please generate the new {{targets}} task.
@@ -149,9 +149,6 @@ component_gen:
 
     You will be provided the feedback for the latest implementation.
 
-    # The expected output format
-    {{component_output_format}}
-
     Please select the component you are going to improve the latest implementation or sota implementation.
 
     Please generate the output following the format below:

From 0f4073e11980eae757ffc2ab490e7d44d0c3259b Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 8 Jan 2025 09:11:36 +0000
Subject: [PATCH 214/304] add timeout message for docker run output

---
 rdagent/utils/env.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
index bfbc0e553..bba41205b 100644
--- a/rdagent/utils/env.py
+++ b/rdagent/utils/env.py
@@ -12,6 +12,7 @@
 import pickle
 import re
 import subprocess
+import time
 import uuid
 from abc import abstractmethod
 from pathlib import Path
@@ -389,8 +390,16 @@ def run(
         if entry is None:
             entry = self.conf.default_entry
         entry_add_timeout = f"timeout {self.conf.running_timeout_period} {entry}"
-        return self.__run(entry_add_timeout, local_path, env, running_extra_volume)
+        
+        start = time.time()
+        out = self.__run(entry_add_timeout, local_path, env, running_extra_volume)
+        end = time.time()
 
+        if end - start >= self.conf.running_timeout_period:
+            out += f"The running time exceeds {self.conf.running_timeout_period} seconds, so the process is killed."
+
+        return out
+        
     def dump_python_code_run_and_get_results(
         self,
         code: str,

From 797acd5c87da03bb298820f029ed5530a777c13b Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 8 Jan 2025 09:12:01 +0000
Subject: [PATCH 215/304] fix

---
 rdagent/utils/env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
index bba41205b..70e3f3507 100644
--- a/rdagent/utils/env.py
+++ b/rdagent/utils/env.py
@@ -396,7 +396,7 @@ def run(
         end = time.time()
 
         if end - start >= self.conf.running_timeout_period:
-            out += f"The running time exceeds {self.conf.running_timeout_period} seconds, so the process is killed."
+            out += f"\n\nThe running time exceeds {self.conf.running_timeout_period} seconds, so the process is killed."
 
         return out
         

From 56d57ace328b0046542de12c3be7f0bad44ecaaa Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Wed, 8 Jan 2025 09:12:50 +0000
Subject: [PATCH 216/304] refine the code in docker time processing

---
 rdagent/components/coder/data_science/feature/eval.py         | 4 ++--
 rdagent/components/coder/data_science/model/eval.py           | 3 ---
 rdagent/components/coder/data_science/raw_data_loader/eval.py | 3 +--
 rdagent/components/coder/data_science/workflow/eval.py        | 3 ---
 4 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
index 2be1fc8eb..a98562732 100644
--- a/rdagent/components/coder/data_science/feature/eval.py
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -43,6 +43,7 @@ def evaluate(
             )
 
         ds_docker_conf = DSDockerConf()
+        # TODO: we should /= 20 for the timeout period on debug component
         ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
         de = DockerEnv(conf=ds_docker_conf)
 
@@ -52,8 +53,7 @@ def evaluate(
         implementation.inject_files(**{fname: test_code})
 
         stdout = implementation.execute(env=de, entry=f"python {fname}")
-        if stdout is None:
-            stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
+
         system_prompt = T(".prompts:feature_eval.system").r(
             test_code=test_code, code=implementation.file_dict["feature.py"]
         )
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 2440e19c7..1d14f337d 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -69,9 +69,6 @@ def evaluate(
         implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
-        if stdout is None:
-            stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
-
         # Filter out progress bars from stdout using regex
         filtered_stdout = filter_progress_bar(stdout)
 
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index 75223bfb5..1515b8008 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -56,8 +56,7 @@ def evaluate(
         test_code = (DIRNAME / "eval_tests" / "data_loader_test.py").read_text()
         implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
-        if stdout is None:
-            stdout = "The execution exceeded the time limit, and no stdout information has been generated yet."
+
         system_prompt = T(".prompts:data_loader_eval.system").r(
             test_code=test_code, code=implementation.file_dict["load_data.py"]
         )
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index 253188fa0..2d96d5c21 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -80,9 +80,6 @@ def evaluate(
         if not submission_fp.exists():
             stdout += "\nSubmission file (submission.csv) is not generated."
 
-        if stdout is None:
-            stdout = "\nThe execution exceeded the time limit."
-
         system_prompt = T(".prompts:workflow_eval.system").r(
             scenario=self.scen.get_scenario_all_desc(), spec=implementation.file_dict["spec/workflow.md"]
         )

From 6d8f47655940ea034878cf47462641b9111042dc Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 8 Jan 2025 10:58:23 +0000
Subject: [PATCH 217/304] use .shape instead of len() when do shape eval

---
 .../coder/data_science/feature/eval_tests/feature_test.py     | 4 ++--
 .../raw_data_loader/eval_tests/data_loader_test.py            | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
index d4b277c7c..efc303408 100644
--- a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
@@ -19,7 +19,7 @@
 
 
 # Validate the conditions mentioned in the docstring
-assert len(X_test) == len(test_ids), "Mismatch in length of test images and test IDs"
-assert len(X) == len(y), "Mismatch in length of training images and labels"
+assert X_test.shape[0] == test_ids.shape[0], "Mismatch in length of test images and test IDs"
+assert X.shape[0] == y.shape[0], "Mismatch in length of training images and labels"
 
 print("Feature Engineering test passed successfully. Length of test images matches length of test IDs.")
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
index b1330362e..31b191872 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
@@ -16,8 +16,8 @@
 X, y, X_test, test_ids = load_data()
 
 # Validate the conditions mentioned in the docstring
-assert len(X_test) == len(test_ids), "Mismatch in length of test images and test IDs"
-assert len(X) == len(y), "Mismatch in length of training images and labels"
+assert X_test.shape[0] == test_ids.shape[0], "Mismatch in length of test images and test IDs"
+assert X.shape[0] == y.shape[0], "Mismatch in length of training images and labels"
 
 print("Data loader test passed successfully. Length of test images matches length of test IDs.")
 

From 02004b9ba46aeb42d62b394f70f2f3bd5d3ec981 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Wed, 8 Jan 2025 09:35:11 +0000
Subject: [PATCH 218/304] won't change size during iteration

---
 rdagent/components/coder/data_science/model/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index b5de472a4..1c29ed968 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -76,10 +76,10 @@ def implement_one_task(
         )
 
         # 3. post process to align file name to the task name
-        for key, value in batch_edit.items():
-            if value != "__DEL__" and key != f"{target_task.name}.py":
-                batch_edit[f"{target_task.name}.py"] = value
-                del batch_edit[key]
+        batch_edit = {
+            (f"{target_task.name}.py" if value != "__DEL__" and key != f"{target_task.name}.py" else key): value
+            for key, value in batch_edit.items()
+        }
 
         return batch_edit
 

From ac145dc55b7d6993e5ea5a22bab39d4aa0b37281 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Wed, 8 Jan 2025 11:18:25 +0000
Subject: [PATCH 219/304] support bson sample

---
 rdagent/scenarios/data_science/debug/data.py | 29 ++++++++++++++------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index 3625a57ef..4822a69ba 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -4,6 +4,10 @@
 from pathlib import Path
 
 import pandas as pd
+try:
+    import bson  # pip install pymongo
+except:
+    pass
 
 from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
 
@@ -39,6 +43,10 @@ def load(self, path) -> pd.DataFrame:
             # you might do: pd.read_hdf(path, key='df') or something similar.
             # Adjust as needed based on your HDF structure.
             return pd.read_hdf(path, key="data")
+        elif suffix == ".bson":
+            data = bson.decode_file_iter(open(path, 'rb'))
+            df = pd.DataFrame(data)
+            return df
         else:
             raise ValueError(f"Unsupported file type: {suffix}")
 
@@ -55,6 +63,12 @@ def dump(self, df: pd.DataFrame, path):
         elif suffix in [".h5", ".hdf", ".hdf5"]:
             # Similarly, you need a key for HDF.
             df.to_hdf(path, key="data", mode="w")
+        elif suffix == ".bson":
+            data = df.to_dict(orient="records")
+            with open(path, "wb") as file:
+                # Write each record in the list to the BSON file
+                for record in data:
+                    file.write(bson.BSON.encode(record))
         else:
             raise ValueError(f"Unsupported file type: {suffix}")
 
@@ -72,7 +86,7 @@ class RandDataReducer(DataReducer):
     or at least `min_frac` fraction of the data (whichever is larger).
     """
 
-    def __init__(self, min_frac=0.05, min_num=100):
+    def __init__(self, min_frac=0.05, min_num=5):
         self.min_frac = min_frac
         self.min_num = min_num
 
@@ -98,7 +112,7 @@ class RowReducer(DataReducer):
     """
 
     def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
-        ten_percent = int(max(len(df) * 0.1, 100))
+        ten_percent = int(max(len(df) * 0.1, 5))
         return df.iloc[:ten_percent]
 
 
@@ -136,7 +150,7 @@ def create_debug_data(
     print(f"[INFO] Original dataset folder `{data_folder}` has {total_files_count} files in total (including subfolders).")
 
     # Traverse the folder and exclude specific file types
-    included_extensions = {".csv", ".pkl", ".parquet", ".h5", ".hdf", ".hdf5"}
+    included_extensions = {".csv", ".pkl", ".parquet", ".h5", ".hdf", ".hdf5", ".bson"}
     files_to_process = [file for file in data_folder.rglob("*") if file.is_file()]
 
     # This set will store filenames or paths that appear in the sampled data
@@ -181,8 +195,7 @@ def create_debug_data(
     for file_path in files_to_process:
         if file_path.suffix.lower() in included_extensions:
             continue  # Already handled above
-
-        rel_dir = file_path.relative_to(data_folder).parent
+        rel_dir = file_path.relative_to(data_folder).parts[0]
         subfolder_dict.setdefault(rel_dir, []).append(file_path)
 
     # For each subfolder, decide which files to copy
@@ -207,12 +220,12 @@ def create_debug_data(
 
         # If no files are used, randomly sample files to keep the folder from being empty
         if len(used_files) == 0:
-            if len(file_list) <= 100:
+            if len(file_list) <= 5:
                 num_to_keep = len(file_list)
             else:
                 num_to_keep = int(len(file_list) * 0.05)
-                if num_to_keep <= 100:
-                    num_to_keep = 100  # Keep at least one file if fraction is too small
+                if num_to_keep <= 5:
+                    num_to_keep = 5  # Keep at least one file if fraction is too small
 
             sampled_not_used = pd.Series(not_used_files).sample(n=num_to_keep, random_state=1)
             for nf in sampled_not_used:

From c88a23d87386a778214c20de186d37ae10099d4c Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Thu, 9 Jan 2025 04:00:18 +0000
Subject: [PATCH 220/304] sample support jsonl and bson

---
 rdagent/scenarios/data_science/debug/data.py | 44 +++++++-------------
 1 file changed, 14 insertions(+), 30 deletions(-)

diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index 4822a69ba..001195d11 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -43,6 +43,9 @@ def load(self, path) -> pd.DataFrame:
             # you might do: pd.read_hdf(path, key='df') or something similar.
             # Adjust as needed based on your HDF structure.
             return pd.read_hdf(path, key="data")
+        elif suffix == ".jsonl":
+            # Read JSON Lines file
+            return pd.read_json(path, lines=True)
         elif suffix == ".bson":
             data = bson.decode_file_iter(open(path, 'rb'))
             df = pd.DataFrame(data)
@@ -63,6 +66,9 @@ def dump(self, df: pd.DataFrame, path):
         elif suffix in [".h5", ".hdf", ".hdf5"]:
             # Similarly, you need a key for HDF.
             df.to_hdf(path, key="data", mode="w")
+        elif suffix == ".jsonl":
+            # Save DataFrame to JSON Lines file
+            df.to_json(path, orient="records", lines=True)
         elif suffix == ".bson":
             data = df.to_dict(orient="records")
             with open(path, "wb") as file:
@@ -92,30 +98,12 @@ def __init__(self, min_frac=0.05, min_num=5):
 
     def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
         frac = max(self.min_frac, self.min_num / len(df))
+        print(f"Sampling {frac * 100:.2f}% of the data ({len(df)} rows)")
         if frac >= 1:
             return df
         return df.sample(frac=frac, random_state=1)
 
 
-class ColumnReducer(DataReducer):
-    """
-    Example column reducer: keep only the first 5 columns.
-    """
-
-    def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
-        return df.iloc[:, :5]
-
-
-class RowReducer(DataReducer):
-    """
-    Example row reducer: keep only the first 10% rows.
-    """
-
-    def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
-        ten_percent = int(max(len(df) * 0.1, 5))
-        return df.iloc[:ten_percent]
-
-
 def count_files_in_folder(folder: Path) -> int:
     """
     Count the total number of files in a folder, including files in subfolders.
@@ -126,7 +114,8 @@ def count_files_in_folder(folder: Path) -> int:
 def create_debug_data(
     competition: str,
     dr_cls: type[DataReducer] = RandDataReducer,
-    dr_cls_kwargs=None,
+    min_frac=0.002, 
+    min_num=5,
     dataset_path=None,
     sample_path=None,
 ):
@@ -135,9 +124,6 @@ def create_debug_data(
     and renames/moves files for easier debugging.
     Automatically detects file type (csv, pkl, parquet, hdf, etc.).
     """
-    if dr_cls_kwargs is None:
-        dr_cls_kwargs = {}
-
     if dataset_path is None:
         dataset_path = KAGGLE_IMPLEMENT_SETTING.local_data_path  # FIXME: don't hardcode this KAGGLE_IMPLEMENT_SETTING
 
@@ -150,7 +136,7 @@ def create_debug_data(
     print(f"[INFO] Original dataset folder `{data_folder}` has {total_files_count} files in total (including subfolders).")
 
     # Traverse the folder and exclude specific file types
-    included_extensions = {".csv", ".pkl", ".parquet", ".h5", ".hdf", ".hdf5", ".bson"}
+    included_extensions = {".csv", ".pkl", ".parquet", ".h5", ".hdf", ".hdf5", ".jsonl", ".bson"}
     files_to_process = [file for file in data_folder.rglob("*") if file.is_file()]
 
     # This set will store filenames or paths that appear in the sampled data
@@ -158,7 +144,7 @@ def create_debug_data(
 
     # Prepare data handler and reducer
     data_handler = GenericDataHandler()
-    data_reducer = dr_cls(**dr_cls_kwargs)
+    data_reducer = dr_cls(min_frac=min_frac, min_num=min_num)
 
     for file_path in files_to_process:
         sampled_file_path = sample_folder / file_path.relative_to(data_folder)
@@ -220,13 +206,11 @@ def create_debug_data(
 
         # If no files are used, randomly sample files to keep the folder from being empty
         if len(used_files) == 0:
-            if len(file_list) <= 5:
+            if len(file_list) <= min_num:
                 num_to_keep = len(file_list)
             else:
-                num_to_keep = int(len(file_list) * 0.05)
-                if num_to_keep <= 5:
-                    num_to_keep = 5  # Keep at least one file if fraction is too small
-
+                num_to_keep = max(int(len(file_list) * min_frac), min_num)
+            print(F"Sampling {num_to_keep} files without label from {len(file_list)} files in {rel_dir}")
             sampled_not_used = pd.Series(not_used_files).sample(n=num_to_keep, random_state=1)
             for nf in sampled_not_used:
                 sampled_file_path = sample_folder / nf.relative_to(data_folder)

From 189584633ed9eacbe2dc8c12c5cab5a856ea33eb Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Thu, 9 Jan 2025 05:02:04 +0000
Subject: [PATCH 221/304] add former_code to coder prompts

---
 .../components/coder/CoSTEER/evaluators.py    |  2 +-
 .../coder/data_science/ensemble/__init__.py   |  5 +-
 .../coder/data_science/ensemble/prompts.yaml  |  5 ++
 .../coder/data_science/feature/prompts.yaml   |  5 +-
 .../coder/data_science/model/__init__.py      | 20 +++++++
 .../data_science/raw_data_loader/prompts.yaml |  6 +++
 .../coder/data_science/workflow/prompts.yaml  | 11 +++-
 rdagent/scenarios/data_science/debug/data.py  | 15 +++---
 rdagent/utils/__init__.py                     | 54 +++++++------------
 rdagent/utils/prompts.yaml                    | 25 ++-------
 10 files changed, 79 insertions(+), 69 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
index 85b6764ae..6c28169c7 100644
--- a/rdagent/components/coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -36,7 +36,7 @@ class CoSTEERSingleFeedback(Feedback):
     """
     execution: str
     # execution_feedback
-    return_checking: str | None  # inlucding every check in the testing (constraints about the generated value)
+    return_checking: str | None  # including every check in the testing (constraints about the generated value)
     # value_feedback, shape_feedback, value_generated_flag
     code: str
     final_decision: bool
diff --git a/rdagent/components/coder/data_science/ensemble/__init__.py b/rdagent/components/coder/data_science/ensemble/__init__.py
index b6ee34d4c..3144913ee 100644
--- a/rdagent/components/coder/data_science/ensemble/__init__.py
+++ b/rdagent/components/coder/data_science/ensemble/__init__.py
@@ -62,7 +62,10 @@ def implement_one_task(
                 queried_former_failed_knowledge[0] if queried_former_failed_knowledge else None
             ),
         )
-        user_prompt = T(".prompts:ensemble_coder.user").r(ensemble_spec=workspace.file_dict["spec/ensemble.md"])
+        user_prompt = T(".prompts:ensemble_coder.user").r(
+            ensemble_spec=workspace.file_dict["spec/ensemble.md"],
+            latest_code=workspace.file_dict.get("ensemble.py"),
+        )
 
         ensemble_code = json.loads(
             APIBackend().build_messages_and_create_chat_completion(
diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
index 99047ab0d..62bf57fa8 100644
--- a/rdagent/components/coder/data_science/ensemble/prompts.yaml
+++ b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -42,6 +42,11 @@ ensemble_coder:
     -----------Ensemble Specification-----------
     {{ ensemble_spec }}
 
+    {% if latest_code %}
+    ---------Former code---------
+      {{ latest_code }}
+      You should follow the former code to improve it.
+    {% endif %}
 ensemble_eval:
   system: |-
     You are a data scientist evaluating an ensemble implementation.
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
index 0e6e41ebc..36b95c7a3 100644
--- a/rdagent/components/coder/data_science/feature/prompts.yaml
+++ b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -48,10 +48,9 @@ feature:
     ---------Feature Processing Specification---------
     {{ feature_spec }}
 
-
     {% if latest_code %}
-    ---------Former Specification---------
-      Former Code: {{ latest_code }}
+    ---------Former code---------
+      {{ latest_code }}
       You should follow the former code to improve it.
     {% endif %}    
 
diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 1c29ed968..cf7475e9c 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -16,6 +16,7 @@
     ModelGeneralCaseSpecEvaluator,
 )
 from rdagent.components.coder.data_science.model.exp import ModelTask
+from rdagent.core.exception import CoderError
 from rdagent.core.experiment import FBWorkspace
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
@@ -75,6 +76,25 @@ def implement_one_task(
             )
         )
 
+        # TODO this is a temporary fix for the issue that CoSTEER always generates the same code or delete code which will cause cache fail
+        try_count = 0
+        while f"{target_task.name}.py" in batch_edit and (
+            batch_edit[f"{target_task.name}.py"] == "__DEL__"
+            or batch_edit[f"{target_task.name}.py"] == workspace.file_dict.get(f"{target_task.name}.py")
+        ):
+            batch_edit = BatchEditOut.extract_output(
+                APIBackend().build_messages_and_create_chat_completion(
+                    user_prompt=user_prompt + "\n Please generate different code from the current workspace",
+                    system_prompt=system_prompt,
+                    json_mode=BatchEditOut.json_mode,
+                )
+            )
+            try_count += 1
+            if try_count > 10:
+                raise CoderError(
+                    "Failed to generate code after 10 tries. Costeer always generates same code or delete code."
+                )
+
         # 3. post process to align file name to the task name
         batch_edit = {
             (f"{target_task.name}.py" if value != "__DEL__" and key != f"{target_task.name}.py" else key): value
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 48b249eee..081b506b2 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -310,6 +310,12 @@ data_loader_coder:
 
     ---------Data Folder Description---------(All path are relative to the data folder)
     {{ folder_spec }}
+    
+    {% if latest_code %}
+    ---------Former code---------
+      {{ latest_code }}
+      You should follow the former code to improve it.
+    {% endif %}   
 
     You should strictly follow the function interface specifications provided by the specification to implement the function.
 
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index f31a9b4e4..8af207df9 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -68,9 +68,16 @@ workflow_coder:
     {{ ensemble_code }}
 
     {% if latest_code %}
-    ---------Former Specification---------
-      Former Code: {{ latest_code }}
+    ---------Former code---------
+      {{ latest_code }}
       You should follow the former code to improve it.
+    {% endif %}  
+    
+    {% if former_failed_knowledge %}
+    You have tried to correct the former code but failed. Please avoid making the same mistakes.
+    Here's the failed code you have written:
+    ---------Former failed attempt---------
+    {{ former_failed_knowledge.implementation.file_dict["main.py"] }}
     {% endif %}    
 
 workflow_eval:
diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index 001195d11..bf02b98ac 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 
 import pandas as pd
+
 try:
     import bson  # pip install pymongo
 except:
@@ -47,7 +48,7 @@ def load(self, path) -> pd.DataFrame:
             # Read JSON Lines file
             return pd.read_json(path, lines=True)
         elif suffix == ".bson":
-            data = bson.decode_file_iter(open(path, 'rb'))
+            data = bson.decode_file_iter(open(path, "rb"))
             df = pd.DataFrame(data)
             return df
         else:
@@ -114,7 +115,7 @@ def count_files_in_folder(folder: Path) -> int:
 def create_debug_data(
     competition: str,
     dr_cls: type[DataReducer] = RandDataReducer,
-    min_frac=0.002, 
+    min_frac=0.002,
     min_num=5,
     dataset_path=None,
     sample_path=None,
@@ -133,7 +134,9 @@ def create_debug_data(
     data_folder = Path(dataset_path) / competition
     sample_folder = Path(sample_path) / competition
     total_files_count = count_files_in_folder(data_folder)
-    print(f"[INFO] Original dataset folder `{data_folder}` has {total_files_count} files in total (including subfolders).")
+    print(
+        f"[INFO] Original dataset folder `{data_folder}` has {total_files_count} files in total (including subfolders)."
+    )
 
     # Traverse the folder and exclude specific file types
     included_extensions = {".csv", ".pkl", ".parquet", ".h5", ".hdf", ".hdf5", ".jsonl", ".bson"}
@@ -155,7 +158,7 @@ def create_debug_data(
             continue
 
         sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
-       
+
         # Load the original data
         df = data_handler.load(file_path)
 
@@ -210,7 +213,7 @@ def create_debug_data(
                 num_to_keep = len(file_list)
             else:
                 num_to_keep = max(int(len(file_list) * min_frac), min_num)
-            print(F"Sampling {num_to_keep} files without label from {len(file_list)} files in {rel_dir}")
+            print(f"Sampling {num_to_keep} files without label from {len(file_list)} files in {rel_dir}")
             sampled_not_used = pd.Series(not_used_files).sample(n=num_to_keep, random_state=1)
             for nf in sampled_not_used:
                 sampled_file_path = sample_folder / nf.relative_to(data_folder)
@@ -218,6 +221,6 @@ def create_debug_data(
                     continue
                 sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
                 shutil.copy(nf, sampled_file_path)
-    
+
     final_files_count = count_files_in_folder(sample_folder)
     print(f"[INFO] After sampling, the sample folder `{sample_folder}` contains {final_files_count} files in total.")
diff --git a/rdagent/utils/__init__.py b/rdagent/utils/__init__.py
index 408b4c4bf..37b59dae8 100644
--- a/rdagent/utils/__init__.py
+++ b/rdagent/utils/__init__.py
@@ -59,6 +59,7 @@ def convert2bool(value: Union[str, bool]) -> bool:
     else:
         raise ValueError(f"Unknown value type {value} to bool")
 
+
 def remove_ansi_codes(s: str) -> str:
     """
     It is for removing ansi ctrl characters in the string(e.g. colored text)
@@ -86,42 +87,28 @@ def filter_progress_bar(stdout: str) -> str:
     filtered_stdout = re.sub(progress_bar_re, "", filtered_stdout)
     filtered_stdout = re.sub(r"\s*\n\s*", "\n", filtered_stdout)
 
-    # Check if progress bars are already filtered
-    system_prompt = T(".prompts:if_filtered.system").r()
-    user_prompt = T(".prompts:if_filtered.user").r(
-        filtered_stdout=filtered_stdout,
-    )
-    stdout_token_size = APIBackend().build_messages_and_calculate_token(
-        user_prompt=user_prompt,
-        system_prompt=system_prompt,
-    )
-    if stdout_token_size < LLM_SETTINGS.chat_token_limit * 0.1:
-        return filtered_stdout
-    elif stdout_token_size < LLM_SETTINGS.chat_token_limit * 0.8:
-        if_filtered_stdout = json.loads(
-            APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
-        ).get("progress bar filtered", False)
-
-        if convert2bool(if_filtered_stdout):
-            return filtered_stdout
-
-    filtered_stdout_shortened = filtered_stdout
     needs_sub = True
     # Attempt further filtering up to 5 times
     for _ in range(5):
+        filtered_stdout_shortened = filtered_stdout
         system_prompt = T(".prompts:filter_progress_bar.system").r()
-        user_prompt = T(".prompts:filter_progress_bar.user").r(
-            stdout=filtered_stdout_shortened,
-        )
 
-        stdout_token_size = APIBackend().build_messages_and_calculate_token(
-            user_prompt=user_prompt,
-            system_prompt=system_prompt,
-        )
-        if stdout_token_size < LLM_SETTINGS.chat_token_limit * 0.1:
-            return filtered_stdout_shortened
-        elif stdout_token_size > LLM_SETTINGS.chat_token_limit * 0.8:
-            filtered_stdout_shortened = filtered_stdout[len(filtered_stdout) // 4 : len(filtered_stdout) * 3 // 4]
+        for __ in range(10):
+            user_prompt = T(".prompts:filter_progress_bar.user").r(
+                stdout=filtered_stdout_shortened,
+            )
+            stdout_token_size = APIBackend().build_messages_and_calculate_token(
+                user_prompt=user_prompt,
+                system_prompt=system_prompt,
+            )
+            if stdout_token_size < LLM_SETTINGS.chat_token_limit * 0.1:
+                return filtered_stdout_shortened
+            elif stdout_token_size > LLM_SETTINGS.chat_token_limit * 0.6:
+                filtered_stdout_shortened = filtered_stdout_shortened[
+                    len(filtered_stdout_shortened) // 4 : len(filtered_stdout_shortened) * 3 // 4
+                ]
+            else:
+                break
 
         response = json.loads(
             APIBackend().build_messages_and_create_chat_completion(
@@ -129,7 +116,7 @@ def filter_progress_bar(stdout: str) -> str:
             )
         )
         needs_sub = response.get("needs_sub", True)
-        regex_patterns = response.get("regex patterns", [])
+        regex_patterns = response.get("regex_patterns", [])
         if isinstance(regex_patterns, list):
             for pattern in regex_patterns:
                 filtered_stdout = re.sub(pattern, "", filtered_stdout)
@@ -138,11 +125,8 @@ def filter_progress_bar(stdout: str) -> str:
 
         if not needs_sub:
             break
-        filtered_stdout = re.sub(regex_patterns, "", filtered_stdout)
         filtered_stdout = re.sub(r"\s*\n\s*", "\n", filtered_stdout)
 
-        filtered_stdout_shortened = filtered_stdout
-
     if needs_sub:
         return None
     return filtered_stdout
diff --git a/rdagent/utils/prompts.yaml b/rdagent/utils/prompts.yaml
index 77341ffaa..db31a454c 100644
--- a/rdagent/utils/prompts.yaml
+++ b/rdagent/utils/prompts.yaml
@@ -1,35 +1,18 @@
 filter_progress_bar:
   system: |
-    You are an assistant helping to analyze and filter progress bars from a given text. Evaluate the text to determine if progress bar patterns are present and, if so, generate a list of regex patterns to remove them. 
+    You are an assistant helping to analyze and filter training log messages and a progress bar output from a given text. Evaluate the text to determine if training log messages and a progress bar output patterns are present and, if so, generate a list of regex patterns to remove them. 
     Additionally, indicate whether substitution is needed. If the input exceeds a token limit, the system will provide only a shortened portion of the text.
-    Note: You can keep metrics or logs, such as `val_accuracy`, `val_loss`, and similar entries, are retained and not filtered.
+    Note: About the training log message, if the log message contains useful information like loss or accuracy and it is reported in each epoch, it should not be removed. If the log message is not useful, for example, reporting nan in each iteration or just reporting the iteration number, please remove them.
 
     Respond in the following JSON format and order:
     ```json
     {
         "needs_sub": <true/false>, 
-        "regex patterns": ["regex pattern 1", "regex pattern 2", ...]
+        "regex_patterns": ["regex pattern 1", "regex pattern 2", ...]
     }
   user: |
     The following text contains stdout:
 
     {{ stdout }}
 
-    Check if the text contains progress bar patterns. If patterns are found, provide a list of regex patterns to filter them. Otherwise, indicate that substitution is not needed.
-
-if_filtered:
-  system: |
-    You are an assistant helping to verify if progress bars have been successfully filtered from a given text. Analyze the filtered text to determine if any progress bar-like patterns remain.
-  user: |
-    The following is the filtered stdout text:
-
-    {{ filtered_stdout }}
-
-    Check if the text still contains any progress bar patterns such as percentages, loading bars, or similar elements. Return true if no progress bar remains; otherwise, return false.
-
-    Please respond with your answer in the following JSON format and order:
-    ```json
-    {
-        "progress bar filtered": <true/false>
-    }
-    ```
\ No newline at end of file
+    Check if the text contains training log messages and progress bar patterns. If patterns are found, provide a list of regex patterns to filter them. Otherwise, indicate that substitution is not needed.

From 0d17ad9a8f5a767144ac5c4c6a9bcb74585a2633 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Thu, 9 Jan 2025 05:07:48 +0000
Subject: [PATCH 222/304] a little speed us in debug data creating

---
 rdagent/scenarios/data_science/debug/data.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index bf02b98ac..36eadf4ad 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -93,7 +93,7 @@ class RandDataReducer(DataReducer):
     or at least `min_frac` fraction of the data (whichever is larger).
     """
 
-    def __init__(self, min_frac=0.05, min_num=5):
+    def __init__(self, min_frac=0.02, min_num=5):
         self.min_frac = min_frac
         self.min_num = min_num
 
@@ -133,15 +133,16 @@ def create_debug_data(
 
     data_folder = Path(dataset_path) / competition
     sample_folder = Path(sample_path) / competition
-    total_files_count = count_files_in_folder(data_folder)
-    print(
-        f"[INFO] Original dataset folder `{data_folder}` has {total_files_count} files in total (including subfolders)."
-    )
 
     # Traverse the folder and exclude specific file types
     included_extensions = {".csv", ".pkl", ".parquet", ".h5", ".hdf", ".hdf5", ".jsonl", ".bson"}
     files_to_process = [file for file in data_folder.rglob("*") if file.is_file()]
 
+    total_files_count = len(files_to_process)
+    print(
+        f"[INFO] Original dataset folder `{data_folder}` has {total_files_count} files in total (including subfolders)."
+    )
+
     # This set will store filenames or paths that appear in the sampled data
     sample_used_file_names = set()
 

From 7adb539e010148962dfa361f89df7e31534c04ba Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 9 Jan 2025 06:43:03 +0000
Subject: [PATCH 223/304] filter progress bar when eval ens and main

---
 rdagent/components/coder/data_science/ensemble/eval.py | 3 ++-
 rdagent/components/coder/data_science/workflow/eval.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
index de5b2bfc3..8a961c8f8 100644
--- a/rdagent/components/coder/data_science/ensemble/eval.py
+++ b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -14,6 +14,7 @@
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
+from rdagent.utils import filter_progress_bar
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
@@ -63,7 +64,7 @@ def evaluate(
         )
 
         implementation.inject_files(**{fname: test_code})
-        stdout = implementation.execute(env=de, entry=f"python {fname}")
+        stdout = filter_progress_bar(implementation.execute(env=de, entry=f"python {fname}"))
 
         system_prompt = T(".prompts:ensemble_eval.system").r(
             test_code=test_code, code=implementation.file_dict["ensemble.py"]
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index 2d96d5c21..3d9c702c1 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -17,6 +17,7 @@
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
+from rdagent.utils import filter_progress_bar
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
@@ -60,7 +61,7 @@ def evaluate(
         }
         de = DockerEnv(conf=ds_docker_conf)
         fname = "main.py"
-        stdout = implementation.execute(env=de, entry=f"python {fname}")
+        stdout = filter_progress_bar(implementation.execute(env=de, entry=f"python {fname}"))
 
         # Check score file
         score_fp = implementation.workspace_path / "scores.csv"

From 262e24226df07510acfbbd1f6b8eab1290c662ba Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Thu, 9 Jan 2025 07:35:46 +0000
Subject: [PATCH 224/304] avoid costeer makes no change to former code

---
 .../coder/data_science/ensemble/__init__.py   | 18 +++++++---
 .../coder/data_science/ensemble/prompts.yaml  |  2 +-
 .../coder/data_science/feature/__init__.py    | 18 +++++++---
 .../coder/data_science/feature/prompts.yaml   |  2 +-
 .../coder/data_science/model/__init__.py      | 33 +++++++++----------
 .../coder/data_science/model/prompts.yaml     |  4 +--
 .../data_science/raw_data_loader/__init__.py  | 18 +++++++---
 .../data_science/raw_data_loader/prompts.yaml | 23 +++++++++++--
 .../coder/data_science/workflow/__init__.py   | 19 ++++++++---
 .../coder/data_science/workflow/eval.py       |  7 ++--
 .../coder/data_science/workflow/prompts.yaml  | 11 ++-----
 11 files changed, 101 insertions(+), 54 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/__init__.py b/rdagent/components/coder/data_science/ensemble/__init__.py
index 3144913ee..8bdc85630 100644
--- a/rdagent/components/coder/data_science/ensemble/__init__.py
+++ b/rdagent/components/coder/data_science/ensemble/__init__.py
@@ -24,6 +24,7 @@
 )
 from rdagent.components.coder.data_science.ensemble.eval import EnsembleCoSTEEREvaluator
 from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
+from rdagent.core.exception import CoderError
 from rdagent.core.experiment import FBWorkspace
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
@@ -67,11 +68,18 @@ def implement_one_task(
             latest_code=workspace.file_dict.get("ensemble.py"),
         )
 
-        ensemble_code = json.loads(
-            APIBackend().build_messages_and_create_chat_completion(
-                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-            )
-        )["code"]
+        for _ in range(5):
+            ensemble_code = json.loads(
+                APIBackend().build_messages_and_create_chat_completion(
+                    user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                )
+            )["code"]
+            if ensemble_code != workspace.file_dict.get("ensemble.py"):
+                break
+            else:
+                user_prompt = user_prompt + "\nPlease avoid generating same code to former code!"
+        else:
+            raise CoderError("Failed to generate a new data loader code.")
 
         return {
             "ensemble.py": ensemble_code,
diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
index 62bf57fa8..7bc3260ae 100644
--- a/rdagent/components/coder/data_science/ensemble/prompts.yaml
+++ b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -45,7 +45,7 @@ ensemble_coder:
     {% if latest_code %}
     ---------Former code---------
       {{ latest_code }}
-      You should follow the former code to improve it.
+    The former code has some errors, you should write the correct code based on the former code. Avoid writing the same code to former code.
     {% endif %}
 ensemble_eval:
   system: |-
diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
index a01135b5c..0c32ed9cc 100644
--- a/rdagent/components/coder/data_science/feature/__init__.py
+++ b/rdagent/components/coder/data_science/feature/__init__.py
@@ -11,6 +11,7 @@
 )
 from rdagent.components.coder.data_science.feature.eval import FeatureCoSTEEREvaluator
 from rdagent.components.coder.data_science.feature.exp import FeatureTask
+from rdagent.core.exception import CoderError
 from rdagent.core.experiment import FBWorkspace
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
@@ -52,11 +53,18 @@ def implement_one_task(
             latest_code=workspace.file_dict.get("feature.py"),
         )
 
-        feature_code = json.loads(
-            APIBackend().build_messages_and_create_chat_completion(
-                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-            )
-        )["code"]
+        for _ in range(5):
+            feature_code = json.loads(
+                APIBackend().build_messages_and_create_chat_completion(
+                    user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                )
+            )["code"]
+            if feature_code != workspace.file_dict.get("feature.py"):
+                break
+            else:
+                user_prompt = user_prompt + "\nPlease avoid generating same code to former code!"
+        else:
+            raise CoderError("Failed to generate a new data loader code.")
 
         return {
             "feature.py": feature_code,
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
index 36b95c7a3..61858c82f 100644
--- a/rdagent/components/coder/data_science/feature/prompts.yaml
+++ b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -51,7 +51,7 @@ feature:
     {% if latest_code %}
     ---------Former code---------
       {{ latest_code }}
-      You should follow the former code to improve it.
+    The former code has some errors, you should write the correct code based on the former code. Avoid writing the same code to former code.
     {% endif %}    
 
 
diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index cf7475e9c..d3e714777 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -76,30 +76,29 @@ def implement_one_task(
             )
         )
 
-        # TODO this is a temporary fix for the issue that CoSTEER always generates the same code or delete code which will cause cache fail
-        try_count = 0
-        while f"{target_task.name}.py" in batch_edit and (
-            batch_edit[f"{target_task.name}.py"] == "__DEL__"
-            or batch_edit[f"{target_task.name}.py"] == workspace.file_dict.get(f"{target_task.name}.py")
-        ):
+        for _ in range(5):
             batch_edit = BatchEditOut.extract_output(
                 APIBackend().build_messages_and_create_chat_completion(
-                    user_prompt=user_prompt + "\n Please generate different code from the current workspace",
+                    user_prompt=user_prompt,
                     system_prompt=system_prompt,
                     json_mode=BatchEditOut.json_mode,
                 )
             )
-            try_count += 1
-            if try_count > 10:
-                raise CoderError(
-                    "Failed to generate code after 10 tries. Costeer always generates same code or delete code."
-                )
 
-        # 3. post process to align file name to the task name
-        batch_edit = {
-            (f"{target_task.name}.py" if value != "__DEL__" and key != f"{target_task.name}.py" else key): value
-            for key, value in batch_edit.items()
-        }
+            # 3. post process to align file name to the task name
+            batch_edit = {
+                (f"{target_task.name}.py" if value != "__DEL__" and key != f"{target_task.name}.py" else key): value
+                for key, value in batch_edit.items()
+            }
+
+            if batch_edit[f"{target_task.name}.py"] != "__DEL__" and batch_edit[
+                f"{target_task.name}.py"
+            ] != workspace.file_dict.get(f"{target_task.name}.py"):
+                break
+            else:
+                user_prompt = user_prompt + "\nPlease avoid generating same code to former code!"
+        else:
+            raise CoderError("Failed to generate a new data loader code.")
 
         return batch_edit
 
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 382cbb580..1203fa469 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -72,8 +72,8 @@ model_coder:
 
         {% if latest_code %}
         ---------Former Code---------
-        Former Code: {{ latest_code }}
-        You should follow the former code to improve it.
+            Former Code: {{ latest_code }}
+        The former code has some errors, you should write the correct code based on the former code. Avoid writing the same code to former code.
         {% endif %}
 
     user_general: |-
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index e67799a29..51d6e2868 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -38,6 +38,7 @@
     DataLoaderCoSTEEREvaluator,
 )
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.core.exception import CoderError
 from rdagent.core.experiment import FBWorkspace
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
@@ -124,11 +125,18 @@ def implement_one_task(
             latest_code=workspace.file_dict.get("load_data.py"),
         )
 
-        data_loader_code = json.loads(
-            APIBackend().build_messages_and_create_chat_completion(
-                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-            )
-        )["code"]
+        for _ in range(5):
+            data_loader_code = json.loads(
+                APIBackend().build_messages_and_create_chat_completion(
+                    user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                )
+            )["code"]
+            if data_loader_code != workspace.file_dict.get("load_data.py"):
+                break
+            else:
+                user_prompt = user_prompt + "\nPlease avoid generating same code to former code!"
+        else:
+            raise CoderError("Failed to generate a new data loader code.")
 
         return {
             "spec/data_loader.md": data_loader_spec,
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 081b506b2..dcd2236b6 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -180,8 +180,8 @@ spec:
       1. Function Interface:
         - Function Name: `ens_and_decision`
         - Parameters:
-          - `test_preds_dict` (Dict[str, DT]): A dictionary of test predictions from different models.
-          - `val_preds_dict` (Dict[str, DT]): A dictionary of validation predictions from different models.
+          - `test_preds_dict` (Dict[str, DT]): A dictionary of test predictions from different models. The key is the model file name.
+          - `val_preds_dict` (Dict[str, DT]): A dictionary of validation predictions from different models. The key is the model file name.
           - `val_label` (DT): Validation label.
         - Output:
           - `final_pred` (DT): Ensemble prediction for the test data.
@@ -256,6 +256,23 @@ spec:
           - Document the workflow with clear comments and docstring.
           - Do not use progress bars (e.g., tqdm) in the code.
 
+        5. Ensemble Strategy:
+          Put all the model's return into a dict, using the model file name as key, and the return as value.
+          Sample code:
+          {% raw %}
+          {% for mn in model_names %}
+          from {{mn}} import model_workflow as {{mn}}_workflow
+          val_preds_dict["{{mn}}"], test_preds_dict["{{mn}}"], _ = {{mn}}_workflow(
+              X=train_X,
+              y=train_y,
+              val_X=val_X,
+              val_y=val_y,
+              test_X=test_X
+          )
+          {% endfor %}
+          final_pred = ens_and_decision(test_preds_dict, val_preds_dict, val_y)
+          {% endraw %}
+
         {% if latest_spec %}
         5. Former Specification:
           {{ latest_spec }}
@@ -314,7 +331,7 @@ data_loader_coder:
     {% if latest_code %}
     ---------Former code---------
       {{ latest_code }}
-      You should follow the former code to improve it.
+    The former code has some errors, you should write the correct code based on the former code. Avoid writing the same code to former code.
     {% endif %}   
 
     You should strictly follow the function interface specifications provided by the specification to implement the function.
diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
index 855c545dc..bb986f11b 100644
--- a/rdagent/components/coder/data_science/workflow/__init__.py
+++ b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -13,6 +13,7 @@
     WorkflowGeneralCaseSpecEvaluator,
 )
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
+from rdagent.core.exception import CoderError
 from rdagent.core.experiment import FBWorkspace
 from rdagent.core.scenario import Scenario
 from rdagent.oai.llm_utils import APIBackend
@@ -56,11 +57,19 @@ def implement_one_task(
             latest_code=workspace.file_dict.get("main.py"),
             workflow_spec=workspace.file_dict["spec/workflow.md"],
         )
-        workflow_code = json.loads(
-            APIBackend().build_messages_and_create_chat_completion(
-                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-            )
-        )["code"]
+
+        for _ in range(5):
+            workflow_code = json.loads(
+                APIBackend().build_messages_and_create_chat_completion(
+                    user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                )
+            )["code"]
+            if workflow_code != workspace.file_dict.get("main.py"):
+                break
+            else:
+                user_prompt = user_prompt + "\nPlease avoid generating same code to former code!"
+        else:
+            raise CoderError("Failed to generate a new data loader code.")
 
         return {"main.py": workflow_code}
 
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index 3d9c702c1..cc2a46db6 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -73,8 +73,11 @@ def evaluate(
             model_set_in_folder = set(
                 f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_.+\.py$", f) and "test" not in f
             )
-            if model_set_in_scores != model_set_in_folder:
-                stdout += f"\nThe models used by ensemble are not consistent with the models in the workspace.\nThe model names in the score.csv are {model_set_in_scores}, while the model names in the workspace are {model_set_in_folder}."
+            for model in model_set_in_folder:
+                if model not in model_set_in_scores:
+                    stdout += (
+                        f"\nModel {model} is not evaluated in the scores.csv. The score.csv has {model_set_in_scores}."
+                    )
 
         # Check submission file
         submission_fp = implementation.workspace_path / "submission.csv"
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index 8af207df9..aa53b0786 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -61,24 +61,19 @@ workflow_coder:
 
     ---------model training code---------
     Attention: The input and output of the model function is flexible. Training dataset is necessary, but validation and test dateset might be optional. The hyperparameters can either be passed as arguments or be set as default values in the function. You need to use the function correctly.
+    All model files share the same function name. Please import the model files with their name like: from {file_name} import {function_name}
     {{ model_codes }}
 
     ---------ensemble code---------
+    Note, we will check the index of the score.csv, so please use the model name as the index to feed into ensemble function.
     file: ensemble.py
     {{ ensemble_code }}
 
     {% if latest_code %}
     ---------Former code---------
       {{ latest_code }}
-      You should follow the former code to improve it.
+    The former code has some errors, you should write the correct code based on the former code. Avoid writing the same code to former code.
     {% endif %}  
-    
-    {% if former_failed_knowledge %}
-    You have tried to correct the former code but failed. Please avoid making the same mistakes.
-    Here's the failed code you have written:
-    ---------Former failed attempt---------
-    {{ former_failed_knowledge.implementation.file_dict["main.py"] }}
-    {% endif %}    
 
 workflow_eval:
   system: |-

From 462982afd57b045b30ee51558906e5bf6ba1bb7c Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Thu, 9 Jan 2025 07:55:17 +0000
Subject: [PATCH 225/304] fix several log error

---
 rdagent/components/coder/data_science/ensemble/__init__.py | 2 +-
 rdagent/components/coder/data_science/feature/__init__.py  | 2 +-
 rdagent/components/coder/data_science/model/__init__.py    | 2 +-
 rdagent/components/coder/data_science/workflow/__init__.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/__init__.py b/rdagent/components/coder/data_science/ensemble/__init__.py
index 8bdc85630..9093f8153 100644
--- a/rdagent/components/coder/data_science/ensemble/__init__.py
+++ b/rdagent/components/coder/data_science/ensemble/__init__.py
@@ -79,7 +79,7 @@ def implement_one_task(
             else:
                 user_prompt = user_prompt + "\nPlease avoid generating same code to former code!"
         else:
-            raise CoderError("Failed to generate a new data loader code.")
+            raise CoderError("Failed to generate a new ensemble code.")
 
         return {
             "ensemble.py": ensemble_code,
diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
index 0c32ed9cc..56dfe03b3 100644
--- a/rdagent/components/coder/data_science/feature/__init__.py
+++ b/rdagent/components/coder/data_science/feature/__init__.py
@@ -64,7 +64,7 @@ def implement_one_task(
             else:
                 user_prompt = user_prompt + "\nPlease avoid generating same code to former code!"
         else:
-            raise CoderError("Failed to generate a new data loader code.")
+            raise CoderError("Failed to generate a new feature code.")
 
         return {
             "feature.py": feature_code,
diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index d3e714777..2bb235ad6 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -98,7 +98,7 @@ def implement_one_task(
             else:
                 user_prompt = user_prompt + "\nPlease avoid generating same code to former code!"
         else:
-            raise CoderError("Failed to generate a new data loader code.")
+            raise CoderError("Failed to generate a new model code.")
 
         return batch_edit
 
diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
index bb986f11b..3eb7f15a6 100644
--- a/rdagent/components/coder/data_science/workflow/__init__.py
+++ b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -69,7 +69,7 @@ def implement_one_task(
             else:
                 user_prompt = user_prompt + "\nPlease avoid generating same code to former code!"
         else:
-            raise CoderError("Failed to generate a new data loader code.")
+            raise CoderError("Failed to generate a new workflow code.")
 
         return {"main.py": workflow_code}
 

From 10de120a3cd716d44f9edd2a1cb39a086d51a636 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 9 Jan 2025 08:00:48 +0000
Subject: [PATCH 226/304] add timeout judge threshold

---
 rdagent/utils/env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
index 70e3f3507..e403fa11a 100644
--- a/rdagent/utils/env.py
+++ b/rdagent/utils/env.py
@@ -395,7 +395,7 @@ def run(
         out = self.__run(entry_add_timeout, local_path, env, running_extra_volume)
         end = time.time()
 
-        if end - start >= self.conf.running_timeout_period:
+        if end - start + 1 >= self.conf.running_timeout_period:
             out += f"\n\nThe running time exceeds {self.conf.running_timeout_period} seconds, so the process is killed."
 
         return out

From c1c9f93a4ea38412694f49dc0e2853475852fc11 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Thu, 9 Jan 2025 08:33:05 +0000
Subject: [PATCH 227/304] fix some bugs in the evaluation of component output
 shapes

---
 .../data_science/ensemble/eval_tests/ensemble_test.py      | 5 ++++-
 .../coder/data_science/feature/eval_tests/feature_test.py  | 7 +++++--
 .../coder/data_science/model/eval_tests/model_test.py      | 2 +-
 .../raw_data_loader/eval_tests/data_loader_test.py         | 7 +++++--
 4 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
index 6a40b2d8e..935f913ad 100644
--- a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
+++ b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
@@ -34,7 +34,10 @@
 final_pred = ens_and_decision(test_preds_dict, val_preds_dict, val_y)
 
 # Check shape
-assert final_pred.shape[0] == test_X.shape[0], "Wrong output sample size"
+if isinstance(final_pred, list):
+    assert len(final_pred) == len(test_X), "Wrong output sample size"
+else:
+    assert final_pred.shape[0] == test_X.shape[0], "Wrong output sample size"
 
 # check if scores.csv is generated
 assert Path("scores.csv").exists(), "scores.csv is not generated"
diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
index efc303408..010028019 100644
--- a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
@@ -19,7 +19,10 @@
 
 
 # Validate the conditions mentioned in the docstring
-assert X_test.shape[0] == test_ids.shape[0], "Mismatch in length of test images and test IDs"
-assert X.shape[0] == y.shape[0], "Mismatch in length of training images and labels"
+def get_length(data):
+    return len(data) if isinstance(data, list) else data.shape[0]
+
+assert get_length(X_test) == get_length(test_ids), "Mismatch in length of test images and test IDs"
+assert get_length(X) == get_length(y), "Mismatch in length of training images and labels"
 
 print("Feature Engineering test passed successfully. Length of test images matches length of test IDs.")
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_test.py b/rdagent/components/coder/data_science/model/eval_tests/model_test.py
index cf0f97935..092eecef6 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_test.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_test.py
@@ -18,7 +18,7 @@ def log_execution_results(start_time, val_pred, test_pred, hypers, execution_lab
 # Load and preprocess data
 X, y, test_X, test_ids = load_data()
 X, y, test_X = feat_eng(X, y, test_X)
-train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=42)
+train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.8, random_state=42)
 
 # First execution
 print("The first execution begins.\n")
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
index 31b191872..5b5b9e0d6 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
@@ -16,8 +16,11 @@
 X, y, X_test, test_ids = load_data()
 
 # Validate the conditions mentioned in the docstring
-assert X_test.shape[0] == test_ids.shape[0], "Mismatch in length of test images and test IDs"
-assert X.shape[0] == y.shape[0], "Mismatch in length of training images and labels"
+def get_length(data):
+    return len(data) if isinstance(data, list) else data.shape[0]
+
+assert get_length(X_test) == get_length(test_ids), "Mismatch in length of test images and test IDs"
+assert get_length(X) == get_length(y), "Mismatch in length of training images and labels"
 
 print("Data loader test passed successfully. Length of test images matches length of test IDs.")
 

From fdbb4b84e8364137afd57f9f07bf0d34991c3d4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=82=BC=E9=87=91=E6=9C=AF=E5=B8=88=E5=8D=8E=E5=8D=8E?=
 <37462254+YeewahChan@users.noreply.github.com>
Date: Thu, 9 Jan 2025 17:05:48 +0800
Subject: [PATCH 228/304] File structure for supporting litellm (#517)

Co-authored-by: Young <afe.young@gmail.com>
---
 rdagent/oai/backend/__init__.py          | 0
 rdagent/oai/backend/base.py              | 3 +++
 rdagent/oai/backend/deprec/__init__.py   | 0
 rdagent/oai/backend/deprec/conf.py       | 0
 rdagent/oai/backend/deprec/deprecated.py | 0
 rdagent/oai/backend/litellm.py           | 0
 rdagent/oai/llm_conf.py                  | 3 +++
 7 files changed, 6 insertions(+)
 create mode 100644 rdagent/oai/backend/__init__.py
 create mode 100644 rdagent/oai/backend/base.py
 create mode 100644 rdagent/oai/backend/deprec/__init__.py
 create mode 100644 rdagent/oai/backend/deprec/conf.py
 create mode 100644 rdagent/oai/backend/deprec/deprecated.py
 create mode 100644 rdagent/oai/backend/litellm.py

diff --git a/rdagent/oai/backend/__init__.py b/rdagent/oai/backend/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/oai/backend/base.py b/rdagent/oai/backend/base.py
new file mode 100644
index 000000000..21ed3695e
--- /dev/null
+++ b/rdagent/oai/backend/base.py
@@ -0,0 +1,3 @@
+
+class APIBackend:
+    """abstract"""
diff --git a/rdagent/oai/backend/deprec/__init__.py b/rdagent/oai/backend/deprec/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/oai/backend/deprec/conf.py b/rdagent/oai/backend/deprec/conf.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/oai/backend/deprec/deprecated.py b/rdagent/oai/backend/deprec/deprecated.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/oai/backend/litellm.py b/rdagent/oai/backend/litellm.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/oai/llm_conf.py b/rdagent/oai/llm_conf.py
index 5f7777ade..f61d0ca8f 100644
--- a/rdagent/oai/llm_conf.py
+++ b/rdagent/oai/llm_conf.py
@@ -8,6 +8,9 @@
 
 
 class LLMSettings(ExtendedBaseSettings):
+    # backend
+    backend: str = "rdagent.oai.backend.DeprecBackend"
+
     log_llm_chat_content: bool = True
 
     use_azure: bool = Field(default=False, deprecated=True)

From 0c919edc91c3d6a9b0bc5e2092d1992f48f81b25 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Thu, 9 Jan 2025 09:19:50 +0000
Subject: [PATCH 229/304] ignore submission and show processing

---
 rdagent/scenarios/data_science/debug/data.py | 27 +++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index 36eadf4ad..2fb09891d 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -1,3 +1,4 @@
+from collections import Counter
 import os
 import platform
 import shutil
@@ -5,6 +6,8 @@
 
 import pandas as pd
 
+from tqdm import tqdm
+
 try:
     import bson  # pip install pymongo
 except:
@@ -99,7 +102,7 @@ def __init__(self, min_frac=0.02, min_num=5):
 
     def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
         frac = max(self.min_frac, self.min_num / len(df))
-        print(f"Sampling {frac * 100:.2f}% of the data ({len(df)} rows)")
+        # print(f"Sampling {frac * 100:.2f}% of the data ({len(df)} rows)")
         if frac >= 1:
             return df
         return df.sample(frac=frac, random_state=1)
@@ -137,11 +140,14 @@ def create_debug_data(
     # Traverse the folder and exclude specific file types
     included_extensions = {".csv", ".pkl", ".parquet", ".h5", ".hdf", ".hdf5", ".jsonl", ".bson"}
     files_to_process = [file for file in data_folder.rglob("*") if file.is_file()]
-
     total_files_count = len(files_to_process)
     print(
         f"[INFO] Original dataset folder `{data_folder}` has {total_files_count} files in total (including subfolders)."
     )
+    file_types_count = Counter(file.suffix.lower() for file in files_to_process)
+    print("File type counts:")
+    for file_type, count in file_types_count.items():
+        print(f"{file_type}: {count}")
 
     # This set will store filenames or paths that appear in the sampled data
     sample_used_file_names = set()
@@ -150,7 +156,10 @@ def create_debug_data(
     data_handler = GenericDataHandler()
     data_reducer = dr_cls(min_frac=min_frac, min_num=min_num)
 
-    for file_path in files_to_process:
+    skip_subfolder_data = any(f.is_file() and f.suffix in included_extensions for f in data_folder.iterdir() if f.name.startswith(("train", "test")))
+    processed_files = []
+
+    for file_path in tqdm(files_to_process, desc="Processing data", unit="file"):
         sampled_file_path = sample_folder / file_path.relative_to(data_folder)
         if sampled_file_path.exists():
             continue
@@ -158,6 +167,9 @@ def create_debug_data(
         if file_path.suffix.lower() not in included_extensions:
             continue
 
+        if skip_subfolder_data and file_path.parent != data_folder:
+            continue  # bypass files in subfolders
+
         sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
 
         # Load the original data
@@ -165,11 +177,13 @@ def create_debug_data(
 
         # Create a sampled subset
         df_sampled = data_reducer.reduce(df)
-
+        processed_files.append(file_path)
         # Dump the sampled data
         try:
             data_handler.dump(df_sampled, sampled_file_path)
             # Extract possible file references from the sampled data
+            if "submission" in file_path.stem:
+                continue  # Skip submission files
             for col in df_sampled.columns:
                 unique_vals = df_sampled[col].astype(str).unique()
                 for val in unique_vals:
@@ -183,13 +197,13 @@ def create_debug_data(
     # Process non-data files
     subfolder_dict = {}
     for file_path in files_to_process:
-        if file_path.suffix.lower() in included_extensions:
+        if file_path in processed_files:
             continue  # Already handled above
         rel_dir = file_path.relative_to(data_folder).parts[0]
         subfolder_dict.setdefault(rel_dir, []).append(file_path)
 
     # For each subfolder, decide which files to copy
-    for rel_dir, file_list in subfolder_dict.items():
+    for rel_dir, file_list in tqdm(subfolder_dict.items(), desc="Processing files", unit="file"):
         used_files = []
         not_used_files = []
 
@@ -223,5 +237,6 @@ def create_debug_data(
                 sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
                 shutil.copy(nf, sampled_file_path)
 
+
     final_files_count = count_files_in_folder(sample_folder)
     print(f"[INFO] After sampling, the sample folder `{sample_folder}` contains {final_files_count} files in total.")

From 3096cf53cab93d639e37ffa1c1f55f495a447848 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Thu, 9 Jan 2025 09:19:50 +0000
Subject: [PATCH 230/304] ignore submission and show processing

---
 rdagent/scenarios/data_science/debug/data.py | 27 +++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index 36eadf4ad..2fb09891d 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -1,3 +1,4 @@
+from collections import Counter
 import os
 import platform
 import shutil
@@ -5,6 +6,8 @@
 
 import pandas as pd
 
+from tqdm import tqdm
+
 try:
     import bson  # pip install pymongo
 except:
@@ -99,7 +102,7 @@ def __init__(self, min_frac=0.02, min_num=5):
 
     def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
         frac = max(self.min_frac, self.min_num / len(df))
-        print(f"Sampling {frac * 100:.2f}% of the data ({len(df)} rows)")
+        # print(f"Sampling {frac * 100:.2f}% of the data ({len(df)} rows)")
         if frac >= 1:
             return df
         return df.sample(frac=frac, random_state=1)
@@ -137,11 +140,14 @@ def create_debug_data(
     # Traverse the folder and exclude specific file types
     included_extensions = {".csv", ".pkl", ".parquet", ".h5", ".hdf", ".hdf5", ".jsonl", ".bson"}
     files_to_process = [file for file in data_folder.rglob("*") if file.is_file()]
-
     total_files_count = len(files_to_process)
     print(
         f"[INFO] Original dataset folder `{data_folder}` has {total_files_count} files in total (including subfolders)."
     )
+    file_types_count = Counter(file.suffix.lower() for file in files_to_process)
+    print("File type counts:")
+    for file_type, count in file_types_count.items():
+        print(f"{file_type}: {count}")
 
     # This set will store filenames or paths that appear in the sampled data
     sample_used_file_names = set()
@@ -150,7 +156,10 @@ def create_debug_data(
     data_handler = GenericDataHandler()
     data_reducer = dr_cls(min_frac=min_frac, min_num=min_num)
 
-    for file_path in files_to_process:
+    skip_subfolder_data = any(f.is_file() and f.suffix in included_extensions for f in data_folder.iterdir() if f.name.startswith(("train", "test")))
+    processed_files = []
+
+    for file_path in tqdm(files_to_process, desc="Processing data", unit="file"):
         sampled_file_path = sample_folder / file_path.relative_to(data_folder)
         if sampled_file_path.exists():
             continue
@@ -158,6 +167,9 @@ def create_debug_data(
         if file_path.suffix.lower() not in included_extensions:
             continue
 
+        if skip_subfolder_data and file_path.parent != data_folder:
+            continue  # bypass files in subfolders
+
         sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
 
         # Load the original data
@@ -165,11 +177,13 @@ def create_debug_data(
 
         # Create a sampled subset
         df_sampled = data_reducer.reduce(df)
-
+        processed_files.append(file_path)
         # Dump the sampled data
         try:
             data_handler.dump(df_sampled, sampled_file_path)
             # Extract possible file references from the sampled data
+            if "submission" in file_path.stem:
+                continue  # Skip submission files
             for col in df_sampled.columns:
                 unique_vals = df_sampled[col].astype(str).unique()
                 for val in unique_vals:
@@ -183,13 +197,13 @@ def create_debug_data(
     # Process non-data files
     subfolder_dict = {}
     for file_path in files_to_process:
-        if file_path.suffix.lower() in included_extensions:
+        if file_path in processed_files:
             continue  # Already handled above
         rel_dir = file_path.relative_to(data_folder).parts[0]
         subfolder_dict.setdefault(rel_dir, []).append(file_path)
 
     # For each subfolder, decide which files to copy
-    for rel_dir, file_list in subfolder_dict.items():
+    for rel_dir, file_list in tqdm(subfolder_dict.items(), desc="Processing files", unit="file"):
         used_files = []
         not_used_files = []
 
@@ -223,5 +237,6 @@ def create_debug_data(
                 sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
                 shutil.copy(nf, sampled_file_path)
 
+
     final_files_count = count_files_in_folder(sample_folder)
     print(f"[INFO] After sampling, the sample folder `{sample_folder}` contains {final_files_count} files in total.")

From c9ef301ea653e7646a035274a743c67c05f74450 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Thu, 9 Jan 2025 10:39:20 +0000
Subject: [PATCH 231/304] add efficiency notice

---
 rdagent/components/coder/data_science/ensemble/prompts.yaml   | 2 +-
 rdagent/components/coder/data_science/feature/prompts.yaml    | 4 ++--
 rdagent/components/coder/data_science/model/prompts.yaml      | 2 +-
 .../coder/data_science/raw_data_loader/prompts.yaml           | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
index 7bc3260ae..f367efce3 100644
--- a/rdagent/components/coder/data_science/ensemble/prompts.yaml
+++ b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -66,7 +66,7 @@ ensemble_eval:
     {
         "execution": "Describe how well the ensemble executed, including any errors or issues encountered.",
         "return_checking": "Detail the checks performed on the ensemble results, including shape and value validation.",
-        "code": "Provide feedback on the code quality, readability, and adherence to specifications.",
+        "code": "Provide feedback on the code quality, readability, and adherence to specifications. Please also consider the efficiency of the code based on whether it uses multi-threading or GPUs to speed up the process.",
         "final_decision": <true/false>
     }    
   user: |-    
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
index 61858c82f..899d2952b 100644
--- a/rdagent/components/coder/data_science/feature/prompts.yaml
+++ b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -57,7 +57,7 @@ feature:
 
 feature_eval:
   system: |-
-    You are data scientist.
+    You are data scientist whose job is to evaluate the feature processing code generation.
 
     The feature code is:
     ```python
@@ -75,7 +75,7 @@ feature_eval:
     {
         "execution": "Describe how well the feature processing executed, including any errors or issues encountered.",
         "return_checking": "Detail the checks performed on the data after feature processing, including data integrity and correctness.",
-        "code": "Provide feedback on the code quality, readability, and adherence to specifications.",
+        "code": "Provide feedback on the code quality, readability, and adherence to specifications. Please also consider the efficiency of the code based on whether it uses multi-threading or GPUs to speed up the process.",
         "final_decision": <true/false>
     }
     ```
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 1203fa469..11e5470f0 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -119,7 +119,7 @@ model_eval:
         {
             "execution": "Describe whether the model executed successfully, including any errors or issues encountered.",
             "return_checking": "Check the generated value, including whether the value is generated and comparing the shape of the model output with the requirement in spec.md. You also need to check whether the hyperparameters used for retraining are correctly returned during the test execution of the model.",
-            "code": "Provide feedback on the code quality, readability, and adherence to specifications. Check whether the hyperparameters from the previous run are used in the model code, compare the parameter names in stdout and if they are used in the retraining part of the code. It is acceptable when hyperparameters is None.",
+            "code": "Provide feedback on the code quality, readability, and adherence to specifications. Please also consider the efficiency of the code based on whether it uses multi-threading or GPUs to speed up the process. Check whether the hyperparameters from the previous run are used in the model code, compare the parameter names in stdout and if they are used in the retraining part of the code. It is acceptable when hyperparameters is None.",
             "final_decision": <true/false>
         }
         ```
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index dcd2236b6..8c6c59c80 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -357,7 +357,7 @@ data_loader_eval:
     {
         "execution": "Describe how well the data loader executed, including any errors or issues encountered.",
         "return_checking": "Detail the checks performed on the data loaded, including data integrity and correctness.",
-        "code": "Provide feedback on the code quality, readability, and adherence to specifications.",
+        "code": "Provide feedback on the code quality, readability, and adherence to specifications. Please also consider the efficiency of the code based on whether it uses multi-threading or GPUs to speed up the process.",
         "final_decision": <true/false>
     }
     ```

From 376d840afd57b21be20da3d10862d786d47535c1 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Thu, 9 Jan 2025 06:47:40 +0000
Subject: [PATCH 232/304] refactor: Enhance error message with detailed
 feedback summary

---
 rdagent/components/coder/CoSTEER/evolving_agent.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/evolving_agent.py b/rdagent/components/coder/CoSTEER/evolving_agent.py
index c5c95d980..e66467f32 100644
--- a/rdagent/components/coder/CoSTEER/evolving_agent.py
+++ b/rdagent/components/coder/CoSTEER/evolving_agent.py
@@ -17,7 +17,13 @@ def filter_evolvable_subjects_by_feedback(
             if evo.sub_workspace_list[index] is not None and feedback[index] is not None and not feedback[index]:
                 evo.sub_workspace_list[index].clear()
 
-        if all(not f.final_decision for f in feedback if f):
-            raise CoderError("All tasks are failed")
+        failed_feedbacks = [
+            f"- trial{index + 1}:\n  - feedback:\n    - execution: {f.execution}\n    - return_checking: {f.return_checking}\n    - code: {f.code}"
+            for index, f in enumerate(feedback) if f and not f.final_decision
+        ]
+
+        if failed_feedbacks:
+            feedback_summary = "\n".join(failed_feedbacks)
+            raise CoderError(f"All tasks are failed:\n{feedback_summary}")
 
         return evo

From 814b06e01f87f61b7db5eb544535a8f8572f9027 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Thu, 9 Jan 2025 07:58:50 +0000
Subject: [PATCH 233/304] refactor: Simplify component handling in DSExpGen
 class

---
 .../coder/CoSTEER/evolving_agent.py           |   4 +-
 .../coder/data_science/model/exp.py           |  13 +-
 .../data_science/proposal/exp_gen.py          | 154 +++++++++---------
 3 files changed, 85 insertions(+), 86 deletions(-)

diff --git a/rdagent/components/coder/CoSTEER/evolving_agent.py b/rdagent/components/coder/CoSTEER/evolving_agent.py
index e66467f32..723995292 100644
--- a/rdagent/components/coder/CoSTEER/evolving_agent.py
+++ b/rdagent/components/coder/CoSTEER/evolving_agent.py
@@ -19,10 +19,10 @@ def filter_evolvable_subjects_by_feedback(
 
         failed_feedbacks = [
             f"- trial{index + 1}:\n  - feedback:\n    - execution: {f.execution}\n    - return_checking: {f.return_checking}\n    - code: {f.code}"
-            for index, f in enumerate(feedback) if f and not f.final_decision
+            for index, f in enumerate(feedback) if f is not None and not f.final_decision
         ]
 
-        if failed_feedbacks:
+        if len(failed_feedbacks) == len(feedback):
             feedback_summary = "\n".join(failed_feedbacks)
             raise CoderError(f"All tasks are failed:\n{feedback_summary}")
 
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
index 73b917f71..f9903accf 100644
--- a/rdagent/components/coder/data_science/model/exp.py
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -18,13 +18,13 @@ def __init__(
         description: str,
         architecture: str,
         *args,
-        hyperparameters: Dict[str, str],
+        hyperparameters: Dict[str, str] = {},
         model_type: Optional[str] = None,
         **kwargs,
     ) -> None:
         self.architecture: str = architecture
         self.hyperparameters: str = hyperparameters
-        self.model_type: str = (
+        self.model_type: str | None = (
             model_type  # Tabular for tabular model, TimesSeries for time series model, Graph for graph model, XGBoost for XGBoost model
             # TODO: More Models Supported
         )
@@ -34,7 +34,10 @@ def get_task_information(self):
         task_desc = f"""name: {self.name}
 description: {self.description}
 """
-        task_desc += f"architecture: {self.architecture}\n"
-        task_desc += f"hyperparameters: {self.hyperparameters}\n"
-        task_desc += f"model_type: {self.model_type}\n"
+        if self.architecture:
+            task_desc += f"architecture: {self.architecture}\n"
+        if self.hyperparameters:
+            task_desc += f"hyperparameters: {self.hyperparameters}\n"
+        if self.model_type:
+            task_desc += f"model_type: {self.model_type}\n"
         return task_desc
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 98bad4d94..0b1ae4b8a 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -128,93 +128,89 @@ def llm_task_gen(
 
         return resp_dict
 
-    def gen(self, trace: DSTrace) -> DSExperiment:
-        scenario_desc = trace.scen.get_scenario_all_desc()
-        last_successful_exp = trace.last_successful_exp()
+    def _handle_missing_component(
+        self,
+        component: str,
+        task_cls: type,
+        scenario_desc: str,
+        trace: Trace,
+        last_successful_exp: DSExperiment | None,
+        spec_file: str | None = None,
+    ) -> DSExperiment:
+        """Handle any component using a unified approach.
+
+        Args:
+            component: Name of the component (e.g. "DataLoadSpec")
+            task_cls: The task class to instantiate (e.g. DataLoaderTask)
+            scenario_desc: Description of the current scenario
+            last_successful_exp: Last successful experiment or None
+            spec_file: Path to specification file if needed
+        """
+        resp_dict = self.llm_task_gen(
+            targets=component,
+            scenario_desc=scenario_desc,
+            spec=last_successful_exp.experiment_workspace.file_dict[spec_file] if spec_file else None,
+            task_output_format=T(f".prompts:output_format.{component.lower()}").r(),
+        )
 
-        if len(trace.hist) == 0 or last_successful_exp is None:
-            next_component = "DataLoadSpec"
+        # Create task instance
+        exp_and_feedback = trace.hist[-1] if len(trace.hist) > 0 else None
+        if exp_and_feedback and exp_and_feedback[1].exception is not None and exp_and_feedback[0].sub_tasks[0].name == component:  # Assumption: when completing missing component, using component name as task name
+            desc = f"You have tried to implement the same component and got the following exception: \n{exp_and_feedback[1].exception}\n Please try different methods to avoid the same errors and results in an infinite loop"
         else:
-            next_component = last_successful_exp.next_component_required()
-
-        if next_component == "DataLoadSpec":
-            resp_dict = self.llm_task_gen(
-                targets="Data loader and specification generation",
-                scenario_desc=scenario_desc,
-                task_output_format=T(".prompts:output_format.data_loader").r(),
-            )
-            dt = DataLoaderTask(
-                name="Data loader and specification generation",
-                description=resp_dict.get(
-                    "description", "Data loader and specification generation description not provided"
-                ),
-            )
-
-            exp = DSExperiment(sub_tasks=[dt], hypothesis=DSHypothesis("DataLoadSpec"))
-            return exp
-        elif next_component == "FeatureEng":
-            resp_dict = self.llm_task_gen(
-                targets="Feature Engineering",
-                scenario_desc=scenario_desc,
-                spec=last_successful_exp.experiment_workspace.file_dict["spec/feature.md"],
-                task_output_format=T(".prompts:output_format.feature").r(),
-            )
+            desc = resp_dict.get("description", f"{component} description not provided")
+        task = task_cls(
+            name=component,
+            description=desc,
+        )
 
-            ft = FeatureTask(
-                name="Feature Engineering",
-                description=resp_dict.get("description", "Factor description not provided"),
-            )
-            exp = DSExperiment(sub_tasks=[ft], hypothesis=DSHypothesis("FeatureEng"))
+        exp = DSExperiment(sub_tasks=[task], hypothesis=DSHypothesis(component))
+        if last_successful_exp:
             exp.experiment_workspace.inject_code_from_folder(last_successful_exp.experiment_workspace.workspace_path)
-            return exp
-        elif next_component == "Model":
-            resp_dict = self.llm_task_gen(
-                targets="Models",
-                scenario_desc=scenario_desc,
-                spec=last_successful_exp.experiment_workspace.file_dict["spec/model.md"],
-                task_output_format=T(".prompts:output_format.model").r(),
-            )
+        return exp
 
-            mt = ModelTask(
-                name=resp_dict.get("model_name", "Model name not provided"),
-                description=resp_dict.get("description", "Model description not provided"),
-                model_type=resp_dict.get("model_type", "Model type not provided"),
-                architecture=resp_dict.get("architecture", "Model architecture not provided"),
-                hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
-            )
-            exp = DSExperiment(sub_tasks=[mt], hypothesis=DSHypothesis("Model"))
-            exp.experiment_workspace.inject_code_from_folder(last_successful_exp.experiment_workspace.workspace_path)
-            return exp
-        elif next_component == "Ensemble":
-            resp_dict = self.llm_task_gen(
-                targets="Ensemble",
-                scenario_desc=scenario_desc,
-                spec=last_successful_exp.experiment_workspace.file_dict["spec/ensemble.md"],
-                task_output_format=T(".prompts:output_format.ensemble").r(),
-            )
+    def gen(self, trace: DSTrace) -> DSExperiment:
+        scenario_desc = trace.scen.get_scenario_all_desc()
+        last_successful_exp = trace.last_successful_exp()
 
-            et = EnsembleTask(
-                name="Ensemble",
-                description=resp_dict.get("description", "Ensemble description not provided"),
-            )
-            exp = DSExperiment(sub_tasks=[et], hypothesis=DSHypothesis("Ensemble"))
-            exp.experiment_workspace.inject_code_from_folder(last_successful_exp.experiment_workspace.workspace_path)
-            return exp
-        elif next_component == "Workflow":
-            resp_dict = self.llm_task_gen(
-                targets="Workflow",
+        if len(trace.hist) == 0 or last_successful_exp is None:
+            next_missing_component = "DataLoadSpec"
+        else:
+            next_missing_component = last_successful_exp.next_component_required()
+
+        component_config = {
+            "DataLoadSpec": {
+                "task_cls": DataLoaderTask,
+                "spec_file": None
+            },
+            "FeatureEng": {
+                "task_cls": FeatureTask,
+                "spec_file": "spec/feature.md"
+            },
+            "Model": {
+                "task_cls": ModelTask,
+                "spec_file": "spec/model.md",
+            },
+            "Ensemble": {
+                "task_cls": EnsembleTask,
+                "spec_file": "spec/ensemble.md"
+            },
+            "Workflow": {
+                "task_cls": WorkflowTask,
+                "spec_file": "spec/workflow.md"
+            }
+        }
+
+        if next_missing_component in component_config:
+            config = component_config[next_missing_component]
+            return self._handle_missing_component(
+                component=next_missing_component,
+                task_cls=config["task_cls"],
                 scenario_desc=scenario_desc,
-                spec=last_successful_exp.experiment_workspace.file_dict["spec/workflow.md"],
-                task_output_format=T(".prompts:output_format.workflow").r(),
-            )
-
-            wt = WorkflowTask(
-                name="Workflow",
-                description=resp_dict.get("description", "Workflow description not provided"),
+                last_successful_exp=last_successful_exp,
+                spec_file=config.get("spec_file"),
+                trace=trace,
             )
-            exp = DSExperiment(sub_tasks=[wt], hypothesis=DSHypothesis("Workflow"))
-            exp.experiment_workspace.inject_code_from_folder(last_successful_exp.experiment_workspace.workspace_path)
-            return exp
         else:  # propose new component by LLM
             # Guidelines:
             # System prompts: Shared condition you are facing

From da60a214f15a8802954d73793c60f49b167219d4 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Thu, 9 Jan 2025 17:53:38 +0000
Subject: [PATCH 234/304] refactor: Update code structure and add docstring for
 clarity

---
 .gitignore                                     |  1 +
 rdagent/components/coder/CoSTEER/evaluators.py |  1 +
 .../components/coder/CoSTEER/evolving_agent.py | 12 +++++++-----
 .../components/coder/data_science/model/exp.py |  2 +-
 .../scenarios/data_science/proposal/exp_gen.py | 18 +++++++++++++-----
 5 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index 773e96569..95b5ceb13 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 Pipfile
 public
 release-notes.md
+typescript
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
index 6c28169c7..03877b4b7 100644
--- a/rdagent/components/coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -137,6 +137,7 @@ def evaluate(
 
 
 class CoSTEERMultiEvaluator(Evaluator):
+    """This is for evaluation of experiment. Due to we have multiple tasks, so we will return a list of evaluation feebacks"""
     def __init__(self, single_evaluator: CoSTEEREvaluator, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.single_evaluator = single_evaluator
diff --git a/rdagent/components/coder/CoSTEER/evolving_agent.py b/rdagent/components/coder/CoSTEER/evolving_agent.py
index 723995292..ad2a09b4c 100644
--- a/rdagent/components/coder/CoSTEER/evolving_agent.py
+++ b/rdagent/components/coder/CoSTEER/evolving_agent.py
@@ -6,10 +6,11 @@
 
 
 class FilterFailedRAGEvoAgent(RAGEvoAgent):
-    def filter_evolvable_subjects_by_feedback(
-        self, evo: EvolvableSubjects, feedback: CoSTEERSingleFeedbackDeprecated
-    ) -> EvolvableSubjects:
+
+    def filter_evolvable_subjects_by_feedback(self, evo: EvolvableSubjects,
+                                              feedback: CoSTEERSingleFeedbackDeprecated) -> EvolvableSubjects:
         assert isinstance(evo, EvolvingItem)
+        # FIXME: the list does not align with the annotation; It should be MultipleFeedback instead of a list of feedbacks
         assert isinstance(feedback, list)
         assert len(evo.sub_workspace_list) == len(feedback)
 
@@ -18,8 +19,9 @@ def filter_evolvable_subjects_by_feedback(
                 evo.sub_workspace_list[index].clear()
 
         failed_feedbacks = [
-            f"- trial{index + 1}:\n  - feedback:\n    - execution: {f.execution}\n    - return_checking: {f.return_checking}\n    - code: {f.code}"
-            for index, f in enumerate(feedback) if f is not None and not f.final_decision
+            f"- feedback{index + 1:02d}:\n  - execution: {f.execution}\n  - return_checking: {f.return_checking}\n  - code: {f.code}"
+            for index, f in enumerate(feedback)
+            if f is not None and not f.final_decision
         ]
 
         if len(failed_feedbacks) == len(feedback):
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
index f9903accf..7ebd277bf 100644
--- a/rdagent/components/coder/data_science/model/exp.py
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -16,7 +16,7 @@ def __init__(
         self,
         name: str,
         description: str,
-        architecture: str,
+        architecture: str = "",
         *args,
         hyperparameters: Dict[str, str] = {},
         model_type: Optional[str] = None,
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 0b1ae4b8a..5be8381ff 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -136,6 +136,7 @@ def _handle_missing_component(
         trace: Trace,
         last_successful_exp: DSExperiment | None,
         spec_file: str | None = None,
+        component_promopt_key: str | None = None,
     ) -> DSExperiment:
         """Handle any component using a unified approach.
 
@@ -150,7 +151,7 @@ def _handle_missing_component(
             targets=component,
             scenario_desc=scenario_desc,
             spec=last_successful_exp.experiment_workspace.file_dict[spec_file] if spec_file else None,
-            task_output_format=T(f".prompts:output_format.{component.lower()}").r(),
+            task_output_format=T(f".prompts:output_format.{component_promopt_key or component.lower()}").r(),
         )
 
         # Create task instance
@@ -159,6 +160,7 @@ def _handle_missing_component(
             desc = f"You have tried to implement the same component and got the following exception: \n{exp_and_feedback[1].exception}\n Please try different methods to avoid the same errors and results in an infinite loop"
         else:
             desc = resp_dict.get("description", f"{component} description not provided")
+
         task = task_cls(
             name=component,
             description=desc,
@@ -181,23 +183,28 @@ def gen(self, trace: DSTrace) -> DSExperiment:
         component_config = {
             "DataLoadSpec": {
                 "task_cls": DataLoaderTask,
-                "spec_file": None
+                "spec_file": None,
+                "component_promopt_key": "data_loader"
             },
             "FeatureEng": {
                 "task_cls": FeatureTask,
-                "spec_file": "spec/feature.md"
+                "spec_file": "spec/feature.md",
+                "component_promopt_key": "feature"
             },
             "Model": {
                 "task_cls": ModelTask,
                 "spec_file": "spec/model.md",
+                "component_promopt_key": "model"
             },
             "Ensemble": {
                 "task_cls": EnsembleTask,
-                "spec_file": "spec/ensemble.md"
+                "spec_file": "spec/ensemble.md",
+                "component_promopt_key": "ensemble"
             },
             "Workflow": {
                 "task_cls": WorkflowTask,
-                "spec_file": "spec/workflow.md"
+                "spec_file": "spec/workflow.md",
+                "component_promopt_key": "workflow"
             }
         }
 
@@ -210,6 +217,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 last_successful_exp=last_successful_exp,
                 spec_file=config.get("spec_file"),
                 trace=trace,
+                component_promopt_key=config.get("component_promopt_key"),
             )
         else:  # propose new component by LLM
             # Guidelines:

From fe88b07e24378ace6811faf8311b16b9bf3272b3 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Fri, 10 Jan 2025 06:10:09 +0000
Subject: [PATCH 235/304] reserve one sample to each label in data sampling

---
 rdagent/scenarios/data_science/debug/data.py | 48 +++++++++++++++++---
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index 2fb09891d..ff3a4f6c2 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -1,11 +1,10 @@
-from collections import Counter
 import os
 import platform
 import shutil
+from collections import Counter
 from pathlib import Path
 
 import pandas as pd
-
 from tqdm import tqdm
 
 try:
@@ -100,14 +99,46 @@ def __init__(self, min_frac=0.02, min_num=5):
         self.min_frac = min_frac
         self.min_num = min_num
 
-    def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
-        frac = max(self.min_frac, self.min_num / len(df))
+    def reduce(self, df: pd.DataFrame, frac: float = None) -> pd.DataFrame:
+        frac = max(self.min_frac, self.min_num / len(df)) if frac is None else frac
         # print(f"Sampling {frac * 100:.2f}% of the data ({len(df)} rows)")
         if frac >= 1:
             return df
         return df.sample(frac=frac, random_state=1)
 
 
+class UniqueIDDataReducer(DataReducer):
+    def __init__(self, min_frac=0.02, min_num=5):
+        self.min_frac = min_frac
+        self.min_num = min_num
+        self.random_reducer = RandDataReducer(min_frac, min_num)
+
+    def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
+        if (
+            not isinstance(df, pd.DataFrame)
+            or df.iloc[:, -1].unique().shape[0] == 0
+            or df.iloc[:, -1].unique().shape[0] == df.shape[0]
+        ):
+            return self.random_reducer.reduce(df)
+        unique_labels = df.iloc[:, -1].unique()
+        unique_count = unique_labels.shape[0]
+        sampled_rows = []
+
+        # 从每个唯一标签中抽样一个
+        for label in unique_labels:
+            sampled_row = df[df.iloc[:, -1] == label].sample(n=1, random_state=1)  # random_state可选
+            sampled_rows.append(sampled_row)
+        sampled_df = pd.concat(sampled_rows, ignore_index=True)
+        frac = max(self.min_frac, self.min_num / len(df))
+        if int(len(df) * frac) < unique_count:
+            return sampled_df
+        else:
+            remain_df = df.drop(index=sampled_df.index)
+            return pd.concat(
+                [sampled_df, self.random_reducer.reduce(remain_df, frac - unique_count / len(df))]
+            ).sort_index()
+
+
 def count_files_in_folder(folder: Path) -> int:
     """
     Count the total number of files in a folder, including files in subfolders.
@@ -117,7 +148,7 @@ def count_files_in_folder(folder: Path) -> int:
 
 def create_debug_data(
     competition: str,
-    dr_cls: type[DataReducer] = RandDataReducer,
+    dr_cls: type[DataReducer] = UniqueIDDataReducer,
     min_frac=0.002,
     min_num=5,
     dataset_path=None,
@@ -156,7 +187,11 @@ def create_debug_data(
     data_handler = GenericDataHandler()
     data_reducer = dr_cls(min_frac=min_frac, min_num=min_num)
 
-    skip_subfolder_data = any(f.is_file() and f.suffix in included_extensions for f in data_folder.iterdir() if f.name.startswith(("train", "test")))
+    skip_subfolder_data = any(
+        f.is_file() and f.suffix in included_extensions
+        for f in data_folder.iterdir()
+        if f.name.startswith(("train", "test"))
+    )
     processed_files = []
 
     for file_path in tqdm(files_to_process, desc="Processing data", unit="file"):
@@ -237,6 +272,5 @@ def create_debug_data(
                 sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
                 shutil.copy(nf, sampled_file_path)
 
-
     final_files_count = count_files_in_folder(sample_folder)
     print(f"[INFO] After sampling, the sample folder `{sample_folder}` contains {final_files_count} files in total.")

From a26b80e95cbda1f92e33db9fb20ca52eec9de189 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Fri, 10 Jan 2025 06:11:45 +0000
Subject: [PATCH 236/304] add Evaluation info

---
 rdagent/scenarios/data_science/scen/prompts.yaml | 6 ++++++
 rdagent/scenarios/data_science/scen/scen.py      | 1 +
 2 files changed, 7 insertions(+)

diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
index b0601af56..cbb11e84e 100644
--- a/rdagent/scenarios/data_science/scen/prompts.yaml
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -4,6 +4,12 @@ scenario_description: |-
 
   ------The expected output & submission format specifications------
   {{submission_specifications}}
+
+  {% if evaluation is not none %}
+  ------Evaluation------
+  {{evaluation}}
+  {% endif %}
+
   The evaluation metric used is directed as: {{metric_direction}}.
 
 competition_description_template:
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index 8366dc242..c1b57f34e 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -226,6 +226,7 @@ def get_scenario_all_desc(self) -> str:
         return T(".prompts:scenario_description").r(
             background=self.background,
             submission_specifications=self.submission_specifications,
+            evaluation=self.raw_description.get("Evaluation"),
             metric_direction=self.metric_direction,
         )
 

From 091d687ee8fb6914c49bb5b7fb63f80e5866e751 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Fri, 10 Jan 2025 07:59:45 +0000
Subject: [PATCH 237/304] refine costeer code to avoid giving same code twice

---
 .../coder/data_science/ensemble/__init__.py    | 16 ++++++++++++++++
 .../coder/data_science/ensemble/prompts.yaml   |  4 ++++
 .../coder/data_science/feature/__init__.py     | 16 ++++++++++++++++
 .../coder/data_science/feature/prompts.yaml    |  4 ++++
 .../coder/data_science/model/__init__.py       | 18 ++++++++++++++++++
 .../coder/data_science/model/prompts.yaml      |  4 ++++
 .../data_science/raw_data_loader/__init__.py   | 16 ++++++++++++++++
 .../data_science/raw_data_loader/prompts.yaml  |  4 ++++
 .../coder/data_science/workflow/__init__.py    | 16 ++++++++++++++++
 .../coder/data_science/workflow/prompts.yaml   |  4 ++++
 10 files changed, 102 insertions(+)

diff --git a/rdagent/components/coder/data_science/ensemble/__init__.py b/rdagent/components/coder/data_science/ensemble/__init__.py
index 9093f8153..e50e46fbb 100644
--- a/rdagent/components/coder/data_science/ensemble/__init__.py
+++ b/rdagent/components/coder/data_science/ensemble/__init__.py
@@ -52,6 +52,21 @@ def implement_one_task(
             if queried_knowledge is not None
             else []
         )
+        latest_code_feedback = [
+            knowledge.feedback
+            for knowledge in queried_former_failed_knowledge[0]
+            if knowledge.implementation.file_dict.get("ensemble.py") is not None
+            and knowledge.implementation.file_dict.get("ensemble.py") == workspace.file_dict.get("ensemble.py")
+        ]
+        if len(latest_code_feedback) > 0:
+            queried_former_failed_knowledge = (
+                [
+                    knowledge
+                    for knowledge in queried_former_failed_knowledge[0]
+                    if knowledge.implementation.file_dict.get("ensemble.py") != workspace.file_dict.get("ensemble.py")
+                ],
+                queried_former_failed_knowledge[1],
+            )
 
         # Generate code with knowledge integration
         competition_info = self.scen.get_scenario_all_desc()
@@ -66,6 +81,7 @@ def implement_one_task(
         user_prompt = T(".prompts:ensemble_coder.user").r(
             ensemble_spec=workspace.file_dict["spec/ensemble.md"],
             latest_code=workspace.file_dict.get("ensemble.py"),
+            latest_code_feedback=latest_code_feedback[0] if len(latest_code_feedback) > 0 else None,
         )
 
         for _ in range(5):
diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
index f367efce3..d4051c1de 100644
--- a/rdagent/components/coder/data_science/ensemble/prompts.yaml
+++ b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -45,6 +45,10 @@ ensemble_coder:
     {% if latest_code %}
     ---------Former code---------
       {{ latest_code }}
+    {% if latest_code_feedback is not none %}
+    ---------Feedback to former code---------
+      {{ latest_code_feedback }}
+    {% endif %}
     The former code has some errors, you should write the correct code based on the former code. Avoid writing the same code to former code.
     {% endif %}
 ensemble_eval:
diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
index 56dfe03b3..6020acf72 100644
--- a/rdagent/components/coder/data_science/feature/__init__.py
+++ b/rdagent/components/coder/data_science/feature/__init__.py
@@ -40,6 +40,21 @@ def implement_one_task(
             if queried_knowledge is not None
             else []
         )
+        latest_code_feedback = [
+            knowledge.feedback
+            for knowledge in queried_former_failed_knowledge[0]
+            if knowledge.implementation.file_dict.get("feature.py") is not None
+            and knowledge.implementation.file_dict.get("feature.py") == workspace.file_dict.get("feature.py")
+        ]
+        if len(latest_code_feedback) > 0:
+            queried_former_failed_knowledge = (
+                [
+                    knowledge
+                    for knowledge in queried_former_failed_knowledge[0]
+                    if knowledge.implementation.file_dict.get("feature.py") != workspace.file_dict.get("feature.py")
+                ],
+                queried_former_failed_knowledge[1],
+            )
 
         # 2. code
         system_prompt = T(".prompts:feature.system").r(
@@ -51,6 +66,7 @@ def implement_one_task(
         user_prompt = T(".prompts:feature.user").r(
             feature_spec=workspace.file_dict["spec/feature.md"],
             latest_code=workspace.file_dict.get("feature.py"),
+            latest_code_feedback=latest_code_feedback[0] if len(latest_code_feedback) > 0 else None,
         )
 
         for _ in range(5):
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
index 899d2952b..2bab7d6e5 100644
--- a/rdagent/components/coder/data_science/feature/prompts.yaml
+++ b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -51,6 +51,10 @@ feature:
     {% if latest_code %}
     ---------Former code---------
       {{ latest_code }}
+    {% if latest_code_feedback is not none %}
+    ---------Feedback to former code---------
+      {{ latest_code_feedback }}
+    {% endif %}
     The former code has some errors, you should write the correct code based on the former code. Avoid writing the same code to former code.
     {% endif %}    
 
diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 2bb235ad6..fe70ff544 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -44,6 +44,23 @@ def implement_one_task(
             if queried_knowledge is not None
             else []
         )
+        latest_code_feedback = [
+            knowledge.feedback
+            for knowledge in queried_former_failed_knowledge[0]
+            if knowledge.implementation.file_dict.get(f"{target_task.name}.py") is not None
+            and knowledge.implementation.file_dict.get(f"{target_task.name}.py")
+            == workspace.file_dict.get(f"{target_task.name}.py")
+        ]
+        if len(latest_code_feedback) > 0:
+            queried_former_failed_knowledge = (
+                [
+                    knowledge
+                    for knowledge in queried_former_failed_knowledge[0]
+                    if knowledge.implementation.file_dict.get(f"{target_task.name}.py")
+                    != workspace.file_dict.get(f"{target_task.name}.py")
+                ],
+                queried_former_failed_knowledge[1],
+            )
 
         # 2. code
         system_prompt = T(".prompts:model_coder.system").r(
@@ -66,6 +83,7 @@ def implement_one_task(
             workspace_code=workspace.get_codes(
                 r"^model_(?!test)\w+\.py$"
             ),  # TODO: If we have high failure rate here, we should clean this step with less information.
+            latest_code_feedback=latest_code_feedback[0] if len(latest_code_feedback) > 0 else None,
         )
 
         batch_edit = BatchEditOut.extract_output(
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 11e5470f0..745d3ded0 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -82,6 +82,10 @@ model_coder:
         So far the workspace is empty. No model code has been implemented yet.
         {% else %}
         {{ workspace_code }}
+        {% if latest_code_feedback is not none %}
+        ---------Feedback to former code---------
+        {{ latest_code_feedback }}
+        {% endif %}
         {% endif %}
         ---------Model Specification---------
         When you are implementing the code, you should follow the spec
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index 51d6e2868..46dfc19cd 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -68,6 +68,21 @@ def implement_one_task(
             if queried_knowledge is not None
             else []
         )
+        latest_code_feedback = [
+            knowledge.feedback
+            for knowledge in queried_former_failed_knowledge[0]
+            if knowledge.implementation.file_dict.get("load_data.py") is not None
+            and knowledge.implementation.file_dict.get("load_data.py") == workspace.file_dict.get("load_data.py")
+        ]
+        if len(latest_code_feedback) > 0:
+            queried_former_failed_knowledge = (
+                [
+                    knowledge
+                    for knowledge in queried_former_failed_knowledge[0]
+                    if knowledge.implementation.file_dict.get("load_data.py") != workspace.file_dict.get("load_data.py")
+                ],
+                queried_former_failed_knowledge[1],
+            )
 
         # 1. specifications
         # TODO: We may move spec into a separated COSTEER task
@@ -123,6 +138,7 @@ def implement_one_task(
             data_loader_spec=data_loader_spec,
             folder_spec=data_folder_info,
             latest_code=workspace.file_dict.get("load_data.py"),
+            latest_code_feedback=latest_code_feedback[0] if len(latest_code_feedback) > 0 else None,
         )
 
         for _ in range(5):
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 8c6c59c80..85ba44179 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -331,6 +331,10 @@ data_loader_coder:
     {% if latest_code %}
     ---------Former code---------
       {{ latest_code }}
+    {% if latest_code_feedback is not none %}
+    ---------Feedback to former code---------
+      {{ latest_code_feedback }}
+    {% endif %}
     The former code has some errors, you should write the correct code based on the former code. Avoid writing the same code to former code.
     {% endif %}   
 
diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
index 3eb7f15a6..09f4ba6d2 100644
--- a/rdagent/components/coder/data_science/workflow/__init__.py
+++ b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -41,6 +41,21 @@ def implement_one_task(
             if queried_knowledge is not None
             else []
         )
+        latest_code_feedback = [
+            knowledge.feedback
+            for knowledge in queried_former_failed_knowledge[0]
+            if knowledge.implementation.file_dict.get("main.py") is not None
+            and knowledge.implementation.file_dict.get("main.py") == workspace.file_dict.get("main.py")
+        ]
+        if len(latest_code_feedback) > 0:
+            queried_former_failed_knowledge = (
+                [
+                    knowledge
+                    for knowledge in queried_former_failed_knowledge[0]
+                    if knowledge.implementation.file_dict.get("main.py") != workspace.file_dict.get("main.py")
+                ],
+                queried_former_failed_knowledge[1],
+            )
 
         # 2. code
         system_prompt = T(".prompts:workflow_coder.system").r(
@@ -56,6 +71,7 @@ def implement_one_task(
             ensemble_code=workspace.file_dict["ensemble.py"],
             latest_code=workspace.file_dict.get("main.py"),
             workflow_spec=workspace.file_dict["spec/workflow.md"],
+            latest_code_feedback=latest_code_feedback[0] if len(latest_code_feedback) > 0 else None,
         )
 
         for _ in range(5):
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index aa53b0786..b6052e154 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -72,6 +72,10 @@ workflow_coder:
     {% if latest_code %}
     ---------Former code---------
       {{ latest_code }}
+    {% if latest_code_feedback is not none %}
+    ---------Feedback to former code---------
+      {{ latest_code_feedback }}
+    {% endif %}
     The former code has some errors, you should write the correct code based on the former code. Avoid writing the same code to former code.
     {% endif %}  
 

From c58d5f6a33b47c142f3eadad74949e15f4dd64ea Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Fri, 10 Jan 2025 08:36:32 +0000
Subject: [PATCH 238/304] use raw_description as plain text

---
 rdagent/scenarios/data_science/scen/kaggle.py    | 2 ++
 rdagent/scenarios/data_science/scen/prompts.yaml | 5 +++--
 rdagent/scenarios/data_science/scen/scen.py      | 9 +++++----
 rdagent/scenarios/kaggle/kaggle_crawler.py       | 2 +-
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/rdagent/scenarios/data_science/scen/kaggle.py b/rdagent/scenarios/data_science/scen/kaggle.py
index 5390582ca..f3e9a21dc 100644
--- a/rdagent/scenarios/data_science/scen/kaggle.py
+++ b/rdagent/scenarios/data_science/scen/kaggle.py
@@ -24,6 +24,8 @@ def _get_description(self):
         return crawl_descriptions(self.competition, DS_RD_SETTING.local_data_path)
 
     def _get_direction(self):
+        if DS_RD_SETTING.if_using_mle_data:
+            return super()._get_direction()
         leaderboard = leaderboard_scores(self.competition)
         return "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize"
 
diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
index cbb11e84e..0648bd5b5 100644
--- a/rdagent/scenarios/data_science/scen/prompts.yaml
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -23,9 +23,10 @@ competition_description_template:
       "Data Type": "The type of competition data, e.g., 'Tabular', 'Time Series', 'Text (Natural Language Processing)', 'Image (Computer Vision)', 'Audio', 'Video'", 
       "Brief Description": "A brief description of the competition",
       "Data Description": "A detailed description of the dataset used in the competition, including its source, structure, and any relevant characteristics",
-      "Target Description": "A description of the target variable to be predicted",
-      "Submission Specifications": "The submission specification & sample submission csv descriptions for the model to output."
+      "Evaluation Description": "A description of the evaluation used in the competition.",
+      "Submission Specifications": "The submission specification & sample submission file descriptions for the model to output."
       "Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
+      "Metric direction": True or False as True means bigger metric number is better, False means smaller is better.
     }
   user: |-
     Competition Description: 
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index c1b57f34e..ec2d9c8d9 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -154,8 +154,8 @@ class DataScienceScen(Scenario):
     def __init__(self, competition: str) -> None:
         self.competition = competition
         self.raw_description = self._get_description()
-        self.metric_direction = self._get_direction()
         self._analysis_competition_description()
+        self.metric_direction = self._get_direction()
 
     def _get_description(self):
         if (fp := Path(f"{DS_RD_SETTING.local_data_path}/{self.competition}.json")).exists():
@@ -168,7 +168,7 @@ def _get_description(self):
             )
 
     def _get_direction(self):
-        return self.raw_description.get("metric_direction", "minimize")
+        return self.metric_direction_guess if hasattr(self, "metric_direction_guess") else True
 
     def _analysis_competition_description(self):
         sys_prompt = T(".prompts:competition_description_template.system").r()
@@ -187,11 +187,12 @@ def _analysis_competition_description(self):
         self.data_type = response_json_analysis.get("Data Type", "No data type provided")
         self.brief_description = response_json_analysis.get("Brief Description", "No brief description provided")
         self.data_description = response_json_analysis.get("Data Description", "No data description provided")
-        self.target_description = response_json_analysis.get("Target Description", "No target description provided")
+        self.target_description = response_json_analysis.get("Evaluation Description", "No target description provided")
         self.submission_specifications = response_json_analysis.get(
             "Submission Specifications", "No submission requirements provided"
         )
         self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
+        self.metric_direction_guess = response_json_analysis.get("Metric Direction", True)
 
     def get_competition_full_desc(self) -> str:
         return f"""Task Type: {self.task_type}
@@ -226,7 +227,7 @@ def get_scenario_all_desc(self) -> str:
         return T(".prompts:scenario_description").r(
             background=self.background,
             submission_specifications=self.submission_specifications,
-            evaluation=self.raw_description.get("Evaluation"),
+            evaluation=self.target_description,
             metric_direction=self.metric_direction,
         )
 
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
index 64aa8fbb9..acc08494f 100644
--- a/rdagent/scenarios/kaggle/kaggle_crawler.py
+++ b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -36,7 +36,7 @@
 
 def crawl_descriptions(
     competition: str, local_data_path: str, wait: float = 3.0, force: bool = False
-) -> dict[str, str]:
+) -> dict[str, str] | str:
     if (fp := Path(f"{local_data_path}/{competition}/description.md")).exists() and not force:
         logger.info(f"Found {competition}/description.md, loading from it.")
         return fp.read_text()

From 9259839beafeef33bb76983f280a860cf3637952 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Fri, 10 Jan 2025 08:45:14 +0000
Subject: [PATCH 239/304] add a prompt hint to avoid same dict key

---
 rdagent/components/coder/data_science/model/__init__.py | 8 --------
 rdagent/utils/agent/tpl.yaml                            | 4 ++--
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index fe70ff544..1b4ee285f 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -86,14 +86,6 @@ def implement_one_task(
             latest_code_feedback=latest_code_feedback[0] if len(latest_code_feedback) > 0 else None,
         )
 
-        batch_edit = BatchEditOut.extract_output(
-            APIBackend().build_messages_and_create_chat_completion(
-                user_prompt=user_prompt,
-                system_prompt=system_prompt,
-                json_mode=BatchEditOut.json_mode,
-            )
-        )
-
         for _ in range(5):
             batch_edit = BatchEditOut.extract_output(
                 APIBackend().build_messages_and_create_chat_completion(
diff --git a/rdagent/utils/agent/tpl.yaml b/rdagent/utils/agent/tpl.yaml
index 4ba6b07de..377a12b5b 100644
--- a/rdagent/utils/agent/tpl.yaml
+++ b/rdagent/utils/agent/tpl.yaml
@@ -9,9 +9,9 @@ BatchEditOut: |-
   You should return a edition that applies to multiple files in a workspace in JSON.
   Except for the model file, other files should not be renamed.
   Files that do not need to be modified do not need to be included in the returned dict.
-  For example:
 
-  Inject the code into the folder. Your file name should always contain the suffix.
+  For example:
+  Inject the code into the folder. Your file name should always contain the suffix. Your file name keys should be unique to avoid delete or replace conflicts.
   {
       <file name1>: "<code>",  // indicate writing <code> into <file name> (create new file or replace existing file)
       <file name2>: "__DEL__"  // indicate removing file name2. When we want to replace a file to a new one, we usually use this

From cbadaa537a704b26d2fb102a4447820916b69a89 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 10 Jan 2025 08:48:20 +0000
Subject: [PATCH 240/304] model task name bug in first model exp gen

---
 .../data_science/proposal/exp_gen.py          | 22 +++++++++++--------
 .../data_science/proposal/prompts.yaml        |  1 -
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 5be8381ff..228a5eb78 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -130,7 +130,7 @@ def llm_task_gen(
 
     def _handle_missing_component(
         self,
-        component: str,
+        component: COMPONENT,
         task_cls: type,
         scenario_desc: str,
         trace: Trace,
@@ -156,15 +156,19 @@ def _handle_missing_component(
 
         # Create task instance
         exp_and_feedback = trace.hist[-1] if len(trace.hist) > 0 else None
-        if exp_and_feedback and exp_and_feedback[1].exception is not None and exp_and_feedback[0].sub_tasks[0].name == component:  # Assumption: when completing missing component, using component name as task name
-            desc = f"You have tried to implement the same component and got the following exception: \n{exp_and_feedback[1].exception}\n Please try different methods to avoid the same errors and results in an infinite loop"
-        else:
-            desc = resp_dict.get("description", f"{component} description not provided")
+        if exp_and_feedback and exp_and_feedback[1].exception is not None and (exp_and_feedback[0].sub_tasks[0].name == component or exp_and_feedback[0].sub_tasks[0].name.startswith("model_") and component == "Model"):  # Assumption: when completing missing component, using component name as task name
+            resp_dict["description"] = f"You have tried to implement the same component and got the following exception: \n{exp_and_feedback[1].exception}\n Please try different methods to avoid the same errors and results in an infinite loop"
 
-        task = task_cls(
-            name=component,
-            description=desc,
-        )
+        if component == "Model":
+            task = task_cls(
+                name=resp_dict.pop("model_name"),
+                **resp_dict,
+            )
+        else:
+            task = task_cls(
+                name=component,
+                **resp_dict,
+            )
 
         exp = DSExperiment(sub_tasks=[task], hypothesis=DSHypothesis(component))
         if last_successful_exp:
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index ed61c3c3e..732df7add 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -218,7 +218,6 @@ output_format:
             "hyperparameter_name_2": "value of hyperparameter 2",
             "hyperparameter_name_3": "value of hyperparameter 3"
         },
-      "edit_strategy": <specify the editing strategy to implement the hypothesis: we have the following strategies 1) remove an existing model (you MUST specify which model will be removed) and add a new model with the "model_name" 2) add a new model in addition to the existing models; the following coding agent will use your edit strategy to make the necessary changes>
     }
     Usually, a larger model works better than a smaller one. Hence, the parameters should be larger.
   ensemble: |-

From 059f36abbf48c63b148cbe38a3f553ba5a3f4789 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Fri, 10 Jan 2025 09:09:15 +0000
Subject: [PATCH 241/304] fix a typo

---
 .../data_science/proposal/exp_gen.py          | 62 +++++++++----------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 228a5eb78..9e9d7fdd5 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -136,7 +136,7 @@ def _handle_missing_component(
         trace: Trace,
         last_successful_exp: DSExperiment | None,
         spec_file: str | None = None,
-        component_promopt_key: str | None = None,
+        component_prompt_key: str | None = None,
     ) -> DSExperiment:
         """Handle any component using a unified approach.
 
@@ -151,13 +151,23 @@ def _handle_missing_component(
             targets=component,
             scenario_desc=scenario_desc,
             spec=last_successful_exp.experiment_workspace.file_dict[spec_file] if spec_file else None,
-            task_output_format=T(f".prompts:output_format.{component_promopt_key or component.lower()}").r(),
+            task_output_format=T(f".prompts:output_format.{component_prompt_key or component.lower()}").r(),
         )
 
         # Create task instance
         exp_and_feedback = trace.hist[-1] if len(trace.hist) > 0 else None
-        if exp_and_feedback and exp_and_feedback[1].exception is not None and (exp_and_feedback[0].sub_tasks[0].name == component or exp_and_feedback[0].sub_tasks[0].name.startswith("model_") and component == "Model"):  # Assumption: when completing missing component, using component name as task name
-            resp_dict["description"] = f"You have tried to implement the same component and got the following exception: \n{exp_and_feedback[1].exception}\n Please try different methods to avoid the same errors and results in an infinite loop"
+        if (
+            exp_and_feedback
+            and exp_and_feedback[1].exception is not None
+            and (
+                exp_and_feedback[0].sub_tasks[0].name == component
+                or exp_and_feedback[0].sub_tasks[0].name.startswith("model_")
+                and component == "Model"
+            )
+        ):  # Assumption: when completing missing component, using component name as task name
+            resp_dict["description"] = (
+                f"You have tried to implement the same component and got the following exception: \n{exp_and_feedback[1].exception}\n Please try different methods to avoid the same errors and results in an infinite loop"
+            )
 
         if component == "Model":
             task = task_cls(
@@ -185,31 +195,11 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             next_missing_component = last_successful_exp.next_component_required()
 
         component_config = {
-            "DataLoadSpec": {
-                "task_cls": DataLoaderTask,
-                "spec_file": None,
-                "component_promopt_key": "data_loader"
-            },
-            "FeatureEng": {
-                "task_cls": FeatureTask,
-                "spec_file": "spec/feature.md",
-                "component_promopt_key": "feature"
-            },
-            "Model": {
-                "task_cls": ModelTask,
-                "spec_file": "spec/model.md",
-                "component_promopt_key": "model"
-            },
-            "Ensemble": {
-                "task_cls": EnsembleTask,
-                "spec_file": "spec/ensemble.md",
-                "component_promopt_key": "ensemble"
-            },
-            "Workflow": {
-                "task_cls": WorkflowTask,
-                "spec_file": "spec/workflow.md",
-                "component_promopt_key": "workflow"
-            }
+            "DataLoadSpec": {"task_cls": DataLoaderTask, "spec_file": None, "component_prompt_key": "data_loader"},
+            "FeatureEng": {"task_cls": FeatureTask, "spec_file": "spec/feature.md", "component_prompt_key": "feature"},
+            "Model": {"task_cls": ModelTask, "spec_file": "spec/model.md", "component_prompt_key": "model"},
+            "Ensemble": {"task_cls": EnsembleTask, "spec_file": "spec/ensemble.md", "component_prompt_key": "ensemble"},
+            "Workflow": {"task_cls": WorkflowTask, "spec_file": "spec/workflow.md", "component_prompt_key": "workflow"},
         }
 
         if next_missing_component in component_config:
@@ -221,7 +211,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 last_successful_exp=last_successful_exp,
                 spec_file=config.get("spec_file"),
                 trace=trace,
-                component_promopt_key=config.get("component_promopt_key"),
+                component_prompt_key=config.get("component_prompt_key"),
             )
         else:  # propose new component by LLM
             # Guidelines:
@@ -239,9 +229,15 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             # Step 1: Generate component
             # Describe current best solution using shared template
             sota_solution = trace.sota_experiment()
-            sota_exp_desc = T("scenarios.data_science.share:describe.exp").r(exp=last_successful_exp, heading="Best of previous exploration of the scenario")
-            current_exp_desc = T("scenarios.data_science.share:describe.exp").r(exp=last_exp, heading="Current exploration of the scenario")
-            exp_and_feedback_desc = T("scenarios.data_science.share:describe.feedback").r(exp_and_feedback=exp_and_feedback)
+            sota_exp_desc = T("scenarios.data_science.share:describe.exp").r(
+                exp=last_successful_exp, heading="Best of previous exploration of the scenario"
+            )
+            current_exp_desc = T("scenarios.data_science.share:describe.exp").r(
+                exp=last_exp, heading="Current exploration of the scenario"
+            )
+            exp_and_feedback_desc = T("scenarios.data_science.share:describe.feedback").r(
+                exp_and_feedback=exp_and_feedback
+            )
 
             # Generate component using template with proper context
             component_sys_prompt = T(".prompts:component_gen.system").r(

From 13ae2ae1d116f55b541f82966c44ee0930020e07 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Fri, 10 Jan 2025 10:00:36 +0000
Subject: [PATCH 242/304] add some debug info in costeer tests

---
 rdagent/components/coder/data_science/ensemble/eval.py   | 6 +++---
 .../eval_tests/{ensemble_test.py => ensemble_test.txt}   | 4 ++++
 .../data_science/feature/eval_tests/feature_test.py      | 5 +++++
 .../coder/data_science/model/eval_tests/model_test.py    | 9 +++++++--
 4 files changed, 19 insertions(+), 5 deletions(-)
 rename rdagent/components/coder/data_science/ensemble/eval_tests/{ensemble_test.py => ensemble_test.txt} (79%)

diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
index 8a961c8f8..16ecab9a8 100644
--- a/rdagent/components/coder/data_science/ensemble/eval.py
+++ b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -12,9 +12,9 @@
 from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils import filter_progress_bar
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
-from rdagent.utils import filter_progress_bar
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
@@ -51,8 +51,8 @@ def evaluate(
         }
         de = DockerEnv(conf=ds_docker_conf)
 
-        fname = "ensemble_test.py"
-        test_code = (DIRNAME / "eval_tests" / "ensemble_test.py").read_text()
+        fname = "ensemble_test.txt"
+        test_code = (DIRNAME / "eval_tests" / "ensemble_test.txt").read_text()
         test_code = (
             Environment(undefined=StrictUndefined)
             .from_string(test_code)
diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt
similarity index 79%
rename from rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
rename to rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt
index 935f913ad..760d40576 100644
--- a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.py
+++ b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt
@@ -30,6 +30,10 @@
 )
 {% endfor %}
 
+for key in val_preds_dict.keys():
+    print(f"Model {key} validation predictions (val_preds_dict[key]) shape: {val_preds_dict[key].shape if val_preds_dict[key] is not None else 'None'}")
+    print(f"Model {key} test predictions (test_preds_dict[key]) shape: {test_preds_dict[key].shape if test_preds_dict[key] is not None else 'None'}")
+
 # Run ensemble
 final_pred = ens_and_decision(test_preds_dict, val_preds_dict, val_y)
 
diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
index 010028019..974ccdef6 100644
--- a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
@@ -15,6 +15,10 @@
 from load_data import load_data
 
 X, y, X_test, test_ids = load_data()
+print(f"X.shape: {X.shape}")
+print(f"y.shape: {y.shape}")
+print(f"X_test.shape: {X_test.shape}")
+print(f"test_ids length: {len(test_ids)}")
 X, y, X_test = feat_eng(X, y, X_test)
 
 
@@ -22,6 +26,7 @@
 def get_length(data):
     return len(data) if isinstance(data, list) else data.shape[0]
 
+
 assert get_length(X_test) == get_length(test_ids), "Mismatch in length of test images and test IDs"
 assert get_length(X) == get_length(y), "Mismatch in length of training images and labels"
 
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_test.py b/rdagent/components/coder/data_science/model/eval_tests/model_test.py
index 092eecef6..3aed21580 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_test.py
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_test.py
@@ -1,8 +1,9 @@
 import time
-from sklearn.model_selection import train_test_split
-from load_data import load_data
+
 from feature import feat_eng
+from load_data import load_data
 from model01 import model_workflow
+from sklearn.model_selection import train_test_split
 
 
 def log_execution_results(start_time, val_pred, test_pred, hypers, execution_label):
@@ -19,6 +20,10 @@ def log_execution_results(start_time, val_pred, test_pred, hypers, execution_lab
 X, y, test_X, test_ids = load_data()
 X, y, test_X = feat_eng(X, y, test_X)
 train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.8, random_state=42)
+print(f"train_X.shape: {train_X.shape}")
+print(f"train_y.shape: {train_y.shape}")
+print(f"val_X.shape: {val_X.shape}")
+print(f"val_y.shape: {val_y.shape}")
 
 # First execution
 print("The first execution begins.\n")

From e516199c972142ce08d4ef25e87be88e7fe11f9b Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 10 Jan 2025 10:43:26 +0000
Subject: [PATCH 243/304] task init change

---
 .../scenarios/data_science/proposal/exp_gen.py   | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 228a5eb78..2a232655d 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -157,18 +157,12 @@ def _handle_missing_component(
         # Create task instance
         exp_and_feedback = trace.hist[-1] if len(trace.hist) > 0 else None
         if exp_and_feedback and exp_and_feedback[1].exception is not None and (exp_and_feedback[0].sub_tasks[0].name == component or exp_and_feedback[0].sub_tasks[0].name.startswith("model_") and component == "Model"):  # Assumption: when completing missing component, using component name as task name
-            resp_dict["description"] = f"You have tried to implement the same component and got the following exception: \n{exp_and_feedback[1].exception}\n Please try different methods to avoid the same errors and results in an infinite loop"
+            resp_dict["description"] += f"\nYou have tried to implement the same component and got the following exception: \n{exp_and_feedback[1].exception}\n Please try different methods to avoid the same errors and results in an infinite loop"
 
-        if component == "Model":
-            task = task_cls(
-                name=resp_dict.pop("model_name"),
-                **resp_dict,
-            )
-        else:
-            task = task_cls(
-                name=component,
-                **resp_dict,
-            )
+        task = task_cls(
+            name=component if component != "Model" else resp_dict.pop("model_name"),
+            **resp_dict,
+        )
 
         exp = DSExperiment(sub_tasks=[task], hypothesis=DSHypothesis(component))
         if last_successful_exp:

From 07c5e40a126e1c219f9e13a3e078b54bc2eabcda Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Mon, 13 Jan 2025 02:54:58 +0000
Subject: [PATCH 244/304] enhance data sampling

---
 rdagent/scenarios/data_science/debug/data.py | 37 ++++++++++++--------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index ff3a4f6c2..4568a7299 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -116,27 +116,36 @@ def __init__(self, min_frac=0.02, min_num=5):
     def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
         if (
             not isinstance(df, pd.DataFrame)
+            or not isinstance(df.iloc[0, -1], (int, float, str, tuple, frozenset, bytes, complex, type(None)))
             or df.iloc[:, -1].unique().shape[0] == 0
-            or df.iloc[:, -1].unique().shape[0] == df.shape[0]
+            or df.iloc[:, -1].unique().shape[0] >= df.shape[0] * 0.5
         ):
             return self.random_reducer.reduce(df)
         unique_labels = df.iloc[:, -1].unique()
+        unique_labels = unique_labels[~pd.isna(unique_labels)]
         unique_count = unique_labels.shape[0]
-        sampled_rows = []
+        print("Unique labels:", unique_count/ df.shape[0])
+
+        labels = df.iloc[:, -1]
+        unique_labels = labels.dropna().unique()
+        unique_count = len(unique_labels)
+
+        sampled_rows = (
+            df.groupby(labels, group_keys=False)
+            .apply(lambda x: x.sample(n=1, random_state=1))
+        )
 
-        # 从每个唯一标签中抽样一个
-        for label in unique_labels:
-            sampled_row = df[df.iloc[:, -1] == label].sample(n=1, random_state=1)  # random_state可选
-            sampled_rows.append(sampled_row)
-        sampled_df = pd.concat(sampled_rows, ignore_index=True)
         frac = max(self.min_frac, self.min_num / len(df))
+
         if int(len(df) * frac) < unique_count:
-            return sampled_df
-        else:
-            remain_df = df.drop(index=sampled_df.index)
-            return pd.concat(
-                [sampled_df, self.random_reducer.reduce(remain_df, frac - unique_count / len(df))]
-            ).sort_index()
+            return sampled_rows.reset_index(drop=True)
+
+        remain_df = df.drop(index=sampled_rows.index)
+        remaining_frac = frac - unique_count / len(df)
+
+        remaining_sampled = self.random_reducer.reduce(remain_df, remaining_frac)
+        result_df = pd.concat([sampled_rows, remaining_sampled]).sort_index()
+        return result_df
 
 
 def count_files_in_folder(folder: Path) -> int:
@@ -149,7 +158,7 @@ def count_files_in_folder(folder: Path) -> int:
 def create_debug_data(
     competition: str,
     dr_cls: type[DataReducer] = UniqueIDDataReducer,
-    min_frac=0.002,
+    min_frac=0.01,
     min_num=5,
     dataset_path=None,
     sample_path=None,

From f7d349a1a10dc59021895985ab05f365eae64fe1 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Mon, 13 Jan 2025 03:44:49 +0000
Subject: [PATCH 245/304] refine the code in data_loader

---
 .../components/coder/data_science/raw_data_loader/prompts.yaml  | 1 +
 rdagent/scenarios/data_science/scen/scen.py                     | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 85ba44179..4a53b76d6 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -290,6 +290,7 @@ data_loader_coder:
     Your task is described below:
     {{ task_desc }}
     You should follow the provided specifications to complete this task.
+    But you need to write the corresponding data loading code based on the information provided in the user's Data Folder Description, rather than relying on any suggestions that might exist in the spec.
 
     Please response the code in the following json format. Here is an example structure for the JSON output:
     {
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index ec2d9c8d9..45163427f 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -41,7 +41,7 @@ def get_dir_snapshot(folder_path):
     return frozenset(exts)
 
 
-def describe_data_folder(folder_path, indent=0, max_files=3, partial_expand_subfolders=3):
+def describe_data_folder(folder_path, indent=0, max_files=1, partial_expand_subfolders=3):
     """
     folder_path              : Current directory path
     indent                   : Current indentation

From 189330f67ea3b5f802ca561fad95f43357a1b603 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Mon, 13 Jan 2025 08:56:36 +0000
Subject: [PATCH 246/304] more reasonable loop

---
 rdagent/utils/agent/tpl.yaml |  1 -
 rdagent/utils/workflow.py    | 18 +++++++++---------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/rdagent/utils/agent/tpl.yaml b/rdagent/utils/agent/tpl.yaml
index 377a12b5b..d61e465f6 100644
--- a/rdagent/utils/agent/tpl.yaml
+++ b/rdagent/utils/agent/tpl.yaml
@@ -4,7 +4,6 @@ PythonAgentOut: |-
   <You code>
   ```
   
-
 BatchEditOut: |-
   You should return a edition that applies to multiple files in a workspace in JSON.
   Except for the model file, other files should not be renamed.
diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py
index 6648f1e0e..cd3cc008a 100644
--- a/rdagent/utils/workflow.py
+++ b/rdagent/utils/workflow.py
@@ -109,10 +109,10 @@ def run(self, step_n: int | None = None):
 
                 li, si = self.loop_idx, self.step_idx
 
-                start = datetime.datetime.now(datetime.timezone.utc)
 
                 name = self.steps[si]
                 with logger.tag(f"Loop_{li}.{name}"):
+                    start = datetime.datetime.now(datetime.timezone.utc)
                     func = getattr(self, name)
                     try:
                         self.loop_prev_out[name] = func(self.loop_prev_out)
@@ -124,14 +124,14 @@ def run(self, step_n: int | None = None):
                         self.step_idx = len(self.steps) - 1  # directly jump to the last step.
                         self.loop_prev_out[self.EXCEPTION_KEY] = e
                         continue
-
-                end = datetime.datetime.now(datetime.timezone.utc)
-
-                self.loop_trace[li].append(LoopTrace(start, end, step_idx=si))
-
-                # Update tqdm progress bar directly to step_idx
-                pbar.n = si + 1
-                pbar.set_postfix(loop_index=li, step_index=si, step_name=name)
+                    finally:
+                        # make sure failure steps are displayed correclty
+                        end = datetime.datetime.now(datetime.timezone.utc)
+                        self.loop_trace[li].append(LoopTrace(start, end, step_idx=si))
+
+                        # Update tqdm progress bar directly to step_idx
+                        pbar.n = si + 1
+                        pbar.set_postfix(loop_index=li, step_index=si + 1, step_name=name)  # step_name indicate  last finished step_name
 
                 # index increase and save session
                 self.step_idx = (self.step_idx + 1) % len(self.steps)

From cc62e16663d8a31b3cace8f736fb0c6337e6f71c Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Mon, 13 Jan 2025 09:28:01 +0000
Subject: [PATCH 247/304] fix a bug in data folder description

---
 rdagent/scenarios/data_science/scen/scen.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
index 45163427f..5925263a4 100644
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ b/rdagent/scenarios/data_science/scen/scen.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 
 import pandas as pd
+from PIL import Image, TiffTags
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.core.scenario import Scenario
@@ -54,7 +55,6 @@ def describe_data_folder(folder_path, indent=0, max_files=1, partial_expand_subf
 
     for root, dirs, files in os.walk(folder_path):
         dirs.sort()
-
         if not dirs:
             for file in files:
                 file_path = os.path.join(root, file)
@@ -139,11 +139,21 @@ def describe_data_folder(folder_path, indent=0, max_files=1, partial_expand_subf
                 result.append(" " * indent + f"- {file} ({size} bytes)")
                 if file_type == "csv":
                     result.append(" " * (indent + 2) + f"- Head of {file}:")
-                    result.append(read_csv_head(path, indent + 2))
+                    csv_head = read_csv_head(path, indent + 2)
+                    if len(csv_head) > 100:
+                        csv_head = " ".join(csv_head.strip().split())
+                        csv_head = csv_head[:100] + "\n... (truncated)"
+                    result.append(csv_head)
                 if file_type == "md":
                     result.append(" " * (indent + 2) + f"- Content of {file}:")
                     with open(path, "r", encoding="utf-8") as f:
                         result.append(f.read())
+                if file_type == "tif":
+                    result.append(" " * (indent + 2) + f"- Metadata of {file}:")
+                    with Image.open(path) as img:
+                        for tag, value in img.tag_v2.items():
+                            tag_name = TiffTags.TAGS_V2.get(tag, f"Unknown Tag {tag}")
+                            result.append(" " * (indent + 4) + f"{tag_name}: {value}")
 
     return "\n".join(result) + "\n"
 

From 39b5c2576e47ca6e54f259d3d51495e904e27fc9 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 13 Jan 2025 11:45:33 +0000
Subject: [PATCH 248/304] add error msg & traceback to execution feedback

---
 rdagent/components/coder/data_science/ensemble/prompts.yaml     | 2 +-
 rdagent/components/coder/data_science/feature/prompts.yaml      | 2 +-
 rdagent/components/coder/data_science/model/prompts.yaml        | 2 +-
 .../components/coder/data_science/raw_data_loader/prompts.yaml  | 2 +-
 rdagent/components/coder/data_science/workflow/prompts.yaml     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
index d4051c1de..ad8b29ed5 100644
--- a/rdagent/components/coder/data_science/ensemble/prompts.yaml
+++ b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -68,7 +68,7 @@ ensemble_eval:
     You'll be given the stdout of your testing scripts.
     Please respond with your feedback in the following JSON format: 
     {
-        "execution": "Describe how well the ensemble executed, including any errors or issues encountered.",
+        "execution": "Describe how well the ensemble executed, including any errors or issues encountered. Please keep the error message and tracking information",
         "return_checking": "Detail the checks performed on the ensemble results, including shape and value validation.",
         "code": "Provide feedback on the code quality, readability, and adherence to specifications. Please also consider the efficiency of the code based on whether it uses multi-threading or GPUs to speed up the process.",
         "final_decision": <true/false>
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
index 2bab7d6e5..381dee974 100644
--- a/rdagent/components/coder/data_science/feature/prompts.yaml
+++ b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -77,7 +77,7 @@ feature_eval:
     Please respond with your feedback in the following JSON format and order
     ```json
     {
-        "execution": "Describe how well the feature processing executed, including any errors or issues encountered.",
+        "execution": "Describe how well the feature processing executed, including any errors or issues encountered. Please keep the error message and tracking information",
         "return_checking": "Detail the checks performed on the data after feature processing, including data integrity and correctness.",
         "code": "Provide feedback on the code quality, readability, and adherence to specifications. Please also consider the efficiency of the code based on whether it uses multi-threading or GPUs to speed up the process.",
         "final_decision": <true/false>
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 745d3ded0..545dde9f9 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -121,7 +121,7 @@ model_eval:
         Please respond with your feedback in the following JSON format and order:
         ```json
         {
-            "execution": "Describe whether the model executed successfully, including any errors or issues encountered.",
+            "execution": "Describe whether the model executed successfully, including any errors or issues encountered. Please keep the error message and tracking information",
             "return_checking": "Check the generated value, including whether the value is generated and comparing the shape of the model output with the requirement in spec.md. You also need to check whether the hyperparameters used for retraining are correctly returned during the test execution of the model.",
             "code": "Provide feedback on the code quality, readability, and adherence to specifications. Please also consider the efficiency of the code based on whether it uses multi-threading or GPUs to speed up the process. Check whether the hyperparameters from the previous run are used in the model code, compare the parameter names in stdout and if they are used in the retraining part of the code. It is acceptable when hyperparameters is None.",
             "final_decision": <true/false>
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 4a53b76d6..b6e0dcbfb 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -360,7 +360,7 @@ data_loader_eval:
     Please respond with your feedback in the following JSON format and order
     ```json
     {
-        "execution": "Describe how well the data loader executed, including any errors or issues encountered.",
+        "execution": "Describe how well the data loader executed, including any errors or issues encountered. Please keep the error message and tracking information",
         "return_checking": "Detail the checks performed on the data loaded, including data integrity and correctness.",
         "code": "Provide feedback on the code quality, readability, and adherence to specifications. Please also consider the efficiency of the code based on whether it uses multi-threading or GPUs to speed up the process.",
         "final_decision": <true/false>
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index b6052e154..df4db1433 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -96,7 +96,7 @@ workflow_eval:
     Please respond with your feedback in the following JSON format and order:
     ```json
     {
-        "execution": "Describe whether the model executed successfully, including any errors or issues encountered.",
+        "execution": "Describe whether the model executed successfully, including any errors or issues encountered. Please keep the error message and tracking information",
         "return_checking": "Check the generated value, including whether the value is generated and comparing the shape of the model output with the requirement in the specification. You also need to check whether the hyperparameters used for retraining are correctly returned during the test execution of the model.",
         "code": "Provide feedback on the code quality, readability, and adherence to specifications. Check whether the hyperparameters from the previous run are used in the model code, compare the parameter names in stdout and if they are used in the retraining part of the code.",
         "final_decision": <true/false>

From 7fab38143cc79e6434656f2daf5d04f5b3901fb6 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Mon, 13 Jan 2025 12:07:00 +0000
Subject: [PATCH 249/304] fix llm error msg detection

---
 rdagent/oai/llm_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/oai/llm_utils.py b/rdagent/oai/llm_utils.py
index 2ac1a23cc..e2f27964f 100644
--- a/rdagent/oai/llm_utils.py
+++ b/rdagent/oai/llm_utils.py
@@ -567,7 +567,7 @@ def _try_create_chat_completion_or_embedding(
             except openai.BadRequestError as e:  # noqa: PERF203
                 logger.warning(e)
                 logger.warning(f"Retrying {i+1}th time...")
-                if "'messages' must contain the word 'json' in some form" in e.message:
+                if "'messages' must contain the word 'json' in some form" in e.message or "\\\'messages\\\' must contain the word \\\'json\\\' in some form" in e.message:
                     kwargs["add_json_in_prompt"] = True
                 elif embedding and "maximum context length" in e.message:
                     kwargs["input_content_list"] = [

From 51c247ec191580dcfc98a9a4b5fa83a1b507eed5 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Mon, 13 Jan 2025 12:21:16 +0000
Subject: [PATCH 250/304] add task information to costeer eval & add cache to
 docker run(use zipfile to store the whole workspace)

---
 .../coder/data_science/ensemble/eval.py       |  4 +-
 .../coder/data_science/ensemble/prompts.yaml  |  2 +
 .../coder/data_science/feature/eval.py        |  8 ++-
 .../coder/data_science/feature/prompts.yaml   |  2 +
 .../coder/data_science/model/eval.py          |  7 +-
 .../coder/data_science/model/prompts.yaml     |  2 +
 .../data_science/raw_data_loader/eval.py      |  8 ++-
 .../data_science/raw_data_loader/prompts.yaml |  4 +-
 .../coder/data_science/workflow/eval.py       |  6 +-
 .../coder/data_science/workflow/prompts.yaml  |  4 ++
 rdagent/core/experiment.py                    |  4 --
 .../scenarios/data_science/dev/feedback.py    | 65 ++++++++++---------
 .../data_science/experiment/workspace.py      | 54 ---------------
 .../data_science/proposal/exp_gen.py          |  4 +-
 rdagent/scenarios/kaggle/kaggle_crawler.py    |  6 +-
 rdagent/utils/__init__.py                     | 10 +++
 rdagent/utils/env.py                          | 65 +++++++++++++++++--
 requirements.txt                              |  3 +
 18 files changed, 152 insertions(+), 106 deletions(-)
 delete mode 100644 rdagent/scenarios/data_science/experiment/workspace.py

diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
index 16ecab9a8..6d1116218 100644
--- a/rdagent/components/coder/data_science/ensemble/eval.py
+++ b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -67,7 +67,9 @@ def evaluate(
         stdout = filter_progress_bar(implementation.execute(env=de, entry=f"python {fname}"))
 
         system_prompt = T(".prompts:ensemble_eval.system").r(
-            test_code=test_code, code=implementation.file_dict["ensemble.py"]
+            task_desc=target_task_information,
+            test_code=test_code,
+            code=implementation.file_dict["ensemble.py"],
         )
         user_prompt = T(".prompts:ensemble_eval.user").r(stdout=stdout)
 
diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
index ad8b29ed5..6fde9d6ce 100644
--- a/rdagent/components/coder/data_science/ensemble/prompts.yaml
+++ b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -54,6 +54,8 @@ ensemble_coder:
 ensemble_eval:
   system: |-
     You are a data scientist evaluating an ensemble implementation.
+    The main code generation task is as follows:
+    {{task_desc}}
     
     The ensemble code is:
     ```python
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
index a98562732..7f0111420 100644
--- a/rdagent/components/coder/data_science/feature/eval.py
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -44,7 +44,9 @@ def evaluate(
 
         ds_docker_conf = DSDockerConf()
         # TODO: we should /= 20 for the timeout period on debug component
-        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
+        ds_docker_conf.extra_volumes = {
+            f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
+        }
         de = DockerEnv(conf=ds_docker_conf)
 
         # TODO: do we need to clean the generated temporary content?
@@ -55,7 +57,9 @@ def evaluate(
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
         system_prompt = T(".prompts:feature_eval.system").r(
-            test_code=test_code, code=implementation.file_dict["feature.py"]
+            task_desc=target_task.get_task_information(),
+            test_code=test_code,
+            code=implementation.file_dict["feature.py"],
         )
         user_prompt = T(".prompts:feature_eval.user").r(stdout=stdout)
 
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
index 381dee974..b49eae8b9 100644
--- a/rdagent/components/coder/data_science/feature/prompts.yaml
+++ b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -62,6 +62,8 @@ feature:
 feature_eval:
   system: |-
     You are data scientist whose job is to evaluate the feature processing code generation.
+    The main code generation task is as follows:
+    {{task_desc}}
 
     The feature code is:
     ```python
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 1d14f337d..857546cf1 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -13,12 +13,12 @@
     CoSTEERSingleFeedback,
 )
 from rdagent.core.evolving_framework import QueriedKnowledge
+from rdagent.core.exception import CoderError
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils import filter_progress_bar
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
-from rdagent.core.exception import CoderError
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 ModelSingleFeedback = CoSTEERSingleFeedback
@@ -73,9 +73,12 @@ def evaluate(
         filtered_stdout = filter_progress_bar(stdout)
 
         if filtered_stdout is None:
-            raise CoderError("The execution output contains too many progress bars and results in the LLM's token size exceeding the limit.")
+            raise CoderError(
+                "The execution output contains too many progress bars and results in the LLM's token size exceeding the limit."
+            )
 
         system_prompt = T(".prompts:model_eval.system").r(
+            task_desc=target_task.get_task_information(),
             test_code=test_code,
             scenario=self.scen.get_scenario_all_desc(),
             spec=implementation.file_dict["spec/model.md"],
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 545dde9f9..919a6cd79 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -97,6 +97,8 @@ model_eval:
         You are a data scientist.
         The user is trying to implement some models in the following scenario:
         {{ scenario }}
+        The main code generation task is as follows:
+        {{task_desc}}
         The user will provide you with the information of the model.
         The information about how to implement the model is given in spec.md as below:
         {{ spec }}
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index 1515b8008..ef4125055 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -48,7 +48,9 @@ def evaluate(
             )
 
         ds_docker_conf = DSDockerConf()
-        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"}
+        ds_docker_conf.extra_volumes = {
+            f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
+        }
         de = DockerEnv(conf=ds_docker_conf)
 
         # TODO: do we need to clean the generated temporary content?
@@ -58,7 +60,9 @@ def evaluate(
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
         system_prompt = T(".prompts:data_loader_eval.system").r(
-            test_code=test_code, code=implementation.file_dict["load_data.py"]
+            task_desc=target_task.get_task_information(),
+            test_code=test_code,
+            code=implementation.file_dict["load_data.py"],
         )
         user_prompt = T(".prompts:data_loader_eval.user").r(stdout=stdout)
 
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index b6e0dcbfb..c3b3b78b7 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -344,7 +344,9 @@ data_loader_coder:
 
 data_loader_eval:
   system: |-
-    You are data scientist.
+    You are data scientist writing some data loader code for a Kaggle-style machine learning competition project.
+    The main code generation task is as follows:
+    {{task_desc}}
 
     The data loader code is:
     ```python
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index cc2a46db6..953497909 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -15,9 +15,9 @@
 from rdagent.core.exception import CoderError
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils import filter_progress_bar
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
-from rdagent.utils import filter_progress_bar
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
@@ -85,7 +85,9 @@ def evaluate(
             stdout += "\nSubmission file (submission.csv) is not generated."
 
         system_prompt = T(".prompts:workflow_eval.system").r(
-            scenario=self.scen.get_scenario_all_desc(), spec=implementation.file_dict["spec/workflow.md"]
+            scenario=self.scen.get_scenario_all_desc(),
+            task_desc=target_task.get_task_information(),
+            spec=implementation.file_dict["spec/workflow.md"],
         )
         user_prompt = T(".prompts:workflow_eval.user").r(
             stdout=stdout,
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
index df4db1433..fd41e8bd2 100644
--- a/rdagent/components/coder/data_science/workflow/prompts.yaml
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -84,6 +84,8 @@ workflow_eval:
     You are a data scientist.
     The user is trying to build a workflow in the following scenario:
     {{ scenario }}
+    The main code generation task is as follows:
+    {{task_desc}}
     The user will provide you with the information of the workflow and its components.
     The information about how to build the workflow is given in the specification file as below:
     {{ spec }}
@@ -93,6 +95,8 @@ workflow_eval:
     Your job is to evaluate the workflow code given by the user. You should be concerned about whether the code executes successfully, generates predictions correctly, and satisfies other requirements in the specification.
     The components have already been evaluated by the user, so you only need to evaluate and improve the workflow code unless there are very serious issues with the components.
 
+    Your evaluation should only consider whether the code executes successfully, generates well formatted predictions, and aligns with the target task. The performance of the model is not a concern in this task.
+
     Please respond with your feedback in the following JSON format and order:
     ```json
     {
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index 06ce05a1c..bc2502989 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -229,10 +229,6 @@ def clear(self) -> None:
         shutil.rmtree(self.workspace_path, ignore_errors=True)
         self.file_dict = {}
 
-    def hash_func(self, env: Env | None = None, entry: str | None = None) -> str:
-        return md5_hash(json.dumps(tuple(sorted(self.file_dict.items()))) + entry)
-
-    @cache_with_pickle(hash_func)
     def execute(self, env: Env | None = None, entry: str | None = None) -> object | None:
         """
         Before each execution, make sure to prepare and inject code
diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
index 86e602943..42738c479 100644
--- a/rdagent/scenarios/data_science/dev/feedback.py
+++ b/rdagent/scenarios/data_science/dev/feedback.py
@@ -1,22 +1,24 @@
+import difflib
 import json
 from pathlib import Path
+from typing import List
 
 from rdagent.components.knowledge_management.graph import UndirectedNode
 from rdagent.core.experiment import Experiment
 from rdagent.core.prompts import Prompts
-from rdagent.core.proposal import Experiment2Feedback, HypothesisFeedback, ExperimentFeedback
+from rdagent.core.proposal import (
+    Experiment2Feedback,
+    ExperimentFeedback,
+    HypothesisFeedback,
+)
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace
-from rdagent.utils import convert2bool
+from rdagent.utils import convert2bool, remove_path_info_from_str
 from rdagent.utils.agent.tpl import T
 
 
-from typing import List
-import difflib
-from pathlib import Path
-
 # TODO:  find a better place.
 def generate_diff(dir1: str, dir2: str) -> List[str]:
     """
@@ -33,8 +35,8 @@ def generate_diff(dir1: str, dir2: str) -> List[str]:
 
     diff_files = []
 
-    dir1_files = {f.relative_to(dir1) for f in Path(dir1).rglob('*.py') if f.is_file()}
-    dir2_files = {f.relative_to(dir2) for f in Path(dir2).rglob('*.py') if f.is_file()}
+    dir1_files = {f.relative_to(dir1) for f in Path(dir1).rglob("*.py") if f.is_file()}
+    dir2_files = {f.relative_to(dir2) for f in Path(dir2).rglob("*.py") if f.is_file()}
 
     all_files = dir1_files.union(dir2_files)
 
@@ -44,36 +46,32 @@ def generate_diff(dir1: str, dir2: str) -> List[str]:
 
         if file1.exists() and file2.exists():
             with file1.open() as f1, file2.open() as f2:
-                diff = list(difflib.unified_diff(
-                    f1.readlines(),
-                    f2.readlines(),
-                    fromfile=str(file1),
-                    tofile=str(file2)
-                ))
+                diff = list(
+                    difflib.unified_diff(f1.readlines(), f2.readlines(), fromfile=str(file1), tofile=str(file2))
+                )
                 if diff:
                     diff_files.extend(diff)
         else:
             if file1.exists():
                 with file1.open() as f1:
-                    diff = list(difflib.unified_diff(
-                        f1.readlines(),
-                        [],
-                        fromfile=str(file1),
-                        tofile=str(file2) + " (empty file)"
-                    ))
+                    diff = list(
+                        difflib.unified_diff(
+                            f1.readlines(), [], fromfile=str(file1), tofile=str(file2) + " (empty file)"
+                        )
+                    )
                     diff_files.extend(diff)
             elif file2.exists():
                 with file2.open() as f2:
-                    diff = list(difflib.unified_diff(
-                        [],
-                        f2.readlines(),
-                        fromfile=str(file1) + " (empty file)",
-                        tofile=str(file2)
-                    ))
+                    diff = list(
+                        difflib.unified_diff(
+                            [], f2.readlines(), fromfile=str(file1) + " (empty file)", tofile=str(file2)
+                        )
+                    )
                     diff_files.extend(diff)
 
     return diff_files
 
+
 class DSExperiment2Feedback(Experiment2Feedback):
     def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeedback:
         # 用哪些信息来生成feedback
@@ -83,12 +81,13 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
         # 4. result 任务的结果
         # 5. sota_exp.result 之前最好的结果
         sota_exp = trace.sota_experiment()
-        sota_desc = T("scenarios.data_science.share:describe.exp").r(exp=sota_exp, heading="SOTA of previous exploration of the scenario")
+        sota_desc = T("scenarios.data_science.share:describe.exp").r(
+            exp=sota_exp, heading="SOTA of previous exploration of the scenario"
+        )
 
         # Get feedback description using shared template
         feedback_desc = T("scenarios.data_science.share:describe.feedback").r(
-            exp_and_feedback=(trace.hist[-1] if trace.hist else None),
-            heading="Previous Trial Feedback"
+            exp_and_feedback=(trace.hist[-1] if trace.hist else None), heading="Previous Trial Feedback"
         )
 
         # TODO:
@@ -104,6 +103,14 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
         else:
             diff_edition = []
 
+        diff_edition = [
+            remove_path_info_from_str(
+                exp.experiment_workspace.workspace_path,
+                remove_path_info_from_str(last_exp.experiment_workspace.workspace_path, line),
+            )
+            for line in diff_edition
+        ]
+
         # assumption:
         # The feedback should focus on experiment **improving**.
         # Assume that all the the sota exp is based on the previous sota experiment
diff --git a/rdagent/scenarios/data_science/experiment/workspace.py b/rdagent/scenarios/data_science/experiment/workspace.py
deleted file mode 100644
index ef52eceee..000000000
--- a/rdagent/scenarios/data_science/experiment/workspace.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from pathlib import Path
-from typing import Any, List, Tuple
-
-import pandas as pd
-
-from rdagent.app.data_science.conf import DS_RD_SETTING
-from rdagent.core.experiment import FBWorkspace
-from rdagent.log import rdagent_logger as logger
-from rdagent.utils.env import DockerEnv, DSDockerConf
-
-
-class DSFBWorkspace(FBWorkspace):
-
-    # TODO: use the cache_with_pickle decorator.
-    # TODO: delete this, it is not used.
-    def execute(self, run_env: dict = {}, *args, **kwargs) -> pd.DataFrame:
-        """
-        Executes the experiment(a competition) within the specified workspace.
-
-        Args:
-            run_env (dict): The runtime environment variables.
-
-        Returns:
-            pd.DataFrame: Scores of each Model and ensemble Model.
-            Example:
-            | Model                 | <Metric like ACC/AUROC/MAE...> |
-            |-----------------------|--------------------------------|
-            | model1                | 0.9                            |
-            | model2                | 0.8                            |
-            | <ensemble model name> | 0.95                           |
-        """
-        logger.info(f"Running the experiment in {self.workspace_path}")
-
-        de = DockerEnv(DSDockerConf())
-        de.prepare()
-
-        running_extra_volume = {}
-        if DS_RD_SETTING.competition:
-            running_extra_volume = {DS_RD_SETTING.local_data_path + "/" + DS_RD_SETTING.competition: "/kaggle/input"}
-        else:
-            running_extra_volume = {}
-
-        execute_log = de.run(
-            local_path=str(self.workspace_path),
-            env=run_env,
-            running_extra_volume=running_extra_volume,
-        )
-
-        csv_path = self.workspace_path / "scores.csv"
-
-        if not csv_path.exists():
-            logger.error(f"File {csv_path} does not exist.")
-            return None
-        return pd.read_csv(csv_path, index_col=0)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 05dc08add..87d50cc68 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -165,7 +165,9 @@ def _handle_missing_component(
                 and component == "Model"
             )
         ):  # Assumption: when completing missing component, using component name as task name
-            resp_dict["description"] += f"You have tried to implement the same component and got the following exception: \n{exp_and_feedback[1].exception}\n Please try different methods to avoid the same errors and results in an infinite loop"
+            resp_dict[
+                "description"
+            ] += f"You have tried to implement the same component and got the following exception: \n{exp_and_feedback[1].exception}\n Please try different methods to avoid the same errors and results in an infinite loop"
 
         task = task_cls(
             name=component if component != "Model" else resp_dict.pop("model_name"),
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
index acc08494f..265d4d214 100644
--- a/rdagent/scenarios/kaggle/kaggle_crawler.py
+++ b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -127,11 +127,9 @@ def download_data(competition: str, settings: ExtendedBaseSettings = KAGGLE_IMPL
         if not (Path(local_path) / competition).exists() or list((Path(local_path) / competition).iterdir()) == []:
             (Path(local_path) / competition).mkdir(parents=True, exist_ok=True)
 
+            mleb_env.run(f"cp -r ./zip_files/{competition}/prepared/public/* ./{competition}", local_path=local_path)
             mleb_env.run(
-                f"/bin/sh -c 'cp -r ./zip_files/{competition}/prepared/public/* ./{competition}'", local_path=local_path
-            )
-            mleb_env.run(
-                f'/bin/sh -c \'for zip_file in ./{competition}/*.zip; do dir_name="${{zip_file%.zip}}"; mkdir -p "$dir_name"; unzip -o "$zip_file" -d "$dir_name"; done\'',
+                f'for zip_file in ./{competition}/*.zip; do dir_name="${{zip_file%.zip}}"; mkdir -p "$dir_name"; unzip -o "$zip_file" -d "$dir_name"; done',
                 local_path=local_path,
             )
             # NOTE:
diff --git a/rdagent/utils/__init__.py b/rdagent/utils/__init__.py
index 37b59dae8..db6eb753b 100644
--- a/rdagent/utils/__init__.py
+++ b/rdagent/utils/__init__.py
@@ -10,6 +10,7 @@
 import json
 import re
 import sys
+from pathlib import Path
 from types import ModuleType
 from typing import Union
 
@@ -130,3 +131,12 @@ def filter_progress_bar(stdout: str) -> str:
     if needs_sub:
         return None
     return filtered_stdout
+
+
+def remove_path_info_from_str(base_path: Path, target_string: str) -> str:
+    """
+    Remove the absolute path from the target string
+    """
+    target_string = re.sub(str(base_path), "...", target_string)
+    target_string = re.sub(str(base_path.absolute()), "...", target_string)
+    return target_string
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
index e403fa11a..6b393ce57 100644
--- a/rdagent/utils/env.py
+++ b/rdagent/utils/env.py
@@ -14,6 +14,7 @@
 import subprocess
 import time
 import uuid
+import zipfile
 from abc import abstractmethod
 from pathlib import Path
 from typing import Generic, Optional, TypeVar
@@ -29,7 +30,9 @@
 from rich.table import Table
 
 from rdagent.core.conf import ExtendedBaseSettings, ExtendedSettingsConfigDict
+from rdagent.core.experiment import RD_AGENT_SETTINGS
 from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import md5_hash
 
 ASpecificBaseModel = TypeVar("ASpecificBaseModel", bound=BaseModel)
 
@@ -380,6 +383,58 @@ def __run(
         except docker.errors.APIError as e:
             raise RuntimeError(f"Error while running the container: {e}")
 
+    def zip_a_folder_into_a_file(self, folder_path: str, zip_file_path: str):
+        """
+        Zip a folder into a file, use zipfile instead of subprocess
+        """
+        with zipfile.ZipFile(zip_file_path, "w") as z:
+            for root, _, files in os.walk(folder_path):
+                for file in files:
+                    z.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), folder_path))
+
+    def unzip_a_file_into_a_folder(self, zip_file_path: str, folder_path: str):
+        """
+        Unzip a file into a folder, use zipfile instead of subprocess
+        """
+        with zipfile.ZipFile(zip_file_path, "r") as z:
+            z.extractall(folder_path)
+
+    def cached_run(
+        self,
+        entry: str | None = None,
+        local_path: str | None = None,
+        env: dict | None = None,
+        running_extra_volume: dict | None = None,
+        remove_timestamp: bool = True,
+    ):
+        """
+        Run the folder under the environment.
+        Will cache the output and the folder diff for next round of running.
+        Use the python codes and the parameters(entry, running_extra_volume) as key to hash the input.
+        """
+        target_folder = Path(RD_AGENT_SETTINGS.pickle_cache_folder_path_str) / f"utils.env.run"
+        target_folder.mkdir(parents=True, exist_ok=True)
+        key = md5_hash(
+            json.dumps(
+                {
+                    str(path.relative_to(Path(local_path))): path.read_text()
+                    for path in sorted(Path(local_path).rglob("*.py"))
+                }
+            )
+            + json.dumps({"entry": entry, "running_extra_volume": running_extra_volume})
+            + json.dumps({"extra_volumes": self.conf.extra_volumes})
+        )
+        if Path(target_folder / f"{key}.pkl").exists() and Path(target_folder / f"{key}.zip").exists():
+            with open(target_folder / f"{key}.pkl", "rb") as f:
+                ret = pickle.load(f)
+            self.unzip_a_file_into_a_folder(target_folder / f"{key}.zip", local_path)
+        else:
+            ret = self.__run(entry, local_path, env, running_extra_volume, remove_timestamp)
+            with open(target_folder / f"{key}.pkl", "wb") as f:
+                pickle.dump(ret, f)
+            self.zip_a_folder_into_a_file(local_path, target_folder / f"{key}.zip")
+        return ret
+
     def run(
         self,
         entry: str | None = None,
@@ -389,17 +444,19 @@ def run(
     ):
         if entry is None:
             entry = self.conf.default_entry
-        entry_add_timeout = f"timeout {self.conf.running_timeout_period} {entry}"
-        
+        entry_add_timeout = (
+            f"/bin/sh -c 'timeout {self.conf.running_timeout_period} {entry}; chmod -R 777 {self.conf.mount_path}'"
+        )
+
         start = time.time()
-        out = self.__run(entry_add_timeout, local_path, env, running_extra_volume)
+        out = self.cached_run(entry_add_timeout, local_path, env, running_extra_volume)
         end = time.time()
 
         if end - start + 1 >= self.conf.running_timeout_period:
             out += f"\n\nThe running time exceeds {self.conf.running_timeout_period} seconds, so the process is killed."
 
         return out
-        
+
     def dump_python_code_run_and_get_results(
         self,
         code: str,
diff --git a/requirements.txt b/requirements.txt
index 0858575c5..413497584 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -47,3 +47,6 @@ nbformat
 # tool
 seaborn
 setuptools-scm
+
+#git related
+zipfile
\ No newline at end of file

From f496bfc309e02c39b607b20e12c35248e2a46af4 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Mon, 13 Jan 2025 13:12:47 +0000
Subject: [PATCH 251/304] fix CI first round

---
 rdagent/app/data_science/loop.py              |  10 +-
 .../components/coder/CoSTEER/evaluators.py    |   1 +
 .../coder/CoSTEER/evolving_agent.py           |   5 +-
 .../eval_tests/data_loader_test.py            |   2 +
 rdagent/core/proposal.py                      |   4 +-
 rdagent/log/ui/llm_st.py                      |   2 +-
 rdagent/oai/backend/base.py                   |   1 -
 rdagent/oai/llm_utils.py                      |   5 +-
 rdagent/scenarios/data_science/debug/data.py  |   7 +-
 rdagent/scenarios/data_science/dev/runner.py  |   1 -
 .../scenarios/data_science/scen/__init__.py   | 277 +++++++++++++++++-
 rdagent/scenarios/data_science/scen/kaggle.py |  37 ---
 rdagent/scenarios/data_science/scen/scen.py   | 245 ----------------
 rdagent/utils/agent/ret.py                    |   6 +-
 rdagent/utils/env.py                          |   6 +-
 rdagent/utils/workflow.py                     |   6 +-
 16 files changed, 306 insertions(+), 309 deletions(-)
 delete mode 100644 rdagent/scenarios/data_science/scen/kaggle.py
 delete mode 100644 rdagent/scenarios/data_science/scen/scen.py

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 0c1ea22a5..314eda89f 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -105,7 +105,7 @@ def record(self, prev_out: dict[str, Any]):
             self.trace.hist.append(
                 (
                     prev_out["direct_exp_gen"] if isinstance(e, CoderError) else prev_out["coding"],
-                    ExperimentFeedback.from_exception(e)
+                    ExperimentFeedback.from_exception(e),
                 )
             )
 
@@ -115,12 +115,12 @@ def main(path=None, step_n=None, competition="bms-molecular-translation"):
 
     Parameters
     ----------
-    path : 
+    path :
         path like `$LOG_PATH/__session__/1/0_propose`. It indicates that we restore the state that after finish the step 0 in loop1
-    step_n : 
+    step_n :
         How many steps to run; if None, it will run forever until error or KeyboardInterrupt
-    competition : 
-        
+    competition :
+
 
     Auto R&D Evolving loop for models in a kaggle{} scenario.
     You can continue running session by
diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
index 03877b4b7..4a329cada 100644
--- a/rdagent/components/coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -138,6 +138,7 @@ def evaluate(
 
 class CoSTEERMultiEvaluator(Evaluator):
     """This is for evaluation of experiment. Due to we have multiple tasks, so we will return a list of evaluation feebacks"""
+
     def __init__(self, single_evaluator: CoSTEEREvaluator, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.single_evaluator = single_evaluator
diff --git a/rdagent/components/coder/CoSTEER/evolving_agent.py b/rdagent/components/coder/CoSTEER/evolving_agent.py
index ad2a09b4c..ef610a213 100644
--- a/rdagent/components/coder/CoSTEER/evolving_agent.py
+++ b/rdagent/components/coder/CoSTEER/evolving_agent.py
@@ -7,8 +7,9 @@
 
 class FilterFailedRAGEvoAgent(RAGEvoAgent):
 
-    def filter_evolvable_subjects_by_feedback(self, evo: EvolvableSubjects,
-                                              feedback: CoSTEERSingleFeedbackDeprecated) -> EvolvableSubjects:
+    def filter_evolvable_subjects_by_feedback(
+        self, evo: EvolvableSubjects, feedback: CoSTEERSingleFeedbackDeprecated
+    ) -> EvolvableSubjects:
         assert isinstance(evo, EvolvingItem)
         # FIXME: the list does not align with the annotation; It should be MultipleFeedback instead of a list of feedbacks
         assert isinstance(feedback, list)
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
index 5b5b9e0d6..6c14fe10d 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
@@ -15,10 +15,12 @@
 
 X, y, X_test, test_ids = load_data()
 
+
 # Validate the conditions mentioned in the docstring
 def get_length(data):
     return len(data) if isinstance(data, list) else data.shape[0]
 
+
 assert get_length(X_test) == get_length(test_ids), "Mismatch in length of test images and test IDs"
 assert get_length(X) == get_length(y), "Mismatch in length of training images and labels"
 
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
index edc50e28e..7d6de9d28 100644
--- a/rdagent/core/proposal.py
+++ b/rdagent/core/proposal.py
@@ -66,7 +66,9 @@ def __init__(
     ) -> None:
         self.decision = decision
         self.reason = reason
-        self.exception: Exception | None = exception  # if the experiment raises exception, it will be integrated into part of the feedback.
+        self.exception: Exception | None = (
+            exception  # if the experiment raises exception, it will be integrated into part of the feedback.
+        )
 
     def __bool__(self) -> bool:
         return self.decision
diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
index 054220278..cd5890255 100644
--- a/rdagent/log/ui/llm_st.py
+++ b/rdagent/log/ui/llm_st.py
@@ -158,7 +158,7 @@ def extract_evoid(tag):
                     else:
                         # show model codes
                         showed_keys = []
-                        for k,v in rdict.items():
+                        for k, v in rdict.items():
                             if k.startswith("model_") and k.endswith(".py"):
                                 st.markdown(f":red[**{k}**]")
                                 st.code(v, language="python", wrap_lines=True, line_numbers=True)
diff --git a/rdagent/oai/backend/base.py b/rdagent/oai/backend/base.py
index 21ed3695e..d9b57c312 100644
--- a/rdagent/oai/backend/base.py
+++ b/rdagent/oai/backend/base.py
@@ -1,3 +1,2 @@
-
 class APIBackend:
     """abstract"""
diff --git a/rdagent/oai/llm_utils.py b/rdagent/oai/llm_utils.py
index e2f27964f..e2155976e 100644
--- a/rdagent/oai/llm_utils.py
+++ b/rdagent/oai/llm_utils.py
@@ -567,7 +567,10 @@ def _try_create_chat_completion_or_embedding(
             except openai.BadRequestError as e:  # noqa: PERF203
                 logger.warning(e)
                 logger.warning(f"Retrying {i+1}th time...")
-                if "'messages' must contain the word 'json' in some form" in e.message or "\\\'messages\\\' must contain the word \\\'json\\\' in some form" in e.message:
+                if (
+                    "'messages' must contain the word 'json' in some form" in e.message
+                    or "\\'messages\\' must contain the word \\'json\\' in some form" in e.message
+                ):
                     kwargs["add_json_in_prompt"] = True
                 elif embedding and "maximum context length" in e.message:
                     kwargs["input_content_list"] = [
diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
index 4568a7299..3161046d2 100644
--- a/rdagent/scenarios/data_science/debug/data.py
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -124,16 +124,13 @@ def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
         unique_labels = df.iloc[:, -1].unique()
         unique_labels = unique_labels[~pd.isna(unique_labels)]
         unique_count = unique_labels.shape[0]
-        print("Unique labels:", unique_count/ df.shape[0])
+        print("Unique labels:", unique_count / df.shape[0])
 
         labels = df.iloc[:, -1]
         unique_labels = labels.dropna().unique()
         unique_count = len(unique_labels)
 
-        sampled_rows = (
-            df.groupby(labels, group_keys=False)
-            .apply(lambda x: x.sample(n=1, random_state=1))
-        )
+        sampled_rows = df.groupby(labels, group_keys=False).apply(lambda x: x.sample(n=1, random_state=1))
 
         frac = max(self.min_frac, self.min_num / len(df))
 
diff --git a/rdagent/scenarios/data_science/dev/runner.py b/rdagent/scenarios/data_science/dev/runner.py
index 20083eb76..d403b212c 100644
--- a/rdagent/scenarios/data_science/dev/runner.py
+++ b/rdagent/scenarios/data_science/dev/runner.py
@@ -29,6 +29,5 @@ def develop(self, exp: DSExperiment) -> DSExperiment:
             logger.error("Submission file (submission.csv) is not generated.")
             raise RunnerError(f"Submission file (submission.csv) is not generated, log is:\n{stdout}")
 
-
         exp.result = pd.read_csv(score_fp, index_col=0)
         return exp
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
index 8aaf93146..e88ba1a64 100644
--- a/rdagent/scenarios/data_science/scen/__init__.py
+++ b/rdagent/scenarios/data_science/scen/__init__.py
@@ -1,4 +1,275 @@
-from .scen import DataScienceScen
-from .kaggle import KaggleScen
+import json
+import os
+from pathlib import Path
 
-__all__ = ["DataScienceScen", "KaggleScen"]
+import pandas as pd
+from PIL import Image, TiffTags
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.core.scenario import Scenario
+from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.kaggle.kaggle_crawler import (
+    crawl_descriptions,
+    leaderboard_scores,
+)
+from rdagent.utils.agent.tpl import T
+
+
+def read_csv_head(file_path, indent, lines=5):
+    try:
+        df = pd.read_csv(file_path, nrows=lines)
+        df_string_lines = df.to_string(index=False).split("\n")
+        for i in range(len(df_string_lines)):
+            df_string_lines[i] = " " * (indent) + df_string_lines[i]
+        return "\n".join(df_string_lines)
+    except Exception as e:
+        return f"Error reading CSV: {e}"
+
+
+def get_dir_snapshot(folder_path):
+    """
+    [note]
+        - Returns a set of file extensions within the subfolder (excluding subfolder names)
+        - Compares only the types of files contained, not specific file names or quantities
+    """
+    exts = set()
+    try:
+        with os.scandir(folder_path) as it:
+            for entry in it:
+                if entry.is_file():
+                    file_ext = os.path.splitext(entry.name)[1]
+                    exts.add(file_ext)
+    except Exception as e:
+        logger.error(f"Error scanning directory: {e}")
+
+    return frozenset(exts)
+
+
+def describe_data_folder(folder_path, indent=0, max_files=1, partial_expand_subfolders=3):
+    """
+    folder_path              : Current directory path
+    indent                   : Current indentation
+    max_files                : Maximum number of files of the same type to display
+    partial_expand_subfolders: When all subfolders have the same internal file types, only expand this many subfolders, the rest are omitted
+    """
+    result = []
+    files_count = {}
+    files_details = {}
+
+    for root, dirs, files in os.walk(folder_path):
+        dirs.sort()
+        if not dirs:
+            for file in files:
+                file_path = os.path.join(root, file)
+                file_type = os.path.splitext(file)[1][1:]
+                file_size = os.path.getsize(file_path)
+
+                if file_type not in files_count:
+                    files_count[file_type] = 0
+                    files_details[file_type] = []
+                files_count[file_type] += 1
+                if len(files_details[file_type]) < max_files:
+                    files_details[file_type].append((file, file_size, file_path))
+            break
+
+        # Collect "type snapshots" of subfolders
+        snapshots = []
+        for d in dirs:
+            subfolder_path = os.path.join(root, d)
+            snapshot = get_dir_snapshot(subfolder_path)
+            snapshots.append(snapshot)
+
+        # Determine if all subfolders have the same file type distribution
+        first_snapshot = snapshots[0]
+        all_same_structure = all(s == first_snapshot for s in snapshots)
+
+        if all_same_structure:
+            for i, d in enumerate(dirs):
+                if i < partial_expand_subfolders:
+                    result.append(" " * indent + f"- Folder: {d}")
+                    subfolder_path = os.path.join(root, d)
+                    result.append(
+                        describe_data_folder(
+                            folder_path=subfolder_path,
+                            indent=indent + 2,
+                            max_files=max_files,
+                            partial_expand_subfolders=partial_expand_subfolders,
+                        )
+                    )
+                else:
+                    remaining = len(dirs) - i
+                    result.append(" " * indent + f"... ({remaining} more subfolders)")
+                    break
+        else:
+            for d in dirs:
+                result.append(" " * indent + f"- Folder: {d}")
+                subfolder_path = os.path.join(root, d)
+                result.append(
+                    describe_data_folder(
+                        folder_path=subfolder_path,
+                        indent=indent + 2,
+                        max_files=max_files,
+                        partial_expand_subfolders=partial_expand_subfolders,
+                    )
+                )
+
+        for file in files:
+            file_path = os.path.join(root, file)
+            file_type = os.path.splitext(file)[1][1:]
+            file_size = os.path.getsize(file_path)
+
+            if file_type not in files_count:
+                files_count[file_type] = 0
+                files_details[file_type] = []
+            files_count[file_type] += 1
+
+            if len(files_details[file_type]) < max_files:
+                files_details[file_type].append((file, file_size, file_path))
+
+        break
+
+    # Print the folder and its contents
+    for file_type, count in files_count.items():
+        if count > max_files:
+            result.append(" " * indent + f"{count} {file_type}s:")
+            for file, size, path in files_details[file_type]:
+                result.append(" " * (indent + 2) + f"- {file} ({size} bytes)")
+            result.append(" " * (indent + 2) + "... (file limit reached)")
+        else:
+            for file, size, path in files_details[file_type]:
+                if file_type == "zip":
+                    continue
+                result.append(" " * indent + f"- {file} ({size} bytes)")
+                if file_type == "csv":
+                    result.append(" " * (indent + 2) + f"- Head of {file}:")
+                    csv_head = read_csv_head(path, indent + 2)
+                    if len(csv_head) > 100:
+                        csv_head = " ".join(csv_head.strip().split())
+                        csv_head = csv_head[:100] + "\n... (truncated)"
+                    result.append(csv_head)
+                if file_type == "md":
+                    result.append(" " * (indent + 2) + f"- Content of {file}:")
+                    with open(path, "r", encoding="utf-8") as f:
+                        result.append(f.read())
+                if file_type == "tif":
+                    result.append(" " * (indent + 2) + f"- Metadata of {file}:")
+                    with Image.open(path) as img:
+                        for tag, value in img.tag_v2.items():
+                            tag_name = TiffTags.TAGS_V2.get(tag, f"Unknown Tag {tag}")
+                            result.append(" " * (indent + 4) + f"{tag_name}: {value}")
+
+    return "\n".join(result) + "\n"
+
+
+class DataScienceScen(Scenario):
+    """Data Science Scenario"""
+
+    def __init__(self, competition: str) -> None:
+        self.competition = competition
+        self.raw_description = self._get_description()
+        self._analysis_competition_description()
+        self.metric_direction = self._get_direction()
+
+    def _get_description(self):
+        if (fp := Path(f"{DS_RD_SETTING.local_data_path}/{self.competition}.json")).exists():
+            logger.info(f"Found {self.competition}.json, loading from local file.")
+            with fp.open("r") as f:
+                return json.load(f)
+        else:
+            logger.error(
+                f"Cannot find {self.competition}.json in {DS_RD_SETTING.local_data_path}, please check the file."
+            )
+
+    def _get_direction(self):
+        return self.metric_direction_guess if hasattr(self, "metric_direction_guess") else True
+
+    def _analysis_competition_description(self):
+        sys_prompt = T(".prompts:competition_description_template.system").r()
+        user_prompt = T(".prompts:competition_description_template.user").r(
+            competition_raw_description=self.raw_description,
+        )
+
+        response_analysis = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=sys_prompt,
+            json_mode=True,
+        )
+
+        response_json_analysis = json.loads(response_analysis)
+        self.task_type = response_json_analysis.get("Task Type", "No type provided")
+        self.data_type = response_json_analysis.get("Data Type", "No data type provided")
+        self.brief_description = response_json_analysis.get("Brief Description", "No brief description provided")
+        self.data_description = response_json_analysis.get("Data Description", "No data description provided")
+        self.target_description = response_json_analysis.get("Evaluation Description", "No target description provided")
+        self.submission_specifications = response_json_analysis.get(
+            "Submission Specifications", "No submission requirements provided"
+        )
+        self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
+        self.metric_direction_guess = response_json_analysis.get("Metric Direction", True)
+
+    def get_competition_full_desc(self) -> str:
+        return f"""Task Type: {self.task_type}
+    Data Type: {self.data_type}
+    Brief Description: {self.brief_description}
+    Data Description: {self.data_description}
+    Target Description: {self.target_description}
+    Submission Specifications: {self.submission_specifications}
+    Model Output Channel: {self.model_output_channel}
+    """
+
+    @property
+    def background(self) -> str:
+        background_template = T(".prompts:competition_background")
+        background_prompt = background_template.r(
+            task_type=self.task_type,
+            data_type=self.data_type,
+            brief_description=self.brief_description,
+            data_description=self.data_description,
+            target_description=self.target_description,
+        )
+        return background_prompt
+
+    @property
+    def rich_style_description(self) -> str:
+        return T(".prompts:rich_style_description").r(
+            name="Data Science",
+            competition=self.competition,
+        )
+
+    def get_scenario_all_desc(self) -> str:
+        return T(".prompts:scenario_description").r(
+            background=self.background,
+            submission_specifications=self.submission_specifications,
+            evaluation=self.target_description,
+            metric_direction=self.metric_direction,
+        )
+
+    def get_data_folder_description(self) -> str:
+        return describe_data_folder(Path(DS_RD_SETTING.local_data_path) / self.competition)
+
+
+class KaggleScen(DataScienceScen):
+    """Kaggle Scenario
+    It is based on kaggle now.
+        - But it is not use the same interface with previous kaggle version.
+        - Ideally, we should reuse previous kaggle scenario.
+          But we found that too much scenario unrelated code in kaggle scenario and hard to reuse.
+          So we start from a simple one....
+    """
+
+    def _get_description(self):
+        return crawl_descriptions(self.competition, DS_RD_SETTING.local_data_path)
+
+    def _get_direction(self):
+        if DS_RD_SETTING.if_using_mle_data:
+            return super()._get_direction()
+        leaderboard = leaderboard_scores(self.competition)
+        return "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize"
+
+    @property
+    def rich_style_description(self) -> str:
+        return T(".prompts:rich_style_description").r(
+            name="Kaggle",
+            competition=f"[{self.competition}](https://www.kaggle.com/competitions/{self.competition})",
+        )
diff --git a/rdagent/scenarios/data_science/scen/kaggle.py b/rdagent/scenarios/data_science/scen/kaggle.py
deleted file mode 100644
index f3e9a21dc..000000000
--- a/rdagent/scenarios/data_science/scen/kaggle.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import json
-
-from rdagent.app.data_science.conf import DS_RD_SETTING
-from rdagent.core.scenario import Scenario
-from rdagent.oai.llm_utils import APIBackend
-from rdagent.scenarios.data_science.scen import DataScienceScen
-from rdagent.scenarios.kaggle.kaggle_crawler import (
-    crawl_descriptions,
-    leaderboard_scores,
-)
-from rdagent.utils.agent.tpl import T
-
-
-class KaggleScen(DataScienceScen):
-    """Kaggle Scenario
-    It is based on kaggle now.
-        - But it is not use the same interface with previous kaggle version.
-        - Ideally, we should reuse previous kaggle scenario.
-          But we found that too much scenario unrelated code in kaggle scenario and hard to reuse.
-          So we start from a simple one....
-    """
-
-    def _get_description(self):
-        return crawl_descriptions(self.competition, DS_RD_SETTING.local_data_path)
-
-    def _get_direction(self):
-        if DS_RD_SETTING.if_using_mle_data:
-            return super()._get_direction()
-        leaderboard = leaderboard_scores(self.competition)
-        return "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize"
-
-    @property
-    def rich_style_description(self) -> str:
-        return T(".prompts:rich_style_description").r(
-            name="Kaggle",
-            competition=f"[{self.competition}](https://www.kaggle.com/competitions/{self.competition})",
-        )
diff --git a/rdagent/scenarios/data_science/scen/scen.py b/rdagent/scenarios/data_science/scen/scen.py
deleted file mode 100644
index 5925263a4..000000000
--- a/rdagent/scenarios/data_science/scen/scen.py
+++ /dev/null
@@ -1,245 +0,0 @@
-import json
-import os
-from pathlib import Path
-
-import pandas as pd
-from PIL import Image, TiffTags
-
-from rdagent.app.data_science.conf import DS_RD_SETTING
-from rdagent.core.scenario import Scenario
-from rdagent.log import rdagent_logger as logger
-from rdagent.oai.llm_utils import APIBackend
-from rdagent.utils.agent.tpl import T
-
-
-def read_csv_head(file_path, indent, lines=5):
-    try:
-        df = pd.read_csv(file_path, nrows=lines)
-        df_string_lines = df.to_string(index=False).split("\n")
-        for i in range(len(df_string_lines)):
-            df_string_lines[i] = " " * (indent) + df_string_lines[i]
-        return "\n".join(df_string_lines)
-    except Exception as e:
-        return f"Error reading CSV: {e}"
-
-
-def get_dir_snapshot(folder_path):
-    """
-    [note]
-        - Returns a set of file extensions within the subfolder (excluding subfolder names)
-        - Compares only the types of files contained, not specific file names or quantities
-    """
-    exts = set()
-    try:
-        with os.scandir(folder_path) as it:
-            for entry in it:
-                if entry.is_file():
-                    file_ext = os.path.splitext(entry.name)[1]
-                    exts.add(file_ext)
-    except Exception as e:
-        logger.error(f"Error scanning directory: {e}")
-
-    return frozenset(exts)
-
-
-def describe_data_folder(folder_path, indent=0, max_files=1, partial_expand_subfolders=3):
-    """
-    folder_path              : Current directory path
-    indent                   : Current indentation
-    max_files                : Maximum number of files of the same type to display
-    partial_expand_subfolders: When all subfolders have the same internal file types, only expand this many subfolders, the rest are omitted
-    """
-    result = []
-    files_count = {}
-    files_details = {}
-
-    for root, dirs, files in os.walk(folder_path):
-        dirs.sort()
-        if not dirs:
-            for file in files:
-                file_path = os.path.join(root, file)
-                file_type = os.path.splitext(file)[1][1:]
-                file_size = os.path.getsize(file_path)
-
-                if file_type not in files_count:
-                    files_count[file_type] = 0
-                    files_details[file_type] = []
-                files_count[file_type] += 1
-                if len(files_details[file_type]) < max_files:
-                    files_details[file_type].append((file, file_size, file_path))
-            break
-
-        # Collect "type snapshots" of subfolders
-        snapshots = []
-        for d in dirs:
-            subfolder_path = os.path.join(root, d)
-            snapshot = get_dir_snapshot(subfolder_path)
-            snapshots.append(snapshot)
-
-        # Determine if all subfolders have the same file type distribution
-        first_snapshot = snapshots[0]
-        all_same_structure = all(s == first_snapshot for s in snapshots)
-
-        if all_same_structure:
-            for i, d in enumerate(dirs):
-                if i < partial_expand_subfolders:
-                    result.append(" " * indent + f"- Folder: {d}")
-                    subfolder_path = os.path.join(root, d)
-                    result.append(
-                        describe_data_folder(
-                            folder_path=subfolder_path,
-                            indent=indent + 2,
-                            max_files=max_files,
-                            partial_expand_subfolders=partial_expand_subfolders,
-                        )
-                    )
-                else:
-                    remaining = len(dirs) - i
-                    result.append(" " * indent + f"... ({remaining} more subfolders)")
-                    break
-        else:
-            for d in dirs:
-                result.append(" " * indent + f"- Folder: {d}")
-                subfolder_path = os.path.join(root, d)
-                result.append(
-                    describe_data_folder(
-                        folder_path=subfolder_path,
-                        indent=indent + 2,
-                        max_files=max_files,
-                        partial_expand_subfolders=partial_expand_subfolders,
-                    )
-                )
-
-        for file in files:
-            file_path = os.path.join(root, file)
-            file_type = os.path.splitext(file)[1][1:]
-            file_size = os.path.getsize(file_path)
-
-            if file_type not in files_count:
-                files_count[file_type] = 0
-                files_details[file_type] = []
-            files_count[file_type] += 1
-
-            if len(files_details[file_type]) < max_files:
-                files_details[file_type].append((file, file_size, file_path))
-
-        break
-
-    # Print the folder and its contents
-    for file_type, count in files_count.items():
-        if count > max_files:
-            result.append(" " * indent + f"{count} {file_type}s:")
-            for file, size, path in files_details[file_type]:
-                result.append(" " * (indent + 2) + f"- {file} ({size} bytes)")
-            result.append(" " * (indent + 2) + "... (file limit reached)")
-        else:
-            for file, size, path in files_details[file_type]:
-                if file_type == "zip":
-                    continue
-                result.append(" " * indent + f"- {file} ({size} bytes)")
-                if file_type == "csv":
-                    result.append(" " * (indent + 2) + f"- Head of {file}:")
-                    csv_head = read_csv_head(path, indent + 2)
-                    if len(csv_head) > 100:
-                        csv_head = " ".join(csv_head.strip().split())
-                        csv_head = csv_head[:100] + "\n... (truncated)"
-                    result.append(csv_head)
-                if file_type == "md":
-                    result.append(" " * (indent + 2) + f"- Content of {file}:")
-                    with open(path, "r", encoding="utf-8") as f:
-                        result.append(f.read())
-                if file_type == "tif":
-                    result.append(" " * (indent + 2) + f"- Metadata of {file}:")
-                    with Image.open(path) as img:
-                        for tag, value in img.tag_v2.items():
-                            tag_name = TiffTags.TAGS_V2.get(tag, f"Unknown Tag {tag}")
-                            result.append(" " * (indent + 4) + f"{tag_name}: {value}")
-
-    return "\n".join(result) + "\n"
-
-
-class DataScienceScen(Scenario):
-    """Data Science Scenario"""
-
-    def __init__(self, competition: str) -> None:
-        self.competition = competition
-        self.raw_description = self._get_description()
-        self._analysis_competition_description()
-        self.metric_direction = self._get_direction()
-
-    def _get_description(self):
-        if (fp := Path(f"{DS_RD_SETTING.local_data_path}/{self.competition}.json")).exists():
-            logger.info(f"Found {self.competition}.json, loading from local file.")
-            with fp.open("r") as f:
-                return json.load(f)
-        else:
-            logger.error(
-                f"Cannot find {self.competition}.json in {DS_RD_SETTING.local_data_path}, please check the file."
-            )
-
-    def _get_direction(self):
-        return self.metric_direction_guess if hasattr(self, "metric_direction_guess") else True
-
-    def _analysis_competition_description(self):
-        sys_prompt = T(".prompts:competition_description_template.system").r()
-        user_prompt = T(".prompts:competition_description_template.user").r(
-            competition_raw_description=self.raw_description,
-        )
-
-        response_analysis = APIBackend().build_messages_and_create_chat_completion(
-            user_prompt=user_prompt,
-            system_prompt=sys_prompt,
-            json_mode=True,
-        )
-
-        response_json_analysis = json.loads(response_analysis)
-        self.task_type = response_json_analysis.get("Task Type", "No type provided")
-        self.data_type = response_json_analysis.get("Data Type", "No data type provided")
-        self.brief_description = response_json_analysis.get("Brief Description", "No brief description provided")
-        self.data_description = response_json_analysis.get("Data Description", "No data description provided")
-        self.target_description = response_json_analysis.get("Evaluation Description", "No target description provided")
-        self.submission_specifications = response_json_analysis.get(
-            "Submission Specifications", "No submission requirements provided"
-        )
-        self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
-        self.metric_direction_guess = response_json_analysis.get("Metric Direction", True)
-
-    def get_competition_full_desc(self) -> str:
-        return f"""Task Type: {self.task_type}
-    Data Type: {self.data_type}
-    Brief Description: {self.brief_description}
-    Data Description: {self.data_description}
-    Target Description: {self.target_description}
-    Submission Specifications: {self.submission_specifications}
-    Model Output Channel: {self.model_output_channel}
-    """
-
-    @property
-    def background(self) -> str:
-        background_template = T(".prompts:competition_background")
-        background_prompt = background_template.r(
-            task_type=self.task_type,
-            data_type=self.data_type,
-            brief_description=self.brief_description,
-            data_description=self.data_description,
-            target_description=self.target_description,
-        )
-        return background_prompt
-
-    @property
-    def rich_style_description(self) -> str:
-        return T(".prompts:rich_style_description").r(
-            name="Data Science",
-            competition=self.competition,
-        )
-
-    def get_scenario_all_desc(self) -> str:
-        return T(".prompts:scenario_description").r(
-            background=self.background,
-            submission_specifications=self.submission_specifications,
-            evaluation=self.target_description,
-            metric_direction=self.metric_direction,
-        )
-
-    def get_data_folder_description(self) -> str:
-        return describe_data_folder(Path(DS_RD_SETTING.local_data_path) / self.competition)
diff --git a/rdagent/utils/agent/ret.py b/rdagent/utils/agent/ret.py
index f84f1a8db..6189af24a 100644
--- a/rdagent/utils/agent/ret.py
+++ b/rdagent/utils/agent/ret.py
@@ -4,8 +4,8 @@
 We think this part can be shared.
 """
 
-import re
 import json
+import re
 from abc import abstractclassmethod
 from typing import Any
 
@@ -13,7 +13,8 @@
 
 
 class AgentOut:
-    json_mode: bool = False   # To get the output, is json_mode required.
+    json_mode: bool = False  # To get the output, is json_mode required.
+
     @abstractclassmethod
     def get_spec(cls, **context: Any) -> str:
         raise NotImplementedError(f"Please implement the `get_spec` method")
@@ -35,6 +36,7 @@ def extract_output(cls, resp: str):
             code = match.group(1)
             return code
 
+
 class BatchEditOut(AgentOut):
     json_mode: bool = True
 
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
index 6b393ce57..da819166a 100644
--- a/rdagent/utils/env.py
+++ b/rdagent/utils/env.py
@@ -416,10 +416,10 @@ def cached_run(
         target_folder.mkdir(parents=True, exist_ok=True)
         key = md5_hash(
             json.dumps(
-                {
-                    str(path.relative_to(Path(local_path))): path.read_text()
+                [
+                    [str(path.relative_to(Path(local_path))), path.read_text()]
                     for path in sorted(Path(local_path).rglob("*.py"))
-                }
+                ]
             )
             + json.dumps({"entry": entry, "running_extra_volume": running_extra_volume})
             + json.dumps({"extra_volumes": self.conf.extra_volumes})
diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py
index cd3cc008a..11305916d 100644
--- a/rdagent/utils/workflow.py
+++ b/rdagent/utils/workflow.py
@@ -75,6 +75,7 @@ class LoopBase:
     Assumption:
     - The last step is responsible for recording information!!!!
     """
+
     steps: list[Callable]  # a list of steps to work on
     loop_trace: dict[int, list[LoopTrace]]
 
@@ -109,7 +110,6 @@ def run(self, step_n: int | None = None):
 
                 li, si = self.loop_idx, self.step_idx
 
-
                 name = self.steps[si]
                 with logger.tag(f"Loop_{li}.{name}"):
                     start = datetime.datetime.now(datetime.timezone.utc)
@@ -131,7 +131,9 @@ def run(self, step_n: int | None = None):
 
                         # Update tqdm progress bar directly to step_idx
                         pbar.n = si + 1
-                        pbar.set_postfix(loop_index=li, step_index=si + 1, step_name=name)  # step_name indicate  last finished step_name
+                        pbar.set_postfix(
+                            loop_index=li, step_index=si + 1, step_name=name
+                        )  # step_name indicate  last finished step_name
 
                 # index increase and save session
                 self.step_idx = (self.step_idx + 1) % len(self.steps)

From 938bcac29393a45af6a56e860ec00c217f43786b Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Mon, 13 Jan 2025 16:01:32 +0000
Subject: [PATCH 252/304] fix CI second round

---
 pyproject.toml                                |   4 +
 rdagent/components/coder/model_coder/model.py |   2 +-
 rdagent/core/evaluation.py                    |   4 +-
 rdagent/core/evolving_agent.py                |   2 +-
 rdagent/core/evolving_framework.py            |   2 +-
 rdagent/core/experiment.py                    |  20 +--
 rdagent/core/proposal.py                      |   6 +-
 rdagent/core/scenario.py                      |   3 +-
 rdagent/oai/llm_utils.py                      | 168 ++++++++----------
 .../scenarios/kaggle/experiment/workspace.py  |   2 +-
 rdagent/utils/__init__.py                     |  11 +-
 rdagent/utils/agent/tpl.py                    |   7 +-
 rdagent/utils/env.py                          |  61 ++++---
 13 files changed, 147 insertions(+), 145 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5a3fbdb4a..3f7300489 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,6 +61,10 @@ explicit_package_bases = true
 warn_return_any = true
 warn_unused_ignores = true
 
+[[tool.mypy.overrides]]
+ignore_missing_imports = true
+module = "llama"
+
 [tool.pytest.ini_options]
 addopts = "-l -s --durations=0"
 log_cli = true
diff --git a/rdagent/components/coder/model_coder/model.py b/rdagent/components/coder/model_coder/model.py
index e19eeed36..4db6a8a3f 100644
--- a/rdagent/components/coder/model_coder/model.py
+++ b/rdagent/components/coder/model_coder/model.py
@@ -123,7 +123,7 @@ def execute(
                 env={},
                 code_dump_file_py_name="model_test",
             )
-            if results is None:
+            if len(results) == 0:
                 raise RuntimeError(f"Error in running the model code: {log}")
             [execution_feedback_str, execution_model_output] = results
 
diff --git a/rdagent/core/evaluation.py b/rdagent/core/evaluation.py
index ce34975b7..e8720a8d4 100644
--- a/rdagent/core/evaluation.py
+++ b/rdagent/core/evaluation.py
@@ -14,10 +14,8 @@ class Feedback:
         The building process of feedback will should be in evaluator
     """
 
-    pass
-
     def __bool__(self) -> bool:
-        super().__bool__()
+        return True
 
 
 class Evaluator(ABC):
diff --git a/rdagent/core/evolving_agent.py b/rdagent/core/evolving_agent.py
index 2f8ea1c14..4d7e8e5c4 100644
--- a/rdagent/core/evolving_agent.py
+++ b/rdagent/core/evolving_agent.py
@@ -31,7 +31,7 @@ def multistep_evolve(
     def filter_evolvable_subjects_by_feedback(
         self,
         evo: EvolvableSubjects,
-        feedback: Feedback | None,
+        feedback: Feedback | list[Feedback] | None,
     ) -> EvolvableSubjects: ...
 
 
diff --git a/rdagent/core/evolving_framework.py b/rdagent/core/evolving_framework.py
index 24c7c6ae7..9ff874482 100644
--- a/rdagent/core/evolving_framework.py
+++ b/rdagent/core/evolving_framework.py
@@ -52,7 +52,7 @@ class EvoStep:
 
     evolvable_subjects: EvolvableSubjects
     queried_knowledge: QueriedKnowledge | None = None
-    feedback: Feedback | None = None
+    feedback: Feedback | list[Feedback] | None = None
 
 
 class EvolvingStrategy(ABC):
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index bc2502989..b9dc17af9 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import json
 import os
 import platform
 import re
@@ -11,15 +10,13 @@
 from collections.abc import Sequence
 from copy import deepcopy
 from pathlib import Path
-from typing import Any, Generic, Optional, TypeVar
+from typing import Any, Generic, TypeVar
 
 from rdagent.core.conf import RD_AGENT_SETTINGS
-from rdagent.core.utils import cache_with_pickle
-from rdagent.oai.llm_utils import md5_hash
-from rdagent.utils.env import Env
 
 if typing.TYPE_CHECKING:
     from rdagent.core.proposal import Hypothesis
+    from rdagent.utils.env import Env
 
 """
 This file contains the all the class about organizing the task in RD-Agent.
@@ -83,7 +80,6 @@ def all_codes(self) -> str:
         """
         Get all the code files in the workspace as a single string.
         """
-        pass
 
 
 ASpecificWS = TypeVar("ASpecificWS", bound=Workspace)
@@ -182,8 +178,10 @@ def inject_files(self, **files: str) -> None:
         """
         Inject the code into the folder.
         {
-            <file name1>: <code>,  // indicate writing <code> into <file name> (create new file or replace existing file)
-            <file name2>: "__DEL__"  // indicate removing file name2. When we want to replace a file to a new one, we usually use this
+            <file name1>: <code>,  // indicate writing <code> into <file name>
+                          (create new file or replace existing file)
+            <file name2>: "__DEL__"  // indicate removing file name2. When we want to replace a file to a new one,
+                          we usually use this
         }
         """
         self.prepare()
@@ -237,7 +235,7 @@ def execute(self, env: Env | None = None, entry: str | None = None) -> object |
         self.inject_files(**self.file_dict)
         # TODO: env should be not None in new design (no code can run without environment)
         if env is not None and entry is not None:
-            return env.run(entry, self.workspace_path)
+            return env.run(entry, str(self.workspace_path))
         return None
 
     def __str__(self) -> str:
@@ -262,9 +260,9 @@ def __init__(
         self,
         sub_tasks: Sequence[ASpecificTask],
         based_experiments: Sequence[ASpecificWSForExperiment] = [],
-        hypothesis: Optional["Hypothesis"] = None,
+        hypothesis: Hypothesis | None = None,
     ) -> None:
-        self.hypothesis: Optional["Hypothesis"] = hypothesis  # Experiment is optionally generated by hypothesis
+        self.hypothesis: Hypothesis | None = hypothesis  # Experiment is optionally generated by hypothesis
         self.sub_tasks: Sequence[ASpecificTask] = sub_tasks
         self.sub_workspace_list: list[ASpecificWSForSubTasks | None] = [None] * len(self.sub_tasks)
         # TODO:
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
index 7d6de9d28..0739dc82c 100644
--- a/rdagent/core/proposal.py
+++ b/rdagent/core/proposal.py
@@ -81,7 +81,7 @@ def from_exception(cls, e: Exception) -> ExperimentFeedback:
         """
         A convenient method to create Feedback from an exception.
         """
-        return cls(False, f"The experiment fails due to {str(e)}", e)
+        return cls(decision=False, reason=f"The experiment fails due to {e!s}", exception=e)
 
 
 class HypothesisFeedback(ExperimentFeedback):
@@ -119,9 +119,9 @@ def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = Non
     def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experiment | None]:
         """Access the last experiment result, sub-task, and the corresponding hypothesis."""
         # TODO: The return value does not align with the signature.
-        for hypothesis, experiment, feedback in self.hist[::-1]:
+        for experiment, feedback in self.hist[::-1]:
             if feedback.decision:
-                return hypothesis, experiment
+                return experiment.hypothesis, experiment
 
         return None, None
 
diff --git a/rdagent/core/scenario.py b/rdagent/core/scenario.py
index 0fe2dfbf5..a9ff6b83f 100644
--- a/rdagent/core/scenario.py
+++ b/rdagent/core/scenario.py
@@ -31,7 +31,8 @@ def source_data(self) -> str:
         return self.get_source_data_desc()
 
     # NOTE: we should keep the interface simpler. So some previous interfaces are deleted.
-    # If we need some specific function only used in the subclass(no exeternal usage). We should not set them in the base class
+    # If we need some specific function only used in the subclass(no external usage).
+    # We should not set them in the base class
 
     @property
     @abstractmethod
diff --git a/rdagent/oai/llm_utils.py b/rdagent/oai/llm_utils.py
index e2155976e..16b8a4553 100644
--- a/rdagent/oai/llm_utils.py
+++ b/rdagent/oai/llm_utils.py
@@ -124,17 +124,13 @@ def chat_get(self, key: str) -> str | None:
         md5_key = md5_hash(key)
         self.c.execute("SELECT chat FROM chat_cache WHERE md5_key=?", (md5_key,))
         result = self.c.fetchone()
-        if result is None:
-            return None
-        return result[0]
+        return None if result is None else result[0]
 
     def embedding_get(self, key: str) -> list | dict | str | None:
         md5_key = md5_hash(key)
         self.c.execute("SELECT embedding FROM embedding_cache WHERE md5_key=?", (md5_key,))
         result = self.c.fetchone()
-        if result is None:
-            return None
-        return json.loads(result[0])
+        return None if result is None else json.loads(result[0])
 
     def chat_set(self, key: str, value: str) -> None:
         md5_key = md5_hash(key)
@@ -143,6 +139,7 @@ def chat_set(self, key: str, value: str) -> None:
             (md5_key, value),
         )
         self.conn.commit()
+        return None
 
     def embedding_set(self, content_to_embedding_dict: dict) -> None:
         for key, value in content_to_embedding_dict.items():
@@ -153,19 +150,18 @@ def embedding_set(self, content_to_embedding_dict: dict) -> None:
             )
         self.conn.commit()
 
-    def message_get(self, conversation_id: str) -> list[str]:
+    def message_get(self, conversation_id: str) -> list[dict[str, Any]]:
         self.c.execute("SELECT message FROM message_cache WHERE conversation_id=?", (conversation_id,))
         result = self.c.fetchone()
-        if result is None:
-            return []
-        return json.loads(result[0])
+        return [] if result is None else json.loads(result[0])
 
-    def message_set(self, conversation_id: str, message_value: list[str]) -> None:
+    def message_set(self, conversation_id: str, message_value: list[dict[str, Any]]) -> None:
         self.c.execute(
             "INSERT OR REPLACE INTO message_cache (conversation_id, message) VALUES (?, ?)",
             (conversation_id, json.dumps(message_value)),
         )
         self.conn.commit()
+        return None
 
 
 class SessionChatHistoryCache(SingletonBaseClass):
@@ -173,10 +169,10 @@ def __init__(self) -> None:
         """load all history conversation json file from self.session_cache_location"""
         self.cache = SQliteLazyCache(cache_location=LLM_SETTINGS.prompt_cache_path)
 
-    def message_get(self, conversation_id: str) -> list[str]:
+    def message_get(self, conversation_id: str) -> list[dict[str, Any]]:
         return self.cache.message_get(conversation_id)
 
-    def message_set(self, conversation_id: str, message_value: list[str]) -> None:
+    def message_set(self, conversation_id: str, message_value: list[dict[str, Any]]) -> None:
         self.cache.message_set(conversation_id, message_value)
 
 
@@ -203,7 +199,7 @@ def build_chat_completion_message_and_calculate_token(self, user_prompt: str) ->
         messages = self.build_chat_completion_message(user_prompt)
         return self.api_backend.calculate_token_from_messages(messages)
 
-    def build_chat_completion(self, user_prompt: str, **kwargs: Any) -> str:
+    def build_chat_completion(self, user_prompt: str, *args, **kwargs) -> str:  # type: ignore[no-untyped-def]
         """
         this function is to build the session messages
         user prompt should always be provided
@@ -211,7 +207,8 @@ def build_chat_completion(self, user_prompt: str, **kwargs: Any) -> str:
         messages = self.build_chat_completion_message(user_prompt)
 
         with logger.tag(f"session_{self.conversation_id}"):
-            response = self.api_backend._try_create_chat_completion_or_embedding(  # noqa: SLF001
+            response: str = self.api_backend._try_create_chat_completion_or_embedding(  # noqa: SLF001
+                *args,
                 messages=messages,
                 chat_completion=True,
                 **kwargs,
@@ -265,7 +262,7 @@ def __init__(  # noqa: C901, PLR0912, PLR0915
             self.generator = Llama.build(
                 ckpt_dir=LLM_SETTINGS.llama2_ckpt_dir,
                 tokenizer_path=LLM_SETTINGS.llama2_tokenizer_path,
-                max_seq_len=LLM_SETTINGS.max_tokens,
+                max_seq_len=LLM_SETTINGS.chat_max_tokens,
                 max_batch_size=LLM_SETTINGS.llams2_max_batch_size,
             )
             self.encoder = None
@@ -308,11 +305,8 @@ def __init__(  # noqa: C901, PLR0912, PLR0915
             self.chat_model = LLM_SETTINGS.chat_model if chat_model is None else chat_model
             self.encoder = None
         else:
-            if LLM_SETTINGS.use_azure:
-                self.chat_use_azure = self.embedding_use_azure = LLM_SETTINGS.use_azure
-            else:
-                self.chat_use_azure = LLM_SETTINGS.chat_use_azure
-                self.embedding_use_azure = LLM_SETTINGS.embedding_use_azure
+            self.chat_use_azure = LLM_SETTINGS.chat_use_azure or LLM_SETTINGS.use_azure
+            self.embedding_use_azure = LLM_SETTINGS.embedding_use_azure or LLM_SETTINGS.use_azure
             self.chat_use_azure_token_provider = LLM_SETTINGS.chat_use_azure_token_provider
             self.embedding_use_azure_token_provider = LLM_SETTINGS.embedding_use_azure_token_provider
             self.managed_identity_client_id = LLM_SETTINGS.managed_identity_client_id
@@ -363,39 +357,27 @@ def __init__(  # noqa: C901, PLR0912, PLR0915
                     credential,
                     "https://cognitiveservices.azure.com/.default",
                 )
-            if self.chat_use_azure:
-                if self.chat_use_azure_token_provider:
-                    self.chat_client = openai.AzureOpenAI(
-                        azure_ad_token_provider=token_provider,
-                        api_version=self.chat_api_version,
-                        azure_endpoint=self.chat_api_base,
-                    )
-                else:
-                    self.chat_client = openai.AzureOpenAI(
-                        api_key=self.chat_api_key,
-                        api_version=self.chat_api_version,
-                        azure_endpoint=self.chat_api_base,
-                    )
-            else:
-                self.chat_client = openai.OpenAI(api_key=self.chat_api_key, base_url=self.chat_openai_base_url)
-
-            if self.embedding_use_azure:
-                if self.embedding_use_azure_token_provider:
-                    self.embedding_client = openai.AzureOpenAI(
-                        azure_ad_token_provider=token_provider,
-                        api_version=self.embedding_api_version,
-                        azure_endpoint=self.embedding_api_base,
-                    )
-                else:
-                    self.embedding_client = openai.AzureOpenAI(
-                        api_key=self.embedding_api_key,
-                        api_version=self.embedding_api_version,
-                        azure_endpoint=self.embedding_api_base,
-                    )
-            else:
-                self.embedding_client = openai.OpenAI(
-                    api_key=self.embedding_api_key, base_url=self.embedding_openai_base_url
+            self.chat_client: openai.OpenAI = (
+                openai.AzureOpenAI(
+                    azure_ad_token_provider=token_provider if self.chat_use_azure_token_provider else None,
+                    api_key=self.chat_api_key if not self.chat_use_azure_token_provider else None,
+                    api_version=self.chat_api_version,
+                    azure_endpoint=self.chat_api_base,
                 )
+                if self.chat_use_azure
+                else openai.OpenAI(api_key=self.chat_api_key, base_url=self.chat_openai_base_url)
+            )
+
+            self.embedding_client: openai.OpenAI = (
+                openai.AzureOpenAI(
+                    azure_ad_token_provider=token_provider if self.embedding_use_azure_token_provider else None,
+                    api_key=self.embedding_api_key if not self.embedding_use_azure_token_provider else None,
+                    api_version=self.embedding_api_version,
+                    azure_endpoint=self.embedding_api_base,
+                )
+                if self.embedding_use_azure
+                else openai.OpenAI(api_key=self.embedding_api_key, base_url=self.embedding_openai_base_url)
+            )
 
         self.dump_chat_cache = LLM_SETTINGS.dump_chat_cache if dump_chat_cache is None else dump_chat_cache
         self.use_chat_cache = LLM_SETTINGS.use_chat_cache if use_chat_cache is None else use_chat_cache
@@ -414,7 +396,7 @@ def __init__(  # noqa: C901, PLR0912, PLR0915
         self.use_gcr_endpoint = LLM_SETTINGS.use_gcr_endpoint
         self.retry_wait_seconds = LLM_SETTINGS.retry_wait_seconds
 
-    def _get_encoder(self):
+    def _get_encoder(self) -> tiktoken.Encoding:
         """
         tiktoken.encoding_for_model(self.chat_model) does not cover all cases it should consider.
 
@@ -431,15 +413,16 @@ def _azure_patch(model: str) -> str:
 
         model = self.chat_model
         try:
-            return tiktoken.encoding_for_model(model)
+            encoding = tiktoken.encoding_for_model(model)
         except KeyError:
             logger.warning(f"Failed to get encoder. Trying to patch the model name")
             for patch_func in [_azure_patch]:
                 try:
-                    return tiktoken.encoding_for_model(patch_func(model))
+                    encoding = tiktoken.encoding_for_model(patch_func(model))
                 except KeyError:
                     logger.error(f"Failed to get encoder even after patching with {patch_func.__name__}")
                     raise
+        return encoding
 
     def build_chat_session(
         self,
@@ -456,10 +439,10 @@ def build_messages(
         self,
         user_prompt: str,
         system_prompt: str | None = None,
-        former_messages: list[dict] | None = None,
+        former_messages: list[dict[str, Any]] | None = None,
         *,
         shrink_multiple_break: bool = False,
-    ) -> list[dict]:
+    ) -> list[dict[str, Any]]:
         """
         build the messages to avoid implementing several redundant lines of code
 
@@ -489,15 +472,15 @@ def build_messages(
         )
         return messages
 
-    def build_messages_and_create_chat_completion(
+    def build_messages_and_create_chat_completion(  # type: ignore[no-untyped-def]
         self,
         user_prompt: str,
         system_prompt: str | None = None,
         former_messages: list | None = None,
         chat_cache_prefix: str = "",
-        *,
         shrink_multiple_break: bool = False,
-        **kwargs: Any,
+        *args,
+        **kwargs,
     ) -> str:
         if former_messages is None:
             former_messages = []
@@ -508,32 +491,36 @@ def build_messages_and_create_chat_completion(
             shrink_multiple_break=shrink_multiple_break,
         )
 
-        resp = self._try_create_chat_completion_or_embedding(
+        resp = self._try_create_chat_completion_or_embedding(  # type: ignore[misc]
+            *args,
             messages=messages,
             chat_completion=True,
             chat_cache_prefix=chat_cache_prefix,
             **kwargs,
         )
+        if isinstance(resp, list):
+            raise ValueError("The response of _try_create_chat_completion_or_embedding should be a string.")
         logger.log_object({"system": system_prompt, "user": user_prompt, "resp": resp}, tag="debug_llm")
         return resp
 
-    def create_embedding(self, input_content: str | list[str], **kwargs: Any) -> list[Any] | Any:
+    def create_embedding(self, input_content: str | list[str], *args, **kwargs) -> list[Any] | Any:  # type: ignore[no-untyped-def]
         input_content_list = [input_content] if isinstance(input_content, str) else input_content
-        resp = self._try_create_chat_completion_or_embedding(
+        resp = self._try_create_chat_completion_or_embedding(  # type: ignore[misc]
             input_content_list=input_content_list,
             embedding=True,
+            *args,
             **kwargs,
         )
         if isinstance(input_content, str):
             return resp[0]
         return resp
 
-    def _create_chat_completion_auto_continue(self, messages: list, **kwargs: dict) -> str:
+    def _create_chat_completion_auto_continue(self, messages: list[dict[str, Any]], *args, **kwargs) -> str:  # type: ignore[no-untyped-def]
         """
         Call the chat completion function and automatically continue the conversation if the finish_reason is length.
         TODO: This function only continues once, maybe need to continue more than once in the future.
         """
-        response, finish_reason = self._create_chat_completion_inner_function(messages=messages, **kwargs)
+        response, finish_reason = self._create_chat_completion_inner_function(messages, *args, **kwargs)
 
         if finish_reason == "length":
             new_message = deepcopy(messages)
@@ -544,28 +531,28 @@ def _create_chat_completion_auto_continue(self, messages: list, **kwargs: dict)
                     "content": "continue the former output with no overlap",
                 },
             )
-            new_response, finish_reason = self._create_chat_completion_inner_function(messages=new_message, **kwargs)
+            new_response, finish_reason = self._create_chat_completion_inner_function(new_message, *args, **kwargs)
             return response + new_response
         return response
 
-    def _try_create_chat_completion_or_embedding(
+    def _try_create_chat_completion_or_embedding(  # type: ignore[no-untyped-def]
         self,
         max_retry: int = 10,
-        *,
         chat_completion: bool = False,
         embedding: bool = False,
-        **kwargs: Any,
-    ) -> Any:
+        *args,
+        **kwargs,
+    ) -> str | list[float]:
         assert not (chat_completion and embedding), "chat_completion and embedding cannot be True at the same time"
         max_retry = LLM_SETTINGS.max_retry if LLM_SETTINGS.max_retry is not None else max_retry
         for i in range(max_retry):
             try:
                 if embedding:
-                    return self._create_embedding_inner_function(**kwargs)
+                    return self._create_embedding_inner_function(*args, **kwargs)
                 if chat_completion:
-                    return self._create_chat_completion_auto_continue(**kwargs)
+                    return self._create_chat_completion_auto_continue(*args, **kwargs)
             except openai.BadRequestError as e:  # noqa: PERF203
-                logger.warning(e)
+                logger.warning(str(e))
                 logger.warning(f"Retrying {i+1}th time...")
                 if (
                     "'messages' must contain the word 'json' in some form" in e.message
@@ -577,14 +564,14 @@ def _try_create_chat_completion_or_embedding(
                         content[: len(content) // 2] for content in kwargs.get("input_content_list", [])
                     ]
             except Exception as e:  # noqa: BLE001
-                logger.warning(e)
+                logger.warning(str(e))
                 logger.warning(f"Retrying {i+1}th time...")
                 time.sleep(self.retry_wait_seconds)
         error_message = f"Failed to create chat completion after {max_retry} retries."
         raise RuntimeError(error_message)
 
-    def _create_embedding_inner_function(
-        self, input_content_list: list[str], **kwargs: Any
+    def _create_embedding_inner_function(  # type: ignore[no-untyped-def]
+        self, input_content_list: list[str], *args, **kwargs
     ) -> list[Any]:  # noqa: ARG002
         content_to_embedding_dict = {}
         filtered_input_content_list = []
@@ -620,7 +607,7 @@ def _create_embedding_inner_function(
                     self.cache.embedding_set(content_to_embedding_dict)
         return [content_to_embedding_dict[content] for content in input_content_list]
 
-    def _build_log_messages(self, messages: list[dict]) -> str:
+    def _build_log_messages(self, messages: list[dict[str, Any]]) -> str:
         log_messages = ""
         for m in messages:
             log_messages += (
@@ -631,19 +618,20 @@ def _build_log_messages(self, messages: list[dict]) -> str:
             )
         return log_messages
 
-    def _create_chat_completion_inner_function(  # noqa: C901, PLR0912, PLR0915
+    def _create_chat_completion_inner_function(  # type: ignore[no-untyped-def] # noqa: C901, PLR0912, PLR0915
         self,
-        messages: list[dict],
+        messages: list[dict[str, Any]],
         temperature: float | None = None,
         max_tokens: int | None = None,
         chat_cache_prefix: str = "",
         frequency_penalty: float | None = None,
         presence_penalty: float | None = None,
-        *,
         json_mode: bool = False,
         add_json_in_prompt: bool = False,
         seed: Optional[int] = None,
-    ) -> str:
+        *args,
+        **kwargs,
+    ) -> tuple[str, str | None]:
         """
         seed : Optional[int]
             When retrying with cache enabled, it will keep returning the same results.
@@ -689,7 +677,7 @@ def _create_chat_completion_inner_function(  # noqa: C901, PLR0912, PLR0915
         finish_reason = None
         if self.use_llama2:
             response = self.generator.chat_completion(
-                messages,  # type: ignore
+                messages,
                 max_gen_len=max_tokens,
                 temperature=temperature,
             )
@@ -718,7 +706,7 @@ def _create_chat_completion_inner_function(  # noqa: C901, PLR0912, PLR0915
             if LLM_SETTINGS.log_llm_chat_content:
                 logger.info(f"{LogColors.CYAN}Response:{resp}{LogColors.END}", tag="llm_messages")
         else:
-            kwargs = dict(
+            call_kwargs = dict(
                 model=model,
                 messages=messages,
                 max_tokens=max_tokens,
@@ -734,8 +722,8 @@ def _create_chat_completion_inner_function(  # noqa: C901, PLR0912, PLR0915
                         message["content"] = message["content"] + "\nPlease respond in json format."
                         if message["role"] == "system":
                             break
-                kwargs["response_format"] = {"type": "json_object"}
-            response = self.chat_client.chat.completions.create(**kwargs)
+                call_kwargs["response_format"] = {"type": "json_object"}
+            response = self.chat_client.chat.completions.create(**call_kwargs)
 
             if self.chat_stream:
                 resp = ""
@@ -781,7 +769,9 @@ def _create_chat_completion_inner_function(  # noqa: C901, PLR0912, PLR0915
             self.cache.chat_set(input_content_json, resp)
         return resp, finish_reason
 
-    def calculate_token_from_messages(self, messages: list[dict]) -> int:
+    def calculate_token_from_messages(self, messages: list[dict[str, Any]]) -> int:
+        if self.encoder is None:
+            raise ValueError("Encoder is not initialized.")
         if self.use_llama2 or self.use_gcr_endpoint:
             logger.warning("num_tokens_from_messages() is not implemented for model llama2.")
             return 0  # TODO implement this function for llama2
@@ -806,7 +796,7 @@ def build_messages_and_calculate_token(
         self,
         user_prompt: str,
         system_prompt: str | None,
-        former_messages: list[dict] | None = None,
+        former_messages: list[dict[str, Any]] | None = None,
         *,
         shrink_multiple_break: bool = False,
     ) -> int:
@@ -837,4 +827,4 @@ def calculate_embedding_distance_between_str_list(
     target_embeddings_np = target_embeddings_np / np.linalg.norm(target_embeddings_np, axis=1, keepdims=True)
     similarity_matrix = np.dot(source_embeddings_np, target_embeddings_np.T)
 
-    return similarity_matrix.tolist()
+    return similarity_matrix.tolist()  # type: ignore[no-any-return]
diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py
index e8ffc084b..5d4b0497f 100644
--- a/rdagent/scenarios/kaggle/experiment/workspace.py
+++ b/rdagent/scenarios/kaggle/experiment/workspace.py
@@ -62,7 +62,7 @@ def generate_preprocess_data(
                 else None
             ),
         )
-        if results is None:
+        if len(results) == 0:
             logger.error("Feature preprocess failed.")
             raise Exception("Feature preprocess failed.")
         else:
diff --git a/rdagent/utils/__init__.py b/rdagent/utils/__init__.py
index db6eb753b..aef26ef4c 100644
--- a/rdagent/utils/__init__.py
+++ b/rdagent/utils/__init__.py
@@ -19,7 +19,7 @@
 from rdagent.utils.agent.tpl import T
 
 
-def get_module_by_module_path(module_path: Union[str, ModuleType]):
+def get_module_by_module_path(module_path: Union[str, ModuleType]) -> ModuleType:
     """Load module from path like a/b/c/d.py or a.b.c.d
 
     :param module_path:
@@ -35,9 +35,14 @@ def get_module_by_module_path(module_path: Union[str, ModuleType]):
         if module_path.endswith(".py"):
             module_name = re.sub("^[^a-zA-Z_]+", "", re.sub("[^0-9a-zA-Z_]", "", module_path[:-3].replace("/", "_")))
             module_spec = importlib.util.spec_from_file_location(module_name, module_path)
+            if module_spec is None:
+                raise ModuleNotFoundError(f"Cannot find module at {module_path}")
             module = importlib.util.module_from_spec(module_spec)
             sys.modules[module_name] = module
-            module_spec.loader.exec_module(module)
+            if module_spec.loader is not None:
+                module_spec.loader.exec_module(module)
+            else:
+                raise ModuleNotFoundError(f"Cannot load module at {module_path}")
         else:
             module = importlib.import_module(module_path)
     return module
@@ -128,8 +133,6 @@ def filter_progress_bar(stdout: str) -> str:
             break
         filtered_stdout = re.sub(r"\s*\n\s*", "\n", filtered_stdout)
 
-    if needs_sub:
-        return None
     return filtered_stdout
 
 
diff --git a/rdagent/utils/agent/tpl.py b/rdagent/utils/agent/tpl.py
index e87559efe..28345c347 100644
--- a/rdagent/utils/agent/tpl.py
+++ b/rdagent/utils/agent/tpl.py
@@ -43,7 +43,10 @@ def __init__(self, uri: str):
         stack = inspect.stack()
         caller_frame = stack[1]
         caller_module = inspect.getmodule(caller_frame[0])
-        caller_dir = Path(caller_module.__file__).parent
+        if caller_module and caller_module.__file__:
+            caller_dir = Path(caller_module.__file__).parent
+        else:
+            caller_dir = DIRNAME
 
         # Parse the URI
         path_part, yaml_path = uri.split(":")
@@ -65,7 +68,7 @@ def __init__(self, uri: str):
 
         self.template = yaml_content
 
-    def r(self, **context: Any):
+    def r(self, **context: Any) -> str:
         """
         Render the template with the given context.
         """
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
index da819166a..f68bd4d07 100644
--- a/rdagent/utils/env.py
+++ b/rdagent/utils/env.py
@@ -19,9 +19,10 @@
 from pathlib import Path
 from typing import Generic, Optional, TypeVar
 
-import docker
-import docker.models
-import docker.models.containers
+import docker  # type: ignore[import-untyped]
+import docker.models  # type: ignore[import-untyped]
+import docker.models.containers  # type: ignore[import-untyped]
+import docker.types  # type: ignore[import-untyped]
 from pydantic import BaseModel
 from rich import print
 from rich.console import Console
@@ -50,13 +51,13 @@ def __init__(self, conf: ASpecificBaseModel):
         self.conf = conf
 
     @abstractmethod
-    def prepare(self):
+    def prepare(self, *args, **kwargs) -> None:  # type: ignore[no-untyped-def]
         """
         Prepare for the environment based on it's configure
         """
 
     @abstractmethod
-    def run(self, entry: str | None, local_path: str | None = None, env: dict | None = None) -> str:
+    def run(self, entry: str | None, local_path: str = ".", env: dict | None = None) -> str:
         """
         Run the folder under the environment.
 
@@ -92,7 +93,7 @@ class LocalEnv(Env[LocalConf]):
     Sometimes local environment may be more convinient for testing
     """
 
-    def prepare(self):
+    def prepare(self) -> None:
         if not (Path("~/.qlib/qlib_data/cn_data").expanduser().resolve().exists()):
             self.run(
                 entry="python -m qlib.run.get_data qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn",
@@ -132,7 +133,7 @@ class DockerConf(ExtendedBaseSettings):
     mount_path: str  # the path in the docker image to mount the folder
     default_entry: str  # the entry point of the image
 
-    extra_volumes: dict | None = {}
+    extra_volumes: dict = {}
     # Sometime, we need maintain some extra data for the workspace.
     # And the extra data may be shared and the downloading can be time consuming.
     # So we just want to download it once.
@@ -229,12 +230,16 @@ class MLEBDockerConf(DockerConf):
 class DockerEnv(Env[DockerConf]):
     # TODO: Save the output into a specific file
 
-    def prepare(self):
+    def prepare(self, *args, **kwargs) -> None:  # type: ignore[no-untyped-def]
         """
         Download image if it doesn't exist
         """
         client = docker.from_env()
-        if self.conf.build_from_dockerfile and self.conf.dockerfile_folder_path.exists():
+        if (
+            self.conf.build_from_dockerfile
+            and self.conf.dockerfile_folder_path is not None
+            and self.conf.dockerfile_folder_path.exists()
+        ):
             logger.info(f"Building the image from dockerfile: {self.conf.dockerfile_folder_path}")
             resp_stream = client.api.build(
                 path=str(self.conf.dockerfile_folder_path), tag=self.conf.image, network_mode=self.conf.network
@@ -291,7 +296,7 @@ def prepare(self):
         except docker.errors.APIError as e:
             raise RuntimeError(f"Error while pulling the image: {e}")
 
-    def _gpu_kwargs(self, client):
+    def _gpu_kwargs(self, client: docker.DockerClient) -> dict:  # type: ignore[no-any-unimported]
         """get gpu kwargs based on its availability"""
         if not self.conf.enable_gpu:
             return {}
@@ -307,7 +312,7 @@ def _gpu_kwargs(self, client):
             return {}
         return gpu_kwargs
 
-    def replace_time_info(self, input_string):
+    def replace_time_info(self, input_string: str) -> str:
         """To remove any time related information from the logs since it will destroy the cache mechanism"""
         """We currently set this function as default, but it can be changed in the future"""
         datetime_pattern = r"\b\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(?:\.\d+)?\b"
@@ -317,7 +322,7 @@ def replace_time_info(self, input_string):
     def __run(
         self,
         entry: str | None = None,
-        local_path: str | None = None,
+        local_path: str = ".",
         env: dict | None = None,
         running_extra_volume: dict | None = None,
         remove_timestamp: bool = True,
@@ -341,7 +346,7 @@ def __run(
         log_output = ""
 
         try:
-            container: docker.models.containers.Container = client.containers.run(
+            container: docker.models.containers.Container = client.containers.run(  # type: ignore[no-any-unimported]
                 image=self.conf.image,
                 command=entry,
                 volumes=volumns,
@@ -383,7 +388,7 @@ def __run(
         except docker.errors.APIError as e:
             raise RuntimeError(f"Error while running the container: {e}")
 
-    def zip_a_folder_into_a_file(self, folder_path: str, zip_file_path: str):
+    def zip_a_folder_into_a_file(self, folder_path: str, zip_file_path: str) -> None:
         """
         Zip a folder into a file, use zipfile instead of subprocess
         """
@@ -392,7 +397,7 @@ def zip_a_folder_into_a_file(self, folder_path: str, zip_file_path: str):
                 for file in files:
                     z.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), folder_path))
 
-    def unzip_a_file_into_a_folder(self, zip_file_path: str, folder_path: str):
+    def unzip_a_file_into_a_folder(self, zip_file_path: str, folder_path: str) -> None:
         """
         Unzip a file into a folder, use zipfile instead of subprocess
         """
@@ -402,11 +407,11 @@ def unzip_a_file_into_a_folder(self, zip_file_path: str, folder_path: str):
     def cached_run(
         self,
         entry: str | None = None,
-        local_path: str | None = None,
+        local_path: str = ".",
         env: dict | None = None,
         running_extra_volume: dict | None = None,
         remove_timestamp: bool = True,
-    ):
+    ) -> str:
         """
         Run the folder under the environment.
         Will cache the output and the folder diff for next round of running.
@@ -426,22 +431,22 @@ def cached_run(
         )
         if Path(target_folder / f"{key}.pkl").exists() and Path(target_folder / f"{key}.zip").exists():
             with open(target_folder / f"{key}.pkl", "rb") as f:
-                ret = pickle.load(f)
-            self.unzip_a_file_into_a_folder(target_folder / f"{key}.zip", local_path)
+                ret: str = pickle.load(f)
+            self.unzip_a_file_into_a_folder(str(target_folder / f"{key}.zip"), local_path)
         else:
             ret = self.__run(entry, local_path, env, running_extra_volume, remove_timestamp)
             with open(target_folder / f"{key}.pkl", "wb") as f:
                 pickle.dump(ret, f)
-            self.zip_a_folder_into_a_file(local_path, target_folder / f"{key}.zip")
+            self.zip_a_folder_into_a_file(local_path, str(target_folder / f"{key}.zip"))
         return ret
 
     def run(
         self,
         entry: str | None = None,
-        local_path: str | None = None,
+        local_path: str = ".",
         env: dict | None = None,
         running_extra_volume: dict | None = None,
-    ):
+    ) -> str:
         if entry is None:
             entry = self.conf.default_entry
         entry_add_timeout = (
@@ -461,11 +466,11 @@ def dump_python_code_run_and_get_results(
         self,
         code: str,
         dump_file_names: list[str],
-        local_path: str | None = None,
+        local_path: str,
         env: dict | None = None,
         running_extra_volume: dict | None = None,
         code_dump_file_py_name: Optional[str] = None,
-    ):
+    ) -> tuple[str, list]:
         """
         Dump the code into the local path and run the code.
         """
@@ -481,7 +486,7 @@ def dump_python_code_run_and_get_results(
                 results.append(pickle.load(open(os.path.join(local_path, f"{name}"), "rb")))
                 os.remove(os.path.join(local_path, f"{name}"))
             else:
-                return log_output, None
+                return log_output, []
         return log_output, results
 
 
@@ -491,7 +496,7 @@ class QTDockerEnv(DockerEnv):
     def __init__(self, conf: DockerConf = QlibDockerConf()):
         super().__init__(conf)
 
-    def prepare(self):
+    def prepare(self, *args, **kwargs) -> None:  # type: ignore[explicit-override, no-untyped-def]
         """
         Download image & data if it doesn't exist
         """
@@ -511,7 +516,7 @@ class DMDockerEnv(DockerEnv):
     def __init__(self, conf: DockerConf = DMDockerConf()):
         super().__init__(conf)
 
-    def prepare(self, username: str, password: str):
+    def prepare(self, username: str, password: str) -> None:
         """
         Download image & data if it doesn't exist
         """
@@ -530,7 +535,7 @@ def prepare(self, username: str, password: str):
 class KGDockerEnv(DockerEnv):
     """Kaggle Competition Docker"""
 
-    def __init__(self, competition: str = None, conf: DockerConf = KGDockerConf()):
+    def __init__(self, competition: str | None = None, conf: DockerConf = KGDockerConf()):
         super().__init__(conf)
 
 

From e979d56fe6afb9d2e46334f72b51644c4b13b709 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Mon, 13 Jan 2025 16:05:54 +0000
Subject: [PATCH 253/304] use txt to store test script to avoid pytest

---
 rdagent/components/coder/data_science/feature/eval.py           | 2 +-
 .../feature/eval_tests/{feature_test.py => feature_test.txt}    | 0
 rdagent/components/coder/data_science/model/eval.py             | 2 +-
 .../model/eval_tests/{model_test.py => model_test.txt}          | 0
 rdagent/components/coder/data_science/raw_data_loader/eval.py   | 2 +-
 .../eval_tests/{data_loader_test.py => data_loader_test.txt}    | 0
 6 files changed, 3 insertions(+), 3 deletions(-)
 rename rdagent/components/coder/data_science/feature/eval_tests/{feature_test.py => feature_test.txt} (100%)
 rename rdagent/components/coder/data_science/model/eval_tests/{model_test.py => model_test.txt} (100%)
 rename rdagent/components/coder/data_science/raw_data_loader/eval_tests/{data_loader_test.py => data_loader_test.txt} (100%)

diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
index 7f0111420..5369ad26e 100644
--- a/rdagent/components/coder/data_science/feature/eval.py
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -51,7 +51,7 @@ def evaluate(
 
         # TODO: do we need to clean the generated temporary content?
         fname = "feature_test.py"
-        test_code = (DIRNAME / "eval_tests" / "feature_test.py").read_text()
+        test_code = (DIRNAME / "eval_tests" / "feature_test.txt").read_text()
         implementation.inject_files(**{fname: test_code})
 
         stdout = implementation.execute(env=de, entry=f"python {fname}")
diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.py b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt
similarity index 100%
rename from rdagent/components/coder/data_science/feature/eval_tests/feature_test.py
rename to rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 857546cf1..d365d41b8 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -64,7 +64,7 @@ def evaluate(
 
         fname = "model_test.py"
         test_code = (
-            (DIRNAME / "eval_tests" / fname).read_text().replace("model01", target_task.name)
+            (DIRNAME / "eval_tests" / "model_test.txt").read_text().replace("model01", target_task.name)
         )  # only check the model changed this time
         implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_test.py b/rdagent/components/coder/data_science/model/eval_tests/model_test.txt
similarity index 100%
rename from rdagent/components/coder/data_science/model/eval_tests/model_test.py
rename to rdagent/components/coder/data_science/model/eval_tests/model_test.txt
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index ef4125055..fc79fd80b 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -55,7 +55,7 @@ def evaluate(
 
         # TODO: do we need to clean the generated temporary content?
         fname = "data_loader_test.py"
-        test_code = (DIRNAME / "eval_tests" / "data_loader_test.py").read_text()
+        test_code = (DIRNAME / "eval_tests" / "data_loader_test.txt").read_text()
         implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt
similarity index 100%
rename from rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.py
rename to rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt

From 016189b0e500c3cc56ba6f5aa913aa3581836b2f Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Mon, 13 Jan 2025 16:47:45 +0000
Subject: [PATCH 254/304] remove zipfile in requirements

---
 requirements.txt | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 413497584..fb9b2e6b9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -46,7 +46,4 @@ nbformat
 
 # tool
 seaborn
-setuptools-scm
-
-#git related
-zipfile
\ No newline at end of file
+setuptools-scm
\ No newline at end of file

From b6e6a0a9e827af04dc8d5795641262333364497d Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Mon, 13 Jan 2025 16:51:47 +0000
Subject: [PATCH 255/304] add azure.identity to requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index fb9b2e6b9..3320c1d07 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,6 +8,7 @@ loguru
 fire
 fuzzywuzzy
 openai
+azure.identity
 
 numpy # we use numpy as default data format. So we have to install numpy
 pandas # we use pandas as default data format. So we have to install pandas

From 031cb1d1ab954a3913cf4ddd79305299ea68709b Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Tue, 14 Jan 2025 03:00:45 +0000
Subject: [PATCH 256/304] ignore debug web page

---
 test/utils/test_import.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/utils/test_import.py b/test/utils/test_import.py
index 997e86234..82280a44c 100644
--- a/test/utils/test_import.py
+++ b/test/utils/test_import.py
@@ -23,6 +23,8 @@ def import_all_modules_from_directory(directory):
                 continue
             if "model_coder" in fstr:
                 continue
+            if "llm_st" in fstr:
+                continue
             if (
                 fstr.endswith("rdagent/log/ui/app.py")
                 or fstr.endswith("rdagent/app/cli.py")

From edfe1798881ff590b820629b676a1e2b2a298e3d Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 14 Jan 2025 06:28:54 +0000
Subject: [PATCH 257/304] component test changes

---
 .../ensemble/eval_tests/ensemble_test.txt         | 15 +++++++++++++--
 .../feature/eval_tests/feature_test.txt           |  4 +---
 .../data_science/model/eval_tests/model_test.txt  |  4 ++--
 .../eval_tests/data_loader_test.txt               |  9 ---------
 .../coder/data_science/workflow/eval.py           |  2 +-
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt
index 760d40576..cfd4842f0 100644
--- a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt
+++ b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt
@@ -31,8 +31,19 @@ val_preds_dict["{{mn}}"], test_preds_dict["{{mn}}"], _ = {{mn}}_workflow(
 {% endfor %}
 
 for key in val_preds_dict.keys():
-    print(f"Model {key} validation predictions (val_preds_dict[key]) shape: {val_preds_dict[key].shape if val_preds_dict[key] is not None else 'None'}")
-    print(f"Model {key} test predictions (test_preds_dict[key]) shape: {test_preds_dict[key].shape if test_preds_dict[key] is not None else 'None'}")
+    if val_preds_dict[key] is None: 
+        print(f"Model {key} validation predictions (val_preds_dict[key]) is None.")
+    elif isinstance(val_preds_dict[key], list):
+        print(f"Model {key} validation predictions (val_preds_dict[key]) (list type) length: {len(val_preds_dict[key])}")
+    else:
+        print(f"Model {key} validation predictions (val_preds_dict[key]) shape: {val_preds_dict[key].shape}")
+
+    if test_preds_dict[key] is None: 
+        print(f"Model {key} test predictions (test_preds_dict[key]) is None.")
+    elif isinstance(test_preds_dict[key], list):
+        print(f"Model {key} test predictions (test_preds_dict[key]) (list type) length: {len(test_preds_dict[key])}")
+    else:
+        print(f"Model {key} test predictions (test_preds_dict[key]) shape: {test_preds_dict[key].shape}")
 
 # Run ensemble
 final_pred = ens_and_decision(test_preds_dict, val_preds_dict, val_y)
diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt
index 974ccdef6..9a447b39d 100644
--- a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt
@@ -16,17 +16,15 @@ from load_data import load_data
 
 X, y, X_test, test_ids = load_data()
 print(f"X.shape: {X.shape}")
-print(f"y.shape: {y.shape}")
+print(f"y.shape: {y.shape}" if not isinstance(y, list) else f"y(list)'s length: {len(y)}")
 print(f"X_test.shape: {X_test.shape}")
 print(f"test_ids length: {len(test_ids)}")
 X, y, X_test = feat_eng(X, y, X_test)
 
 
-# Validate the conditions mentioned in the docstring
 def get_length(data):
     return len(data) if isinstance(data, list) else data.shape[0]
 
-
 assert get_length(X_test) == get_length(test_ids), "Mismatch in length of test images and test IDs"
 assert get_length(X) == get_length(y), "Mismatch in length of training images and labels"
 
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_test.txt b/rdagent/components/coder/data_science/model/eval_tests/model_test.txt
index 3aed21580..9ddcd1834 100644
--- a/rdagent/components/coder/data_science/model/eval_tests/model_test.txt
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_test.txt
@@ -21,9 +21,9 @@ X, y, test_X, test_ids = load_data()
 X, y, test_X = feat_eng(X, y, test_X)
 train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.8, random_state=42)
 print(f"train_X.shape: {train_X.shape}")
-print(f"train_y.shape: {train_y.shape}")
+print(f"train_y.shape: {train_y.shape}" if not isinstance(train_y, list) else f"train_y(list)'s length: {len(train_y)}")
 print(f"val_X.shape: {val_X.shape}")
-print(f"val_y.shape: {val_y.shape}")
+print(f"val_y.shape: {val_y.shape}" if not isinstance(val_y, list) else f"val_y(list)'s length: {len(val_y)}")
 
 # First execution
 print("The first execution begins.\n")
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt
index 6c14fe10d..670b0cd9b 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt
@@ -8,23 +8,14 @@ Please make sure the stdout is rich enough to support informative feedback
 """
 
 import pickle
-
 from load_data import load_data
 
-# Setup logging
-
 X, y, X_test, test_ids = load_data()
 
-
-# Validate the conditions mentioned in the docstring
 def get_length(data):
     return len(data) if isinstance(data, list) else data.shape[0]
 
-
 assert get_length(X_test) == get_length(test_ids), "Mismatch in length of test images and test IDs"
 assert get_length(X) == get_length(y), "Mismatch in length of training images and labels"
 
 print("Data loader test passed successfully. Length of test images matches length of test IDs.")
-
-with open("data.pkl", "wb") as f:
-    pickle.dump((X, y, X_test, test_ids), f)
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index 953497909..9b717a2c9 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -76,7 +76,7 @@ def evaluate(
             for model in model_set_in_folder:
                 if model not in model_set_in_scores:
                     stdout += (
-                        f"\nModel {model} is not evaluated in the scores.csv. The score.csv has {model_set_in_scores}."
+                        f"\nModel {model} is not evaluated in the scores.csv. The scores.csv has {model_set_in_scores}."
                     )
 
         # Check submission file

From 01874ca6763a882ed9c2163ca25833060fcc7ebc Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Tue, 14 Jan 2025 07:42:17 +0000
Subject: [PATCH 258/304] remove redundent task_desc in model coder

---
 rdagent/components/coder/data_science/model/__init__.py  | 1 -
 rdagent/components/coder/data_science/model/prompts.yaml | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 1b4ee285f..537d3fab5 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -70,7 +70,6 @@ def implement_one_task(
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
             out_spec=BatchEditOut.get_spec(),
-            task_info=model_information_str,
         )
         # user_prompt = T(".prompts:model_coder.user").r(
         #     model_spec=workspace.file_dict["spec/model.md"],
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 919a6cd79..52215eafc 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -29,9 +29,6 @@ model_coder:
             Error Learning:
                 If previous failed attempts and their feedback are available, learn from them. Understand what went wrong and avoid repeating similar mistakes in your new implementation.
                 The failure knowledge may include the code unrelated to the model, such as data loading, preprocessing, or feature engineering. Focus only on the model implementation part.
-        
-        ---------Model Task Description---------
-        {{ task_info }}
 
         {% if out_spec %}
         {{out_spec}}

From 6f634a5777e20138a51a6efb012b03fcfed771b5 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Mon, 13 Jan 2025 10:54:24 +0000
Subject: [PATCH 259/304] feat: Add APE module and prompts for automated prompt
 engineering

---
 rdagent/app/utils/ape.py       |  52 ++++++++++++++
 rdagent/app/utils/prompts.yaml | 119 +++++++++++++++++++++++++++++++++
 rdagent/log/logger.py          |   1 +
 3 files changed, 172 insertions(+)
 create mode 100644 rdagent/app/utils/ape.py
 create mode 100644 rdagent/app/utils/prompts.yaml

diff --git a/rdagent/app/utils/ape.py b/rdagent/app/utils/ape.py
new file mode 100644
index 000000000..4a70e1e01
--- /dev/null
+++ b/rdagent/app/utils/ape.py
@@ -0,0 +1,52 @@
+"""
+This is the preliminary version of the APE (Automated Prompt Engineering)
+"""
+
+import pickle
+from pathlib import Path
+
+from rdagent.core.conf import RD_AGENT_SETTINGS
+
+def get_llm_qa(file_path):
+    data_flt = []
+    with open(file_path, "rb") as f:
+        data = pickle.load(f)
+        print(len(data))
+        for item in data:
+            if "debug_llm" in item["tag"]:
+                data_flt.append(item)
+    return data_flt
+
+# Example usage
+# use 
+file_path = Path(RD_AGENT_SETTINGS.log_trace_path) / "debug_llm.pkl"
+llm_qa = get_llm_qa(file_path)
+print(len(llm_qa))
+
+print(llm_qa[0])
+
+from rdagent.utils.agent.tpl import T
+
+
+
+# Initialize APE backend
+from rdagent.oai.llm_utils import APIBackend
+api = APIBackend()
+
+# Analyze test data and generate improved prompts
+for qa in llm_qa:
+    # Generate system prompt for APE
+    system_prompt = T(".prompts:ape.system").r()
+
+    # Generate user prompt with context from LLM QA
+    user_prompt = T(".prompts:ape.user").r(
+        system=qa["obj"].get("system", ""),
+        user=qa["obj"]["user"],
+        answer=qa["obj"]["resp"]
+    )
+    analysis_result = api.build_messages_and_create_chat_completion(
+        system_prompt=system_prompt,
+        user_prompt=user_prompt
+    )
+    print(f"Analysis Result:\n{analysis_result}\n")
+    yes = input("Do you want to continue? (y/n)")
diff --git a/rdagent/app/utils/prompts.yaml b/rdagent/app/utils/prompts.yaml
new file mode 100644
index 000000000..3d24df175
--- /dev/null
+++ b/rdagent/app/utils/prompts.yaml
@@ -0,0 +1,119 @@
+ape:
+  system: |-
+    We'll provide you with a pair of Chat QA about data science.
+    We are creating solutions for a Kaggle Competition based on the answers.
+    Good questions are crucial for getting good answers.
+    Please suggest how to improve the question.
+    You can analyze based on these aspects:
+    - Is the question complete (is all the information needed to answer the question provided?)
+
+    The conversation will be provided in the following format:
+
+    <question>
+      <part1>
+      ...text to describe the question...
+      </part1>
+      <part2>
+      ...text to describe the question...
+      </part2>
+    </question>
+
+    <answer>
+      ...text to describe the answer.
+    </answer>
+
+    You response should be very concorete and concise(less than 20 words) and focuse on the mentioned aspects, like
+    ```
+    Info Missing: the question ask for changing code, but it does not provide the description of current code.
+    ```
+    Please be very conversatiive when you propose improvements. Only propose improvements when it becomes impossible to give the answer.
+
+    Don't propose conerete modifications
+
+  user: |-
+    <question>
+      <part1>
+      {{system}}
+      </part1>
+      <part2>
+      {{user}}
+      </part2>
+    </question>
+
+    <answer>
+      {{answer}}
+    </answer>
+
+  optional: |-
+    If you want to suggest modification on  the question. Please follow the *SEARCH/REPLACE block* Rules!!!! It is optional.
+    Please make it concise and less than 20 lines!!!
+
+    # *SEARCH/REPLACE block* Rules:
+
+    Every *SEARCH/REPLACE block* must use this format:
+    1. The *FULL* file path alone on a line, verbatim. No bold asterisks, no quotes around it, no escaping of characters, etc.
+    2. The opening fence and code language, eg: ```python
+    3. The start of search block: <<<<<<< SEARCH
+    4. A contiguous chunk of lines to search for in the existing source code
+    5. The dividing line: =======
+    6. The lines to replace into the source code
+    7. The end of the replace block: >>>>>>> REPLACE
+    8. The closing fence: ```
+
+    Use the *FULL* file path, as shown to you by the user.
+
+    Every *SEARCH* section must *EXACTLY MATCH* the existing file content, character for character, including all comments, docstrings, etc.
+    If the file contains code or other data wrapped/escaped in json/xml/quotes or other containers, you need to propose edits to the literal contents of the file, including the container markup.
+
+    *SEARCH/REPLACE* blocks will *only* replace the first match occurrence.
+    Including multiple unique *SEARCH/REPLACE* blocks if needed.
+    Include enough lines in each SEARCH section to uniquely match each set of lines that need to change.
+
+    Keep *SEARCH/REPLACE* blocks concise.
+    Break large *SEARCH/REPLACE* blocks into a series of smaller blocks that each change a small portion of the file.
+    Include just the changing lines, and a few surrounding lines if needed for uniqueness.
+    Do not include long runs of unchanging lines in *SEARCH/REPLACE* blocks.
+
+    Only create *SEARCH/REPLACE* blocks for files that the user has added to the chat!
+
+    To move code within a file, use 2 *SEARCH/REPLACE* blocks: 1 to delete it from its current location, 1 to insert it in the new location.
+
+    Pay attention to which filenames the user wants you to edit, especially if they are asking you to create a new file.
+
+    If you want to put code in a new file, use a *SEARCH/REPLACE block* with:
+    - A new file path, including dir name if needed
+    - An empty `SEARCH` section
+    - The new file's contents in the `REPLACE` section
+
+    To rename files which have been added to the chat, use shell commands at the end of your response.
+
+    If the user just says something like "ok" or "go ahead" or "do that" they probably want you to make SEARCH/REPLACE blocks for the code changes you just proposed.
+    The user will say when they've applied your edits. If they haven't explicitly confirmed the edits have been applied, they probably want proper SEARCH/REPLACE blocks.
+
+    You are diligent and tireless!
+    You NEVER leave comments describing code without implementing it!
+    You always COMPLETELY IMPLEMENT the needed code!
+
+
+    ONLY EVER RETURN CODE IN A *SEARCH/REPLACE BLOCK*!
+    Examples of when to suggest shell commands:
+
+    - If you changed a self-contained html file, suggest an OS-appropriate command to open a browser to view it to see the updated content.
+    - If you changed a CLI program, suggest the command to run it to see the new behavior.
+    - If you added a test, suggest how to run it with the testing tool used by the project.
+    - Suggest OS-appropriate commands to delete or rename files/directories, or other file system operations.
+    - If your code changes add new dependencies, suggest the command to install them.
+    - Etc.
+
+    Here is a example of SEARCH/REPLACE BLOCK to change a function implementation to import.
+
+    <<<<<<< SEARCH
+    def hello():
+        "print a greeting"
+
+        print("hello")
+    =======
+    from hello import hello
+
+    >>>>>>> REPLACE
+# - Is there any ambiguity in the question?
diff --git a/rdagent/log/logger.py b/rdagent/log/logger.py
index 5b848571c..11905eb40 100644
--- a/rdagent/log/logger.py
+++ b/rdagent/log/logger.py
@@ -115,6 +115,7 @@ def log_object(self, obj: object, *, tag: str = "") -> None:
         caller_info = get_caller_info()
         tag = f"{self._tag}.{tag}.{self.get_pids()}".strip(".")
 
+        # FIXME: it looks like a hacking... We should redesign it...
         if "debug_" in tag:
             debug_log_path = self.log_trace_path / "debug_llm.pkl"
             debug_data = {"tag": tag, "obj": obj}

From 5cff5fca92387769fb0eeca9d1deec45ddf6f4aa Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Tue, 14 Jan 2025 07:29:54 +0000
Subject: [PATCH 260/304] fix: Update .gitignore and improve text formatting in
 eval.py

---
 .gitignore                                    |  2 +-
 .../coder/data_science/feature/eval.py        |  3 +-
 .../data_science/raw_data_loader/prompts.yaml | 14 ++++++----
 .../data_science/proposal/exp_gen.py          |  2 +-
 rdagent/utils/fmt.py                          | 28 +++++++++++++++++++
 5 files changed, 41 insertions(+), 8 deletions(-)
 create mode 100644 rdagent/utils/fmt.py

diff --git a/.gitignore b/.gitignore
index 95b5ceb13..fa8a46224 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,7 @@
 Pipfile
 public
 release-notes.md
-typescript
+typescript*
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
index 5369ad26e..c4fcdfd26 100644
--- a/rdagent/components/coder/data_science/feature/eval.py
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -11,6 +11,7 @@
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
+from rdagent.utils.fmt import shrink_text
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
@@ -61,7 +62,7 @@ def evaluate(
             test_code=test_code,
             code=implementation.file_dict["feature.py"],
         )
-        user_prompt = T(".prompts:feature_eval.user").r(stdout=stdout)
+        user_prompt = T(".prompts:feature_eval.user").r(stdout=shrink_text(stdout))
 
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
         return FeatureEvalFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index c3b3b78b7..fb0814ce5 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -241,22 +241,26 @@ spec:
             - Model workflow for training and testing (`model_*.py`).
             - Ensemble and decision-making (`ensemble.py`).
           - Treat each component as a modular and callable Python function.
+        2. Feature Engineering
+          - The feature engineering should be called only once. For example:
+            `X_transformed, y_transformed, X_test_transformed = feat_eng(X, y, test_X)`
+          - It should be called before dataset splitting.
 
-        2. Dataset Splitting
-          - The dataset returned by `load_data` is not split into training and testing sets.
+        3. Dataset Splitting
+          - The dataset returned by `load_data` is not split into training and testing sets, so the dataset splitting should happen after calling `feat_eng`.
           - By default, split the dataset into 80% for training and 20% for testing. 
           - You can also use cross-validation or other splitting methods as you deem more useful and appropriate based on the Competition Information.
 
-        3. Submission File:
+        4. Submission File:
           - Save the final predictions as `submission.csv` in the format required by the competition.
           - Present the required submission format explicitly and ensure the output adheres to it.
 
-        4. Code Standards:
+        5. Code Standards:
           - Use consistent naming conventions and type annotations.
           - Document the workflow with clear comments and docstring.
           - Do not use progress bars (e.g., tqdm) in the code.
 
-        5. Ensemble Strategy:
+        6. Ensemble Strategy:
           Put all the model's return into a dict, using the model file name as key, and the return as value.
           Sample code:
           {% raw %}
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 87d50cc68..54677b2ec 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -167,7 +167,7 @@ def _handle_missing_component(
         ):  # Assumption: when completing missing component, using component name as task name
             resp_dict[
                 "description"
-            ] += f"You have tried to implement the same component and got the following exception: \n{exp_and_feedback[1].exception}\n Please try different methods to avoid the same errors and results in an infinite loop"
+            ] += f"\n\nYou have tried to implement the same component and got the following exception: \n{exp_and_feedback[1].exception}\n Please try different methods to avoid the same errors and results in an infinite loop"
 
         task = task_cls(
             name=component if component != "Model" else resp_dict.pop("model_name"),
diff --git a/rdagent/utils/fmt.py b/rdagent/utils/fmt.py
new file mode 100644
index 000000000..639c61bf2
--- /dev/null
+++ b/rdagent/utils/fmt.py
@@ -0,0 +1,28 @@
+"""
+Tools that support generating better formats.
+"""
+
+
+def shrink_text(text: str, context_lines: int = 200) -> str:
+    """
+    When the context is too long, hide the part that is not important.
+
+        text before
+        ... (XXXXX lines are hidden) ...
+        text after
+    """
+    lines = text.splitlines()
+    total_lines = len(lines)
+
+    if total_lines <= context_lines:
+        return text
+
+    # Calculate how many lines to show from start and end
+    half_lines = context_lines // 2
+    start = '\n'.join(lines[:half_lines])
+    end = '\n'.join(lines[-half_lines:])
+
+    # Count the number of lines we're hiding
+    hidden_lines = total_lines - half_lines * 2
+
+    return f"{start}\n... ({hidden_lines} lines are hidden) ...\n{end}"

From d68bc143a8ee66a06820445e0cb372f0ec572227 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Tue, 14 Jan 2025 10:02:13 +0000
Subject: [PATCH 261/304] refactor: Update print output and improve code
 comments and imports

---
 rdagent/app/utils/ape.py                                      | 2 +-
 .../coder/data_science/raw_data_loader/prompts.yaml           | 4 ++--
 rdagent/scenarios/data_science/dev/runner.py                  | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/rdagent/app/utils/ape.py b/rdagent/app/utils/ape.py
index 4a70e1e01..d5b19ae03 100644
--- a/rdagent/app/utils/ape.py
+++ b/rdagent/app/utils/ape.py
@@ -48,5 +48,5 @@ def get_llm_qa(file_path):
         system_prompt=system_prompt,
         user_prompt=user_prompt
     )
-    print(f"Analysis Result:\n{analysis_result}\n")
+    print(f"█" * 60)
     yes = input("Do you want to continue? (y/n)")
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index fb0814ce5..628ac4bdf 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -243,7 +243,7 @@ spec:
           - Treat each component as a modular and callable Python function.
         2. Feature Engineering
           - The feature engineering should be called only once. For example:
-            `X_transformed, y_transformed, X_test_transformed = feat_eng(X, y, test_X)`
+            `X_transformed, y_transformed, X_test_transformed = feat_eng(X, y, X_test)`
           - It should be called before dataset splitting.
 
         3. Dataset Splitting
@@ -352,7 +352,7 @@ data_loader_eval:
     The main code generation task is as follows:
     {{task_desc}}
 
-    The data loader code is:
+    The data loader code is in a file named "load_data.py":
     ```python
     {{code}}
     ```
diff --git a/rdagent/scenarios/data_science/dev/runner.py b/rdagent/scenarios/data_science/dev/runner.py
index d403b212c..9016b21d5 100644
--- a/rdagent/scenarios/data_science/dev/runner.py
+++ b/rdagent/scenarios/data_science/dev/runner.py
@@ -5,6 +5,7 @@
 from rdagent.core.exception import RunnerError
 from rdagent.log import rdagent_logger as logger
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.utils import filter_progress_bar
 from rdagent.utils.env import DockerEnv, DSDockerConf
 
 
@@ -17,7 +18,7 @@ def develop(self, exp: DSExperiment) -> DSExperiment:
         de = DockerEnv(conf=ds_docker_conf)
 
         # execute workflow
-        stdout = exp.experiment_workspace.execute(env=de, entry="python main.py")
+        stdout = filter_progress_bar(exp.experiment_workspace.execute(env=de, entry="python main.py"))
 
         score_fp = exp.experiment_workspace.workspace_path / "scores.csv"
         if not score_fp.exists():

From 0171b4127792ed4f8f2d95dbb01cc6c5a41429b5 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Tue, 14 Jan 2025 10:04:17 +0000
Subject: [PATCH 262/304] style: Fix string formatting and import order in
 ape.py and fmt.py

---
 rdagent/app/utils/ape.py | 17 +++++++----------
 rdagent/utils/fmt.py     |  4 ++--
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/rdagent/app/utils/ape.py b/rdagent/app/utils/ape.py
index d5b19ae03..474e9a2d9 100644
--- a/rdagent/app/utils/ape.py
+++ b/rdagent/app/utils/ape.py
@@ -7,6 +7,7 @@
 
 from rdagent.core.conf import RD_AGENT_SETTINGS
 
+
 def get_llm_qa(file_path):
     data_flt = []
     with open(file_path, "rb") as f:
@@ -17,20 +18,19 @@ def get_llm_qa(file_path):
                 data_flt.append(item)
     return data_flt
 
+
 # Example usage
-# use 
+# use
 file_path = Path(RD_AGENT_SETTINGS.log_trace_path) / "debug_llm.pkl"
 llm_qa = get_llm_qa(file_path)
 print(len(llm_qa))
 
 print(llm_qa[0])
 
-from rdagent.utils.agent.tpl import T
-
-
-
 # Initialize APE backend
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+
 api = APIBackend()
 
 # Analyze test data and generate improved prompts
@@ -40,13 +40,10 @@ def get_llm_qa(file_path):
 
     # Generate user prompt with context from LLM QA
     user_prompt = T(".prompts:ape.user").r(
-        system=qa["obj"].get("system", ""),
-        user=qa["obj"]["user"],
-        answer=qa["obj"]["resp"]
+        system=qa["obj"].get("system", ""), user=qa["obj"]["user"], answer=qa["obj"]["resp"]
     )
     analysis_result = api.build_messages_and_create_chat_completion(
-        system_prompt=system_prompt,
-        user_prompt=user_prompt
+        system_prompt=system_prompt, user_prompt=user_prompt
     )
     print(f"█" * 60)
     yes = input("Do you want to continue? (y/n)")
diff --git a/rdagent/utils/fmt.py b/rdagent/utils/fmt.py
index 639c61bf2..b49c9e71c 100644
--- a/rdagent/utils/fmt.py
+++ b/rdagent/utils/fmt.py
@@ -19,8 +19,8 @@ def shrink_text(text: str, context_lines: int = 200) -> str:
 
     # Calculate how many lines to show from start and end
     half_lines = context_lines // 2
-    start = '\n'.join(lines[:half_lines])
-    end = '\n'.join(lines[-half_lines:])
+    start = "\n".join(lines[:half_lines])
+    end = "\n".join(lines[-half_lines:])
 
     # Count the number of lines we're hiding
     hidden_lines = total_lines - half_lines * 2

From e5e218f073f1ff609368b57ea3adfee19aa1b397 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Tue, 14 Jan 2025 10:04:17 +0000
Subject: [PATCH 263/304] exclude ape

---
 test/utils/test_import.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/utils/test_import.py b/test/utils/test_import.py
index 82280a44c..5a07ebe6b 100644
--- a/test/utils/test_import.py
+++ b/test/utils/test_import.py
@@ -29,6 +29,7 @@ def import_all_modules_from_directory(directory):
                 fstr.endswith("rdagent/log/ui/app.py")
                 or fstr.endswith("rdagent/app/cli.py")
                 or fstr.endswith("rdagent/app/CI/run.py")
+                or fstr.endswith("rdagent/app/utils/ape.py")
             ):
                 # the entrance points
                 continue

From 082efafbaa6bd0d260c55cf356fb9487a47f55ad Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Tue, 14 Jan 2025 12:54:12 +0000
Subject: [PATCH 264/304] add a data folder notice

---
 .../coder/data_science/raw_data_loader/prompts.yaml           | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 628ac4bdf..e742fe61e 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -294,7 +294,9 @@ data_loader_coder:
     Your task is described below:
     {{ task_desc }}
     You should follow the provided specifications to complete this task.
-    But you need to write the corresponding data loading code based on the information provided in the user's Data Folder Description, rather than relying on any suggestions that might exist in the spec.
+    You need to write the corresponding data loading code based on the information provided in the user's Data Folder Description, rather than relying on any suggestions that might exist in the spec.
+
+    Notice, the data files are stored in the data folder located at `/kaggle/input/`, and the data folder is structured as described in the Data Folder Description. Please don't load the data from the current directory.
 
     Please response the code in the following json format. Here is an example structure for the JSON output:
     {

From c8cf3836618fcbda8ba98b72caa0ee1acc87846c Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Tue, 14 Jan 2025 11:59:05 +0000
Subject: [PATCH 265/304] reduce unnecessary output to stdout

---
 rdagent/utils/env.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
index f68bd4d07..d02a33319 100644
--- a/rdagent/utils/env.py
+++ b/rdagent/utils/env.py
@@ -330,6 +330,7 @@ def __run(
         if env is None:
             env = {}
         env["PYTHONWARNINGS"] = "ignore"
+        env["TF_CPP_MIN_LOG_LEVEL"] = "2"
         client = docker.from_env()
 
         volumns = {}

From 7474c978de4f774a97c806c76e0a1ac53d777576 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Tue, 14 Jan 2025 12:34:29 +0000
Subject: [PATCH 266/304] refine the code of describe_data_folder

---
 .../scenarios/data_science/scen/__init__.py   | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
index e88ba1a64..9d5a6765e 100644
--- a/rdagent/scenarios/data_science/scen/__init__.py
+++ b/rdagent/scenarios/data_science/scen/__init__.py
@@ -46,12 +46,13 @@ def get_dir_snapshot(folder_path):
     return frozenset(exts)
 
 
-def describe_data_folder(folder_path, indent=0, max_files=1, partial_expand_subfolders=3):
+def describe_data_folder(folder_path, indent=0, max_files=2, partial_expand_subfolders=2, is_top_level=True):
     """
     folder_path              : Current directory path
     indent                   : Current indentation
     max_files                : Maximum number of files of the same type to display
     partial_expand_subfolders: When all subfolders have the same internal file types, only expand this many subfolders, the rest are omitted
+    is_top_level             : Indicates if the current folder is the top-level folder
     """
     result = []
     files_count = {}
@@ -69,7 +70,12 @@ def describe_data_folder(folder_path, indent=0, max_files=1, partial_expand_subf
                     files_count[file_type] = 0
                     files_details[file_type] = []
                 files_count[file_type] += 1
-                if len(files_details[file_type]) < max_files:
+
+                # At top level, collect all CSV and Markdown files without restrictions
+                # In deeper levels, follow the max_files restriction
+                if is_top_level and file_type in ["csv", "md"]:
+                    files_details[file_type].append((file, file_size, file_path))
+                elif not is_top_level and len(files_details[file_type]) < max_files:
                     files_details[file_type].append((file, file_size, file_path))
             break
 
@@ -95,6 +101,7 @@ def describe_data_folder(folder_path, indent=0, max_files=1, partial_expand_subf
                             indent=indent + 2,
                             max_files=max_files,
                             partial_expand_subfolders=partial_expand_subfolders,
+                            is_top_level=False,
                         )
                     )
                 else:
@@ -111,6 +118,7 @@ def describe_data_folder(folder_path, indent=0, max_files=1, partial_expand_subf
                         indent=indent + 2,
                         max_files=max_files,
                         partial_expand_subfolders=partial_expand_subfolders,
+                        is_top_level=False,
                     )
                 )
 
@@ -124,22 +132,24 @@ def describe_data_folder(folder_path, indent=0, max_files=1, partial_expand_subf
                 files_details[file_type] = []
             files_count[file_type] += 1
 
-            if len(files_details[file_type]) < max_files:
+            # At top level, collect all CSV and Markdown files without restrictions
+            # In deeper levels, follow the max_files restriction
+            if is_top_level and file_type in ["csv", "md"]:
+                files_details[file_type].append((file, file_size, file_path))
+            elif not is_top_level and len(files_details[file_type]) < max_files:
                 files_details[file_type].append((file, file_size, file_path))
 
         break
 
     # Print the folder and its contents
     for file_type, count in files_count.items():
-        if count > max_files:
+        if count > max_files and file_type not in ["csv", "md"]:
             result.append(" " * indent + f"{count} {file_type}s:")
             for file, size, path in files_details[file_type]:
                 result.append(" " * (indent + 2) + f"- {file} ({size} bytes)")
             result.append(" " * (indent + 2) + "... (file limit reached)")
         else:
             for file, size, path in files_details[file_type]:
-                if file_type == "zip":
-                    continue
                 result.append(" " * indent + f"- {file} ({size} bytes)")
                 if file_type == "csv":
                     result.append(" " * (indent + 2) + f"- Head of {file}:")
@@ -273,3 +283,7 @@ def rich_style_description(self) -> str:
             name="Kaggle",
             competition=f"[{self.competition}](https://www.kaggle.com/competitions/{self.competition})",
         )
+
+
+if __name__ == "__main__":
+    print(describe_data_folder(Path("/data/userdata/share/mle_kaggle") / "aerial-cactus-identification"))
\ No newline at end of file

From cec7a91e06297623980f4cad53803ea7cc807e36 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Tue, 14 Jan 2025 12:35:18 +0000
Subject: [PATCH 267/304] fix ci

---
 rdagent/scenarios/data_science/scen/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
index 9d5a6765e..e343452f4 100644
--- a/rdagent/scenarios/data_science/scen/__init__.py
+++ b/rdagent/scenarios/data_science/scen/__init__.py
@@ -286,4 +286,4 @@ def rich_style_description(self) -> str:
 
 
 if __name__ == "__main__":
-    print(describe_data_folder(Path("/data/userdata/share/mle_kaggle") / "aerial-cactus-identification"))
\ No newline at end of file
+    print(describe_data_folder(Path("/data/userdata/share/mle_kaggle") / "aerial-cactus-identification"))

From 5a5d0cb9611899f07bfc30f4224e391f7d62e876 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Wed, 15 Jan 2025 14:56:56 +0800
Subject: [PATCH 268/304] style: streamlit style update (#522)

* streamlit style update

* fix import

* fix format
---
 rdagent/log/ui/llm_st.py | 313 ++++++++++++++++++++++++---------------
 1 file changed, 193 insertions(+), 120 deletions(-)

diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
index cd5890255..8f3e3d89a 100644
--- a/rdagent/log/ui/llm_st.py
+++ b/rdagent/log/ui/llm_st.py
@@ -2,6 +2,7 @@
 import json
 import pickle
 import re
+import time
 from pathlib import Path
 
 import streamlit as st
@@ -9,17 +10,30 @@
 
 st.set_page_config(layout="wide", page_title="debug_llm", page_icon="🎓", initial_sidebar_state="expanded")
 
-# 获取log_path参数
+# 获取 log_path 参数
 parser = argparse.ArgumentParser(description="RD-Agent Streamlit App")
 parser.add_argument("--log_dir", type=str, help="Path to the log directory")
 args = parser.parse_args()
-if args.log_dir:
-    main_log_path = Path(args.log_dir)
-    if not main_log_path.exists():
-        st.error(f"Log dir `{main_log_path}` does not exist!")
-        st.stop()
-else:
-    main_log_path = Path("./log")
+
+
+@st.cache_data
+def get_folders_sorted(log_path):
+    """缓存并返回排序后的文件夹列表，并加入进度打印"""
+    with st.spinner("正在加载文件夹列表..."):
+        folders = sorted(
+            (folder for folder in log_path.iterdir() if folder.is_dir() and list(folder.iterdir())),
+            key=lambda folder: folder.stat().st_mtime,
+            reverse=True,
+        )
+        st.write(f"找到 {len(folders)} 个文件夹")
+    return [folder.name for folder in folders]
+
+
+# 设置主日志路径
+main_log_path = Path(args.log_dir) if args.log_dir else Path("./log")
+if not main_log_path.exists():
+    st.error(f"Log dir {main_log_path} does not exist!")
+    st.stop()
 
 if "data" not in session_state:
     session_state.data = []
@@ -30,31 +44,29 @@
 
 
 def load_data():
+    """加载数据到 session_state 并显示进度"""
+    log_file = main_log_path / session_state.log_path / "debug_llm.pkl"
     try:
-        with open(f"{main_log_path}/{session_state.log_path}/debug_llm.pkl", "rb") as f:
-            session_state.data = pickle.load(f)
+        with st.spinner(f"正在加载数据文件 {log_file}..."):
+            start_time = time.time()
+            with open(log_file, "rb") as f:
+                session_state.data = pickle.load(f)
+            st.success(f"数据加载完成！耗时 {time.time() - start_time:.2f} 秒")
+            st.session_state["current_loop"] = 1
     except Exception as e:
         session_state.data = [{"error": str(e)}]
+        st.error(f"加载数据失败: {e}")
 
 
-# Add a button to refresh the data
+# UI - Sidebar
 with st.sidebar:
-    lc1, lc2 = st.columns([1, 2], vertical_alignment="center")
-    with lc1:
-        st.markdown(":blue[**Log Path**]")
-    with lc2:
-        manually = st.toggle("Manual Input")
+    st.markdown(":blue[**Log Path**]")
+    manually = st.toggle("Manual Input")
     if manually:
         st.text_input("log path", key="log_path", label_visibility="collapsed")
     else:
-        folders = sorted(
-            [folder for folder in main_log_path.iterdir() if folder.is_dir()],
-            key=lambda folder: folder.stat().st_mtime,
-            reverse=True,
-        )
-        folders = [folder.name for folder in folders]
-
-        st.selectbox(f"**Select from `{main_log_path.absolute()}`**", folders, key="log_path")
+        folders = get_folders_sorted(main_log_path)
+        st.selectbox(f"**Select from {main_log_path.absolute()}**", folders, key="log_path")
 
     if st.button("Refresh Data"):
         load_data()
@@ -63,120 +75,181 @@ def load_data():
     expand_all = st.toggle("Expand All", key="expand_all")
 
 
+# Helper functions
 def show_text(text, lang=None):
-    if lang is not None:
+    """显示文本代码块"""
+    if lang:
         st.code(text, language=lang, wrap_lines=True)
-    elif "```py" in text:
-        st.code(text, language="python", wrap_lines=True, line_numbers=True)
+    elif "\n" in text:
+        st.code(text, language="python", wrap_lines=True)
     else:
         st.code(text, language="html", wrap_lines=True)
 
 
 def highlight_prompts_uri(uri):
+    """高亮 URI 的格式"""
     parts = uri.split(":")
     return f"**{parts[0]}:**:green[**{parts[1]}**]"
 
 
 def extract_loopid_func_name(tag):
+    """提取 Loop ID 和函数名称"""
     match = re.search(r"Loop_(\d+)\.(\w+)\.", tag)
-    if match:
-        return match.group(1), match.group(2)
-    return None, None
+    return match.groups() if match else (None, None)
 
 
 def extract_evoid(tag):
+    """提取 EVO ID"""
     match = re.search(r"\.evo_loop_(\d+)\.", tag)
-    if match:
-        return match.group(1)
-    return None
+    return match.group(1) if match else None
 
 
-# Display the data
-for d in session_state.data:
-    tag = d["tag"]
-    obj = d["obj"]
+# Display Data
+progress_text = st.empty()
+progress_bar = st.progress(0)
 
-    loop_id, func_name = extract_loopid_func_name(tag)
-    evo_id = extract_evoid(tag)
-    if loop_id:
-        if f"Loop_{loop_id}" not in tlist:
-            tlist.append(f"Loop_{loop_id}")
-            st.header(f"Loop_{loop_id}", anchor=f"Loop_{loop_id}", divider="blue")
-        if f"loop_{loop_id}.{func_name}" not in tlist:
-            tlist.append(f"loop_{loop_id}.{func_name}")
-            st.header(f"in *{func_name}*", anchor=f"loop_{loop_id}.{func_name}", divider="green")
-        if evo_id and f"loop_{loop_id}.evo_step_{evo_id}" not in tlist:
-            tlist.append(f"loop_{loop_id}.evo_step_{evo_id}")
-            st.subheader(f"evo_step_{evo_id}", anchor=f"loop_{loop_id}.evo_step_{evo_id}", divider="orange")
-
-    if "debug_exp_gen" in tag:
-        with st.expander(
-            f"Exp in :violet[**{obj.experiment_workspace.workspace_path}**]", expanded=expand_all, icon="🧩"
-        ):
-            st.write(obj)
-    elif "debug_tpl" in tag:
-        uri = obj["uri"]
-        tpl = obj["template"]
-        cxt = obj["context"]
-        rd = obj["rendered"]
-
-        with st.expander(highlight_prompts_uri(uri), expanded=expand_all, icon="⚙️"):
-            t1, t2, t3 = st.tabs([":green[**Rendered**]", ":blue[**Template**]", ":orange[**Context**]"])
-            with t1:
-                show_text(rd)
-            with t2:
-                show_text(tpl, lang="django")
-            with t3:
-                st.json(cxt)
-    elif "debug_llm" in tag:
-        system = obj.get("system", None)
-        user = obj["user"]
-        resp = obj["resp"]
-
-        with st.expander(f"**LLM**", expanded=expand_all, icon="🤖"):
-            t1, t2, t3 = st.tabs([":green[**Response**]", ":blue[**User**]", ":orange[**System**]"])
-            with t3:
-                if system is None:
-                    st.text("In session, no system prompt")
-                else:
-                    show_text(system)
-            with t2:
-                show_text(user)
-            with t1:
-                try:
-                    rdict = json.loads(resp)
-                    if "code" in rdict:
-                        code = rdict["code"]
-                        st.markdown(":red[**Code in response dict:**]")
-                        st.code(code, language="python", wrap_lines=True, line_numbers=True)
-                        rdict.pop("code")
-                    elif "spec" in rdict:
-                        spec = rdict["spec"]
-                        st.markdown(":red[**Spec in response dict:**]")
-                        st.markdown(spec)
-                        rdict.pop("spec")
-                    else:
-                        # show model codes
-                        showed_keys = []
-                        for k, v in rdict.items():
-                            if k.startswith("model_") and k.endswith(".py"):
-                                st.markdown(f":red[**{k}**]")
-                                st.code(v, language="python", wrap_lines=True, line_numbers=True)
-                                showed_keys.append(k)
-                        for k in showed_keys:
-                            rdict.pop(k)
-                    st.write(":red[**Other parts (except for the code or spec) in response dict:**]")
-                    st.json(rdict)
-                except:
-                    st.json(resp)
+# 每页展示一个 Loop
+LOOPS_PER_PAGE = 1
 
-with st.sidebar:
-    et_toc = ""
-    for t in tlist:
-        if t.startswith("L"):
-            et_toc += f"- [{t}](#{t})\n"
-        elif "evo_step_" in t:
-            et_toc += f"    - [{t.split('.')[1]}](#{t})\n"
-        else:
-            et_toc += f"  - [{t.split('.')[1]}](#{t})\n"
-    st.markdown(et_toc, unsafe_allow_html=True)
+# 获取所有的 Loop ID
+loop_groups = {}
+for i, d in enumerate(session_state.data):
+    tag = d["tag"]
+    loop_id, _ = extract_loopid_func_name(tag)
+    if loop_id:
+        if loop_id not in loop_groups:
+            loop_groups[loop_id] = []
+        loop_groups[loop_id].append(d)
+
+# 按 Loop ID 排序
+sorted_loop_ids = sorted(loop_groups.keys(), key=int)  # 假设 Loop ID 是数字
+total_loops = len(sorted_loop_ids)
+total_pages = total_loops  # 每页展示一个 Loop
+
+if total_pages:
+    # 初始化 current_loop
+    if "current_loop" not in st.session_state:
+        st.session_state["current_loop"] = 1
+
+    # Loop 导航按钮
+    col1, col2, col3, col4, col5 = st.sidebar.columns([1.2, 1, 2, 1, 1.2])
+
+    with col1:
+        if st.button("|<"):  # 首页
+            st.session_state["current_loop"] = 1
+    with col2:
+        if st.button("<") and st.session_state["current_loop"] > 1:  # 上一页
+            st.session_state["current_loop"] -= 1
+    with col3:
+        # 下拉列表显示所有 Loop
+        st.session_state["current_loop"] = st.selectbox(
+            "选择 Loop",
+            options=list(range(1, total_loops + 1)),
+            index=st.session_state["current_loop"] - 1,  # 默认选中当前 Loop
+            label_visibility="collapsed",  # 隐藏标签
+        )
+    with col4:
+        if st.button("\>") and st.session_state["current_loop"] < total_loops:  # 下一页
+            st.session_state["current_loop"] += 1
+    with col5:
+        if st.button("\>|"):  # 最后一页
+            st.session_state["current_loop"] = total_loops
+
+    # 获取当前 Loop
+    current_loop = st.session_state["current_loop"]
+
+    # 显示当前 Loop
+    st.write(f"正在显示Loop {current_loop}/{total_loops} 个 Loop")
+
+    # 渲染当前 Loop 数据
+    loop_id = sorted_loop_ids[current_loop - 1]
+    progress_text = st.empty()
+    progress_text.text(f"正在处理 Loop {loop_id}...")
+
+    # 渲染 Loop Header
+    loop_anchor = f"Loop_{loop_id}"
+    if loop_anchor not in tlist:
+        tlist.append(loop_anchor)
+        st.header(loop_anchor, anchor=loop_anchor, divider="blue")
+
+    # 渲染当前 Loop 的所有数据
+    loop_data = loop_groups[loop_id]
+    for d in loop_data:
+        tag = d["tag"]
+        obj = d["obj"]
+        _, func_name = extract_loopid_func_name(tag)
+        evo_id = extract_evoid(tag)
+
+        func_anchor = f"loop_{loop_id}.{func_name}"
+        if func_anchor not in tlist:
+            tlist.append(func_anchor)
+            st.header(f"in *{func_name}*", anchor=func_anchor, divider="green")
+
+        evo_anchor = f"loop_{loop_id}.evo_step_{evo_id}"
+        if evo_id and evo_anchor not in tlist:
+            tlist.append(evo_anchor)
+            st.subheader(f"evo_step_{evo_id}", anchor=evo_anchor, divider="orange")
+
+        # 根据 tag 渲染内容
+        if "debug_exp_gen" in tag:
+            with st.expander(
+                f"Exp in :violet[**{obj.experiment_workspace.workspace_path}**]", expanded=False, icon="🧩"
+            ):
+                st.write(obj)
+        elif "debug_tpl" in tag:
+            uri = obj["uri"]
+            tpl = obj["template"]
+            cxt = obj["context"]
+            rd = obj["rendered"]
+            with st.expander(highlight_prompts_uri(uri), expanded=False, icon="⚙️"):
+                t1, t2, t3 = st.tabs([":green[**Rendered**]", ":blue[**Template**]", ":orange[**Context**]"])
+                with t1:
+                    show_text(rd)
+                with t2:
+                    show_text(tpl, lang="django")
+                with t3:
+                    st.json(cxt)
+        elif "debug_llm" in tag:
+            system = obj.get("system", None)
+            user = obj["user"]
+            resp = obj["resp"]
+            with st.expander(f"**LLM**", expanded=False, icon="🤖"):
+                t1, t2, t3 = st.tabs([":green[**Response**]", ":blue[**User**]", ":orange[**System**]"])
+                with t1:
+                    try:
+                        rdict = json.loads(resp)
+                        if "code" in rdict:
+                            code = rdict["code"]
+                            st.markdown(":red[**Code in response dict:**]")
+                            st.code(code, language="python", wrap_lines=True, line_numbers=True)
+                            rdict.pop("code")
+                        elif "spec" in rdict:
+                            spec = rdict["spec"]
+                            st.markdown(":red[**Spec in response dict:**]")
+                            st.markdown(spec)
+                            rdict.pop("spec")
+                        else:
+                            # show model codes
+                            showed_keys = []
+                            for k, v in rdict.items():
+                                if k.startswith("model_") and k.endswith(".py"):
+                                    st.markdown(f":red[**{k}**]")
+                                    st.code(v, language="python", wrap_lines=True, line_numbers=True)
+                                    showed_keys.append(k)
+                            for k in showed_keys:
+                                rdict.pop(k)
+                        st.write(":red[**Other parts (except for the code or spec) in response dict:**]")
+                        st.json(rdict)
+                    except:
+                        st.json(resp)
+                with t2:
+                    show_text(user)
+                with t3:
+                    show_text(system or "No system prompt available")
+
+    progress_text.text("当前 Loop 数据处理完成！")
+
+    # Sidebar TOC
+    with st.sidebar:
+        toc = "\n".join([f"- [{t}](#{t})" if t.startswith("L") else f"  - [{t.split('.')[1]}](#{t})" for t in tlist])
+        st.markdown(toc, unsafe_allow_html=True)

From 1ede9d6da17a254e573539f346f124c4772cb782 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 15 Jan 2025 07:08:54 +0000
Subject: [PATCH 269/304] fix llm_st loop progress bar

---
 rdagent/log/ui/llm_st.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
index 8f3e3d89a..e29cc3d37 100644
--- a/rdagent/log/ui/llm_st.py
+++ b/rdagent/log/ui/llm_st.py
@@ -165,6 +165,7 @@ def extract_evoid(tag):
     loop_id = sorted_loop_ids[current_loop - 1]
     progress_text = st.empty()
     progress_text.text(f"正在处理 Loop {loop_id}...")
+    progress_bar.progress(current_loop / total_loops)
 
     # 渲染 Loop Header
     loop_anchor = f"Loop_{loop_id}"

From b68c724a5b95da7fd77e030a4e36014c83fc056f Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 15 Jan 2025 07:21:48 +0000
Subject: [PATCH 270/304] debugapp small change

---
 rdagent/log/ui/llm_st.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
index e29cc3d37..9c05441c0 100644
--- a/rdagent/log/ui/llm_st.py
+++ b/rdagent/log/ui/llm_st.py
@@ -158,14 +158,11 @@ def extract_evoid(tag):
     # 获取当前 Loop
     current_loop = st.session_state["current_loop"]
 
-    # 显示当前 Loop
-    st.write(f"正在显示Loop {current_loop}/{total_loops} 个 Loop")
-
     # 渲染当前 Loop 数据
     loop_id = sorted_loop_ids[current_loop - 1]
     progress_text = st.empty()
     progress_text.text(f"正在处理 Loop {loop_id}...")
-    progress_bar.progress(current_loop / total_loops)
+    progress_bar.progress(current_loop / total_loops, text=f"Loop :green[**{current_loop}**] / {total_loops}")
 
     # 渲染 Loop Header
     loop_anchor = f"Loop_{loop_id}"

From a3d3e15090667d793bb43472ff63eca9db8a3506 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Wed, 15 Jan 2025 08:11:16 +0000
Subject: [PATCH 271/304] fix model str

---
 rdagent/scenarios/data_science/proposal/exp_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 54677b2ec..065a1c749 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -292,7 +292,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 metric_name = score_df.columns[0]
                 for fname in last_successful_exp.experiment_workspace.file_dict:
                     if re.match(r"^model_.+\.py", fname):
-                        model_str = f"{fname}:\n{metric_name} on valid: {score_df[metric_name].max()}\n```python\n{last_successful_exp.experiment_workspace.file_dict[fname]}\n```\n"
+                        model_str = f"{fname}:\n{metric_name} on valid: {score_df.loc[fname[:-3]]}\n```python\n{last_successful_exp.experiment_workspace.file_dict[fname]}\n```\n"
                         model_infos.append(model_str)
 
                 model_num = len(model_infos)

From eff7d22a88ceba7ffd4d890e8fd3b2d476443c74 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Wed, 15 Jan 2025 08:12:23 +0000
Subject: [PATCH 272/304] refine some prompts

---
 .../data_science/proposal/prompts.yaml        | 37 ++++++++++++-------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 732df7add..2166d1669 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -3,24 +3,35 @@ hypothesis_gen:
     The user is working on generating new hypotheses for the {{targets}} in a data-driven research and development process. 
     The {{targets}} are used in the following scenario:
     {{scenario}}
-    The user has already proposed several hypotheses and conducted evaluations on them. This information will be provided to you. Your task is to check whether a similar hypothesis has already been generated. 
-    If one exists and you agree with it, feel free to use it. If you disagree, please generate an improved version.
-    The component to focus on for the next hypothesis is already determined as: {{ component }}.
+    
+    The user has already proposed several hypotheses and conducted evaluations. This information will be provided to you. Your task is to:
+    1. Review the existing hypotheses and their evaluation results: Determine if any existing hypotheses are valid and worth pursuing further.
+    2. Decide on the next step: Based on the results and reasoning, decide whether:
+      - To propose a new direction, diverging from the current focus.
+      - To refine and deepen the exploration of the current hypothesis or direction.
+    3. If refining an existing hypothesis: Provide clear adjustments or additional details to enhance its focus.
+    4. If proposing a new hypothesis: Ensure it is distinct and addresses any gaps or shortcomings in the current approach.
+
+    The current component to focus on is: {{component}}.
     {% if hypothesis_specification %}
-    To assist you in formulating new hypotheses, the user has provided some additional information: {{hypothesis_specification}}.
-    Important: If the hypothesis_specification outlines the next steps you need to follow, ensure you adhere to those instructions.
+    To assist in hypothesis formulation, the user has provided additional information: {{hypothesis_specification}}.
+    Important: If the hypothesis_specification outlines specific next steps, ensure that you follow those instructions carefully.
     {% endif %}
     Please generate the output using the following format and specifications:
     {{ hypothesis_output_format }}
 
   user: |-
-    {% if exp_and_feedback_desc|length == 0 %}It is the first round of hypothesis generation. The user has no hypothesis on this scenario yet.
-    {% else %}It is not the first round, the user has made several hypothesis on this scenario and did several evaluation on them.
-    The former hypothesis and the corresponding feedbacks are as follows (focus on the last one & the new hypothesis that it provides and reasoning to see if you agree):
-    {{ exp_and_feedback_desc }}
+    {% if exp_and_feedback_desc|length == 0 %}
+    This is the first round of hypothesis generation. The user has not yet proposed any hypotheses for this scenario.
+    {% else %}
+    This is not the first round. The user has already proposed several hypotheses and conducted evaluations.
+    
+    The previous hypotheses and their corresponding feedback are as follows (focus on the most recent hypothesis, its derived insights, and reasoning):
+    {{exp_and_feedback_desc}}
     {% endif %}
-    Also generate the relevant keys for the reasoning and the distilled knowledge that follows. For those keys, in particular for knowledge, explain in the context of the specific scenario to build up domain knowledge in the specific field rather than general knowledge.
-
+    
+    In addition, generate relevant reasoning and distilled knowledge keys.
+    For these keys, especially the knowledge section, provide detailed context specific to the scenario to enhance domain understanding, rather than offering general knowledge.
 
 hypothesis_model:
   system: |-
@@ -42,8 +53,8 @@ hypothesis_model:
     - If you decide to add a new model, specify the type of model you would add and generate a new hypothesis related to the new model.
     {% endif %}
     {% if hypothesis_specification %}
-    To assist you in formulating new hypotheses, the user has provided some additional information: {{hypothesis_specification}}.
-    Important: If the hypothesis_specification outlines the next steps you need to follow, ensure you adhere to those instructions.
+    To assist in hypothesis formulation, the user has provided additional information: {{hypothesis_specification}}.
+    Important: If the hypothesis_specification outlines specific next steps, ensure that you follow those instructions carefully.
     {% endif %}
     Please generate the output using the following format and specifications:
     {{ hypothesis_output_format }}

From 8b631760ed6286ed38e78c93c0f915638dab7bc1 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Wed, 15 Jan 2025 08:38:57 +0000
Subject: [PATCH 273/304] fix model str

---
 rdagent/scenarios/data_science/proposal/exp_gen.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 065a1c749..4a322b940 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -292,6 +292,8 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 metric_name = score_df.columns[0]
                 for fname in last_successful_exp.experiment_workspace.file_dict:
                     if re.match(r"^model_.+\.py", fname):
+                        if 'test' in fname:
+                            continue
                         model_str = f"{fname}:\n{metric_name} on valid: {score_df.loc[fname[:-3]]}\n```python\n{last_successful_exp.experiment_workspace.file_dict[fname]}\n```\n"
                         model_infos.append(model_str)
 
@@ -444,4 +446,4 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 )
                 return exp
 
-        return super().gen(trace)
+        return super().gen(trace)       

From 37de5a3643f70e66a6425a85cfe48d7b45803066 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Wed, 15 Jan 2025 09:34:43 +0000
Subject: [PATCH 274/304] fix CI

---
 rdagent/scenarios/data_science/proposal/exp_gen.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 4a322b940..57f73ae90 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -292,7 +292,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 metric_name = score_df.columns[0]
                 for fname in last_successful_exp.experiment_workspace.file_dict:
                     if re.match(r"^model_.+\.py", fname):
-                        if 'test' in fname:
+                        if "test" in fname:
                             continue
                         model_str = f"{fname}:\n{metric_name} on valid: {score_df.loc[fname[:-3]]}\n```python\n{last_successful_exp.experiment_workspace.file_dict[fname]}\n```\n"
                         model_infos.append(model_str)
@@ -446,4 +446,4 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 )
                 return exp
 
-        return super().gen(trace)       
+        return super().gen(trace)

From 58ecf57c831762c4e90836c781b012e6d1b07cc9 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Wed, 15 Jan 2025 10:01:57 +0000
Subject: [PATCH 275/304] refine the logic associated with the data_folder

---
 .../data_science/raw_data_loader/__init__.py  |  2 +-
 .../scenarios/data_science/scen/__init__.py   | 68 +++++++++++++++----
 .../scenarios/data_science/scen/prompts.yaml  | 10 ++-
 3 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index 46dfc19cd..1d37be742 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -55,7 +55,7 @@ def implement_one_task(
         # return a workspace with "load_data.py", "spec/load_data.md" inside
         # assign the implemented code to the new workspace.
         competition_info = self.scen.get_scenario_all_desc()
-        data_folder_info = self.scen.get_data_folder_description()
+        data_folder_info = self.scen.processed_data_folder_description
         data_loader_task_info = target_task.get_task_information()
 
         queried_similar_successful_knowledge = (
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
index e343452f4..d0ea88991 100644
--- a/rdagent/scenarios/data_science/scen/__init__.py
+++ b/rdagent/scenarios/data_science/scen/__init__.py
@@ -16,13 +16,46 @@
 from rdagent.utils.agent.tpl import T
 
 
-def read_csv_head(file_path, indent, lines=5):
+import pandas as pd
+
+def read_csv_head(file_path, indent=0, lines=5, max_col_width=100):
+    """
+    Reads the first few rows of a CSV file and formats them with indentation and optional truncation.
+    
+    Parameters:
+        file_path (str): Path to the CSV file.
+        indent (int): Number of spaces to prepend to each line for indentation.
+        lines (int): Number of rows to read from the CSV file.
+        max_col_width (int): Maximum width of each column's content.
+        
+    Returns:
+        str: A formatted string of the first few rows of the CSV file.
+    """
     try:
+        # Read the CSV file with specified rows
         df = pd.read_csv(file_path, nrows=lines)
-        df_string_lines = df.to_string(index=False).split("\n")
-        for i in range(len(df_string_lines)):
-            df_string_lines[i] = " " * (indent) + df_string_lines[i]
-        return "\n".join(df_string_lines)
+
+        if df.empty:
+            return " " * indent + "(No data in the file)"
+
+        # Truncate column contents to a maximum width
+        truncated_df = df.copy()
+        for col in truncated_df.columns:
+            truncated_df[col] = truncated_df[col].astype(str).apply(
+                lambda x: (x[:max_col_width] + "...") if len(x) > max_col_width else x
+            )
+
+        # Convert DataFrame to a string representation
+        df_string_lines = truncated_df.to_string(index=False).split("\n")
+
+        # Add indentation to each line
+        indented_lines = [" " * indent + line for line in df_string_lines]
+
+        return "\n".join(indented_lines)
+    except FileNotFoundError:
+        return f"Error: File not found at path '{file_path}'."
+    except pd.errors.EmptyDataError:
+        return f"Error: The file at '{file_path}' is empty."
     except Exception as e:
         return f"Error reading CSV: {e}"
 
@@ -153,15 +186,18 @@ def describe_data_folder(folder_path, indent=0, max_files=2, partial_expand_subf
                 result.append(" " * indent + f"- {file} ({size} bytes)")
                 if file_type == "csv":
                     result.append(" " * (indent + 2) + f"- Head of {file}:")
-                    csv_head = read_csv_head(path, indent + 2)
-                    if len(csv_head) > 100:
-                        csv_head = " ".join(csv_head.strip().split())
-                        csv_head = csv_head[:100] + "\n... (truncated)"
+                    csv_head = read_csv_head(path, indent + 4)
+                    # if len(csv_head) > 300:
+                    #     csv_head = " ".join(csv_head.strip().split())
+                    #     csv_head = csv_head[:300] + "\n" + " " * (indent + 4) + "... (truncated)"
                     result.append(csv_head)
                 if file_type == "md":
                     result.append(" " * (indent + 2) + f"- Content of {file}:")
+                    if file == "description.md":
+                        result.append(" " * (indent + 4) + f"Please refer to the background of the scenario context.")
+                        continue
                     with open(path, "r", encoding="utf-8") as f:
-                        result.append(f.read())
+                        result.append(" " * (indent + 4) + f.read())
                 if file_type == "tif":
                     result.append(" " * (indent + 2) + f"- Metadata of {file}:")
                     with Image.open(path) as img:
@@ -178,6 +214,7 @@ class DataScienceScen(Scenario):
     def __init__(self, competition: str) -> None:
         self.competition = competition
         self.raw_description = self._get_description()
+        self.processed_data_folder_description = self._get_data_folder_description()
         self._analysis_competition_description()
         self.metric_direction = self._get_direction()
 
@@ -198,6 +235,7 @@ def _analysis_competition_description(self):
         sys_prompt = T(".prompts:competition_description_template.system").r()
         user_prompt = T(".prompts:competition_description_template.user").r(
             competition_raw_description=self.raw_description,
+            competition_processed_data_folder_description=self.processed_data_folder_description,
         )
 
         response_analysis = APIBackend().build_messages_and_create_chat_completion(
@@ -210,7 +248,7 @@ def _analysis_competition_description(self):
         self.task_type = response_json_analysis.get("Task Type", "No type provided")
         self.data_type = response_json_analysis.get("Data Type", "No data type provided")
         self.brief_description = response_json_analysis.get("Brief Description", "No brief description provided")
-        self.data_description = response_json_analysis.get("Data Description", "No data description provided")
+        self.dataset_description = response_json_analysis.get("Dataset Description", "No dataset description provided")
         self.target_description = response_json_analysis.get("Evaluation Description", "No target description provided")
         self.submission_specifications = response_json_analysis.get(
             "Submission Specifications", "No submission requirements provided"
@@ -222,7 +260,7 @@ def get_competition_full_desc(self) -> str:
         return f"""Task Type: {self.task_type}
     Data Type: {self.data_type}
     Brief Description: {self.brief_description}
-    Data Description: {self.data_description}
+    Dataset Description: {self.dataset_description}
     Target Description: {self.target_description}
     Submission Specifications: {self.submission_specifications}
     Model Output Channel: {self.model_output_channel}
@@ -235,7 +273,7 @@ def background(self) -> str:
             task_type=self.task_type,
             data_type=self.data_type,
             brief_description=self.brief_description,
-            data_description=self.data_description,
+            dataset_description=self.dataset_description,
             target_description=self.target_description,
         )
         return background_prompt
@@ -255,10 +293,9 @@ def get_scenario_all_desc(self) -> str:
             metric_direction=self.metric_direction,
         )
 
-    def get_data_folder_description(self) -> str:
+    def _get_data_folder_description(self) -> str:
         return describe_data_folder(Path(DS_RD_SETTING.local_data_path) / self.competition)
 
-
 class KaggleScen(DataScienceScen):
     """Kaggle Scenario
     It is based on kaggle now.
@@ -277,6 +314,7 @@ def _get_direction(self):
         leaderboard = leaderboard_scores(self.competition)
         return "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize"
 
+
     @property
     def rich_style_description(self) -> str:
         return T(".prompts:rich_style_description").r(
diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
index 0648bd5b5..e239d3d6f 100644
--- a/rdagent/scenarios/data_science/scen/prompts.yaml
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -22,7 +22,7 @@ competition_description_template:
       "Task Type": "The type of competition task, e.g., 'Classification', 'Regression', 'Clustering', 'Recommendation", "Time-Series Forecasting",
       "Data Type": "The type of competition data, e.g., 'Tabular', 'Time Series', 'Text (Natural Language Processing)', 'Image (Computer Vision)', 'Audio', 'Video'", 
       "Brief Description": "A brief description of the competition",
-      "Data Description": "A detailed description of the dataset used in the competition, including its source, structure, and any relevant characteristics",
+      "Dataset Description": "The dataset utilized in the competition is described based on two sources: the Competition Description, which provides contextual details about the original files, and the Processed Data folder description, which outlines the structure of the dataset after processing. While there may be differences—for instance, original files mentioned in the Competition Description (e.g., .zip files) may have been extracted or restructured—your task is to interpret the new file structure accurately (do not contain any file or folder that is not in Processed Data folder description) and reconcile it with the contextual information from the Competition Description to provide a clear and updated explanation.",
       "Evaluation Description": "A description of the evaluation used in the competition.",
       "Submission Specifications": "The submission specification & sample submission file descriptions for the model to output."
       "Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
@@ -32,6 +32,12 @@ competition_description_template:
     Competition Description: 
     {{ competition_raw_description }}
 
+    Processed Data folder description:
+    {{ competition_processed_data_folder_description }}
+    
+    [Note] There may be some discrepancies between the competition description and the processed data folder description. Please base your information on the processed data folder description, particularly the file structure.
+
+
 competition_background: |-
   You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
   Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
@@ -40,7 +46,7 @@ competition_background: |-
   The task type for this competition is {{ task_type }}.
   The data type used in this competition is {{ data_type }}.
   Briefly, the competition involves: {{ brief_description }}.
-  The dataset used in this competition is: {{ data_description }}.
+  The dataset used in this competition is: {{ dataset_description }}.
   Your goal in this competition is to: {{target_description }}.
 
 rich_style_description: |-

From 1ccba6153dba7fb056e57cf5cd3728fbf440de8a Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Wed, 15 Jan 2025 10:02:33 +0000
Subject: [PATCH 276/304] fix ci

---
 rdagent/scenarios/data_science/scen/__init__.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
index d0ea88991..0705ff913 100644
--- a/rdagent/scenarios/data_science/scen/__init__.py
+++ b/rdagent/scenarios/data_science/scen/__init__.py
@@ -16,18 +16,16 @@
 from rdagent.utils.agent.tpl import T
 
 
-import pandas as pd
-
 def read_csv_head(file_path, indent=0, lines=5, max_col_width=100):
     """
     Reads the first few rows of a CSV file and formats them with indentation and optional truncation.
-    
+
     Parameters:
         file_path (str): Path to the CSV file.
         indent (int): Number of spaces to prepend to each line for indentation.
         lines (int): Number of rows to read from the CSV file.
         max_col_width (int): Maximum width of each column's content.
-        
+
     Returns:
         str: A formatted string of the first few rows of the CSV file.
     """
@@ -41,8 +39,10 @@ def read_csv_head(file_path, indent=0, lines=5, max_col_width=100):
         # Truncate column contents to a maximum width
         truncated_df = df.copy()
         for col in truncated_df.columns:
-            truncated_df[col] = truncated_df[col].astype(str).apply(
-                lambda x: (x[:max_col_width] + "...") if len(x) > max_col_width else x
+            truncated_df[col] = (
+                truncated_df[col]
+                .astype(str)
+                .apply(lambda x: (x[:max_col_width] + "...") if len(x) > max_col_width else x)
             )
 
         # Convert DataFrame to a string representation
@@ -296,6 +296,7 @@ def get_scenario_all_desc(self) -> str:
     def _get_data_folder_description(self) -> str:
         return describe_data_folder(Path(DS_RD_SETTING.local_data_path) / self.competition)
 
+
 class KaggleScen(DataScienceScen):
     """Kaggle Scenario
     It is based on kaggle now.
@@ -314,7 +315,6 @@ def _get_direction(self):
         leaderboard = leaderboard_scores(self.competition)
         return "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize"
 
-
     @property
     def rich_style_description(self) -> str:
         return T(".prompts:rich_style_description").r(

From 8968482dc38dbb0ab1abe1527c973c268a4534d6 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 15 Jan 2025 10:02:56 +0000
Subject: [PATCH 277/304] small change

---
 rdagent/scenarios/data_science/proposal/exp_gen.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 57f73ae90..1801a4ee2 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -291,9 +291,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                 )
                 metric_name = score_df.columns[0]
                 for fname in last_successful_exp.experiment_workspace.file_dict:
-                    if re.match(r"^model_.+\.py", fname):
-                        if "test" in fname:
-                            continue
+                    if re.match(r"^model_(?!test)\w+\.py$", fname):
                         model_str = f"{fname}:\n{metric_name} on valid: {score_df.loc[fname[:-3]]}\n```python\n{last_successful_exp.experiment_workspace.file_dict[fname]}\n```\n"
                         model_infos.append(model_str)
 

From 1547cce7c1e277421218455d46d5dab9361cf690 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Wed, 15 Jan 2025 10:15:55 +0000
Subject: [PATCH 278/304] set filter_progress_bar as default in execute

---
 rdagent/components/coder/data_science/ensemble/eval.py   | 4 +---
 rdagent/components/coder/data_science/model/eval.py      | 9 ++-------
 .../coder/data_science/raw_data_loader/eval.py           | 3 +--
 rdagent/components/coder/data_science/workflow/eval.py   | 4 +---
 rdagent/core/experiment.py                               | 3 ++-
 rdagent/scenarios/data_science/dev/prompts.yaml          | 3 ++-
 rdagent/scenarios/data_science/dev/runner.py             | 3 +--
 7 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
index 6d1116218..f130326e1 100644
--- a/rdagent/components/coder/data_science/ensemble/eval.py
+++ b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -1,5 +1,4 @@
 import json
-from dataclasses import dataclass
 from pathlib import Path
 
 from jinja2 import Environment, StrictUndefined
@@ -12,7 +11,6 @@
 from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.utils import filter_progress_bar
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
 
@@ -64,7 +62,7 @@ def evaluate(
         )
 
         implementation.inject_files(**{fname: test_code})
-        stdout = filter_progress_bar(implementation.execute(env=de, entry=f"python {fname}"))
+        stdout = implementation.execute(env=de, entry=f"python {fname}")
 
         system_prompt = T(".prompts:ensemble_eval.system").r(
             task_desc=target_task_information,
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index d365d41b8..38c52e5d8 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -4,7 +4,6 @@
 """
 
 import json
-import re
 from pathlib import Path
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
@@ -16,7 +15,6 @@
 from rdagent.core.exception import CoderError
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.utils import filter_progress_bar
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
 
@@ -69,10 +67,7 @@ def evaluate(
         implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
-        # Filter out progress bars from stdout using regex
-        filtered_stdout = filter_progress_bar(stdout)
-
-        if filtered_stdout is None:
+        if stdout is None:
             raise CoderError(
                 "The execution output contains too many progress bars and results in the LLM's token size exceeding the limit."
             )
@@ -84,7 +79,7 @@ def evaluate(
             spec=implementation.file_dict["spec/model.md"],
         )
         user_prompt = T(".prompts:model_eval.user").r(
-            stdout=filtered_stdout,
+            stdout=stdout,
             code=implementation.file_dict[f"{target_task.name}.py"],
         )
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
index fc79fd80b..ffbe8b39e 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -11,8 +11,7 @@
 from rdagent.components.coder.CoSTEER.knowledge_management import (
     CoSTEERQueriedKnowledgeV2,
 )
-from rdagent.core.evaluation import Feedback
-from rdagent.core.experiment import FBWorkspace, Task, Workspace
+from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index 9b717a2c9..f02fbd4a9 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -12,10 +12,8 @@
     CoSTEERSingleFeedbackDeprecated,
 )
 from rdagent.core.evolving_framework import QueriedKnowledge
-from rdagent.core.exception import CoderError
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.utils import filter_progress_bar
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import DockerEnv, DSDockerConf
 
@@ -61,7 +59,7 @@ def evaluate(
         }
         de = DockerEnv(conf=ds_docker_conf)
         fname = "main.py"
-        stdout = filter_progress_bar(implementation.execute(env=de, entry=f"python {fname}"))
+        stdout = implementation.execute(env=de, entry=f"python {fname}")
 
         # Check score file
         score_fp = implementation.workspace_path / "scores.csv"
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index b9dc17af9..231ac6740 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -13,6 +13,7 @@
 from typing import Any, Generic, TypeVar
 
 from rdagent.core.conf import RD_AGENT_SETTINGS
+from rdagent.utils import filter_progress_bar
 
 if typing.TYPE_CHECKING:
     from rdagent.core.proposal import Hypothesis
@@ -235,7 +236,7 @@ def execute(self, env: Env | None = None, entry: str | None = None) -> object |
         self.inject_files(**self.file_dict)
         # TODO: env should be not None in new design (no code can run without environment)
         if env is not None and entry is not None:
-            return env.run(entry, str(self.workspace_path))
+            return filter_progress_bar(env.run(entry, str(self.workspace_path)))
         return None
 
     def __str__(self) -> str:
diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
index 37ea21d4c..59c2b0885 100644
--- a/rdagent/scenarios/data_science/dev/prompts.yaml
+++ b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -43,7 +43,8 @@ exp_feedback:
     {{ de }}
     {% endfor %}
 
-    Final results of the current solution: {{ cur_exp.result }}
+    Final results of the current solution: 
+    {{ cur_exp.result }}
     ### Complete Code of current solution
     {{cur_exp.experiment_workspace.all_codes}}
 
diff --git a/rdagent/scenarios/data_science/dev/runner.py b/rdagent/scenarios/data_science/dev/runner.py
index 9016b21d5..d403b212c 100644
--- a/rdagent/scenarios/data_science/dev/runner.py
+++ b/rdagent/scenarios/data_science/dev/runner.py
@@ -5,7 +5,6 @@
 from rdagent.core.exception import RunnerError
 from rdagent.log import rdagent_logger as logger
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
-from rdagent.utils import filter_progress_bar
 from rdagent.utils.env import DockerEnv, DSDockerConf
 
 
@@ -18,7 +17,7 @@ def develop(self, exp: DSExperiment) -> DSExperiment:
         de = DockerEnv(conf=ds_docker_conf)
 
         # execute workflow
-        stdout = filter_progress_bar(exp.experiment_workspace.execute(env=de, entry="python main.py"))
+        stdout = exp.experiment_workspace.execute(env=de, entry="python main.py")
 
         score_fp = exp.experiment_workspace.workspace_path / "scores.csv"
         if not score_fp.exists():

From 6be3fc7f2c0fa73595a30dea76166a65d282be40 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Wed, 15 Jan 2025 10:33:04 +0000
Subject: [PATCH 279/304] model proposal with workflow

---
 .../coder/data_science/model/__init__.py      |  3 ++-
 .../coder/data_science/model/eval.py          | 20 +++++++++++++++++++
 .../coder/data_science/model/prompts.yaml     |  9 +++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 537d3fab5..aa7d2f45e 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -67,6 +67,7 @@ def implement_one_task(
             task_desc=model_information_str,
             data_loader_code=workspace.file_dict.get("load_data.py"),
             feature_code=workspace.file_dict["feature.py"],
+            workflow_code=workspace.file_dict.get("main.py"),
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
             out_spec=BatchEditOut.get_spec(),
@@ -96,7 +97,7 @@ def implement_one_task(
 
             # 3. post process to align file name to the task name
             batch_edit = {
-                (f"{target_task.name}.py" if value != "__DEL__" and key != f"{target_task.name}.py" else key): value
+                (f"{target_task.name}.py" if value != "__DEL__" and key != f"{target_task.name}.py" and key.startswith("model") else key): value
                 for key, value in batch_edit.items()
             }
 
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 38c52e5d8..cda09d2f3 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -5,6 +5,7 @@
 
 import json
 from pathlib import Path
+import pandas as pd
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.CoSTEER.evaluators import (
@@ -71,6 +72,25 @@ def evaluate(
             raise CoderError(
                 "The execution output contains too many progress bars and results in the LLM's token size exceeding the limit."
             )
+        fname = "main.py"
+        if "Model code test passed successfully." in stdout and implementation.file_dict.get(fname):
+            stdout = filter_progress_bar(implementation.execute(env=de, entry=f"python {fname}"))
+
+            # Check score file
+            score_fp = implementation.workspace_path / "scores.csv"
+            if not score_fp.exists():
+                stdout += "\nMetrics file (scores.csv) is not generated."
+            else:
+                score_df = pd.read_csv(score_fp, index_col=0)
+                model_set_in_scores = set(score_df.index)
+                model_set_in_folder = set(
+                    f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_.+\.py$", f) and "test" not in f
+                )
+                for model in model_set_in_folder:
+                    if model not in model_set_in_scores:
+                        stdout += (
+                            f"\nModel {model} is not evaluated in the scores.csv. The scores.csv has {model_set_in_scores}."
+                        )
 
         system_prompt = T(".prompts:model_eval.system").r(
             task_desc=target_task.get_task_information(),
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index 52215eafc..ec6cecda2 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -14,6 +14,11 @@ model_coder:
         --------- Feature Engineering Code: ---------
         {{feature_code}}
 
+        {% if workflow_code %}
+        --------- Workflow Code (main.py): ---------
+        {{workflow_code}}
+        {% endif %}
+        
         Instructions for Code Generation:
             Leveraging User Inputs:
                 The user may provide various forms of additional information to guide you:
@@ -33,6 +38,10 @@ model_coder:
         {% if out_spec %}
         {{out_spec}}
         The file name should be the model name described in the model task in the format "{task_name}.py". You should always follow this name format.
+        {% if workflow_code %}
+        If the workflow code is provided, you should also consider modify the workflow code in "main.py" if the model implementation is changed.
+        Please make sure all model files are used in the workflow code in "main.py".
+        {% endif %}
         {% else %}
         Formatting Your Response:
             Return only the code in a JSON format as shown below. Do not include any explanations or extra text. Example:

From c3ddfb0bfc8a9a0f23480256984532a3b260cb49 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 15 Jan 2025 11:53:34 +0000
Subject: [PATCH 280/304] add submission check in workflow eval

---
 .../coder/data_science/workflow/eval.py        |  6 +++++-
 .../workflow/eval_tests/submission_check.txt   | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 rdagent/components/coder/data_science/workflow/eval_tests/submission_check.txt

diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index 9b717a2c9..7eb179ece 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -83,6 +83,10 @@ def evaluate(
         submission_fp = implementation.workspace_path / "submission.csv"
         if not submission_fp.exists():
             stdout += "\nSubmission file (submission.csv) is not generated."
+        else:
+            check_code = (DIRNAME / "eval_tests" / "submission_check.txt").read_text()
+            implementation.inject_files(**{"submission_check.py": check_code})
+            stdout += implementation.execute(env=de, entry="python submission_check.py")
 
         system_prompt = T(".prompts:workflow_eval.system").r(
             scenario=self.scen.get_scenario_all_desc(),
@@ -90,7 +94,7 @@ def evaluate(
             spec=implementation.file_dict["spec/workflow.md"],
         )
         user_prompt = T(".prompts:workflow_eval.user").r(
-            stdout=stdout,
+            stdout=stdout.strip(),
             code=implementation.file_dict["main.py"],
         )
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
diff --git a/rdagent/components/coder/data_science/workflow/eval_tests/submission_check.txt b/rdagent/components/coder/data_science/workflow/eval_tests/submission_check.txt
new file mode 100644
index 000000000..abca1553f
--- /dev/null
+++ b/rdagent/components/coder/data_science/workflow/eval_tests/submission_check.txt
@@ -0,0 +1,18 @@
+import pandas as pd
+from pathlib import Path
+
+# Check if the sample submission file exists
+if not Path("/kaggle/input/sample_submission.csv").exists():
+    exit(0)
+
+sample_submission = pd.read_csv('/kaggle/input/sample_submission.csv')
+our_submission = pd.read_csv('submission.csv')
+
+success = True
+for col in sample_submission.columns:
+    if col not in our_submission.columns:
+        success = False
+        print(f'Column {col} not found in submission.csv')
+
+if success:
+    print('submission.csv is valid.')
\ No newline at end of file

From 0e88ee0c890c4f1cebac601695ccbdaa05f4efc3 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 15 Jan 2025 12:03:44 +0000
Subject: [PATCH 281/304] fix bug

---
 rdagent/components/coder/data_science/model/eval.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index cda09d2f3..a873f21e5 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -4,6 +4,7 @@
 """
 
 import json
+import re
 from pathlib import Path
 import pandas as pd
 
@@ -74,7 +75,7 @@ def evaluate(
             )
         fname = "main.py"
         if "Model code test passed successfully." in stdout and implementation.file_dict.get(fname):
-            stdout = filter_progress_bar(implementation.execute(env=de, entry=f"python {fname}"))
+            stdout = implementation.execute(env=de, entry=f"python {fname}")
 
             # Check score file
             score_fp = implementation.workspace_path / "scores.csv"
@@ -84,7 +85,7 @@ def evaluate(
                 score_df = pd.read_csv(score_fp, index_col=0)
                 model_set_in_scores = set(score_df.index)
                 model_set_in_folder = set(
-                    f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_.+\.py$", f) and "test" not in f
+                    f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_(?!test)\w+\.py$", f)
                 )
                 for model in model_set_in_folder:
                     if model not in model_set_in_scores:

From b8852d1f6557dab29529b7f98344e7ef2b0a8245 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 15 Jan 2025 12:06:33 +0000
Subject: [PATCH 282/304] small change

---
 rdagent/components/coder/data_science/workflow/eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
index f6484c78d..584075ede 100644
--- a/rdagent/components/coder/data_science/workflow/eval.py
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -69,7 +69,7 @@ def evaluate(
             score_df = pd.read_csv(score_fp, index_col=0)
             model_set_in_scores = set(score_df.index)
             model_set_in_folder = set(
-                f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_.+\.py$", f) and "test" not in f
+                f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_(?!test)\w+\.py$", f)
             )
             for model in model_set_in_folder:
                 if model not in model_set_in_scores:

From 8086e392d560f24ce53a51d12c38478b2f5fc0ab Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 15 Jan 2025 12:07:50 +0000
Subject: [PATCH 283/304] fix CI

---
 rdagent/components/coder/data_science/model/eval.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index a873f21e5..74e304d58 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -6,6 +6,7 @@
 import json
 import re
 from pathlib import Path
+
 import pandas as pd
 
 from rdagent.app.data_science.conf import DS_RD_SETTING

From 953a1d79ca6bc4e1368a8c3d454c7db262a7b2ec Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Wed, 15 Jan 2025 12:14:12 +0000
Subject: [PATCH 284/304] fix CI

---
 rdagent/components/coder/data_science/model/__init__.py | 6 +++++-
 rdagent/components/coder/data_science/model/eval.py     | 4 +---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index aa7d2f45e..b8c335dc9 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -97,7 +97,11 @@ def implement_one_task(
 
             # 3. post process to align file name to the task name
             batch_edit = {
-                (f"{target_task.name}.py" if value != "__DEL__" and key != f"{target_task.name}.py" and key.startswith("model") else key): value
+                (
+                    f"{target_task.name}.py"
+                    if value != "__DEL__" and key != f"{target_task.name}.py" and key.startswith("model")
+                    else key
+                ): value
                 for key, value in batch_edit.items()
             }
 
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 74e304d58..86b09342b 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -90,9 +90,7 @@ def evaluate(
                 )
                 for model in model_set_in_folder:
                     if model not in model_set_in_scores:
-                        stdout += (
-                            f"\nModel {model} is not evaluated in the scores.csv. The scores.csv has {model_set_in_scores}."
-                        )
+                        stdout += f"\nModel {model} is not evaluated in the scores.csv. The scores.csv has {model_set_in_scores}."
 
         system_prompt = T(".prompts:model_eval.system").r(
             task_desc=target_task.get_task_information(),

From ad473601f7aa2cc40fe8d1991a92cb6870bb5b3f Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Thu, 16 Jan 2025 03:30:58 +0000
Subject: [PATCH 285/304] refactor: Move generate_diff to utils and update
 DSExpGen logic

---
 rdagent/app/data_science/conf.py              |   1 +
 .../scenarios/data_science/dev/feedback.py    |  57 +---
 .../data_science/experiment/experiment.py     |   1 +
 .../data_science/proposal/exp_gen.py          | 293 +++++++-----------
 .../data_science/proposal/prompts.yaml        |  99 +++++-
 rdagent/scenarios/data_science/share.yaml     |   4 +-
 rdagent/utils/repo/diff.py                    |  56 ++++
 7 files changed, 253 insertions(+), 258 deletions(-)
 create mode 100644 rdagent/utils/repo/diff.py

diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index d31a93fac..25a92f3e3 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -12,6 +12,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
 
     ## proposal
     exp_gen: str = "rdagent.scenarios.data_science.proposal.exp_gen.DSExpGen"
+    # exp_gen_init_kwargs: dict = {"max_trace_hist": 3}   # TODO: to be configurable
 
     # the two below should be used in ExpGen
     # hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen"
diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
index 42738c479..d92145d4a 100644
--- a/rdagent/scenarios/data_science/dev/feedback.py
+++ b/rdagent/scenarios/data_science/dev/feedback.py
@@ -1,7 +1,4 @@
-import difflib
 import json
-from pathlib import Path
-from typing import List
 
 from rdagent.components.knowledge_management.graph import UndirectedNode
 from rdagent.core.experiment import Experiment
@@ -17,59 +14,7 @@
 from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace
 from rdagent.utils import convert2bool, remove_path_info_from_str
 from rdagent.utils.agent.tpl import T
-
-
-# TODO:  find a better place.
-def generate_diff(dir1: str, dir2: str) -> List[str]:
-    """
-    Generate a diff between two directories, considering only .py files.
-    It is mocking `diff -durN dir1 dir2` in linux.
-
-    Args:
-        dir1 (str): Path to the first directory.
-        dir2 (str): Path to the second directory.
-
-    Returns:
-        List[str]: A list of diffs for .py files that are different between the two directories.
-    """
-
-    diff_files = []
-
-    dir1_files = {f.relative_to(dir1) for f in Path(dir1).rglob("*.py") if f.is_file()}
-    dir2_files = {f.relative_to(dir2) for f in Path(dir2).rglob("*.py") if f.is_file()}
-
-    all_files = dir1_files.union(dir2_files)
-
-    for file in all_files:
-        file1 = Path(dir1) / file
-        file2 = Path(dir2) / file
-
-        if file1.exists() and file2.exists():
-            with file1.open() as f1, file2.open() as f2:
-                diff = list(
-                    difflib.unified_diff(f1.readlines(), f2.readlines(), fromfile=str(file1), tofile=str(file2))
-                )
-                if diff:
-                    diff_files.extend(diff)
-        else:
-            if file1.exists():
-                with file1.open() as f1:
-                    diff = list(
-                        difflib.unified_diff(
-                            f1.readlines(), [], fromfile=str(file1), tofile=str(file2) + " (empty file)"
-                        )
-                    )
-                    diff_files.extend(diff)
-            elif file2.exists():
-                with file2.open() as f2:
-                    diff = list(
-                        difflib.unified_diff(
-                            [], f2.readlines(), fromfile=str(file1) + " (empty file)", tofile=str(file2)
-                        )
-                    )
-                    diff_files.extend(diff)
-
-    return diff_files
+from rdagent.utils.repo.diff import generate_diff
 
 
 class DSExperiment2Feedback(Experiment2Feedback):
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index 565d45931..51ec6c30f 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -1,4 +1,5 @@
 import re
+import pandas as pd
 from typing import Literal
 
 from rdagent.core.experiment import Experiment, FBWorkspace, Task
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 1801a4ee2..3e413ea29 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -22,6 +22,7 @@
 from rdagent.scenarios.data_science.experiment.experiment import COMPONENT, DSExperiment
 from rdagent.scenarios.data_science.scen import DataScienceScen
 from rdagent.utils.agent.tpl import T
+from rdagent.utils.repo.diff import generate_diff
 
 
 class DSHypothesis(Hypothesis):
@@ -96,7 +97,11 @@ def last_successful_exp(self) -> DSExperiment | None:
 class DSExpGen(ExpGen):
     """Data Science Task Generator."""
 
-    def llm_task_gen(
+    def __init__(self, scen: DataScienceScen, max_trace_hist: int = 3) -> None:
+        self.max_trace_hist = max_trace_hist  # max number of historical trace to know when propose new experiment
+        super().__init__(scen)
+
+    def _init_task_gen(
         self,
         targets: str,
         scenario_desc: str,
@@ -147,7 +152,7 @@ def _handle_missing_component(
             last_successful_exp: Last successful experiment or None
             spec_file: Path to specification file if needed
         """
-        resp_dict = self.llm_task_gen(
+        resp_dict = self._init_task_gen(
             targets=component,
             scenario_desc=scenario_desc,
             spec=last_successful_exp.experiment_workspace.file_dict[spec_file] if spec_file else None,
@@ -188,7 +193,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
         else:
             next_missing_component = last_successful_exp.next_component_required()
 
-        component_config = {
+        init_component_config = {
             "DataLoadSpec": {"task_cls": DataLoaderTask, "spec_file": None, "component_prompt_key": "data_loader"},
             "FeatureEng": {"task_cls": FeatureTask, "spec_file": "spec/feature.md", "component_prompt_key": "feature"},
             "Model": {"task_cls": ModelTask, "spec_file": "spec/model.md", "component_prompt_key": "model"},
@@ -196,8 +201,10 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             "Workflow": {"task_cls": WorkflowTask, "spec_file": "spec/workflow.md", "component_prompt_key": "workflow"},
         }
 
-        if next_missing_component in component_config:
-            config = component_config[next_missing_component]
+        if next_missing_component in init_component_config:
+            # TODO: we may merge the if else logic in the future.
+            # the current
+            config = init_component_config[next_missing_component]
             return self._handle_missing_component(
                 component=next_missing_component,
                 task_cls=config["task_cls"],
@@ -216,19 +223,17 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             # - Previous Feedback
             # - Current sota implementation (encourage change based on it)
             # - Extra RAG
-            assert last_successful_exp is not None, "SOTA experiment is not provided."
+            sota_exp = trace.sota_experiment()
+            assert sota_exp is not None, "SOTA experiment is not provided."
             exp_and_feedback = trace.hist[-1]
             last_exp = exp_and_feedback[0]
 
             # Step 1: Generate component
             # Describe current best solution using shared template
-            sota_solution = trace.sota_experiment()
             sota_exp_desc = T("scenarios.data_science.share:describe.exp").r(
-                exp=last_successful_exp, heading="Best of previous exploration of the scenario"
-            )
-            current_exp_desc = T("scenarios.data_science.share:describe.exp").r(
-                exp=last_exp, heading="Current exploration of the scenario"
+                exp=sota_exp, heading="Best of previous exploration of the scenario"
             )
+            last_exp_diff = "\n".join(generate_diff(sota_exp.experiment_workspace.workspace_path, last_exp.experiment_workspace.workspace_path))
             exp_and_feedback_desc = T("scenarios.data_science.share:describe.feedback").r(
                 exp_and_feedback=exp_and_feedback
             )
@@ -237,7 +242,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             component_sys_prompt = T(".prompts:component_gen.system").r(
                 scenario=scenario_desc,
                 sota_exp_desc=sota_exp_desc,
-                current_exp_desc=current_exp_desc,
+                last_exp_diff=last_exp_diff,
                 component_output_format=T(".prompts:output_format.component").r(),
             )
 
@@ -253,195 +258,111 @@ def gen(self, trace: DSTrace) -> DSExperiment:
 
             component = resp_dict_component.get("component", "Component not provided")
 
-            # Why we should split component selection and hypothesis generation
+            # Why we should split component selection and steps after?
             # - after we know the selected component, we can use RAG.
 
-            # Step 2: Generate the rest of the hypothesis
-            if component != "Model":
-                hypothesis_sys_prompt = T(".prompts:hypothesis_gen.system").r(
-                    targets="data science project",
-                    scenario=scenario_desc,
-                    hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
-                    hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
+            # Step 2: Generate the rest of the hypothesis & task
+            component_task_mapping = {
+                # TODO:  merge the tow names, DataLoadSpec, data_loader, make all the code easier
+                "DataLoadSpec": {
+                    "target_name": "Data loader and specification generation",
+                    "spec_file": "spec/data_loader.md",
+                    "task_output_format": T(".prompts:output_format.data_loader").r(),
+                    "task_class": DataLoaderTask,
+                },
+                "FeatureEng": {
+                    "target_name": "Feature engineering",
+                    "spec_file": "spec/feature.md",
+                    "task_output_format": T(".prompts:output_format.feature").r(),
+                    "task_class": FeatureTask,
+                },
+                "Model": {
+                    "target_name": "Building model",
+                    "spec_file": "spec/model.md",
+                    "task_output_format": T(".prompts:output_format.model").r(),
+                    "task_class": ModelTask,
+                    "extra_params": {
+                        "model_type": "Model type not provided",
+                        "architecture": "Model architecture not provided",
+                        "hyperparameters": "Model hyperparameters not provided",
+                    },
+                    "extra_requirement": T(".prompts:extra_requirement.model").r(),
+                },
+                "Ensemble": {
+                    "target_name": "Ensemble",
+                    "spec_file": "spec/ensemble.md",
+                    "task_output_format": T(".prompts:output_format.ensemble").r(),
+                    "task_class": EnsembleTask,
+                },
+                "Workflow": {
+                    "target_name": "Workflow",
+                    "spec_file": "spec/workflow.md",
+                    "task_output_format": T(".prompts:output_format.workflow").r(),
+                    "task_class": WorkflowTask,
+                }
+            }
+
+            component_info = component_task_mapping.get(component)
+
+            if component_info:
+                system_prompt = T(".prompts:direct_exp_gen.system").r(
+                    targets=component_info["target_name"],
                     component=component,
-                )
-                hypothesis_user_prompt = T(".prompts:hypothesis_gen.user").r(
-                    targets="data science project",
-                    exp_and_feedback_desc=exp_and_feedback_desc,
+                    scenario=scenario_desc,
+                    hypothesis_output_format=T(".prompts:output_format.hypothesis"),
+                    task_specification=sota_exp.experiment_workspace.file_dict[component_info["spec_file"]],
+                    task_output_format=component_info["task_output_format"],
+                    extra_requirement=component_info.get("extra_requirement"),
                 )
 
-                resp_dict: dict = json.loads(
-                    APIBackend().build_messages_and_create_chat_completion(
-                        hypothesis_user_prompt, hypothesis_sys_prompt, json_mode=True
-                    )
-                )
-                hypothesis = DSHypothesis(
-                    component=resp_dict.get("component", "Component not provided"),
-                    hypothesis=resp_dict.get("hypothesis", "Hypothesis not provided"),
-                    reason=resp_dict.get("reason", "Reason not provided"),
-                    concise_reason=resp_dict.get("concise_reason", "Concise reason not provided"),
-                    concise_observation=resp_dict.get("concise_observation", "Concise observation not provided"),
-                    concise_justification=resp_dict.get("concise_justification", "Concise justification not provided"),
-                    concise_knowledge=resp_dict.get("concise_knowledge", "Concise knowledge not provided"),
-                )
-            else:
-                model_infos = []
-                score_df = pd.read_csv(
-                    last_successful_exp.experiment_workspace.workspace_path / "scores.csv", index_col=0
-                )
-                metric_name = score_df.columns[0]
-                for fname in last_successful_exp.experiment_workspace.file_dict:
-                    if re.match(r"^model_(?!test)\w+\.py$", fname):
-                        model_str = f"{fname}:\n{metric_name} on valid: {score_df.loc[fname[:-3]]}\n```python\n{last_successful_exp.experiment_workspace.file_dict[fname]}\n```\n"
-                        model_infos.append(model_str)
-
-                model_num = len(model_infos)
-                models_info_str = ("-" * 20).join(model_infos)
-                if model_num >= 3:
-                    hypothesis_sys_prompt = T(".prompts:hypothesis_model.system").r(
-                        targets="data science project",
-                        scenario=scenario_desc,
-                        hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
-                        hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
-                        model_info=models_info_str,
-                        model_enough=True,
-                    )
-                else:
-                    hypothesis_sys_prompt = T(".prompts:hypothesis_model.system").r(
-                        targets="data science project",
-                        scenario=scenario_desc,
-                        hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
-                        hypothesis_specification=T(".prompts:hypothesis_specification").r(sota_solution=sota_solution),
-                        model_info=models_info_str,
-                        model_enough=False,
-                    )
-                hypothesis_user_prompt = T(".prompts:hypothesis_gen.user").r(
-                    targets="data science project",
+                recent_trace_desc = []
+                for i in range(self.max_trace_hist):
+                    if i < len(trace.hist):
+                        eaf = trace.hist[-i - 1]
+                        if eaf[1].decision:
+                            # we only add failed direction incase of trying same invalid direction
+                            break
+                        recent_trace_desc.insert(0,
+                            T("scenarios.data_science.share:describe.feedback").r(
+                                exp_and_feedback=eaf
+                            )
+                        )
+                user_prompt = T(".prompts:direct_exp_gen.user").r(
                     exp_and_feedback_desc=exp_and_feedback_desc,
+                    sota_exp_desc=sota_exp_desc,
+                    last_exp_diff=last_exp_diff,
+                    recent_trace_desc="\n".join(recent_trace_desc),
                 )
-                resp_dict: dict = json.loads(
+
+                resp_dict = json.loads(
                     APIBackend().build_messages_and_create_chat_completion(
-                        hypothesis_user_prompt, hypothesis_sys_prompt, json_mode=True
+                        user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
                     )
                 )
-                hypothesis = DSHypothesis(
-                    component=resp_dict.get("component", "Component not provided"),
-                    hypothesis=resp_dict.get("hypothesis", "Hypothesis not provided"),
-                    reason=resp_dict.get("reason", "Reason not provided"),
-                    concise_reason=resp_dict.get("concise_reason", "Concise reason not provided"),
-                    concise_observation=resp_dict.get("concise_observation", "Concise observation not provided"),
-                    concise_justification=resp_dict.get("concise_justification", "Concise justification not provided"),
-                    concise_knowledge=resp_dict.get("concise_knowledge", "Concise knowledge not provided"),
-                )
-
-            # 2. gen experiment
-            if hypothesis.component == "DataLoadSpec":
-                resp_dict = self.llm_task_gen(
-                    targets="Data loader and specification generation",
-                    scenario_desc=scenario_desc,
-                    spec=last_successful_exp.experiment_workspace.file_dict["spec/data_loader.md"],
-                    hypothesis=hypothesis,
-                    task_output_format=T(".prompts:output_format.data_loader").r(),
-                    exp_and_feedback_desc=exp_and_feedback_desc,
-                )
 
-                dt = DataLoaderTask(
-                    name="Data loader and specification generation",
-                    description=resp_dict.get(
-                        "description", "Data loader and specification generation description not provided"
-                    ),
-                )
-
-                exp = DSExperiment(sub_tasks=[dt], hypothesis=hypothesis)
-                exp.experiment_workspace.inject_code_from_folder(
-                    last_successful_exp.experiment_workspace.workspace_path
-                )
-                return exp
-            elif hypothesis.component == "FeatureEng":
-                # TODO: RAG
-                resp_dict = self.llm_task_gen(
-                    targets="Feature Engineering",
-                    scenario_desc=scenario_desc,
-                    spec=last_successful_exp.experiment_workspace.file_dict["spec/feature.md"],
-                    hypothesis=hypothesis,
-                    task_output_format=T(".prompts:output_format.feature").r(),
-                    exp_and_feedback_desc=exp_and_feedback_desc,
-                )
-
-                ft = FeatureTask(
-                    name="Feature Engineering",
-                    description=resp_dict.get("description", "Feature description not provided"),
-                )
-
-                exp = DSExperiment(sub_tasks=[ft], hypothesis=hypothesis)
-                exp.experiment_workspace.inject_code_from_folder(
-                    last_successful_exp.experiment_workspace.workspace_path
-                )
-                return exp
-            elif hypothesis.component == "Model":
-                resp_dict = self.llm_task_gen(
-                    targets="Models",
-                    scenario_desc=scenario_desc,
-                    spec=last_successful_exp.experiment_workspace.file_dict["spec/model.md"],
-                    hypothesis=hypothesis,
-                    workspace_code=last_successful_exp.experiment_workspace.all_codes,
-                    task_output_format=T(".prompts:output_format.model").r(),
-                    exp_and_feedback_desc=exp_and_feedback_desc,
-                )
-
-                mt = ModelTask(
-                    name=resp_dict.get("model_name", "Model name not provided"),
-                    description=resp_dict.get("description", "Model description not provided"),
-                    model_type=resp_dict.get("model_type", "Model type not provided"),
-                    architecture=resp_dict.get("architecture", "Model architecture not provided"),
-                    hyperparameters=resp_dict.get("hyperparameters", "Model hyperparameters not provided"),
-                    base_code="",
-                )
-
-                exp = DSExperiment(sub_tasks=[mt], hypothesis=hypothesis)
-                exp.experiment_workspace.inject_code_from_folder(
-                    last_successful_exp.experiment_workspace.workspace_path
-                )
-                return exp
-            elif hypothesis.component == "Ensemble":
-                resp_dict = self.llm_task_gen(
-                    targets="Ensemble",
-                    scenario_desc=scenario_desc,
-                    spec=last_successful_exp.experiment_workspace.file_dict["spec/ensemble.md"],
-                    hypothesis=hypothesis,
-                    task_output_format=T(".prompts:output_format.ensemble").r(),
-                    exp_and_feedback_desc=exp_and_feedback_desc,
-                )
-
-                et = EnsembleTask(
-                    name="Ensemble",
-                    description=resp_dict.get("description", "Ensemble description not provided"),
-                )
-
-                exp = DSExperiment(sub_tasks=[et], hypothesis=hypothesis)
-                exp.experiment_workspace.inject_code_from_folder(
-                    last_successful_exp.experiment_workspace.workspace_path
-                )
-                return exp
-            elif hypothesis.component == "Workflow":
-                resp_dict = self.llm_task_gen(
-                    targets="Workflow",
-                    scenario_desc=scenario_desc,
-                    spec=last_successful_exp.experiment_workspace.file_dict["spec/workflow.md"],
-                    hypothesis=hypothesis,
-                    task_output_format=T(".prompts:output_format.workflow").r(),
-                    exp_and_feedback_desc=exp_and_feedback_desc,
+                task_class = component_info["task_class"]
+                task_name = resp_dict["model_name"] if component == "Model" else component
+                description = resp_dict.get("description", f"{component_info['target_name']} description not provided")
+                hypothesis = DSHypothesis(
+                    component=component,
+                    hypothesis=resp_dict.get("hypothesis", ""),
+                    reason=resp_dict.get("reason", ""),
+                    concise_reason=resp_dict.get("concise_reason", ""),
+                    concise_observation=resp_dict.get("concise_observation", ""),
+                    concise_justification=resp_dict.get("concise_justification", ""),
+                    concise_knowledge=resp_dict.get("concise_knowledge", "")
                 )
 
-                wt = WorkflowTask(
-                    name="Workflow",
-                    description=resp_dict.get("description", "Workflow description not provided"),
+                task = task_class(
+                    name=task_name,
+                    description=description,
+                    **{k: resp_dict.get(k, v) for k, v in component_info.get("extra_params", {}).items()}
                 )
 
-                exp = DSExperiment(sub_tasks=[wt], hypothesis=hypothesis)
+                exp = DSExperiment(sub_tasks=[task], hypothesis=hypothesis)
                 exp.experiment_workspace.inject_code_from_folder(
-                    last_successful_exp.experiment_workspace.workspace_path
+                    sota_exp.experiment_workspace.workspace_path
                 )
                 return exp
-
-        return super().gen(trace)
+            else:
+                raise ValueError(f"Unknown component: {component}")
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 2166d1669..8e4e67208 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -1,4 +1,4 @@
-hypothesis_gen:
+hypothesis_gen: # It is deprecated now, please refer to direct_exp_gen
   system: |-
     The user is working on generating new hypotheses for the {{targets}} in a data-driven research and development process. 
     The {{targets}} are used in the following scenario:
@@ -33,7 +33,7 @@ hypothesis_gen:
     In addition, generate relevant reasoning and distilled knowledge keys.
     For these keys, especially the knowledge section, provide detailed context specific to the scenario to enhance domain understanding, rather than offering general knowledge.
 
-hypothesis_model:
+hypothesis_model: # It is deprecated now, please refer to direct_exp_gen
   system: |-
     The user is working on generating new hypotheses for the {{targets}} in a data-driven research and development process. 
     The {{targets}} are used in the following scenario:
@@ -59,15 +59,7 @@ hypothesis_model:
     Please generate the output using the following format and specifications:
     {{ hypothesis_output_format }}
 
-hypothesis_and_feedback: |-
-  {% for experiment, feedback in hist %}
-  Hypothesis {{ loop.index }}
-  Observation on the result with the hypothesis: {{ feedback.observations }}
-  Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
-  Did changing to this hypothesis work? (focus on the change):  {{ feedback.decision }}
-  {% endfor %}
-
-task_gen:
+task_gen: # It is deprecated now, please refer to direct_exp_gen
   system: |-
     {% if hypothesis is not none %}
     The user is trying to generate new {{targets}} based on the hypothesis generated in the previous step. 
@@ -111,7 +103,7 @@ task_gen:
     Please generate the new {{targets}} task.
     {% endif %}
 
-task_gen_model:
+task_gen_model: # It is deprecated now, please refer to direct_exp_gen
   system: |-
     {% if hypothesis is not none %}
     The user is trying to generate new {{targets}} based on the hypothesis generated in the previous step. 
@@ -143,6 +135,85 @@ task_gen_model:
     Please generate the new {{targets}} task.
     {% endif %}
 
+direct_exp_gen:
+  system: |-
+    You are a data scientist and a top Kaggle competitor. The user is working on creating a solution for a Kaggle competition. Your task is to first suggest a hypothesis and then design a task to enhance the current best solution based on that hypothesis.
+
+    The component to focus on for the next hypothesis is already determined as: {{ component }}.
+    It will be used in the following scenario:
+    {{scenario}}
+
+    # Hypothesis Proposal
+
+    The user has already proposed several hypotheses and conducted evaluations on them. This information will be provided to you later. Your task is to check if a similar hypothesis has already been generated. If one exists and you agree with it, you can use it. If you disagree, please create an improved version.
+
+    To assist you in formulating new hypotheses, the user has provided some additional information: 
+    Hypothesis should avoid being too general and vague, and should be specific and actionable. For example, hypothesis like 'tune a model' is too general, while hypothesis like 'increase the learning rate to 0.1 of the lightgbm model will improve the performance' is specific and actionable.
+    Your hypothesis should based on current SOTA solution. The user will conduct experiments based on the SOTA solution(current best experiments) to test whether your hypothesis is right on this specific competition.
+    Important: If the hypothesis_specification outlines the next steps you need to follow, ensure you adhere to those instructions.
+
+    [Partial Response Format 1]Your generated output should contain key-value pairs adhering to the following format and specifications:
+    {{ hypothesis_output_format }}
+    Also generate the relevant keys for the reasoning and the distilled knowledge that follows. For those keys, in particular for knowledge, explain in the context of the specific scenario to build up domain knowledge in the specific field rather than general knowledge.
+
+    # Task Design
+
+    The user is trying to generate new {{targets}} based on the hypothesis generated in the previous step.
+
+    The scope of the {{targets}} can be described by a interface specification as follows
+    ```Python
+    {{task_specification}}
+    ```
+
+    The user will use the {{targets}} generated to do some experiments. The user will provide this information to you:
+    1. The target hypothesis you are targeting to generate {{targets}} for.
+    2. The hypothesis generated in the previous steps and their corresponding feedbacks.
+    3.  Former proposed {{targets}} on similar hypothesis.
+    4. Some additional information to help you generate new {{targets}}.
+
+    [Partial Response Format 2] Your generated output should contain key-value pairs adhering to the following format and specifications:
+    {{ task_output_format }}
+
+    {% if extra_requirement %}
+    {{extra_requirement}}
+    {% endif %}
+
+    # Response Requirement
+    Your generated output should combine the key-value pairs based on [Partial Response Format 1] and [Partial Response Format 2]. Please output the key-values specified in [Partial Response Format 1] first.
+
+  user: |-
+    # The detailed description of current best experiments
+    {{sota_exp_desc}}
+
+    ## The according feedbacks for the best experiments
+    {{ exp_and_feedback_desc }}
+
+    {% if recent_trace_desc %}
+    # Several trials after the best experiments
+    The user has made several hypothesis on this scenario and did several evaluation on them.
+    The former hypothesis and the corresponding feedbacks are as follows (focus on the last one & the new hypothesis that it provides and reasoning to see if you agree):
+    {{recent_trace_desc}}
+
+    # The difference from the best experiments to the last one
+    {{last_exp_diff}}
+    {% endif %}
+
+
+extra_requirement:
+  model: |-
+    If there are sufficient models available. Your task is to choose one of the existing models for further tuning or optimization. Based on the model's information:
+
+    If the number of available models is insufficient . Your task is to first decide whether to:
+    - Tune an existing model: Select one of the current models for further tuning and improvement.
+    - Add a new model: Introduce a new model to expand the hypothesis space.
+
+    The information of the model is described by the code of workspace.
+
+    Make a decision and proceed accordingly:
+    - If you decide to tune an existing model, select the existing model file and generate a new hypothesis.
+    - If you decide to add a new model, specify the type of model you would add and generate a new hypothesis related to the new model.
+
+
 component_gen:
   system: |-
     You are a Kaggle Grander Master. You are going to provide a solution for a kaggle competition.
@@ -155,8 +226,8 @@ component_gen:
     # Here is the current best version of implementation.
     {{sota_exp_desc}}
 
-    # Here is the latest version of implementation
-    {{current_exp_desc}}
+    # Here is the latest version of implementation different from the sota_exp_desc
+    {{last_exp_diff}}
 
     You will be provided the feedback for the latest implementation.
 
diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
index c1435e5cd..36a3e0420 100644
--- a/rdagent/scenarios/data_science/share.yaml
+++ b/rdagent/scenarios/data_science/share.yaml
@@ -16,7 +16,7 @@ describe: # some template to describe some object
     {% if exp.result is none %}
     There are no according evaluation results
     {% else %}
-    Evaluated results is:
+    Evaluated results on validation is:
     {{ exp.result }}
     {% endif %}
 
@@ -27,7 +27,7 @@ describe: # some template to describe some object
   feedback: |-
     {% if exp_and_feedback and exp_and_feedback|length > 1 %}
     ## {{heading | default('Previous trial and feedback')}}
-    Before current trial, previous recent trial is listed below.
+    Before current trial, a previous recent trial is listed below.
     {% if exp_and_feedback[0].hypothesis %}
     the experiment is designed based on hypothesis: {{ exp_and_feedback[0].hypothesis }}
     {% endif %}
diff --git a/rdagent/utils/repo/diff.py b/rdagent/utils/repo/diff.py
new file mode 100644
index 000000000..b68fd3bca
--- /dev/null
+++ b/rdagent/utils/repo/diff.py
@@ -0,0 +1,56 @@
+import difflib
+from pathlib import Path
+from typing import List
+
+
+def generate_diff(dir1: str, dir2: str) -> List[str]:
+    """
+    Generate a diff between two directories(from dir1 to dir2), considering only .py files.
+    It is mocking `diff -durN dir1 dir2` in linux.
+
+    Args:
+        dir1 (str): Path to the first directory.
+        dir2 (str): Path to the second directory.
+
+    Returns:
+        List[str]: A list of diffs for .py files that are different between the two directories.
+    """
+
+    diff_files = []
+
+    dir1_files = {f.relative_to(dir1) for f in Path(dir1).rglob("*.py") if f.is_file()}
+    dir2_files = {f.relative_to(dir2) for f in Path(dir2).rglob("*.py") if f.is_file()}
+
+    all_files = dir1_files.union(dir2_files)
+
+    for file in all_files:
+        file1 = Path(dir1) / file
+        file2 = Path(dir2) / file
+
+        if file1.exists() and file2.exists():
+            with file1.open() as f1, file2.open() as f2:
+                diff = list(
+                    difflib.unified_diff(f1.readlines(), f2.readlines(), fromfile=str(file1), tofile=str(file2))
+                )
+                if diff:
+                    diff_files.extend(diff)
+        else:
+            if file1.exists():
+                with file1.open() as f1:
+                    diff = list(
+                        difflib.unified_diff(
+                            f1.readlines(), [], fromfile=str(file1), tofile=str(file2) + " (empty file)"
+                        )
+                    )
+                    diff_files.extend(diff)
+            elif file2.exists():
+                with file2.open() as f2:
+                    diff = list(
+                        difflib.unified_diff(
+                            [], f2.readlines(), fromfile=str(file1) + " (empty file)", tofile=str(file2)
+                        )
+                    )
+                    diff_files.extend(diff)
+
+    return diff_files
+

From 534c398fff21c795177812d8cf697ba755f26883 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Thu, 16 Jan 2025 04:23:34 +0000
Subject: [PATCH 286/304] more reasonable prompt describing metric direction

---
 rdagent/scenarios/data_science/scen/prompts.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
index e239d3d6f..6957fbb07 100644
--- a/rdagent/scenarios/data_science/scen/prompts.yaml
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -10,7 +10,10 @@ scenario_description: |-
   {{evaluation}}
   {% endif %}
 
-  The evaluation metric used is directed as: {{metric_direction}}.
+  The evaluation metrics used is directed as:
+  { % if metric_direction %}The metric is better when it is bigger. 
+  { % else %}The metric is better when it is smaller.
+  { % endif %}
 
 competition_description_template:
   system: |-

From 97c95026ae3d8624699f4b6f22d322817e27bb57 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Thu, 16 Jan 2025 04:56:16 +0000
Subject: [PATCH 287/304] fix a minor jinja2 bug

---
 rdagent/scenarios/data_science/scen/prompts.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
index 6957fbb07..40fa4b0d0 100644
--- a/rdagent/scenarios/data_science/scen/prompts.yaml
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -11,9 +11,9 @@ scenario_description: |-
   {% endif %}
 
   The evaluation metrics used is directed as:
-  { % if metric_direction %}The metric is better when it is bigger. 
-  { % else %}The metric is better when it is smaller.
-  { % endif %}
+  {% if metric_direction %}The metric is better when it is bigger. 
+  {% else %}The metric is better when it is smaller.
+  {% endif %}
 
 competition_description_template:
   system: |-

From bd402fac0a0cf955c7a6da8f85e123eb321f0b4d Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Thu, 16 Jan 2025 06:59:32 +0000
Subject: [PATCH 288/304] quick fix exp_gen bugs

---
 rdagent/app/data_science/loop.py              |  1 +
 .../data_science/proposal/exp_gen.py          | 28 +++++++++----------
 .../data_science/proposal/prompts.yaml        | 15 ++++++++--
 3 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 314eda89f..73b97062c 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -108,6 +108,7 @@ def record(self, prev_out: dict[str, Any]):
                     ExperimentFeedback.from_exception(e),
                 )
             )
+        logger.log_object(self.trace.sota_experiment(), tag="SOTA experiment")
 
 
 def main(path=None, step_n=None, competition="bms-molecular-translation"):
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 3e413ea29..3d2bfd655 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -43,7 +43,7 @@ def __init__(
 
     def __str__(self) -> str:
         if self.hypothesis == "":
-            return f"Chosen Component: {self.component}"
+            return f"No hypothesis available. Trying to construct the first runnable {self.component} component."
         return f"""Chosen Component: {self.component}
 Hypothesis: {self.hypothesis}
 Reason: {self.reason}
@@ -176,7 +176,7 @@ def _handle_missing_component(
 
         task = task_cls(
             name=component if component != "Model" else resp_dict.pop("model_name"),
-            **resp_dict,
+            **resp_dict.get("extra_params", {}),
         )
 
         exp = DSExperiment(sub_tasks=[task], hypothesis=DSHypothesis(component))
@@ -233,7 +233,11 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             sota_exp_desc = T("scenarios.data_science.share:describe.exp").r(
                 exp=sota_exp, heading="Best of previous exploration of the scenario"
             )
-            last_exp_diff = "\n".join(generate_diff(sota_exp.experiment_workspace.workspace_path, last_exp.experiment_workspace.workspace_path))
+            last_exp_diff = "\n".join(
+                generate_diff(
+                    sota_exp.experiment_workspace.workspace_path, last_exp.experiment_workspace.workspace_path
+                )
+            )
             exp_and_feedback_desc = T("scenarios.data_science.share:describe.feedback").r(
                 exp_and_feedback=exp_and_feedback
             )
@@ -299,7 +303,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     "spec_file": "spec/workflow.md",
                     "task_output_format": T(".prompts:output_format.workflow").r(),
                     "task_class": WorkflowTask,
-                }
+                },
             }
 
             component_info = component_task_mapping.get(component)
@@ -309,7 +313,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     targets=component_info["target_name"],
                     component=component,
                     scenario=scenario_desc,
-                    hypothesis_output_format=T(".prompts:output_format.hypothesis"),
+                    hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
                     task_specification=sota_exp.experiment_workspace.file_dict[component_info["spec_file"]],
                     task_output_format=component_info["task_output_format"],
                     extra_requirement=component_info.get("extra_requirement"),
@@ -322,10 +326,8 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                         if eaf[1].decision:
                             # we only add failed direction incase of trying same invalid direction
                             break
-                        recent_trace_desc.insert(0,
-                            T("scenarios.data_science.share:describe.feedback").r(
-                                exp_and_feedback=eaf
-                            )
+                        recent_trace_desc.insert(
+                            0, T("scenarios.data_science.share:describe.feedback").r(exp_and_feedback=eaf)
                         )
                 user_prompt = T(".prompts:direct_exp_gen.user").r(
                     exp_and_feedback_desc=exp_and_feedback_desc,
@@ -350,19 +352,17 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     concise_reason=resp_dict.get("concise_reason", ""),
                     concise_observation=resp_dict.get("concise_observation", ""),
                     concise_justification=resp_dict.get("concise_justification", ""),
-                    concise_knowledge=resp_dict.get("concise_knowledge", "")
+                    concise_knowledge=resp_dict.get("concise_knowledge", ""),
                 )
 
                 task = task_class(
                     name=task_name,
                     description=description,
-                    **{k: resp_dict.get(k, v) for k, v in component_info.get("extra_params", {}).items()}
+                    **{k: resp_dict.get(k, v) for k, v in component_info.get("extra_params", {}).items()},
                 )
 
                 exp = DSExperiment(sub_tasks=[task], hypothesis=hypothesis)
-                exp.experiment_workspace.inject_code_from_folder(
-                    sota_exp.experiment_workspace.workspace_path
-                )
+                exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
                 return exp
             else:
                 raise ValueError(f"Unknown component: {component}")
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 8e4e67208..a9ffa9e79 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -174,6 +174,12 @@ direct_exp_gen:
     [Partial Response Format 2] Your generated output should contain key-value pairs adhering to the following format and specifications:
     {{ task_output_format }}
 
+    Your response should contain two parts: the hypothesis proposal and the task design. Please follow the format and specifications provided below:
+    {
+      "hypothesis_proposal": [Partial Response Format 1],
+      "task_design": [Partial Response Format 2]
+    }
+
     {% if extra_requirement %}
     {{extra_requirement}}
     {% endif %}
@@ -293,12 +299,15 @@ output_format:
     {
         "model_name": "model name, must start with 'model_' and only contain letters, numbers, and underscores",
         "description": "A detailed description of the model",
-        "model_type": "The type of the model, e.g., neural network, tree-based model, etc.",
-        "architecture": "A detailed description of the model's architecture, e.g., neural network layers or tree structures",
-        "hyperparameters": {
+        "extra_params":
+        {
+          "model_type": "The type of the model, e.g., neural network, tree-based model, etc.",
+          "architecture": "A detailed description of the model's architecture, e.g., neural network layers or tree structures",
+          "hyperparameters": {
             "hyperparameter_name_1": "value of hyperparameter 1",
             "hyperparameter_name_2": "value of hyperparameter 2",
             "hyperparameter_name_3": "value of hyperparameter 3"
+          },
         },
     }
     Usually, a larger model works better than a smaller one. Hence, the parameters should be larger.

From 7f91db3dded28d69551d55dea4d6ae7a67a03251 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Thu, 16 Jan 2025 08:12:35 +0000
Subject: [PATCH 289/304] fix the following bug

---
 rdagent/scenarios/data_science/proposal/exp_gen.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 3d2bfd655..e0a31235c 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -176,6 +176,7 @@ def _handle_missing_component(
 
         task = task_cls(
             name=component if component != "Model" else resp_dict.pop("model_name"),
+            description=resp_dict.get("description", f"{component} description not provided"),
             **resp_dict.get("extra_params", {}),
         )
 

From 432e2b572025feb07068fc9c6d67bcf387f50c80 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Thu, 16 Jan 2025 07:08:13 +0000
Subject: [PATCH 290/304] fix

---
 .../data_science/proposal/exp_gen.py          | 41 +++++++++-
 .../data_science/proposal/prompts.yaml        | 78 ++++++++++---------
 2 files changed, 80 insertions(+), 39 deletions(-)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index e0a31235c..533540089 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -52,7 +52,45 @@ def __str__(self) -> str:
 Concise Justification: {self.concise_justification}
 Concise Knowledge: {self.concise_knowledge}
 """
-
+COMPONENT_TASK_MAPPING = {
+    # TODO:  merge the tow names, DataLoadSpec, data_loader, make all the code easier
+    "DataLoadSpec": {
+        "target_name": "Data loader and specification generation",
+        "spec_file": "spec/data_loader.md",
+        "task_output_format": T(".prompts:output_format.data_loader").r(),
+        "task_class": DataLoaderTask,
+    },
+    "FeatureEng": {
+        "target_name": "Feature engineering",
+        "spec_file": "spec/feature.md",
+        "task_output_format": T(".prompts:output_format.feature").r(),
+        "task_class": FeatureTask,
+    },
+    "Model": {
+        "target_name": "Building model",
+        "spec_file": "spec/model.md",
+        "task_output_format": T(".prompts:output_format.model").r(),
+        "task_class": ModelTask,
+        "extra_params": {
+            "model_type": "Model type not provided",
+            "architecture": "Model architecture not provided",
+            "hyperparameters": "Model hyperparameters not provided",
+        },
+        "extra_requirement": T(".prompts:extra_requirement.model").r(),
+    },
+    "Ensemble": {
+        "target_name": "Ensemble",
+        "spec_file": "spec/ensemble.md",
+        "task_output_format": T(".prompts:output_format.ensemble").r(),
+        "task_class": EnsembleTask,
+    },
+    "Workflow": {
+        "target_name": "Workflow",
+        "spec_file": "spec/workflow.md",
+        "task_output_format": T(".prompts:output_format.workflow").r(),
+        "task_class": WorkflowTask,
+    }
+}
 
 class DSTrace(Trace[DataScienceScen, KnowledgeBase]):
     def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None = None) -> None:
@@ -267,6 +305,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             # - after we know the selected component, we can use RAG.
 
             # Step 2: Generate the rest of the hypothesis & task
+            # TODO: Currently, all components are rendered each time. This can be optimized later.
             component_task_mapping = {
                 # TODO:  merge the tow names, DataLoadSpec, data_loader, make all the code easier
                 "DataLoadSpec": {
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index a9ffa9e79..72719839a 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -1,8 +1,8 @@
 hypothesis_gen: # It is deprecated now, please refer to direct_exp_gen
   system: |-
-    The user is working on generating new hypotheses for the {{targets}} in a data-driven research and development process. 
-    The {{targets}} are used in the following scenario:
-    {{scenario}}
+    The user is working on generating new hypotheses for the {{ targets }} in a data-driven research and development process. 
+    The {{ targets }} are used in the following scenario:
+    {{ scenario }}
     
     The user has already proposed several hypotheses and conducted evaluations. This information will be provided to you. Your task is to:
     1. Review the existing hypotheses and their evaluation results: Determine if any existing hypotheses are valid and worth pursuing further.
@@ -12,7 +12,7 @@ hypothesis_gen: # It is deprecated now, please refer to direct_exp_gen
     3. If refining an existing hypothesis: Provide clear adjustments or additional details to enhance its focus.
     4. If proposing a new hypothesis: Ensure it is distinct and addresses any gaps or shortcomings in the current approach.
 
-    The current component to focus on is: {{component}}.
+    The current component to focus on is: {{ component }}.
     {% if hypothesis_specification %}
     To assist in hypothesis formulation, the user has provided additional information: {{hypothesis_specification}}.
     Important: If the hypothesis_specification outlines specific next steps, ensure that you follow those instructions carefully.
@@ -35,9 +35,9 @@ hypothesis_gen: # It is deprecated now, please refer to direct_exp_gen
 
 hypothesis_model: # It is deprecated now, please refer to direct_exp_gen
   system: |-
-    The user is working on generating new hypotheses for the {{targets}} in a data-driven research and development process. 
-    The {{targets}} are used in the following scenario:
-    {{scenario}}
+    The user is working on generating new hypotheses for the {{ targets }} in a data-driven research and development process. 
+    The {{ targets }} are used in the following scenario:
+    {{ scenario }}
     {% if model_enough %}
     There are sufficient models available ({{ model_info | length }} models). Your task is to choose one of the existing models for further tuning or optimization. Based on the model's information:
     {{ model_info }}
@@ -62,25 +62,25 @@ hypothesis_model: # It is deprecated now, please refer to direct_exp_gen
 task_gen: # It is deprecated now, please refer to direct_exp_gen
   system: |-
     {% if hypothesis is not none %}
-    The user is trying to generate new {{targets}} based on the hypothesis generated in the previous step. 
+    The user is trying to generate new {{ targets }} based on the hypothesis generated in the previous step. 
     {% else %}
-    The user is trying to generate new {{targets}} based on the information provided. 
+    The user is trying to generate new {{ targets }} based on the information provided. 
     {% endif %}
-    The {{targets}} are used in certain scenario, the scenario is as follows:
+    The {{ targets }} are used in certain scenario, the scenario is as follows:
     {{ scenario }}
 
     {% if task_specification is not none %}
-    The user has wrote some specification for the {{targets}}. The specification is as follows:
+    The user has wrote some specification for the {{ targets }}. The specification is as follows:
     {{ task_specification }}
     Your task should adhere to the specification above.
     {% endif %}
 
     {% if hypothesis is not none %}
-    The user will use the {{targets}} generated to do some experiments. The user will provide this information to you:
-    1. The target hypothesis you are targeting to generate {{targets}} for.
+    The user will use the {{ targets }} generated to do some experiments. The user will provide this information to you:
+    1. The target hypothesis you are targeting to generate {{ targets }} for.
     2. The hypothesis generated in the previous steps and their corresponding feedbacks.
-    3. Former proposed {{targets}} on similar hypothesis.
-    4. Some additional information to help you generate new {{targets}}.
+    3. Former proposed {{ targets }} on similar hypothesis.
+    4. Some additional information to help you generate new {{ targets }}.
     {% endif %}
 
     Please generate the output following the format below:
@@ -94,31 +94,31 @@ task_gen: # It is deprecated now, please refer to direct_exp_gen
 
     {% if hypothesis is not none %}
     The user has made several hypothesis on this scenario and did several evaluation on them.
-    The target hypothesis you are targeting to generate {{targets}} for is as follows:
+    The target hypothesis you are targeting to generate {{ targets }} for is as follows:
     {{ hypothesis }}
     The former hypothesis and the corresponding feedbacks are as follows:
     {{ exp_and_feedback_desc }}
-    Please generate the new {{targets}} based on the information above.
+    Please generate the new {{ targets }} based on the information above.
     {% else %}
-    Please generate the new {{targets}} task.
+    Please generate the new {{ targets }} task.
     {% endif %}
 
 task_gen_model: # It is deprecated now, please refer to direct_exp_gen
   system: |-
     {% if hypothesis is not none %}
-    The user is trying to generate new {{targets}} based on the hypothesis generated in the previous step. 
+    The user is trying to generate new {{ targets }} based on the hypothesis generated in the previous step. 
     {% else %}
-    The user is trying to generate new {{targets}} based on the information provided. 
+    The user is trying to generate new {{ targets }} based on the information provided. 
     {% endif %}
-    The {{targets}} are used in certain scenario, the scenario is as follows:
+    The {{ targets }} are used in certain scenario, the scenario is as follows:
     {{ scenario }}
 
     {% if hypothesis is not none %}
-    The user will use the {{targets}} generated to do some experiments. The user will provide this information to you:
-    1. The target hypothesis you are targeting to generate {{targets}} for.
+    The user will use the {{ targets }} generated to do some experiments. The user will provide this information to you:
+    1. The target hypothesis you are targeting to generate {{ targets }} for.
     2. The hypothesis generated in the previous steps and their corresponding feedbacks.
-    3. Former proposed {{targets}} on similar hypothesis.
-    4. Some additional information to help you generate new {{targets}}.
+    3. Former proposed {{ targets }} on similar hypothesis.
+    4. Some additional information to help you generate new {{ targets }}.
     {% endif %}
     Please generate the output following the format below:
     {{ task_output_format }}
@@ -126,13 +126,13 @@ task_gen_model: # It is deprecated now, please refer to direct_exp_gen
   user: |-
     {% if hypothesis is not none %}
     The user has made several hypothesis on this scenario and did several evaluation on them.
-    The target hypothesis you are targeting to generate {{targets}} for is as follows:
+    The target hypothesis you are targeting to generate {{ targets }} for is as follows:
     {{ hypothesis }}
     The former hypothesis and the corresponding feedbacks are as follows:
     {{ exp_and_feedback_desc }}
-    Please generate the new {{targets}} based on the information above.
+    Please generate the new {{ targets }} based on the information above.
     {% else %}
-    Please generate the new {{targets}} task.
+    Please generate the new {{ targets }} task.
     {% endif %}
 
 direct_exp_gen:
@@ -141,7 +141,7 @@ direct_exp_gen:
 
     The component to focus on for the next hypothesis is already determined as: {{ component }}.
     It will be used in the following scenario:
-    {{scenario}}
+    {{ scenario }}
 
     # Hypothesis Proposal
 
@@ -158,18 +158,18 @@ direct_exp_gen:
 
     # Task Design
 
-    The user is trying to generate new {{targets}} based on the hypothesis generated in the previous step.
+    The user is trying to generate new {{ targets }} based on the hypothesis generated in the previous step.
 
-    The scope of the {{targets}} can be described by a interface specification as follows
+    The scope of the {{ targets }} can be described by a interface specification as follows
     ```Python
     {{task_specification}}
     ```
 
-    The user will use the {{targets}} generated to do some experiments. The user will provide this information to you:
-    1. The target hypothesis you are targeting to generate {{targets}} for.
+    The user will use the {{ targets }} generated to do some experiments. The user will provide this information to you:
+    1. The target hypothesis you are targeting to generate {{ targets }} for.
     2. The hypothesis generated in the previous steps and their corresponding feedbacks.
-    3.  Former proposed {{targets}} on similar hypothesis.
-    4. Some additional information to help you generate new {{targets}}.
+    3.  Former proposed {{ targets }} on similar hypothesis.
+    4. Some additional information to help you generate new {{ targets }}.
 
     [Partial Response Format 2] Your generated output should contain key-value pairs adhering to the following format and specifications:
     {{ task_output_format }}
@@ -226,14 +226,16 @@ component_gen:
 
     Here is the description of the competition scenario
     ```
-    {{scenario}}
+    {{ scenario }}
     ```
 
     # Here is the current best version of implementation.
     {{sota_exp_desc}}
 
+    {% if last_exp_diff %}
     # Here is the latest version of implementation different from the sota_exp_desc
-    {{last_exp_diff}}
+    {{ last_exp_diff }}
+    {% endif %}
 
     You will be provided the feedback for the latest implementation.
 
@@ -243,7 +245,7 @@ component_gen:
     {{ component_output_format }}
 
   user: |-
-    {{exp_and_feedback_desc}}
+    {{ exp_and_feedback_desc }}
 
 
 exp_and_feedback: |-

From ba7db06fd662634e9fcc2c6676bc9e6def04ecb3 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Thu, 16 Jan 2025 08:51:11 +0000
Subject: [PATCH 291/304] fix some bugs

---
 .../data_science/experiment/experiment.py     |  3 +-
 .../data_science/proposal/exp_gen.py          | 49 ++-----------------
 rdagent/utils/repo/diff.py                    |  1 -
 3 files changed, 7 insertions(+), 46 deletions(-)

diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index 51ec6c30f..9f48154f9 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -1,7 +1,8 @@
 import re
-import pandas as pd
 from typing import Literal
 
+import pandas as pd
+
 from rdagent.core.experiment import Experiment, FBWorkspace, Task
 
 COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow"]
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 533540089..1816e907c 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -52,8 +52,9 @@ def __str__(self) -> str:
 Concise Justification: {self.concise_justification}
 Concise Knowledge: {self.concise_knowledge}
 """
+
+
 COMPONENT_TASK_MAPPING = {
-    # TODO:  merge the tow names, DataLoadSpec, data_loader, make all the code easier
     "DataLoadSpec": {
         "target_name": "Data loader and specification generation",
         "spec_file": "spec/data_loader.md",
@@ -89,9 +90,10 @@ def __str__(self) -> str:
         "spec_file": "spec/workflow.md",
         "task_output_format": T(".prompts:output_format.workflow").r(),
         "task_class": WorkflowTask,
-    }
+    },
 }
 
+
 class DSTrace(Trace[DataScienceScen, KnowledgeBase]):
     def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None = None) -> None:
         self.scen: DataScienceScen = scen
@@ -305,48 +307,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             # - after we know the selected component, we can use RAG.
 
             # Step 2: Generate the rest of the hypothesis & task
-            # TODO: Currently, all components are rendered each time. This can be optimized later.
-            component_task_mapping = {
-                # TODO:  merge the tow names, DataLoadSpec, data_loader, make all the code easier
-                "DataLoadSpec": {
-                    "target_name": "Data loader and specification generation",
-                    "spec_file": "spec/data_loader.md",
-                    "task_output_format": T(".prompts:output_format.data_loader").r(),
-                    "task_class": DataLoaderTask,
-                },
-                "FeatureEng": {
-                    "target_name": "Feature engineering",
-                    "spec_file": "spec/feature.md",
-                    "task_output_format": T(".prompts:output_format.feature").r(),
-                    "task_class": FeatureTask,
-                },
-                "Model": {
-                    "target_name": "Building model",
-                    "spec_file": "spec/model.md",
-                    "task_output_format": T(".prompts:output_format.model").r(),
-                    "task_class": ModelTask,
-                    "extra_params": {
-                        "model_type": "Model type not provided",
-                        "architecture": "Model architecture not provided",
-                        "hyperparameters": "Model hyperparameters not provided",
-                    },
-                    "extra_requirement": T(".prompts:extra_requirement.model").r(),
-                },
-                "Ensemble": {
-                    "target_name": "Ensemble",
-                    "spec_file": "spec/ensemble.md",
-                    "task_output_format": T(".prompts:output_format.ensemble").r(),
-                    "task_class": EnsembleTask,
-                },
-                "Workflow": {
-                    "target_name": "Workflow",
-                    "spec_file": "spec/workflow.md",
-                    "task_output_format": T(".prompts:output_format.workflow").r(),
-                    "task_class": WorkflowTask,
-                },
-            }
-
-            component_info = component_task_mapping.get(component)
+            component_info = COMPONENT_TASK_MAPPING.get(component)
 
             if component_info:
                 system_prompt = T(".prompts:direct_exp_gen.system").r(
diff --git a/rdagent/utils/repo/diff.py b/rdagent/utils/repo/diff.py
index b68fd3bca..d3acf2e0d 100644
--- a/rdagent/utils/repo/diff.py
+++ b/rdagent/utils/repo/diff.py
@@ -53,4 +53,3 @@ def generate_diff(dir1: str, dir2: str) -> List[str]:
                     diff_files.extend(diff)
 
     return diff_files
-

From 7a6669f25e7236c501d1d9d7100dc784304d4bc5 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Thu, 16 Jan 2025 09:46:29 +0000
Subject: [PATCH 292/304] remove workflow from model

---
 .../coder/data_science/model/__init__.py      |  3 +--
 .../coder/data_science/model/eval.py          | 19 -------------------
 .../coder/data_science/model/prompts.yaml     |  9 ---------
 3 files changed, 1 insertion(+), 30 deletions(-)

diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index b8c335dc9..93113bb95 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -67,7 +67,6 @@ def implement_one_task(
             task_desc=model_information_str,
             data_loader_code=workspace.file_dict.get("load_data.py"),
             feature_code=workspace.file_dict["feature.py"],
-            workflow_code=workspace.file_dict.get("main.py"),
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
             out_spec=BatchEditOut.get_spec(),
@@ -99,7 +98,7 @@ def implement_one_task(
             batch_edit = {
                 (
                     f"{target_task.name}.py"
-                    if value != "__DEL__" and key != f"{target_task.name}.py" and key.startswith("model")
+                    if value != "__DEL__" and key != f"{target_task.name}.py" 
                     else key
                 ): value
                 for key, value in batch_edit.items()
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
index 86b09342b..9331e001b 100644
--- a/rdagent/components/coder/data_science/model/eval.py
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -7,8 +7,6 @@
 import re
 from pathlib import Path
 
-import pandas as pd
-
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
@@ -74,23 +72,6 @@ def evaluate(
             raise CoderError(
                 "The execution output contains too many progress bars and results in the LLM's token size exceeding the limit."
             )
-        fname = "main.py"
-        if "Model code test passed successfully." in stdout and implementation.file_dict.get(fname):
-            stdout = implementation.execute(env=de, entry=f"python {fname}")
-
-            # Check score file
-            score_fp = implementation.workspace_path / "scores.csv"
-            if not score_fp.exists():
-                stdout += "\nMetrics file (scores.csv) is not generated."
-            else:
-                score_df = pd.read_csv(score_fp, index_col=0)
-                model_set_in_scores = set(score_df.index)
-                model_set_in_folder = set(
-                    f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_(?!test)\w+\.py$", f)
-                )
-                for model in model_set_in_folder:
-                    if model not in model_set_in_scores:
-                        stdout += f"\nModel {model} is not evaluated in the scores.csv. The scores.csv has {model_set_in_scores}."
 
         system_prompt = T(".prompts:model_eval.system").r(
             task_desc=target_task.get_task_information(),
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
index ec6cecda2..ab51831aa 100644
--- a/rdagent/components/coder/data_science/model/prompts.yaml
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -13,11 +13,6 @@ model_coder:
         {{data_loader_code}}
         --------- Feature Engineering Code: ---------
         {{feature_code}}
-
-        {% if workflow_code %}
-        --------- Workflow Code (main.py): ---------
-        {{workflow_code}}
-        {% endif %}
         
         Instructions for Code Generation:
             Leveraging User Inputs:
@@ -38,10 +33,6 @@ model_coder:
         {% if out_spec %}
         {{out_spec}}
         The file name should be the model name described in the model task in the format "{task_name}.py". You should always follow this name format.
-        {% if workflow_code %}
-        If the workflow code is provided, you should also consider modify the workflow code in "main.py" if the model implementation is changed.
-        Please make sure all model files are used in the workflow code in "main.py".
-        {% endif %}
         {% else %}
         Formatting Your Response:
             Return only the code in a JSON format as shown below. Do not include any explanations or extra text. Example:

From f0059383e57630980ffd126aa66ca6a162c08fb1 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Thu, 16 Jan 2025 10:18:46 +0000
Subject: [PATCH 293/304] add pending_tasks_list in data science to enable
 coding model and workflow

---
 rdagent/app/data_science/loop.py              | 29 +++++++------
 .../scenarios/data_science/dev/feedback.py    |  2 +-
 .../scenarios/data_science/dev/prompts.yaml   |  2 +-
 .../data_science/experiment/experiment.py     |  5 ++-
 .../data_science/proposal/exp_gen.py          | 42 ++++++++++++-------
 .../data_science/proposal/prompts.yaml        | 14 ++++---
 rdagent/scenarios/data_science/share.yaml     |  2 +-
 7 files changed, 59 insertions(+), 37 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 73b97062c..16658359b 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -63,19 +63,22 @@ def direct_exp_gen(self, prev_out: dict[str, Any]):
         return exp
 
     def coding(self, prev_out: dict[str, Any]):
-        exp: DSExperiment = prev_out["direct_exp_gen"]
-        if exp.hypothesis.component == "DataLoadSpec":
-            exp = self.data_loader_coder.develop(exp)
-        elif exp.hypothesis.component == "FeatureEng":
-            exp = self.feature_coder.develop(exp)
-        elif exp.hypothesis.component == "Model":
-            exp = self.model_coder.develop(exp)
-        elif exp.hypothesis.component == "Ensemble":
-            exp = self.ensemble_coder.develop(exp)
-        elif exp.hypothesis.component == "Workflow":
-            exp = self.workflow_coder.develop(exp)
-        else:
-            raise NotImplementedError(f"Unsupported component in DataScienceRDLoop: {exp.hypothesis.component}")
+        exp = prev_out["direct_exp_gen"]
+        for tasks in exp.pending_tasks_list:
+            exp.sub_tasks = tasks
+            if exp.hypothesis.component == "DataLoadSpec":
+                exp = self.data_loader_coder.develop(exp)
+            elif exp.hypothesis.component == "FeatureEng":
+                exp = self.feature_coder.develop(exp)
+            elif exp.hypothesis.component == "Model":
+                exp = self.model_coder.develop(exp)
+            elif exp.hypothesis.component == "Ensemble":
+                exp = self.ensemble_coder.develop(exp)
+            elif exp.hypothesis.component == "Workflow":
+                exp = self.workflow_coder.develop(exp)
+            else:
+                raise NotImplementedError(f"Unsupported component in DataScienceRDLoop: {exp.hypothesis.component}")
+            exp.sub_tasks = []
 
         return exp
 
diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
index d92145d4a..7e794eb56 100644
--- a/rdagent/scenarios/data_science/dev/feedback.py
+++ b/rdagent/scenarios/data_science/dev/feedback.py
@@ -20,7 +20,7 @@
 class DSExperiment2Feedback(Experiment2Feedback):
     def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeedback:
         # 用哪些信息来生成feedback
-        # 1. sub_tasks[0] 任务的描述
+        # 1. pending_tasks_list[0][0] 任务的描述
         # 2. hypothesis 任务的假设
         # 3. 相对sota_exp的改动
         # 4. result 任务的结果
diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
index 59c2b0885..bbc9ac2f6 100644
--- a/rdagent/scenarios/data_science/dev/prompts.yaml
+++ b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -30,7 +30,7 @@ exp_feedback:
     Current solution to be evaluated:
 
     ### Task of Current solution
-    {{cur_exp.sub_tasks[0].get_task_information()}}
+    {{cur_exp.pending_tasks_list[0][0].get_task_information()}}
 
     {% if cur_exp.hypothesis %}
     the experiment is designed based on hypothesis: {{ cur_exp.hypothesis }}
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index 9f48154f9..b468970e8 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -9,9 +9,10 @@
 
 
 class DSExperiment(Experiment[Task, FBWorkspace, FBWorkspace]):
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
+    def __init__(self, pending_tasks_list: list, *args, **kwargs) -> None:
+        super().__init__(sub_tasks=[], *args, **kwargs)
         self.experiment_workspace = FBWorkspace()
+        self.pending_tasks_list = pending_tasks_list
 
     def next_component_required(self) -> COMPONENT | None:
         files = list(self.experiment_workspace.file_dict.keys())
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
index 1816e907c..59d9c4d61 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -205,8 +205,8 @@ def _handle_missing_component(
             exp_and_feedback
             and exp_and_feedback[1].exception is not None
             and (
-                exp_and_feedback[0].sub_tasks[0].name == component
-                or exp_and_feedback[0].sub_tasks[0].name.startswith("model_")
+                exp_and_feedback[0].pending_tasks_list[0][0].name == component
+                or exp_and_feedback[0].pending_tasks_list[0][0].name.startswith("model_")
                 and component == "Model"
             )
         ):  # Assumption: when completing missing component, using component name as task name
@@ -220,7 +220,7 @@ def _handle_missing_component(
             **resp_dict.get("extra_params", {}),
         )
 
-        exp = DSExperiment(sub_tasks=[task], hypothesis=DSHypothesis(component))
+        exp = DSExperiment(pending_tasks_list=[[task]], hypothesis=DSHypothesis(component))
         if last_successful_exp:
             exp.experiment_workspace.inject_code_from_folder(last_successful_exp.experiment_workspace.workspace_path)
         return exp
@@ -318,6 +318,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                     task_specification=sota_exp.experiment_workspace.file_dict[component_info["spec_file"]],
                     task_output_format=component_info["task_output_format"],
                     extra_requirement=component_info.get("extra_requirement"),
+                    workflow_check=(not component == "Workflow"),
                 )
 
                 recent_trace_desc = []
@@ -342,28 +343,41 @@ def gen(self, trace: DSTrace) -> DSExperiment:
                         user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
                     )
                 )
-
+                assert "hypothesis_proposal" in resp_dict, "Hypothesis proposal not provided."
+                assert "task_design" in resp_dict, "Task design not provided."
                 task_class = component_info["task_class"]
-                task_name = resp_dict["model_name"] if component == "Model" else component
-                description = resp_dict.get("description", f"{component_info['target_name']} description not provided")
+                hypothesis_proposal = resp_dict.get("hypothesis_proposal", {})
                 hypothesis = DSHypothesis(
                     component=component,
-                    hypothesis=resp_dict.get("hypothesis", ""),
-                    reason=resp_dict.get("reason", ""),
-                    concise_reason=resp_dict.get("concise_reason", ""),
-                    concise_observation=resp_dict.get("concise_observation", ""),
-                    concise_justification=resp_dict.get("concise_justification", ""),
-                    concise_knowledge=resp_dict.get("concise_knowledge", ""),
+                    hypothesis=hypothesis_proposal.get("hypothesis", ""),
+                    reason=hypothesis_proposal.get("reason", ""),
+                    concise_reason=hypothesis_proposal.get("concise_reason", ""),
+                    concise_observation=hypothesis_proposal.get("concise_observation", ""),
+                    concise_justification=hypothesis_proposal.get("concise_justification", ""),
+                    concise_knowledge=hypothesis_proposal.get("concise_knowledge", ""),
                 )
 
+                task_design = resp_dict.get("task_design", {})
+                task_name = task_design["model_name"] if component == "Model" else component
+                description = task_design.get(
+                    "description", f"{component_info['target_name']} description not provided"
+                )
                 task = task_class(
                     name=task_name,
                     description=description,
-                    **{k: resp_dict.get(k, v) for k, v in component_info.get("extra_params", {}).items()},
+                    **{k: task_design.get(k, v) for k, v in component_info.get("extra_params", {}).items()},
                 )
 
-                exp = DSExperiment(sub_tasks=[task], hypothesis=hypothesis)
+                exp = DSExperiment(pending_tasks_list=[[task]], hypothesis=hypothesis)
                 exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
+
+                new_workflow_desc = resp_dict.get("workflow_update", "No update needed")
+                if new_workflow_desc != "No update needed":
+                    workflow_task = WorkflowTask(
+                        name="Workflow",
+                        description=new_workflow_desc,
+                    )
+                    exp.pending_tasks_list.append([workflow_task])
                 return exp
             else:
                 raise ValueError(f"Unknown component: {component}")
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index 72719839a..beb186e02 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -174,19 +174,23 @@ direct_exp_gen:
     [Partial Response Format 2] Your generated output should contain key-value pairs adhering to the following format and specifications:
     {{ task_output_format }}
 
+    {% if workflow_check %}
+    # Workflow update
+    Since components have dependencies, the workflow should be updated to reflect the changes made to the target component. Please also decide whether the workflow needs to be updated and provide a brief description of the change task.
+    [Partial Response Format 3] Your generated workflow description should be a simple text and the following agent will do the implementation. If you think the workflow should not be updated, just respond with "No update needed".
+    {% endif %}
+
     Your response should contain two parts: the hypothesis proposal and the task design. Please follow the format and specifications provided below:
     {
       "hypothesis_proposal": [Partial Response Format 1],
-      "task_design": [Partial Response Format 2]
+      "task_design": [Partial Response Format 2],
+      {% if workflow_check %}"workflow_update": [Partial Response Format 3], {% endif %}
     }
 
     {% if extra_requirement %}
     {{extra_requirement}}
     {% endif %}
 
-    # Response Requirement
-    Your generated output should combine the key-value pairs based on [Partial Response Format 1] and [Partial Response Format 2]. Please output the key-values specified in [Partial Response Format 1] first.
-
   user: |-
     # The detailed description of current best experiments
     {{sota_exp_desc}}
@@ -251,7 +255,7 @@ component_gen:
 exp_and_feedback: |-
   {% for experiment, feedback in trace.hist[-10:] %}
   ## Experiment {{ loop.index }}
-  Experiment are focusing on task: {{experiment.sub_tasks[0]}}
+  Experiment are focusing on task: {{experiment.pending_tasks_list[0][0]}}
   {% if experiment.hypothesis %}
   The experiment is design driven by hypothesis : {{ experiment.hypothesis }}
   Observation on the result with the hypothesis: {{ feedback.observations }}
diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
index 36a3e0420..773eec221 100644
--- a/rdagent/scenarios/data_science/share.yaml
+++ b/rdagent/scenarios/data_science/share.yaml
@@ -32,7 +32,7 @@ describe: # some template to describe some object
     the experiment is designed based on hypothesis: {{ exp_and_feedback[0].hypothesis }}
     {% endif %}
     ### Task of previous trial
-    {{ exp_and_feedback[0].sub_tasks[0].get_task_information() }}
+    {{ exp_and_feedback[0].pending_tasks_list[0][0].get_task_information() }}
     feedback decision: {{ exp_and_feedback[1].decision }}
     reason: {{ exp_and_feedback[1].reason }}
     {% endif %}

From 8516e486349e1b2d78ef7e825d283d9699987d61 Mon Sep 17 00:00:00 2001
From: yuanteli <1957922024@qq.com>
Date: Thu, 16 Jan 2025 10:21:09 +0000
Subject: [PATCH 294/304] refine the code for handling JSON-formatted data
 descriptions

---
 rdagent/scenarios/data_science/scen/__init__.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
index 0705ff913..9c6d7cbf9 100644
--- a/rdagent/scenarios/data_science/scen/__init__.py
+++ b/rdagent/scenarios/data_science/scen/__init__.py
@@ -95,6 +95,7 @@ def describe_data_folder(folder_path, indent=0, max_files=2, partial_expand_subf
         dirs.sort()
         if not dirs:
             for file in files:
+                print(file)
                 file_path = os.path.join(root, file)
                 file_type = os.path.splitext(file)[1][1:]
                 file_size = os.path.getsize(file_path)
@@ -108,7 +109,7 @@ def describe_data_folder(folder_path, indent=0, max_files=2, partial_expand_subf
                 # In deeper levels, follow the max_files restriction
                 if is_top_level and file_type in ["csv", "md"]:
                     files_details[file_type].append((file, file_size, file_path))
-                elif not is_top_level and len(files_details[file_type]) < max_files:
+                elif len(files_details[file_type]) < max_files:
                     files_details[file_type].append((file, file_size, file_path))
             break
 
@@ -169,7 +170,7 @@ def describe_data_folder(folder_path, indent=0, max_files=2, partial_expand_subf
             # In deeper levels, follow the max_files restriction
             if is_top_level and file_type in ["csv", "md"]:
                 files_details[file_type].append((file, file_size, file_path))
-            elif not is_top_level and len(files_details[file_type]) < max_files:
+            elif not is_top_level and len(files_details[file_type]) <= max_files:
                 files_details[file_type].append((file, file_size, file_path))
 
         break
@@ -204,6 +205,16 @@ def describe_data_folder(folder_path, indent=0, max_files=2, partial_expand_subf
                         for tag, value in img.tag_v2.items():
                             tag_name = TiffTags.TAGS_V2.get(tag, f"Unknown Tag {tag}")
                             result.append(" " * (indent + 4) + f"{tag_name}: {value}")
+                if file_type == "json":
+                    result.append(" " * (indent + 2) + f"- Content of {file}:")
+                    with open(path, "r", encoding="utf-8") as f:
+                        for i, line in enumerate(f):
+                            if i < 2:
+                                result.append(
+                                    " " * (indent + 4) + line.strip()[:100] + ("..." if len(line.strip()) > 100 else "")
+                                )
+                            else:
+                                break
 
     return "\n".join(result) + "\n"
 
@@ -324,4 +335,4 @@ def rich_style_description(self) -> str:
 
 
 if __name__ == "__main__":
-    print(describe_data_folder(Path("/data/userdata/share/mle_kaggle") / "aerial-cactus-identification"))
+    print(describe_data_folder(Path("/data/userdata/share/mle_kaggle") / "stanford-covid-vaccine"))

From d2959b017f9a5740071ce547c1cdf6e4beed51d5 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Thu, 16 Jan 2025 10:51:04 +0000
Subject: [PATCH 295/304] assert with information

---
 .../data_science/ensemble/eval_tests/ensemble_test.txt    | 8 ++++++--
 .../data_science/feature/eval_tests/feature_test.txt      | 8 ++++++--
 .../raw_data_loader/eval_tests/data_loader_test.txt       | 8 ++++++--
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt
index cfd4842f0..5a716731d 100644
--- a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt
+++ b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt
@@ -50,9 +50,13 @@ final_pred = ens_and_decision(test_preds_dict, val_preds_dict, val_y)
 
 # Check shape
 if isinstance(final_pred, list):
-    assert len(final_pred) == len(test_X), "Wrong output sample size"
+    assert len(final_pred) == len(test_X), (
+        f"Wrong output sample size: len(final_pred) ({len(final_pred)}) and len(test_X) ({len(test_X)})"
+    )
 else:
-    assert final_pred.shape[0] == test_X.shape[0], "Wrong output sample size"
+    assert final_pred.shape[0] == test_X.shape[0], (
+        f"Wrong output sample size: final_pred.shape[0] ({final_pred.shape[0]}) and test_X.shape[0] ({test_X.shape[0]})"
+    )
 
 # check if scores.csv is generated
 assert Path("scores.csv").exists(), "scores.csv is not generated"
diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt
index 9a447b39d..a731c0409 100644
--- a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt
@@ -25,7 +25,11 @@ X, y, X_test = feat_eng(X, y, X_test)
 def get_length(data):
     return len(data) if isinstance(data, list) else data.shape[0]
 
-assert get_length(X_test) == get_length(test_ids), "Mismatch in length of test images and test IDs"
-assert get_length(X) == get_length(y), "Mismatch in length of training images and labels"
+assert get_length(X_test) == get_length(test_ids), (
+    f"Mismatch in length of test images and test IDs: X_test ({get_length(X_test)}) and test_ids ({get_length(test_ids)})"
+)
+assert get_length(X) == get_length(y), (
+    f"Mismatch in length of training images and labels: X ({get_length(X)}) and y ({get_length(y)})"
+)
 
 print("Feature Engineering test passed successfully. Length of test images matches length of test IDs.")
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt
index 670b0cd9b..0da4e61fe 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt
@@ -15,7 +15,11 @@ X, y, X_test, test_ids = load_data()
 def get_length(data):
     return len(data) if isinstance(data, list) else data.shape[0]
 
-assert get_length(X_test) == get_length(test_ids), "Mismatch in length of test images and test IDs"
-assert get_length(X) == get_length(y), "Mismatch in length of training images and labels"
+assert get_length(X_test) == get_length(test_ids), (
+    f"Mismatch in length of test images and test IDs: X_test ({get_length(X_test)}) and test_ids ({get_length(test_ids)})"
+)
+assert get_length(X) == get_length(y), (
+    f"Mismatch in length of training images and labels: X ({get_length(X)}) and y ({get_length(y)})"
+)
 
 print("Data loader test passed successfully. Length of test images matches length of test IDs.")

From ea2ff15fda9585d274a5409741e546d486670404 Mon Sep 17 00:00:00 2001
From: Tim <illking@foxmail.com>
Date: Thu, 16 Jan 2025 11:01:00 +0000
Subject: [PATCH 296/304] ensure correct csv file name

---
 .../components/coder/data_science/raw_data_loader/prompts.yaml  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index e742fe61e..0d74a2e2b 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -196,7 +196,7 @@ spec:
           - Ensure all predictions in `test_preds_dict` and `val_preds_dict` have consistent shapes and dimensions.
           - Verify that `val_label` is provided and matches the length of `val_preds_dict` predictions.
           - Handle empty or invalid inputs gracefully with appropriate error messages.
-        - You should calculate the metric for each model and ensemble strategy, and save the results in a CSV file, e.g.:
+        - You should calculate the metric for each model and ensemble strategy, and save the results in `scores.csv`, e.g.:
           ```python
           scores = {}
           for model_name, val_pred in val_preds_dict.items():

From 2741121b9c342c789fd54c0f361d7f8add4d902e Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Fri, 17 Jan 2025 08:42:52 +0000
Subject: [PATCH 297/304] add logging to help record the output

---
 rdagent/app/data_science/loop.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 16658359b..99e916fa3 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -59,7 +59,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):
 
     def direct_exp_gen(self, prev_out: dict[str, Any]):
         exp = self.exp_gen.gen(self.trace)
-        logger.log_object(exp, tag="debug_exp_gen")
+        logger.log_object(exp, tag="direct_exp_gen")
         return exp
 
     def coding(self, prev_out: dict[str, Any]):
@@ -79,13 +79,15 @@ def coding(self, prev_out: dict[str, Any]):
             else:
                 raise NotImplementedError(f"Unsupported component in DataScienceRDLoop: {exp.hypothesis.component}")
             exp.sub_tasks = []
-
+        logger.log_object(exp, tag="coding")
         return exp
 
     def running(self, prev_out: dict[str, Any]):
         exp: DSExperiment = prev_out["coding"]
         if exp.next_component_required() is None:
-            return self.runner.develop(exp)
+            new_exp = self.runner.run(exp)
+            logger.log_object(new_exp, tag="running")
+            return new_exp
         else:
             return exp
 
@@ -98,6 +100,7 @@ def feedback(self, prev_out: dict[str, Any]) -> ExperimentFeedback:
                 reason=f"{exp.hypothesis.component} is completed.",
                 decision=True,
             )
+        logger.log_object(feedback, tag="feedback")
         return feedback
 
     def record(self, prev_out: dict[str, Any]):
@@ -111,6 +114,7 @@ def record(self, prev_out: dict[str, Any]):
                     ExperimentFeedback.from_exception(e),
                 )
             )
+        logger.log_object(self.trace, tag="trace")
         logger.log_object(self.trace.sota_experiment(), tag="SOTA experiment")
 
 

From b816570eec61a06b4b144b522760432788dd3b14 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Fri, 17 Jan 2025 08:48:57 +0000
Subject: [PATCH 298/304] log competition

---
 rdagent/app/data_science/loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 99e916fa3..2980e9e02 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -27,6 +27,7 @@ class DataScienceRDLoop(RDLoop):
     skip_loop_error = (CoderError, RunnerError)
 
     def __init__(self, PROP_SETTING: BasePropSetting):
+        logger.log_object(PROP_SETTING.competition, tag="competition")
         scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
 
         ### shared components in the workflow  # TODO: check if

From e572aa1252baabaf74c9c89606c3a88563f31503 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 17 Jan 2025 09:07:44 +0000
Subject: [PATCH 299/304] add log tag for debug llm app

---
 rdagent/app/data_science/loop.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 2980e9e02..30a33de29 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -61,6 +61,9 @@ def __init__(self, PROP_SETTING: BasePropSetting):
     def direct_exp_gen(self, prev_out: dict[str, Any]):
         exp = self.exp_gen.gen(self.trace)
         logger.log_object(exp, tag="direct_exp_gen")
+        
+        # FIXME: this is for LLM debug webapp, remove this when the debugging is done.
+        logger.log_object(exp, tag="debug_exp_gen")
         return exp
 
     def coding(self, prev_out: dict[str, Any]):

From ae0ec76067b1b23d7f4d3bd087fd412fed79a8ff Mon Sep 17 00:00:00 2001
From: Linlang <30293408+SunsetWolf@users.noreply.github.com>
Date: Fri, 17 Jan 2025 17:40:00 +0800
Subject: [PATCH 300/304] test: Test ds refactor ll (#523)

* fix bugs to former scenario

* fix a bug because coding in rdloop changed

* fix the bug when feedback gets no hypothesis

* fix trace structure

* change all trace hist when merging hypothesis to experiments

* ignore some error in ruff

* fix kaggle scenario bugs

* refine one line

* another bug

* another small bug

* fix ui bugs

* chage kaggle  train.py path

---------

Co-authored-by: Xu Yang <peteryang@vip.qq.com>
---
 pyproject.toml                                |  3 +-
 rdagent/app/kaggle/loop.py                    |  4 +-
 .../app/qlib_rd_loop/factor_from_report.py    | 11 ++-
 .../coder/factor_coder/eva_utils.py           |  2 +-
 rdagent/components/workflow/rd_loop.py        | 22 +++--
 rdagent/core/evolving_agent.py                | 86 +++++++++----------
 rdagent/log/storage.py                        |  2 +
 rdagent/log/ui/app.py                         |  2 +-
 .../data_mining/proposal/model_proposal.py    |  4 +-
 .../data_science/proposal/prompts.yaml        |  9 ++
 .../scenarios/kaggle/developer/feedback.py    |  4 +-
 .../scenarios/kaggle/experiment/scenario.py   |  2 +-
 rdagent/scenarios/kaggle/prompts.yaml         |  4 +-
 rdagent/scenarios/kaggle/proposal/proposal.py | 12 +--
 rdagent/scenarios/qlib/prompts.yaml           |  4 +-
 .../qlib/proposal/factor_proposal.py          |  4 +-
 .../scenarios/qlib/proposal/model_proposal.py |  4 +-
 rdagent/utils/env.py                          | 11 ++-
 rdagent/utils/workflow.py                     | 46 +++++-----
 19 files changed, 134 insertions(+), 102 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3f7300489..9a8aa5375 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,7 +81,6 @@ src = ["rdagent"]
 [tool.ruff.lint]
 ignore = [
   # https://docs.astral.sh/ruff/rules/#pydocstyle-d
-  "ANN101",
   "ANN401",
   "D",
   "ERA001",
@@ -92,7 +91,7 @@ ignore = [
   "S101",
   "S301",
   "T20",
-  "TCH003",
+  "TC003",
   "TD",
 ]
 select = ["ALL"]
diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
index 8325938b0..9f4cae811 100644
--- a/rdagent/app/kaggle/loop.py
+++ b/rdagent/app/kaggle/loop.py
@@ -7,7 +7,7 @@
 from rdagent.components.workflow.conf import BasePropSetting
 from rdagent.components.workflow.rd_loop import RDLoop
 from rdagent.core.developer import Developer
-from rdagent.core.exception import FactorEmptyError, ModelEmptyError
+from rdagent.core.exception import CoderError, FactorEmptyError, ModelEmptyError
 from rdagent.core.proposal import (
     Experiment2Feedback,
     Hypothesis2Experiment,
@@ -115,7 +115,7 @@ def running(self, prev_out: dict[str, Any]):
 
         return exp
 
-    skip_loop_error = (ModelEmptyError, FactorEmptyError)
+    skip_loop_error = (ModelEmptyError, FactorEmptyError, CoderError)
 
 
 def main(path=None, step_n=None, competition=None):
diff --git a/rdagent/app/qlib_rd_loop/factor_from_report.py b/rdagent/app/qlib_rd_loop/factor_from_report.py
index d98e0b13a..b89bb88af 100644
--- a/rdagent/app/qlib_rd_loop/factor_from_report.py
+++ b/rdagent/app/qlib_rd_loop/factor_from_report.py
@@ -97,6 +97,7 @@ def extract_hypothesis_and_exp_from_reports(report_file_path: str) -> Tuple[Qlib
 
     report_content = "\n".join(docs_dict.values())
     hypothesis = generate_hypothesis(factor_result, report_content)
+    exp.hypothesis = hypothesis
     return exp, hypothesis
 
 
@@ -128,7 +129,9 @@ def propose_hypo_exp(self, prev_out: dict[str, Any]):
                 if exp is None:
                     continue
                 self.valid_pdf_file_count += 1
-                exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [t[1] for t in self.trace.hist if t[2]]
+                exp.based_experiments = [QlibFactorExperiment(sub_tasks=[], hypothesis=hypothesis)] + [
+                    t[0] for t in self.trace.hist if t[1]
+                ]
                 exp.sub_workspace_list = exp.sub_workspace_list[: FACTOR_FROM_REPORT_PROP_SETTING.max_factors_per_exp]
                 exp.sub_tasks = exp.sub_tasks[: FACTOR_FROM_REPORT_PROP_SETTING.max_factors_per_exp]
                 logger.log_object(hypothesis, tag="hypothesis generation")
@@ -143,6 +146,12 @@ def propose(self, prev_out: dict[str, Any]):
     def exp_gen(self, prev_out: dict[str, Any]):
         return self.current_loop_exp
 
+    def coding(self, prev_out: dict[str, Any]):
+        with logger.tag("d"):  # develop
+            exp = self.coder.develop(prev_out["exp_gen"])
+            logger.log_object(exp.sub_workspace_list, tag="coder result")
+        return exp
+
 
 def main(report_folder=None, path=None, step_n=None):
     """
diff --git a/rdagent/components/coder/factor_coder/eva_utils.py b/rdagent/components/coder/factor_coder/eva_utils.py
index 48d7fe4f7..40f85bc7a 100644
--- a/rdagent/components/coder/factor_coder/eva_utils.py
+++ b/rdagent/components/coder/factor_coder/eva_utils.py
@@ -79,7 +79,7 @@ def evaluate(
         **kwargs,
     ):
         factor_information = target_task.get_task_information()
-        code = implementation.code
+        code = implementation.all_codes
 
         system_prompt = (
             Environment(undefined=StrictUndefined)
diff --git a/rdagent/components/workflow/rd_loop.py b/rdagent/components/workflow/rd_loop.py
index d1b3dcb9f..11ef2ba4b 100644
--- a/rdagent/components/workflow/rd_loop.py
+++ b/rdagent/components/workflow/rd_loop.py
@@ -11,6 +11,7 @@
     Experiment2Feedback,
     Hypothesis,
     Hypothesis2Experiment,
+    HypothesisFeedback,
     HypothesisGen,
     Trace,
 )
@@ -74,9 +75,18 @@ def running(self, prev_out: dict[str, Any]):
         return exp
 
     def feedback(self, prev_out: dict[str, Any]):
-        feedback = self.summarizer.generate_feedback(
-            prev_out["running"], prev_out["direct_exp_gen"]["propose"], self.trace
-        )
-        with logger.tag("ef"):  # evaluate and feedback
-            logger.log_object(feedback, tag="feedback")
-        self.trace.hist.append((prev_out["direct_exp_gen"]["propose"], prev_out["running"], feedback))
+        e = prev_out.get(self.EXCEPTION_KEY, None)
+        if e is not None:
+            feedback = HypothesisFeedback(
+                observations="Error occurred in loop, skip this loop",
+                hypothesis_evaluation="",
+                new_hypothesis="",
+                reason="",
+                decision=False,
+            )
+            self.trace.hist.append((prev_out["direct_exp_gen"]["exp_gen"], feedback))
+        else:
+            feedback = self.summarizer.generate_feedback(prev_out["running"], self.trace)
+            with logger.tag("ef"):  # evaluate and feedback
+                logger.log_object(feedback, tag="feedback")
+            self.trace.hist.append((prev_out["running"], feedback))
diff --git a/rdagent/core/evolving_agent.py b/rdagent/core/evolving_agent.py
index 4d7e8e5c4..d2c36a4e0 100644
--- a/rdagent/core/evolving_agent.py
+++ b/rdagent/core/evolving_agent.py
@@ -59,50 +59,50 @@ def multistep_evolve(
         filter_final_evo: bool = False,
     ) -> EvolvableSubjects:
         for evo_loop_id in tqdm(range(self.max_loop), "Implementing"):
-            with logger.tag(f"evo_loop_{evo_loop_id}"):
-                # 1. knowledge self-evolving
-                if self.knowledge_self_gen and self.rag is not None:
-                    self.rag.generate_knowledge(self.evolving_trace)
-                # 2. RAG
-                queried_knowledge = None
-                if self.with_knowledge and self.rag is not None:
-                    # TODO: Putting the evolving trace in here doesn't actually work
-                    queried_knowledge = self.rag.query(evo, self.evolving_trace)
-
-                # 3. evolve
-                evo = self.evolving_strategy.evolve(
-                    evo=evo,
-                    evolving_trace=self.evolving_trace,
-                    queried_knowledge=queried_knowledge,
+            # with logger.tag(f"evo_loop_{evo_loop_id}"):
+            # 1. knowledge self-evolving
+            if self.knowledge_self_gen and self.rag is not None:
+                self.rag.generate_knowledge(self.evolving_trace)
+            # 2. RAG
+            queried_knowledge = None
+            if self.with_knowledge and self.rag is not None:
+                # TODO: Putting the evolving trace in here doesn't actually work
+                queried_knowledge = self.rag.query(evo, self.evolving_trace)
+
+            # 3. evolve
+            evo = self.evolving_strategy.evolve(
+                evo=evo,
+                evolving_trace=self.evolving_trace,
+                queried_knowledge=queried_knowledge,
+            )
+            # TODO: Due to design issues, we have chosen to ignore this mypy error.
+            logger.log_object(evo.sub_workspace_list, tag="evolving code")  # type: ignore[attr-defined]
+            for sw in evo.sub_workspace_list:  # type: ignore[attr-defined]
+                logger.info(f"evolving code workspace: {sw}")
+
+            # 4. Pack evolve results
+            es = EvoStep(evo, queried_knowledge)
+
+            # 5. Evaluation
+            if self.with_feedback:
+                es.feedback = (
+                    # TODO: Due to the irregular design of rdagent.core.evaluation.Evaluator,
+                    # it fails mypy's test here, so we'll ignore this error for now.
+                    eva
+                    if isinstance(eva, Feedback)
+                    else eva.evaluate(evo, queried_knowledge=queried_knowledge)  # type: ignore[arg-type, call-arg]
                 )
-                # TODO: Due to design issues, we have chosen to ignore this mypy error.
-                logger.log_object(evo.sub_workspace_list, tag="evolving code")  # type: ignore[attr-defined]
-                for sw in evo.sub_workspace_list:  # type: ignore[attr-defined]
-                    logger.info(f"evolving code workspace: {sw}")
-
-                # 4. Pack evolve results
-                es = EvoStep(evo, queried_knowledge)
-
-                # 5. Evaluation
-                if self.with_feedback:
-                    es.feedback = (
-                        # TODO: Due to the irregular design of rdagent.core.evaluation.Evaluator,
-                        # it fails mypy's test here, so we'll ignore this error for now.
-                        eva
-                        if isinstance(eva, Feedback)
-                        else eva.evaluate(evo, queried_knowledge=queried_knowledge)  # type: ignore[arg-type, call-arg]
-                    )
-                    logger.log_object(es.feedback, tag="evolving feedback")
-
-                # 6. update trace
-                self.evolving_trace.append(es)
-
-                # 7. check if all tasks are completed
-                if self.with_feedback:
-                    all_completed = all(es.feedback) if isinstance(es.feedback, list) else es.feedback
-                    if all_completed:
-                        logger.info("All tasks in evolving subject have been completed.")
-                        break
+                logger.log_object(es.feedback, tag="evolving feedback")
+
+            # 6. update trace
+            self.evolving_trace.append(es)
+
+            # 7. check if all tasks are completed
+            if self.with_feedback:
+                all_completed = all(es.feedback) if isinstance(es.feedback, list) else es.feedback
+                if all_completed:
+                    logger.info("All tasks in evolving subject have been completed.")
+                    break
 
         if self.with_feedback and filter_final_evo:
             evo = self.filter_evolvable_subjects_by_feedback(evo, self.evolving_trace[-1].feedback)
diff --git a/rdagent/log/storage.py b/rdagent/log/storage.py
index 0411a5bad..acf87606b 100644
--- a/rdagent/log/storage.py
+++ b/rdagent/log/storage.py
@@ -100,6 +100,8 @@ def iter_msg(self, watch: bool = False) -> Generator[Message, None, None]:
                 msg_l.append(m)
 
         for file in self.path.glob("**/*.pkl"):
+            if file.name == "debug_llm.pkl":
+                continue
             tag = ".".join(file.relative_to(self.path).as_posix().replace("/", ".").split(".")[:-3])
             pid = file.parent.name
 
diff --git a/rdagent/log/ui/app.py b/rdagent/log/ui/app.py
index f760fbb0e..04f49ed24 100644
--- a/rdagent/log/ui/app.py
+++ b/rdagent/log/ui/app.py
@@ -357,7 +357,7 @@ def hypothesis_hover_text(h: Hypothesis, d: bool = False):
     hover_texts = [
         hypothesis_hover_text(state.hypotheses[int(i[6:])], state.h_decisions[int(i[6:])])
         for i in df.index
-        if i != "alpha158"
+        if i != "alpha158" and i != "Baseline"
     ]
     if state.alpha158_metrics is not None:
         hover_texts = ["Baseline: alpha158"] + hover_texts
diff --git a/rdagent/scenarios/data_mining/proposal/model_proposal.py b/rdagent/scenarios/data_mining/proposal/model_proposal.py
index 547978edc..a2a6354d2 100644
--- a/rdagent/scenarios/data_mining/proposal/model_proposal.py
+++ b/rdagent/scenarios/data_mining/proposal/model_proposal.py
@@ -80,7 +80,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             else "No previous hypothesis and feedback available since it's the first round."
         )
 
-        experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist]
+        experiment_list: List[ModelExperiment] = [t[0] for t in trace.hist]
 
         model_list = []
         for experiment in experiment_list:
@@ -117,5 +117,5 @@ def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace)
                 )
             )
         exp = DMModelExperiment(tasks, hypothesis=hypothesis)
-        exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+        exp.based_experiments = [t[0] for t in trace.hist if t[1]]
         return exp
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
index beb186e02..03989e6d9 100644
--- a/rdagent/scenarios/data_science/proposal/prompts.yaml
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -59,6 +59,15 @@ hypothesis_model: # It is deprecated now, please refer to direct_exp_gen
     Please generate the output using the following format and specifications:
     {{ hypothesis_output_format }}
 
+hypothesis_and_feedback: |-
+  {% for experiment, feedback in hist %}
+  Hypothesis {{ loop.index }}
+  The experiment is design driven by hypothesis : {{ experiment.hypothesis }}
+  Observation on the result with the hypothesis: {{ feedback.observations }}
+  Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
+  Did changing to this hypothesis work? (focus on the change):  {{ feedback.decision }}
+  {% endfor %}
+
 task_gen: # It is deprecated now, please refer to direct_exp_gen
   system: |-
     {% if hypothesis is not none %}
diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
index d97ff9820..fe4effa0a 100644
--- a/rdagent/scenarios/kaggle/developer/feedback.py
+++ b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -111,7 +111,7 @@ def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback
             ]
         else:
             current_sub_exps_to_code = {
-                sub_ws.target_task.get_task_information(): sub_ws.code for sub_ws in exp.sub_workspace_list
+                sub_ws.target_task.get_task_information(): sub_ws.all_codes for sub_ws in exp.sub_workspace_list
             }
         current_sub_exps_to_code_str = json.dumps(current_sub_exps_to_code, indent=2)
         current_result = exp.result
@@ -119,7 +119,7 @@ def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback
 
         last_hypothesis_and_feedback = None
         if trace.hist and len(trace.hist) > 0:
-            last_hypothesis_and_feedback = (trace.hist[-1][0], trace.hist[-1][2])
+            last_hypothesis_and_feedback = (trace.hist[-1][0].hypothesis, trace.hist[-1][1])
 
         # Prepare render dictionary
         render_dict = {
diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
index b6ef2f6bb..2816d78af 100644
--- a/rdagent/scenarios/kaggle/experiment/scenario.py
+++ b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -125,7 +125,7 @@ def background(self) -> str:
         background_template = prompt_dict["kg_background"]
 
         train_script = (
-            Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template" / "train.py"
+            Path(__file__).parent / "templates" / KAGGLE_IMPLEMENT_SETTING.competition / "train.py"
         ).read_text()
 
         background_prompt = (
diff --git a/rdagent/scenarios/kaggle/prompts.yaml b/rdagent/scenarios/kaggle/prompts.yaml
index 79ee84b31..b7a7e489c 100644
--- a/rdagent/scenarios/kaggle/prompts.yaml
+++ b/rdagent/scenarios/kaggle/prompts.yaml
@@ -25,8 +25,8 @@ KG_hypothesis_gen_RAG: |-
   {% endif %}
 
 hypothesis_and_feedback: |-
-  {% for hypothesis, experiment, feedback in trace.hist[-10:] %}
-  Hypothesis {{ loop.index }}: {{ hypothesis }}
+  {% for experiment, feedback in trace.hist[-10:] %}
+  Hypothesis {{ loop.index }}: {{ experiment.hypothesis }}
   Observation on the result with the hypothesis: {{ feedback.observations }}
   Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
   Did changing to this hypothesis work? (focus on the change):  {{ feedback.decision }}
diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py
index d1072d6dc..63636ee5c 100644
--- a/rdagent/scenarios/kaggle/proposal/proposal.py
+++ b/rdagent/scenarios/kaggle/proposal/proposal.py
@@ -276,11 +276,11 @@ def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
 
         hypothesis_specification = f"Hypothesis should avoid being too general and vague, and should be specific and actionable. For example, hypothesis like 'tune a model' is too general, while hypothesis like 'increase the learning rate to 0.1 of the lightgbm model will improve the performance' is specific and actionable."
         if len(trace.hist) > 0:
-            sota_features = str(trace.hist[-1][1].based_experiments[-1].experiment_workspace.data_description)
+            sota_features = str(trace.hist[-1][0].based_experiments[-1].experiment_workspace.data_description)
             sota_models = json.dumps(
-                trace.hist[-1][1].based_experiments[-1].experiment_workspace.model_description, indent=2
+                trace.hist[-1][0].based_experiments[-1].experiment_workspace.model_description, indent=2
             )
-            sota_result = trace.hist[-1][1].based_experiments[-1].result
+            sota_result = trace.hist[-1][0].based_experiments[-1].result
             hypothesis_specification += f"\nYour hypothesis should based on current SOTA solution. The user will conduct experiments based on the SOTA solution to test whether your hypothesis is right on this specific ecompetition. \n\nSOTA Features: {sota_features}\n\nSOTA Models: {sota_models}\n\nSOTA Result: {sota_result}"
         if self.scen.if_action_choosing_based_on_UCB:
             hypothesis_specification += (
@@ -340,7 +340,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             else "No previous hypothesis and feedback available since it's the first round."
         )
 
-        experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist]
+        experiment_list: List[ModelExperiment] = [t[0] for t in trace.hist]
 
         model_list = []
         for experiment in experiment_list:
@@ -384,7 +384,7 @@ def convert_feature_experiment(self, response: str, hypothesis: Hypothesis, trac
             sub_tasks=tasks,
             based_experiments=(
                 [KGFactorExperiment(sub_tasks=[], source_feature_size=trace.scen.input_shape[-1])]
-                + [t[1] for t in trace.hist if t[2]]
+                + [t[0] for t in trace.hist if t[1]]
             ),
             hypothesis=hypothesis,
         )
@@ -400,7 +400,7 @@ def convert_model_experiment(self, response: str, hypothesis: Hypothesis, trace:
             )
 
         based_experiments = [KGModelExperiment(sub_tasks=[], source_feature_size=trace.scen.input_shape[-1])] + [
-            t[1] for t in trace.hist if t[2]
+            t[0] for t in trace.hist if t[1]
         ]
         model_type = response_dict.get("model_type", "Model type not provided")
         if model_type in KG_MODEL_MAPPING:
diff --git a/rdagent/scenarios/qlib/prompts.yaml b/rdagent/scenarios/qlib/prompts.yaml
index 206d4d64e..b1ea9e520 100644
--- a/rdagent/scenarios/qlib/prompts.yaml
+++ b/rdagent/scenarios/qlib/prompts.yaml
@@ -1,6 +1,6 @@
 hypothesis_and_feedback: |-
-  {% for hypothesis, experiment, feedback in trace.hist[-10:] %}
-  Hypothesis {{ loop.index }}: {{ hypothesis }}
+  {% for experiment, feedback in trace.hist[-10:] %}
+  Hypothesis {{ loop.index }}: {{ experiment.hypothesis }}
   Corresponding Code (that leads to the difference in performance): {{experiment.sub_workspace_list[0].file_dict.get("model.py")}}
   Observation on the result with the hypothesis: {{ feedback.observations }}
   Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
diff --git a/rdagent/scenarios/qlib/proposal/factor_proposal.py b/rdagent/scenarios/qlib/proposal/factor_proposal.py
index a1928ccd8..80df3eba0 100644
--- a/rdagent/scenarios/qlib/proposal/factor_proposal.py
+++ b/rdagent/scenarios/qlib/proposal/factor_proposal.py
@@ -65,7 +65,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict |
             else "No previous hypothesis and feedback available since it's the first round."
         )
 
-        experiment_list: List[FactorExperiment] = [t[1] for t in trace.hist]
+        experiment_list: List[FactorExperiment] = [t[0] for t in trace.hist]
 
         factor_list = []
         for experiment in experiment_list:
@@ -98,7 +98,7 @@ def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace)
             )
 
         exp = QlibFactorExperiment(tasks, hypothesis=hypothesis)
-        exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [t[1] for t in trace.hist if t[2]]
+        exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [t[0] for t in trace.hist if t[1]]
 
         unique_tasks = []
 
diff --git a/rdagent/scenarios/qlib/proposal/model_proposal.py b/rdagent/scenarios/qlib/proposal/model_proposal.py
index c519fddb3..81ee91ad4 100644
--- a/rdagent/scenarios/qlib/proposal/model_proposal.py
+++ b/rdagent/scenarios/qlib/proposal/model_proposal.py
@@ -65,7 +65,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             else "No previous hypothesis and feedback available since it's the first round."
         )
 
-        experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist]
+        experiment_list: List[ModelExperiment] = [t[0] for t in trace.hist]
 
         model_list = []
         for experiment in experiment_list:
@@ -102,5 +102,5 @@ def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace)
                 )
             )
         exp = QlibModelExperiment(tasks, hypothesis=hypothesis)
-        exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+        exp.based_experiments = [t[0] for t in trace.hist if t[1]]
         return exp
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
index d02a33319..811aef8ad 100644
--- a/rdagent/utils/env.py
+++ b/rdagent/utils/env.py
@@ -153,7 +153,7 @@ class QlibDockerConf(DockerConf):
     image: str = "local_qlib:latest"
     mount_path: str = "/workspace/qlib_workspace/"
     default_entry: str = "qrun conf.yaml"
-    extra_volumes: dict = {Path("~/.qlib/").expanduser().resolve(): "/root/.qlib/"}
+    extra_volumes: dict = {str(Path("~/.qlib/").expanduser().resolve().absolute()): "/root/.qlib/"}
     shm_size: str | None = "16g"
     enable_gpu: bool = True
 
@@ -167,9 +167,12 @@ class DMDockerConf(DockerConf):
     mount_path: str = "/workspace/dm_workspace/"
     default_entry: str = "python train.py"
     extra_volumes: dict = {
-        Path("~/.rdagent/.data/physionet.org/files/mimic-eicu-fiddle-feature/1.0.0/FIDDLE_mimic3/")
-        .expanduser()
-        .resolve(): "/root/.data/"
+        str(
+            Path("~/.rdagent/.data/physionet.org/files/mimic-eicu-fiddle-feature/1.0.0/FIDDLE_mimic3/")
+            .expanduser()
+            .resolve()
+            .absolute()
+        ): "/root/.data/"
     }
     shm_size: str | None = "16g"
 
diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py
index 11305916d..e2e83b7a6 100644
--- a/rdagent/utils/workflow.py
+++ b/rdagent/utils/workflow.py
@@ -111,29 +111,29 @@ def run(self, step_n: int | None = None):
                 li, si = self.loop_idx, self.step_idx
 
                 name = self.steps[si]
-                with logger.tag(f"Loop_{li}.{name}"):
-                    start = datetime.datetime.now(datetime.timezone.utc)
-                    func = getattr(self, name)
-                    try:
-                        self.loop_prev_out[name] = func(self.loop_prev_out)
-                        # TODO: Fix the error logger.exception(f"Skip loop {li} due to {e}")
-                    except self.skip_loop_error as e:
-                        # FIXME: This does not support previous demo (due to their last step is not for recording)
-                        logger.warning(f"Skip loop {li} due to {e}")
-                        # NOTE: strong assumption!  The last step is responsible for recording information
-                        self.step_idx = len(self.steps) - 1  # directly jump to the last step.
-                        self.loop_prev_out[self.EXCEPTION_KEY] = e
-                        continue
-                    finally:
-                        # make sure failure steps are displayed correclty
-                        end = datetime.datetime.now(datetime.timezone.utc)
-                        self.loop_trace[li].append(LoopTrace(start, end, step_idx=si))
-
-                        # Update tqdm progress bar directly to step_idx
-                        pbar.n = si + 1
-                        pbar.set_postfix(
-                            loop_index=li, step_index=si + 1, step_name=name
-                        )  # step_name indicate  last finished step_name
+                # with logger.tag(f"Loop_{li}.{name}"):
+                start = datetime.datetime.now(datetime.timezone.utc)
+                func = getattr(self, name)
+                try:
+                    self.loop_prev_out[name] = func(self.loop_prev_out)
+                    # TODO: Fix the error logger.exception(f"Skip loop {li} due to {e}")
+                except self.skip_loop_error as e:
+                    # FIXME: This does not support previous demo (due to their last step is not for recording)
+                    logger.warning(f"Skip loop {li} due to {e}")
+                    # NOTE: strong assumption!  The last step is responsible for recording information
+                    self.step_idx = len(self.steps) - 1  # directly jump to the last step.
+                    self.loop_prev_out[self.EXCEPTION_KEY] = e
+                    continue
+                finally:
+                    # make sure failure steps are displayed correclty
+                    end = datetime.datetime.now(datetime.timezone.utc)
+                    self.loop_trace[li].append(LoopTrace(start, end, step_idx=si))
+
+                    # Update tqdm progress bar directly to step_idx
+                    pbar.n = si + 1
+                    pbar.set_postfix(
+                        loop_index=li, step_index=si + 1, step_name=name
+                    )  # step_name indicate  last finished step_name
 
                 # index increase and save session
                 self.step_idx = (self.step_idx + 1) % len(self.steps)

From 626296c47df3ff72b7b78611365af4fcb4b45454 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Fri, 17 Jan 2025 09:46:48 +0000
Subject: [PATCH 301/304] fix CI

---
 rdagent/app/data_science/loop.py                        | 2 +-
 rdagent/components/coder/data_science/model/__init__.py | 6 +-----
 rdagent/core/evolving_agent.py                          | 2 +-
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 30a33de29..f6d9306fc 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -61,7 +61,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):
     def direct_exp_gen(self, prev_out: dict[str, Any]):
         exp = self.exp_gen.gen(self.trace)
         logger.log_object(exp, tag="direct_exp_gen")
-        
+
         # FIXME: this is for LLM debug webapp, remove this when the debugging is done.
         logger.log_object(exp, tag="debug_exp_gen")
         return exp
diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 93113bb95..537d3fab5 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -96,11 +96,7 @@ def implement_one_task(
 
             # 3. post process to align file name to the task name
             batch_edit = {
-                (
-                    f"{target_task.name}.py"
-                    if value != "__DEL__" and key != f"{target_task.name}.py" 
-                    else key
-                ): value
+                (f"{target_task.name}.py" if value != "__DEL__" and key != f"{target_task.name}.py" else key): value
                 for key, value in batch_edit.items()
             }
 
diff --git a/rdagent/core/evolving_agent.py b/rdagent/core/evolving_agent.py
index d2c36a4e0..eded60b36 100644
--- a/rdagent/core/evolving_agent.py
+++ b/rdagent/core/evolving_agent.py
@@ -58,7 +58,7 @@ def multistep_evolve(
         eva: Evaluator | Feedback,
         filter_final_evo: bool = False,
     ) -> EvolvableSubjects:
-        for evo_loop_id in tqdm(range(self.max_loop), "Implementing"):
+        for _ in tqdm(range(self.max_loop), "Implementing"):
             # with logger.tag(f"evo_loop_{evo_loop_id}"):
             # 1. knowledge self-evolving
             if self.knowledge_self_gen and self.rag is not None:

From a826c2b016d0ffb33d40469447045f2968b830bd Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Fri, 17 Jan 2025 18:19:16 +0800
Subject: [PATCH 302/304] Update rdagent/app/data_science/loop.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 rdagent/app/data_science/loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index f6d9306fc..930e3a3a5 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -134,7 +134,7 @@ def main(path=None, step_n=None, competition="bms-molecular-translation"):
     competition :
 
 
-    Auto R&D Evolving loop for models in a kaggle{} scenario.
+    Auto R&D Evolving loop for models in a Kaggle scenario.
     You can continue running session by
     .. code-block:: bash
         dotenv run -- python rdagent/app/data_science/loop.py [--competition titanic] $LOG_PATH/__session__/1/0_propose  --step_n 1   # `step_n` is a optional parameter

From 734700f238df5b40302538be59a6285d8772e8b0 Mon Sep 17 00:00:00 2001
From: TPLin22 <tplin2@163.com>
Date: Fri, 17 Jan 2025 11:15:57 +0000
Subject: [PATCH 303/304] add samplecsv into spec prompts

---
 .../coder/data_science/raw_data_loader/__init__.py        | 3 ++-
 .../coder/data_science/raw_data_loader/prompts.yaml       | 8 +++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index 1d37be742..07d537e18 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -90,9 +90,10 @@ def implement_one_task(
             system_prompt = T(".prompts:spec.system").r(
                 task_desc=data_loader_task_info,
                 competition_info=competition_info,
+                folder_spec=data_folder_info,
             )
             data_loader_prompt = T(".prompts:spec.user.data_loader").r(
-                latest_spec=workspace.file_dict.get("spec/data_loader.md")
+                latest_spec=workspace.file_dict.get("spec/data_loader.md") 
             )
             feature_prompt = T(".prompts:spec.user.feature").r(latest_spec=workspace.file_dict.get("spec/feature.md"))
             model_prompt = T(".prompts:spec.user.model").r(latest_spec=workspace.file_dict.get("spec/model.md"))
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
index 0d74a2e2b..b77e64028 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -27,6 +27,10 @@ spec:
     -----------Competition Information-----------
     {{ competition_info }}
 
+    -----------Folder Description---------(All path are relative to the data folder)
+    - Ensure that all columns in sample_submission can be generated.
+    {{ folder_spec }}
+
   user:
     data_loader: |-
       Data loader specification text should follow these detailed requirements:
@@ -61,13 +65,15 @@ spec:
       4. Notes:
         - Update `DT` (data type) based on the specific competition dataset. This can include `pd.DataFrame`, `np.array`, `torch.Tensor`, etc.
         - Extend domain-specific handling steps based on the competition information.
-      
+
       {% if latest_spec %}
       5. Former Specification:
         {{ latest_spec }}
         You should follow the provided specifications to improve this task.
       {% endif %}
 
+      
+
       Please respond with a JSON structure as follows:
       {
           "spec": "The function definition in code format, tailored to the Competition Information, with detailed explanations provided in the docstring."

From 792660b845b431ada014fbf302ee831fb6b957f0 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Fri, 17 Jan 2025 14:46:15 +0000
Subject: [PATCH 304/304] fix CI

---
 .../components/coder/data_science/raw_data_loader/__init__.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index 07d537e18..73a7a349d 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -93,7 +93,7 @@ def implement_one_task(
                 folder_spec=data_folder_info,
             )
             data_loader_prompt = T(".prompts:spec.user.data_loader").r(
-                latest_spec=workspace.file_dict.get("spec/data_loader.md") 
+                latest_spec=workspace.file_dict.get("spec/data_loader.md")
             )
             feature_prompt = T(".prompts:spec.user.feature").r(latest_spec=workspace.file_dict.get("spec/feature.md"))
             model_prompt = T(".prompts:spec.user.model").r(latest_spec=workspace.file_dict.get("spec/model.md"))