diff --git a/.gitignore b/.gitignore
index 400cf7d8e..fa8a46224 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 Pipfile
 public
 release-notes.md
+typescript*
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -170,3 +171,4 @@ mlruns/
 # shell script
 *.out
 *.sh
+.aider*
diff --git a/pyproject.toml b/pyproject.toml
index 5a3fbdb4a..9a8aa5375 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,6 +61,10 @@ explicit_package_bases = true
 warn_return_any = true
 warn_unused_ignores = true
 
+[[tool.mypy.overrides]]
+ignore_missing_imports = true
+module = "llama"
+
 [tool.pytest.ini_options]
 addopts = "-l -s --durations=0"
 log_cli = true
@@ -77,7 +81,6 @@ src = ["rdagent"]
 [tool.ruff.lint]
 ignore = [
   # https://docs.astral.sh/ruff/rules/#pydocstyle-d
-  "ANN101",
   "ANN401",
   "D",
   "ERA001",
@@ -88,7 +91,7 @@ ignore = [
   "S101",
   "S301",
   "T20",
-  "TCH003",
+  "TC003",
   "TD",
 ]
 select = ["ALL"]
diff --git a/rdagent/app/data_mining/conf.py b/rdagent/app/data_mining/conf.py
index 45b1ef355..e8b6ab8f9 100644
--- a/rdagent/app/data_mining/conf.py
+++ b/rdagent/app/data_mining/conf.py
@@ -23,7 +23,7 @@ class MedBasePropSetting(BasePropSetting):
     runner: str = "rdagent.scenarios.data_mining.developer.model_runner.DMModelRunner"
     """Runner class"""
 
-    summarizer: str = "rdagent.scenarios.data_mining.developer.feedback.DMModelHypothesisExperiment2Feedback"
+    summarizer: str = "rdagent.scenarios.data_mining.developer.feedback.DMModelExperiment2Feedback"
     """Summarizer class"""
 
     evolving_n: int = 10
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
new file mode 100644
index 000000000..25a92f3e3
--- /dev/null
+++ b/rdagent/app/data_science/conf.py
@@ -0,0 +1,49 @@
+from rdagent.app.kaggle.conf import KaggleBasePropSetting
+from rdagent.core.conf import ExtendedSettingsConfigDict
+
+
+class DataScienceBasePropSetting(KaggleBasePropSetting):
+    model_config = ExtendedSettingsConfigDict(env_prefix="DS_", protected_namespaces=())
+
+    # Main components
+    ## Scen
+    scen: str = "rdagent.scenarios.data_science.scen.KaggleScen"
+    """Scenario class for data mining model"""
+
+    ## proposal
+    exp_gen: str = "rdagent.scenarios.data_science.proposal.exp_gen.DSExpGen"
+    # exp_gen_init_kwargs: dict = {"max_trace_hist": 3}   # TODO: to be configurable
+
+    # the two below should be used in ExpGen
+    # hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen"
+    # """Hypothesis generation class"""
+    #
+    # hypothesis2experiment: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesis2Experiment"
+    # """Hypothesis to experiment class"""
+
+    ## dev/coder
+    data_loader_coder: str = "rdagent.components.coder.data_science.raw_data_loader.DataLoaderCoSTEER"
+    """Data Loader CoSTEER"""
+
+    # feature_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGFactorCoSTEER"
+    # """Feature Coder class"""
+
+    # model_feature_selection_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGModelFeatureSelectionCoder"
+    # """Model Feature Selection Coder class"""
+
+    # model_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGModelCoSTEER"
+    # """Model Coder class"""
+
+    ## dev/runner
+    feature_runner: str = "rdagent.scenarios.kaggle.developer.runner.KGFactorRunner"
+    """Feature Runner class"""
+
+    model_runner: str = "rdagent.scenarios.kaggle.developer.runner.KGModelRunner"
+    """Model Runner class"""
+
+    ## feedback
+    summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGExperiment2Feedback"
+    """Summarizer class"""
+
+
+DS_RD_SETTING = DataScienceBasePropSetting()
diff --git a/rdagent/app/data_science/debug.py b/rdagent/app/data_science/debug.py
new file mode 100644
index 000000000..e5ea7da7b
--- /dev/null
+++ b/rdagent/app/data_science/debug.py
@@ -0,0 +1,6 @@
+import fire
+
+from rdagent.scenarios.data_science.debug.data import create_debug_data
+
+if __name__ == "__main__":
+    fire.Fire(create_debug_data)
diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
new file mode 100644
index 000000000..930e3a3a5
--- /dev/null
+++ b/rdagent/app/data_science/loop.py
@@ -0,0 +1,163 @@
+from pathlib import Path
+from typing import Any
+
+import fire
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.components.coder.data_science.ensemble import EnsembleCoSTEER
+from rdagent.components.coder.data_science.feature import FeatureCoSTEER
+from rdagent.components.coder.data_science.model import ModelCoSTEER
+from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
+from rdagent.components.coder.data_science.workflow import WorkflowCoSTEER
+from rdagent.components.workflow.conf import BasePropSetting
+from rdagent.components.workflow.rd_loop import RDLoop
+from rdagent.core.exception import CoderError, RunnerError
+from rdagent.core.proposal import ExperimentFeedback, HypothesisFeedback
+from rdagent.core.scenario import Scenario
+from rdagent.core.utils import import_class
+from rdagent.log import rdagent_logger as logger
+from rdagent.scenarios.data_science.dev.feedback import DSExperiment2Feedback
+from rdagent.scenarios.data_science.dev.runner import DSRunner
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.scenarios.data_science.proposal.exp_gen import DSExpGen, DSTrace
+from rdagent.scenarios.kaggle.kaggle_crawler import download_data
+
+
+class DataScienceRDLoop(RDLoop):
+    skip_loop_error = (CoderError, RunnerError)
+
+    def __init__(self, PROP_SETTING: BasePropSetting):
+        logger.log_object(PROP_SETTING.competition, tag="competition")
+        scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
+
+        ### shared components in the workflow  # TODO: check if
+        knowledge_base = (
+            import_class(PROP_SETTING.knowledge_base)(PROP_SETTING.knowledge_base_path, scen)
+            if PROP_SETTING.knowledge_base != ""
+            else None
+        )
+
+        # 1) task generation from scratch
+        # self.scratch_gen: tuple[HypothesisGen, Hypothesis2Experiment] = DummyHypothesisGen(scen),
+
+        # 2) task generation from a complete solution
+        # self.exp_gen: ExpGen = import_class(PROP_SETTING.exp_gen)(scen)
+        self.exp_gen = DSExpGen(scen)
+        self.data_loader_coder = DataLoaderCoSTEER(scen)
+        self.feature_coder = FeatureCoSTEER(scen)
+        self.model_coder = ModelCoSTEER(scen)
+        self.ensemble_coder = EnsembleCoSTEER(scen)
+        self.workflow_coder = WorkflowCoSTEER(scen)
+
+        self.runner = DSRunner(scen)
+        # self.summarizer: Experiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
+        # logger.log_object(self.summarizer, tag="summarizer")
+
+        # self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
+        self.trace = DSTrace(scen=scen)
+        self.summarizer = DSExperiment2Feedback(scen)
+        super(RDLoop, self).__init__()
+
+    def direct_exp_gen(self, prev_out: dict[str, Any]):
+        exp = self.exp_gen.gen(self.trace)
+        logger.log_object(exp, tag="direct_exp_gen")
+
+        # FIXME: this is for LLM debug webapp, remove this when the debugging is done.
+        logger.log_object(exp, tag="debug_exp_gen")
+        return exp
+
+    def coding(self, prev_out: dict[str, Any]):
+        exp = prev_out["direct_exp_gen"]
+        for tasks in exp.pending_tasks_list:
+            exp.sub_tasks = tasks
+            if exp.hypothesis.component == "DataLoadSpec":
+                exp = self.data_loader_coder.develop(exp)
+            elif exp.hypothesis.component == "FeatureEng":
+                exp = self.feature_coder.develop(exp)
+            elif exp.hypothesis.component == "Model":
+                exp = self.model_coder.develop(exp)
+            elif exp.hypothesis.component == "Ensemble":
+                exp = self.ensemble_coder.develop(exp)
+            elif exp.hypothesis.component == "Workflow":
+                exp = self.workflow_coder.develop(exp)
+            else:
+                raise NotImplementedError(f"Unsupported component in DataScienceRDLoop: {exp.hypothesis.component}")
+            exp.sub_tasks = []
+        logger.log_object(exp, tag="coding")
+        return exp
+
+    def running(self, prev_out: dict[str, Any]):
+        exp: DSExperiment = prev_out["coding"]
+        if exp.next_component_required() is None:
+            new_exp = self.runner.run(exp)
+            logger.log_object(new_exp, tag="running")
+            return new_exp
+        else:
+            return exp
+
+    def feedback(self, prev_out: dict[str, Any]) -> ExperimentFeedback:
+        exp: DSExperiment = prev_out["running"]
+        if exp.next_component_required() is None:
+            feedback = self.summarizer.generate_feedback(exp, self.trace)
+        else:
+            feedback = ExperimentFeedback(
+                reason=f"{exp.hypothesis.component} is completed.",
+                decision=True,
+            )
+        logger.log_object(feedback, tag="feedback")
+        return feedback
+
+    def record(self, prev_out: dict[str, Any]):
+        e = prev_out.get(self.EXCEPTION_KEY, None)
+        if e is None:
+            self.trace.hist.append((prev_out["running"], prev_out["feedback"]))
+        else:
+            self.trace.hist.append(
+                (
+                    prev_out["direct_exp_gen"] if isinstance(e, CoderError) else prev_out["coding"],
+                    ExperimentFeedback.from_exception(e),
+                )
+            )
+        logger.log_object(self.trace, tag="trace")
+        logger.log_object(self.trace.sota_experiment(), tag="SOTA experiment")
+
+
+def main(path=None, step_n=None, competition="bms-molecular-translation"):
+    """
+
+    Parameters
+    ----------
+    path :
+        path like `$LOG_PATH/__session__/1/0_propose`. It indicates that we restore the state that after finish the step 0 in loop1
+    step_n :
+        How many steps to run; if None, it will run forever until error or KeyboardInterrupt
+    competition :
+
+
+    Auto R&D Evolving loop for models in a Kaggle scenario.
+    You can continue running session by
+    .. code-block:: bash
+        dotenv run -- python rdagent/app/data_science/loop.py [--competition titanic] $LOG_PATH/__session__/1/0_propose  --step_n 1   # `step_n` is a optional parameter
+        rdagent kaggle --competition playground-series-s4e8  # You are encouraged to use this one.
+    """
+    if competition is not None:
+        DS_RD_SETTING.competition = competition
+
+    if DS_RD_SETTING.competition:
+        if DS_RD_SETTING.scen.endswith("KaggleScen"):
+            download_data(competition=DS_RD_SETTING.competition, settings=DS_RD_SETTING)
+        else:
+            if not Path(f"{DS_RD_SETTING.local_data_path}/{competition}").exists():
+                logger.error(f"Please prepare data for competition {competition} first.")
+                return
+    else:
+        logger.error("Please specify competition name.")
+    if path is None:
+        kaggle_loop = DataScienceRDLoop(DS_RD_SETTING)
+    else:
+        kaggle_loop = DataScienceRDLoop.load(path)
+    kaggle_loop.run(step_n=step_n)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py
index b52047308..b0c9e7165 100644
--- a/rdagent/app/kaggle/conf.py
+++ b/rdagent/app/kaggle/conf.py
@@ -1,8 +1,7 @@
-from rdagent.components.workflow.conf import BasePropSetting
-from rdagent.core.conf import ExtendedSettingsConfigDict
+from rdagent.core.conf import ExtendedBaseSettings, ExtendedSettingsConfigDict
 
 
-class KaggleBasePropSetting(BasePropSetting):
+class KaggleBasePropSetting(ExtendedBaseSettings):
     model_config = ExtendedSettingsConfigDict(env_prefix="KG_", protected_namespaces=())
 
     # 1) overriding the default
@@ -30,7 +29,7 @@ class KaggleBasePropSetting(BasePropSetting):
     model_runner: str = "rdagent.scenarios.kaggle.developer.runner.KGModelRunner"
     """Model Runner class"""
 
-    summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGHypothesisExperiment2Feedback"
+    summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGExperiment2Feedback"
     """Summarizer class"""
 
     evolving_n: int = 10
@@ -45,12 +44,21 @@ class KaggleBasePropSetting(BasePropSetting):
     local_data_path: str = ""
     """Folder storing Kaggle competition data"""
 
+    if_using_mle_data: bool = False
+    auto_submit: bool = False
+    """Automatically upload and submit each experiment result to Kaggle platform"""
+    # Conditionally set the knowledge_base based on the use of graph RAG
+    knowledge_base: str = ""
+    """Knowledge base class, uses 'KGKnowledgeGraph' when advanced graph-based RAG is enabled, otherwise empty."""
     if_action_choosing_based_on_UCB: bool = False
     """Enable decision mechanism based on UCB algorithm"""
 
     domain_knowledge_path: str = "/data/userdata/share/kaggle/domain_knowledge"
     """Folder storing domain knowledge files in .case format"""
 
+    knowledge_base_path: str = "kg_graph.pkl"
+    """Advanced version of graph-based RAG"""
+
     rag_path: str = "git_ignore_folder/kaggle_vector_base.pkl"
     """Base version of vector-based RAG"""
 
@@ -60,20 +68,8 @@ class KaggleBasePropSetting(BasePropSetting):
     if_using_graph_rag: bool = False
     """Enable advanced graph-based RAG"""
 
-    # Conditionally set the knowledge_base based on the use of graph RAG
-    knowledge_base: str = ""
-    """Knowledge base class, uses 'KGKnowledgeGraph' when advanced graph-based RAG is enabled, otherwise empty."""
-
-    knowledge_base_path: str = "kg_graph.pkl"
-    """Advanced version of graph-based RAG"""
-
-    auto_submit: bool = False
-    """Automatically upload and submit each experiment result to Kaggle platform"""
-
     mini_case: bool = False
     """Enable mini-case study for experiments"""
 
-    if_using_mle_data: bool = False
-
 
 KAGGLE_IMPLEMENT_SETTING = KaggleBasePropSetting()
diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
index 2c66d668a..9f4cae811 100644
--- a/rdagent/app/kaggle/loop.py
+++ b/rdagent/app/kaggle/loop.py
@@ -7,16 +7,15 @@
 from rdagent.components.workflow.conf import BasePropSetting
 from rdagent.components.workflow.rd_loop import RDLoop
 from rdagent.core.developer import Developer
-from rdagent.core.exception import FactorEmptyError, ModelEmptyError
+from rdagent.core.exception import CoderError, FactorEmptyError, ModelEmptyError
 from rdagent.core.proposal import (
+    Experiment2Feedback,
     Hypothesis2Experiment,
-    HypothesisExperiment2Feedback,
     HypothesisGen,
 )
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
-from rdagent.log.time import measure_time
 from rdagent.scenarios.kaggle.experiment.scenario import (
     KG_ACTION_FEATURE_ENGINEERING,
     KG_ACTION_FEATURE_PROCESSING,
@@ -28,7 +27,6 @@
 
 
 class KaggleRDLoop(RDLoop):
-    @measure_time
     def __init__(self, PROP_SETTING: BasePropSetting):
         with logger.tag("init"):
             scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
@@ -55,27 +53,31 @@ def __init__(self, PROP_SETTING: BasePropSetting):
             logger.log_object(self.feature_runner, tag="feature runner")
             self.model_runner: Developer = import_class(PROP_SETTING.model_runner)(scen)
             logger.log_object(self.model_runner, tag="model runner")
-            self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
+            self.summarizer: Experiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
             logger.log_object(self.summarizer, tag="summarizer")
             self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
             super(RDLoop, self).__init__()
 
-    @measure_time
     def coding(self, prev_out: dict[str, Any]):
         with logger.tag("d"):  # develop
-            if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
-                exp = self.feature_coder.develop(prev_out["exp_gen"])
-            elif prev_out["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
-                exp = self.model_feature_selection_coder.develop(prev_out["exp_gen"])
+            if prev_out["direct_exp_gen"]["propose"].action in [
+                KG_ACTION_FEATURE_ENGINEERING,
+                KG_ACTION_FEATURE_PROCESSING,
+            ]:
+                exp = self.feature_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
+            elif prev_out["direct_exp_gen"]["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
+                exp = self.model_feature_selection_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
             else:
-                exp = self.model_coder.develop(prev_out["exp_gen"])
+                exp = self.model_coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
             logger.log_object(exp.sub_workspace_list, tag="coder result")
         return exp
 
-    @measure_time
     def running(self, prev_out: dict[str, Any]):
         with logger.tag("ef"):  # evaluate and feedback
-            if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
+            if prev_out["direct_exp_gen"]["propose"].action in [
+                KG_ACTION_FEATURE_ENGINEERING,
+                KG_ACTION_FEATURE_PROCESSING,
+            ]:
                 exp = self.feature_runner.develop(prev_out["coding"])
             else:
                 exp = self.model_runner.develop(prev_out["coding"])
@@ -113,7 +115,7 @@ def running(self, prev_out: dict[str, Any]):
 
         return exp
 
-    skip_loop_error = (ModelEmptyError, FactorEmptyError)
+    skip_loop_error = (ModelEmptyError, FactorEmptyError, CoderError)
 
 
 def main(path=None, step_n=None, competition=None):
@@ -126,7 +128,7 @@ def main(path=None, step_n=None, competition=None):
     """
     if competition:
         KAGGLE_IMPLEMENT_SETTING.competition = competition
-        download_data(competition=competition, local_path=KAGGLE_IMPLEMENT_SETTING.local_data_path)
+        download_data(competition=competition, settings=KAGGLE_IMPLEMENT_SETTING)
         if KAGGLE_IMPLEMENT_SETTING.if_using_graph_rag:
             KAGGLE_IMPLEMENT_SETTING.knowledge_base = (
                 "rdagent.scenarios.kaggle.knowledge_management.graph.KGKnowledgeGraph"
diff --git a/rdagent/app/qlib_rd_loop/conf.py b/rdagent/app/qlib_rd_loop/conf.py
index e6a91351a..da1a98c56 100644
--- a/rdagent/app/qlib_rd_loop/conf.py
+++ b/rdagent/app/qlib_rd_loop/conf.py
@@ -21,7 +21,7 @@ class ModelBasePropSetting(BasePropSetting):
     runner: str = "rdagent.scenarios.qlib.developer.model_runner.QlibModelRunner"
     """Runner class"""
 
-    summarizer: str = "rdagent.scenarios.qlib.developer.feedback.QlibModelHypothesisExperiment2Feedback"
+    summarizer: str = "rdagent.scenarios.qlib.developer.feedback.QlibModelExperiment2Feedback"
     """Summarizer class"""
 
     evolving_n: int = 10
@@ -47,7 +47,7 @@ class FactorBasePropSetting(BasePropSetting):
     runner: str = "rdagent.scenarios.qlib.developer.factor_runner.QlibFactorRunner"
     """Runner class"""
 
-    summarizer: str = "rdagent.scenarios.qlib.developer.feedback.QlibFactorHypothesisExperiment2Feedback"
+    summarizer: str = "rdagent.scenarios.qlib.developer.feedback.QlibFactorExperiment2Feedback"
     """Summarizer class"""
 
     evolving_n: int = 10
diff --git a/rdagent/app/qlib_rd_loop/factor.py b/rdagent/app/qlib_rd_loop/factor.py
index a27bf59ec..8379b0f08 100755
--- a/rdagent/app/qlib_rd_loop/factor.py
+++ b/rdagent/app/qlib_rd_loop/factor.py
@@ -10,13 +10,11 @@
 from rdagent.components.workflow.rd_loop import RDLoop
 from rdagent.core.exception import FactorEmptyError
 from rdagent.log import rdagent_logger as logger
-from rdagent.log.time import measure_time
 
 
 class FactorRDLoop(RDLoop):
     skip_loop_error = (FactorEmptyError,)
 
-    @measure_time
     def running(self, prev_out: dict[str, Any]):
         with logger.tag("ef"):  # evaluate and feedback
             exp = self.runner.develop(prev_out["coding"])
diff --git a/rdagent/app/qlib_rd_loop/factor_from_report.py b/rdagent/app/qlib_rd_loop/factor_from_report.py
index 2acab1012..b89bb88af 100644
--- a/rdagent/app/qlib_rd_loop/factor_from_report.py
+++ b/rdagent/app/qlib_rd_loop/factor_from_report.py
@@ -14,7 +14,6 @@
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import Hypothesis
 from rdagent.log import rdagent_logger as logger
-from rdagent.log.time import measure_time
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorExperiment
 from rdagent.scenarios.qlib.factor_experiment_loader.pdf_loader import (
@@ -98,11 +97,11 @@ def extract_hypothesis_and_exp_from_reports(report_file_path: str) -> Tuple[Qlib
 
     report_content = "\n".join(docs_dict.values())
     hypothesis = generate_hypothesis(factor_result, report_content)
+    exp.hypothesis = hypothesis
     return exp, hypothesis
 
 
 class FactorReportLoop(FactorRDLoop, metaclass=LoopMeta):
-    @measure_time
     def __init__(self, report_folder: str = None):
         super().__init__(PROP_SETTING=FACTOR_FROM_REPORT_PROP_SETTING)
         if report_folder is None:
@@ -118,7 +117,6 @@ def __init__(self, report_folder: str = None):
         self.current_loop_exp = None
         self.steps = ["propose_hypo_exp", "propose", "exp_gen", "coding", "running", "feedback"]
 
-    @measure_time
     def propose_hypo_exp(self, prev_out: dict[str, Any]):
         with logger.tag("r"):
             while True:
@@ -131,7 +129,9 @@ def propose_hypo_exp(self, prev_out: dict[str, Any]):
                 if exp is None:
                     continue
                 self.valid_pdf_file_count += 1
-                exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [t[1] for t in self.trace.hist if t[2]]
+                exp.based_experiments = [QlibFactorExperiment(sub_tasks=[], hypothesis=hypothesis)] + [
+                    t[0] for t in self.trace.hist if t[1]
+                ]
                 exp.sub_workspace_list = exp.sub_workspace_list[: FACTOR_FROM_REPORT_PROP_SETTING.max_factors_per_exp]
                 exp.sub_tasks = exp.sub_tasks[: FACTOR_FROM_REPORT_PROP_SETTING.max_factors_per_exp]
                 logger.log_object(hypothesis, tag="hypothesis generation")
@@ -140,14 +140,18 @@ def propose_hypo_exp(self, prev_out: dict[str, Any]):
                 self.current_loop_exp = exp
                 return None
 
-    @measure_time
     def propose(self, prev_out: dict[str, Any]):
         return self.current_loop_hypothesis
 
-    @measure_time
     def exp_gen(self, prev_out: dict[str, Any]):
         return self.current_loop_exp
 
+    def coding(self, prev_out: dict[str, Any]):
+        with logger.tag("d"):  # develop
+            exp = self.coder.develop(prev_out["exp_gen"])
+            logger.log_object(exp.sub_workspace_list, tag="coder result")
+        return exp
+
 
 def main(report_folder=None, path=None, step_n=None):
     """
diff --git a/rdagent/app/utils/ape.py b/rdagent/app/utils/ape.py
new file mode 100644
index 000000000..474e9a2d9
--- /dev/null
+++ b/rdagent/app/utils/ape.py
@@ -0,0 +1,49 @@
+"""
+This is the preliminary version of the APE (Automated Prompt Engineering)
+"""
+
+import pickle
+from pathlib import Path
+
+from rdagent.core.conf import RD_AGENT_SETTINGS
+
+
+def get_llm_qa(file_path):
+    data_flt = []
+    with open(file_path, "rb") as f:
+        data = pickle.load(f)
+        print(len(data))
+        for item in data:
+            if "debug_llm" in item["tag"]:
+                data_flt.append(item)
+    return data_flt
+
+
+# Example usage
+# use
+file_path = Path(RD_AGENT_SETTINGS.log_trace_path) / "debug_llm.pkl"
+llm_qa = get_llm_qa(file_path)
+print(len(llm_qa))
+
+print(llm_qa[0])
+
+# Initialize APE backend
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+
+api = APIBackend()
+
+# Analyze test data and generate improved prompts
+for qa in llm_qa:
+    # Generate system prompt for APE
+    system_prompt = T(".prompts:ape.system").r()
+
+    # Generate user prompt with context from LLM QA
+    user_prompt = T(".prompts:ape.user").r(
+        system=qa["obj"].get("system", ""), user=qa["obj"]["user"], answer=qa["obj"]["resp"]
+    )
+    analysis_result = api.build_messages_and_create_chat_completion(
+        system_prompt=system_prompt, user_prompt=user_prompt
+    )
+    print(f"█" * 60)
+    yes = input("Do you want to continue? (y/n)")
diff --git a/rdagent/app/utils/prompts.yaml b/rdagent/app/utils/prompts.yaml
new file mode 100644
index 000000000..3d24df175
--- /dev/null
+++ b/rdagent/app/utils/prompts.yaml
@@ -0,0 +1,119 @@
+ape:
+  system: |-
+    We'll provide you with a pair of Chat QA about data science.
+    We are creating solutions for a Kaggle Competition based on the answers.
+    Good questions are crucial for getting good answers.
+    Please suggest how to improve the question.
+    You can analyze based on these aspects:
+    - Is the question complete (is all the information needed to answer the question provided?)
+
+    The conversation will be provided in the following format:
+
+    <question>
+      <part1>
+      ...text to describe the question...
+      </part1>
+      <part2>
+      ...text to describe the question...
+      </part2>
+    </question>
+
+    <answer>
+      ...text to describe the answer.
+    </answer>
+
+    You response should be very concorete and concise(less than 20 words) and focuse on the mentioned aspects, like
+    ```
+    Info Missing: the question ask for changing code, but it does not provide the description of current code.
+    ```
+    Please be very conversatiive when you propose improvements. Only propose improvements when it becomes impossible to give the answer.
+
+    Don't propose conerete modifications
+
+  user: |-
+    <question>
+      <part1>
+      {{system}}
+      </part1>
+      <part2>
+      {{user}}
+      </part2>
+    </question>
+
+    <answer>
+      {{answer}}
+    </answer>
+
+  optional: |-
+    If you want to suggest modification on  the question. Please follow the *SEARCH/REPLACE block* Rules!!!! It is optional.
+    Please make it concise and less than 20 lines!!!
+
+    # *SEARCH/REPLACE block* Rules:
+
+    Every *SEARCH/REPLACE block* must use this format:
+    1. The *FULL* file path alone on a line, verbatim. No bold asterisks, no quotes around it, no escaping of characters, etc.
+    2. The opening fence and code language, eg: ```python
+    3. The start of search block: <<<<<<< SEARCH
+    4. A contiguous chunk of lines to search for in the existing source code
+    5. The dividing line: =======
+    6. The lines to replace into the source code
+    7. The end of the replace block: >>>>>>> REPLACE
+    8. The closing fence: ```
+
+    Use the *FULL* file path, as shown to you by the user.
+
+    Every *SEARCH* section must *EXACTLY MATCH* the existing file content, character for character, including all comments, docstrings, etc.
+    If the file contains code or other data wrapped/escaped in json/xml/quotes or other containers, you need to propose edits to the literal contents of the file, including the container markup.
+
+    *SEARCH/REPLACE* blocks will *only* replace the first match occurrence.
+    Including multiple unique *SEARCH/REPLACE* blocks if needed.
+    Include enough lines in each SEARCH section to uniquely match each set of lines that need to change.
+
+    Keep *SEARCH/REPLACE* blocks concise.
+    Break large *SEARCH/REPLACE* blocks into a series of smaller blocks that each change a small portion of the file.
+    Include just the changing lines, and a few surrounding lines if needed for uniqueness.
+    Do not include long runs of unchanging lines in *SEARCH/REPLACE* blocks.
+
+    Only create *SEARCH/REPLACE* blocks for files that the user has added to the chat!
+
+    To move code within a file, use 2 *SEARCH/REPLACE* blocks: 1 to delete it from its current location, 1 to insert it in the new location.
+
+    Pay attention to which filenames the user wants you to edit, especially if they are asking you to create a new file.
+
+    If you want to put code in a new file, use a *SEARCH/REPLACE block* with:
+    - A new file path, including dir name if needed
+    - An empty `SEARCH` section
+    - The new file's contents in the `REPLACE` section
+
+    To rename files which have been added to the chat, use shell commands at the end of your response.
+
+    If the user just says something like "ok" or "go ahead" or "do that" they probably want you to make SEARCH/REPLACE blocks for the code changes you just proposed.
+    The user will say when they've applied your edits. If they haven't explicitly confirmed the edits have been applied, they probably want proper SEARCH/REPLACE blocks.
+
+    You are diligent and tireless!
+    You NEVER leave comments describing code without implementing it!
+    You always COMPLETELY IMPLEMENT the needed code!
+
+
+    ONLY EVER RETURN CODE IN A *SEARCH/REPLACE BLOCK*!
+    Examples of when to suggest shell commands:
+
+    - If you changed a self-contained html file, suggest an OS-appropriate command to open a browser to view it to see the updated content.
+    - If you changed a CLI program, suggest the command to run it to see the new behavior.
+    - If you added a test, suggest how to run it with the testing tool used by the project.
+    - Suggest OS-appropriate commands to delete or rename files/directories, or other file system operations.
+    - If your code changes add new dependencies, suggest the command to install them.
+    - Etc.
+
+    Here is a example of SEARCH/REPLACE BLOCK to change a function implementation to import.
+
+    <<<<<<< SEARCH
+    def hello():
+        "print a greeting"
+
+        print("hello")
+    =======
+    from hello import hello
+
+    >>>>>>> REPLACE
+# - Is there any ambiguity in the question?
diff --git a/rdagent/components/coder/CoSTEER/__init__.py b/rdagent/components/coder/CoSTEER/__init__.py
index 7dddbfe2d..a6c53a65d 100644
--- a/rdagent/components/coder/CoSTEER/__init__.py
+++ b/rdagent/components/coder/CoSTEER/__init__.py
@@ -105,4 +105,5 @@ def develop(self, exp: Experiment) -> Experiment:
             pickle.dump(self.knowledge_base, open(self.new_knowledge_base_path, "wb"))
             logger.info(f"New knowledge base saved to {self.new_knowledge_base_path}")
         exp.sub_workspace_list = experiment.sub_workspace_list
+        exp.experiment_workspace = experiment.experiment_workspace
         return exp
diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
index 37f4d1ca0..4a329cada 100644
--- a/rdagent/components/coder/CoSTEER/evaluators.py
+++ b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -1,4 +1,5 @@
 from abc import abstractmethod
+from dataclasses import dataclass
 from typing import List
 
 from rdagent.components.coder.CoSTEER.evolvable_subjects import EvolvingItem
@@ -10,8 +11,52 @@
 from rdagent.core.utils import multiprocessing_wrapper
 from rdagent.log import rdagent_logger as logger
 
+# TODO:
+# 1. It seems logically sound, but we currently lack a scenario to apply it.
+# 2. If it proves to be useful, relocate it to a more general location.
+#
+# class FBWorkspaceExeFeedback(Feedback):
+#     """
+#     It pairs with FBWorkspace in the abstract level.
+#     """
+#     # ws: FBWorkspace   # potential
+#     stdout: str
 
+
+@dataclass
 class CoSTEERSingleFeedback(Feedback):
+    # TODO: (xiao)
+    # it should be more general class for FBWorkspaceExeFeedback
+    # A better name of it may be NormalFeedback
+    # TODO: It should be a general feeddback for CoSTEERR
+    """
+    The feedback for the data loader evaluation.
+    It is design align the phases of the implemented code
+    - Execution -> Return Value -> Code -> Final Decision
+    """
+    execution: str
+    # execution_feedback
+    return_checking: str | None  # including every check in the testing (constraints about the generated value)
+    # value_feedback, shape_feedback, value_generated_flag
+    code: str
+    final_decision: bool
+
+    def __str__(self) -> str:
+        return f"""------------------Execution------------------
+{self.execution}
+------------------Return Checking------------------
+{self.return_checking if self.return_checking is not None else 'No return checking'}
+------------------Code------------------
+{self.code}
+------------------Final Decision------------------
+This implementation is {'SUCCESS' if self.final_decision else 'FAIL'}.
+"""
+
+    def __bool__(self):
+        return self.final_decision
+
+
+class CoSTEERSingleFeedbackDeprecated(CoSTEERSingleFeedback):
     """This class is a base class for all code generator feedback to single implementation"""
 
     def __init__(
@@ -26,7 +71,6 @@ def __init__(
         final_decision_based_on_gt: bool = None,
     ) -> None:
         self.execution_feedback = execution_feedback
-        self.shape_feedback = shape_feedback
         self.code_feedback = code_feedback
         self.value_feedback = value_feedback
         self.final_decision = final_decision
@@ -34,6 +78,26 @@ def __init__(
         self.value_generated_flag = value_generated_flag
         self.final_decision_based_on_gt = final_decision_based_on_gt
 
+        # TODO:
+        # Not general enough. So we should not put them in the general costeer feedback
+        # Instead, we should create subclass for it.
+        self.shape_feedback = shape_feedback  # Not general enough. So
+
+    # TODO: @property
+    @property
+    def execution(self):
+        return self.execution_feedback
+
+    @property
+    def return_checking(self):
+        if self.value_generated_flag:
+            return f"value feedback: {self.value_feedback}\n\nshape feedback: {self.shape_feedback}"
+        return None
+
+    @property
+    def code(self):
+        return self.code_feedback
+
     def __str__(self) -> str:
         return f"""------------------Execution Feedback------------------
 {self.execution_feedback if self.execution_feedback is not None else 'No execution feedback'}
@@ -73,6 +137,8 @@ def evaluate(
 
 
 class CoSTEERMultiEvaluator(Evaluator):
+    """This is for evaluation of experiment. Due to we have multiple tasks, so we will return a list of evaluation feebacks"""
+
     def __init__(self, single_evaluator: CoSTEEREvaluator, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.single_evaluator = single_evaluator
diff --git a/rdagent/components/coder/CoSTEER/evolving_agent.py b/rdagent/components/coder/CoSTEER/evolving_agent.py
index 70c097290..ef610a213 100644
--- a/rdagent/components/coder/CoSTEER/evolving_agent.py
+++ b/rdagent/components/coder/CoSTEER/evolving_agent.py
@@ -1,18 +1,32 @@
-from rdagent.components.coder.CoSTEER.evaluators import CoSTEERSingleFeedback
+from rdagent.components.coder.CoSTEER.evaluators import CoSTEERSingleFeedbackDeprecated
 from rdagent.components.coder.CoSTEER.evolvable_subjects import EvolvingItem
 from rdagent.core.evolving_agent import RAGEvoAgent
 from rdagent.core.evolving_framework import EvolvableSubjects
+from rdagent.core.exception import CoderError
 
 
 class FilterFailedRAGEvoAgent(RAGEvoAgent):
+
     def filter_evolvable_subjects_by_feedback(
-        self, evo: EvolvableSubjects, feedback: CoSTEERSingleFeedback
+        self, evo: EvolvableSubjects, feedback: CoSTEERSingleFeedbackDeprecated
     ) -> EvolvableSubjects:
         assert isinstance(evo, EvolvingItem)
+        # FIXME: the list does not align with the annotation; It should be MultipleFeedback instead of a list of feedbacks
         assert isinstance(feedback, list)
         assert len(evo.sub_workspace_list) == len(feedback)
 
         for index in range(len(evo.sub_workspace_list)):
-            if evo.sub_workspace_list[index] is not None and feedback[index] and not feedback[index].final_decision:
+            if evo.sub_workspace_list[index] is not None and feedback[index] is not None and not feedback[index]:
                 evo.sub_workspace_list[index].clear()
+
+        failed_feedbacks = [
+            f"- feedback{index + 1:02d}:\n  - execution: {f.execution}\n  - return_checking: {f.return_checking}\n  - code: {f.code}"
+            for index, f in enumerate(feedback)
+            if f is not None and not f.final_decision
+        ]
+
+        if len(failed_feedbacks) == len(feedback):
+            feedback_summary = "\n".join(failed_feedbacks)
+            raise CoderError(f"All tasks are failed:\n{feedback_summary}")
+
         return evo
diff --git a/rdagent/components/coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/CoSTEER/evolving_strategy.py
index c7126e7ff..7275e2a9d 100644
--- a/rdagent/components/coder/CoSTEER/evolving_strategy.py
+++ b/rdagent/components/coder/CoSTEER/evolving_strategy.py
@@ -12,7 +12,7 @@
 from rdagent.core.conf import RD_AGENT_SETTINGS
 from rdagent.core.evaluation import Scenario
 from rdagent.core.evolving_framework import EvolvingStrategy, QueriedKnowledge
-from rdagent.core.experiment import Workspace
+from rdagent.core.experiment import FBWorkspace
 from rdagent.core.prompts import Prompts
 from rdagent.core.scenario import Task
 from rdagent.core.utils import multiprocessing_wrapper
@@ -30,7 +30,17 @@ def implement_one_task(
         self,
         target_task: Task,
         queried_knowledge: QueriedKnowledge = None,
-    ) -> Workspace:
+        workspace: FBWorkspace | None = None,
+    ) -> dict[str, str]:  # FIXME: fix interface of previous implement
+        """
+        This method will input the task & current workspace,
+        and output the modification to applied to the workspace.
+        (i.e. replace the content <filename> with <content>)
+
+        Return
+        ------
+        The new files {<filename>: <content>} to update the workspace.
+        """
         raise NotImplementedError
 
     def select_one_round_tasks(
@@ -86,7 +96,10 @@ def evolve(
 
         result = multiprocessing_wrapper(
             [
-                (self.implement_one_task, (evo.sub_tasks[target_index], queried_knowledge))
+                (
+                    self.implement_one_task,
+                    (evo.sub_tasks[target_index], queried_knowledge, evo.experiment_workspace),
+                )
                 for target_index in to_be_finished_task_index
             ],
             n=RD_AGENT_SETTINGS.multi_proc_n,
diff --git a/rdagent/components/coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/CoSTEER/knowledge_management.py
index d67215e56..5f3bff064 100644
--- a/rdagent/components/coder/CoSTEER/knowledge_management.py
+++ b/rdagent/components/coder/CoSTEER/knowledge_management.py
@@ -48,7 +48,7 @@ def __init__(
 
     def get_implementation_and_feedback_str(self) -> str:
         return f"""------------------implementation code:------------------
-{self.implementation.code}
+{self.implementation.all_codes}
 ------------------implementation feedback:------------------
 {self.feedback!s}
 """
@@ -269,15 +269,15 @@ def generate_knowledge(
                         else:
                             # generate error node and store into knowledge base
                             error_analysis_result = []
-                            if not single_feedback.value_generated_flag:
+                            if single_feedback.return_checking:
                                 error_analysis_result = self.analyze_error(
-                                    single_feedback.execution_feedback,
-                                    feedback_type="execution",
+                                    single_feedback.return_checking,
+                                    feedback_type="value",
                                 )
                             else:
                                 error_analysis_result = self.analyze_error(
-                                    single_feedback.value_feedback,
-                                    feedback_type="value",
+                                    single_feedback.execution,
+                                    feedback_type="execution",
                                 )
                             self.knowledgebase.working_trace_error_analysis.setdefault(
                                 target_task_information,
@@ -425,8 +425,8 @@ def former_trace_query(
                 current_index = 1
                 while current_index < len(former_trace_knowledge):
                     if (
-                        not former_trace_knowledge[current_index].feedback.value_generated_flag
-                        and former_trace_knowledge[current_index - 1].feedback.value_generated_flag
+                        not former_trace_knowledge[current_index].feedback.return_checking
+                        and former_trace_knowledge[current_index - 1].feedback.return_checking
                     ):
                         former_trace_knowledge.pop(current_index)
                     else:
@@ -718,7 +718,7 @@ def __init__(self, init_component_list=None, path: str | Path = None) -> None:
         Load knowledge, offer brief information of knowledge and common handle interfaces
         """
         self.graph: UndirectedGraph = UndirectedGraph(Path.cwd() / "graph.pkl")
-        logger.info(f"Knowledge Graph loaded, size={self.graph.size()}")
+        logger.info(f"CoSTEER Knowledge Graph loaded, size={self.graph.size()}")
 
         if init_component_list:
             for component in init_component_list:
diff --git a/rdagent/components/coder/CoSTEER/task.py b/rdagent/components/coder/CoSTEER/task.py
index aaa38a4f1..5bc898994 100644
--- a/rdagent/components/coder/CoSTEER/task.py
+++ b/rdagent/components/coder/CoSTEER/task.py
@@ -4,4 +4,6 @@
 class CoSTEERTask(Task):
     def __init__(self, base_code: str = None, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
+        # TODO: we may upgrade the base_code into a workspace-like thing to know previous.
+        # NOTE: (xiao) think we don't need the base_code anymore. The information should be retrieved from the workspace.
         self.base_code = base_code
diff --git a/rdagent/components/coder/data_science/ensemble/__init__.py b/rdagent/components/coder/data_science/ensemble/__init__.py
index 40d21d3e5..e50e46fbb 100644
--- a/rdagent/components/coder/data_science/ensemble/__init__.py
+++ b/rdagent/components/coder/data_science/ensemble/__init__.py
@@ -1,19 +1,131 @@
-# from rdagent.components.coder.CoSTEER import CoSTEER
-# from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
-# from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-# from rdagent.core.scenario import Scenario
-
-
-# class ModelEnsembleCoSTEER(CoSTEER):
-#     def __init__(
-#         self,
-#         scen: Scenario,
-#         *args,
-#         **kwargs,
-#     ) -> None:
-#         eva = CoSTEERMultiEvaluator(
-#             ModelEnsembleCoSTEEREvaluator(scen=scen), scen=scen
-#         )  # Please specify whether you agree running your eva in parallel or not
-#         es = ModelEnsembleMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
-
-#         super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
+"""
+File structure
+- ___init__.py: the entrance/agent of coder
+- evaluator.py
+- conf.py
+- exp.py: everything under the experiment, e.g.
+    - Task
+    - Experiment
+    - Workspace
+- test.py
+    - Each coder could be tested.
+"""
+
+import json
+
+from rdagent.components.coder.CoSTEER import CoSTEER
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
+from rdagent.components.coder.CoSTEER.evolving_strategy import (
+    MultiProcessEvolvingStrategy,
+)
+from rdagent.components.coder.CoSTEER.knowledge_management import (
+    CoSTEERQueriedKnowledge,
+)
+from rdagent.components.coder.data_science.ensemble.eval import EnsembleCoSTEEREvaluator
+from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
+from rdagent.core.exception import CoderError
+from rdagent.core.experiment import FBWorkspace
+from rdagent.core.scenario import Scenario
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+
+
+class EnsembleMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
+    def implement_one_task(
+        self,
+        target_task: EnsembleTask,
+        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+        workspace: FBWorkspace | None = None,
+    ) -> dict[str, str]:
+        # Get task information for knowledge querying
+        ensemble_information_str = target_task.get_task_information()
+
+        # Query knowledge
+        queried_similar_successful_knowledge = (
+            queried_knowledge.task_to_similar_task_successful_knowledge[ensemble_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+        queried_former_failed_knowledge = (
+            queried_knowledge.task_to_former_failed_traces[ensemble_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+        latest_code_feedback = [
+            knowledge.feedback
+            for knowledge in queried_former_failed_knowledge[0]
+            if knowledge.implementation.file_dict.get("ensemble.py") is not None
+            and knowledge.implementation.file_dict.get("ensemble.py") == workspace.file_dict.get("ensemble.py")
+        ]
+        if len(latest_code_feedback) > 0:
+            queried_former_failed_knowledge = (
+                [
+                    knowledge
+                    for knowledge in queried_former_failed_knowledge[0]
+                    if knowledge.implementation.file_dict.get("ensemble.py") != workspace.file_dict.get("ensemble.py")
+                ],
+                queried_former_failed_knowledge[1],
+            )
+
+        # Generate code with knowledge integration
+        competition_info = self.scen.get_scenario_all_desc()
+        system_prompt = T(".prompts:ensemble_coder.system").r(
+            task_desc=ensemble_information_str,
+            competition_info=competition_info,
+            queried_similar_successful_knowledge=queried_similar_successful_knowledge,
+            queried_former_failed_knowledge=(
+                queried_former_failed_knowledge[0] if queried_former_failed_knowledge else None
+            ),
+        )
+        user_prompt = T(".prompts:ensemble_coder.user").r(
+            ensemble_spec=workspace.file_dict["spec/ensemble.md"],
+            latest_code=workspace.file_dict.get("ensemble.py"),
+            latest_code_feedback=latest_code_feedback[0] if len(latest_code_feedback) > 0 else None,
+        )
+
+        for _ in range(5):
+            ensemble_code = json.loads(
+                APIBackend().build_messages_and_create_chat_completion(
+                    user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                )
+            )["code"]
+            if ensemble_code != workspace.file_dict.get("ensemble.py"):
+                break
+            else:
+                user_prompt = user_prompt + "\nPlease avoid generating same code to former code!"
+        else:
+            raise CoderError("Failed to generate a new ensemble code.")
+
+        return {
+            "ensemble.py": ensemble_code,
+        }
+
+    def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
+        """
+        Assign the code list to the evolving item.
+
+        The code list is aligned with the evolving item's sub-tasks.
+        If a task is not implemented, put a None in the list.
+        """
+        for index in range(len(evo.sub_tasks)):
+            if code_list[index] is None:
+                continue
+            if evo.sub_workspace_list[index] is None:
+                # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
+                evo.sub_workspace_list[index] = evo.experiment_workspace
+            evo.sub_workspace_list[index].inject_files(**code_list[index])
+        return evo
+
+
+class EnsembleCoSTEER(CoSTEER):
+    def __init__(
+        self,
+        scen: Scenario,
+        *args,
+        **kwargs,
+    ) -> None:
+        eva = CoSTEERMultiEvaluator(EnsembleCoSTEEREvaluator(scen=scen), scen=scen)
+        es = EnsembleMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+
+        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=2, scen=scen, **kwargs)
diff --git a/rdagent/components/coder/data_science/ensemble/conf.py b/rdagent/components/coder/data_science/ensemble/conf.py
new file mode 100644
index 000000000..b6c859788
--- /dev/null
+++ b/rdagent/components/coder/data_science/ensemble/conf.py
@@ -0,0 +1,2 @@
+# Configuration file for ensemble component
+# Currently empty as no specific configuration is needed
diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
new file mode 100644
index 000000000..f130326e1
--- /dev/null
+++ b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -0,0 +1,75 @@
+import json
+from pathlib import Path
+
+from jinja2 import Environment, StrictUndefined
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.components.coder.CoSTEER.evaluators import (
+    CoSTEEREvaluator,
+    CoSTEERSingleFeedback,
+)
+from rdagent.core.evolving_framework import QueriedKnowledge
+from rdagent.core.experiment import FBWorkspace, Task
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+from rdagent.utils.env import DockerEnv, DSDockerConf
+
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+EnsembleEvalFeedback = CoSTEERSingleFeedback
+
+
+class EnsembleCoSTEEREvaluator(CoSTEEREvaluator):
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: FBWorkspace,
+        gt_implementation: FBWorkspace,
+        queried_knowledge: QueriedKnowledge = None,
+        **kwargs,
+    ) -> EnsembleEvalFeedback:
+
+        target_task_information = target_task.get_task_information()
+        if (
+            queried_knowledge is not None
+            and target_task_information in queried_knowledge.success_task_to_knowledge_dict
+        ):
+            return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
+        elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
+            return EnsembleEvalFeedback(
+                execution="This task has failed too many times, skip implementation.",
+                code="This task has failed too many times, skip implementation.",
+                return_checking="This task has failed too many times, skip implementation.",
+                final_decision=False,
+            )
+
+        ds_docker_conf = DSDockerConf()
+        ds_docker_conf.extra_volumes = {
+            f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
+        }
+        de = DockerEnv(conf=ds_docker_conf)
+
+        fname = "ensemble_test.txt"
+        test_code = (DIRNAME / "eval_tests" / "ensemble_test.txt").read_text()
+        test_code = (
+            Environment(undefined=StrictUndefined)
+            .from_string(test_code)
+            .render(
+                model_names=[
+                    fn[:-3] for fn in implementation.file_dict.keys() if fn.startswith("model_") and "test" not in fn
+                ]
+            )
+        )
+
+        implementation.inject_files(**{fname: test_code})
+        stdout = implementation.execute(env=de, entry=f"python {fname}")
+
+        system_prompt = T(".prompts:ensemble_eval.system").r(
+            task_desc=target_task_information,
+            test_code=test_code,
+            code=implementation.file_dict["ensemble.py"],
+        )
+        user_prompt = T(".prompts:ensemble_eval.user").r(stdout=stdout)
+
+        resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
+        return EnsembleEvalFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt
new file mode 100644
index 000000000..5a716731d
--- /dev/null
+++ b/rdagent/components/coder/data_science/ensemble/eval_tests/ensemble_test.txt
@@ -0,0 +1,65 @@
+"""
+A qualified ensemble implementation should:
+- Successfully run
+- Return predictions
+- Have correct shapes for inputs and outputs
+- Use validation data appropriately
+"""
+
+import numpy as np
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from load_data import load_data
+from feature import feat_eng
+from ensemble import ens_and_decision
+
+X, y, test_X, test_ids = load_data()
+X, y, test_X = feat_eng(X, y, test_X)
+train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=42)
+
+test_preds_dict = {}
+val_preds_dict = {}
+{% for mn in model_names %}
+from {{mn}} import model_workflow as {{mn}}_workflow
+val_preds_dict["{{mn}}"], test_preds_dict["{{mn}}"], _ = {{mn}}_workflow(
+    X=train_X,
+    y=train_y,
+    val_X=val_X,
+    val_y=val_y,
+    test_X=test_X
+)
+{% endfor %}
+
+for key in val_preds_dict.keys():
+    if val_preds_dict[key] is None: 
+        print(f"Model {key} validation predictions (val_preds_dict[key]) is None.")
+    elif isinstance(val_preds_dict[key], list):
+        print(f"Model {key} validation predictions (val_preds_dict[key]) (list type) length: {len(val_preds_dict[key])}")
+    else:
+        print(f"Model {key} validation predictions (val_preds_dict[key]) shape: {val_preds_dict[key].shape}")
+
+    if test_preds_dict[key] is None: 
+        print(f"Model {key} test predictions (test_preds_dict[key]) is None.")
+    elif isinstance(test_preds_dict[key], list):
+        print(f"Model {key} test predictions (test_preds_dict[key]) (list type) length: {len(test_preds_dict[key])}")
+    else:
+        print(f"Model {key} test predictions (test_preds_dict[key]) shape: {test_preds_dict[key].shape}")
+
+# Run ensemble
+final_pred = ens_and_decision(test_preds_dict, val_preds_dict, val_y)
+
+# Check shape
+if isinstance(final_pred, list):
+    assert len(final_pred) == len(test_X), (
+        f"Wrong output sample size: len(final_pred) ({len(final_pred)}) and len(test_X) ({len(test_X)})"
+    )
+else:
+    assert final_pred.shape[0] == test_X.shape[0], (
+        f"Wrong output sample size: final_pred.shape[0] ({final_pred.shape[0]}) and test_X.shape[0] ({test_X.shape[0]})"
+    )
+
+# check if scores.csv is generated
+assert Path("scores.csv").exists(), "scores.csv is not generated"
+
+print("Ensemble test passed successfully.")
+print(f"Output shape: {final_pred.shape}")
diff --git a/rdagent/components/coder/data_science/ensemble/exp.py b/rdagent/components/coder/data_science/ensemble/exp.py
new file mode 100644
index 000000000..2850ca0f0
--- /dev/null
+++ b/rdagent/components/coder/data_science/ensemble/exp.py
@@ -0,0 +1,10 @@
+import pickle
+import site
+import traceback
+from pathlib import Path
+from typing import Dict, Optional
+
+from rdagent.components.coder.CoSTEER.task import CoSTEERTask
+from rdagent.core.utils import cache_with_pickle
+
+EnsembleTask = CoSTEERTask
diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
new file mode 100644
index 000000000..6fde9d6ce
--- /dev/null
+++ b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -0,0 +1,79 @@
+ensemble_coder:
+  system: |-
+    You are a Python data scientist working on model ensemble implementation. Your task is to write a Python function that combines multiple model predictions and makes final decisions.
+
+    Your specific task as follows:
+    {{task_desc}}
+
+    You should follow the provided specifications to complete this task.
+
+    -----------Competition Information-----------
+    {{ competition_info }}
+
+    Please respond with the code in the following json format:
+    {
+        "code": "The Python code as a string."
+    }
+
+    {% if queried_similar_successful_knowledge|length != 0 or queried_former_failed_knowledge|length != 0 %}
+    -----------Here is the relevant information for this task-----------
+    {% endif %}
+    {% if queried_similar_successful_knowledge|length != 0 %}
+    --------------Successful Implementations for Similar Models:--------------
+    ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
+    {{ similar_successful_knowledge.target_task.get_task_information() }}
+    =====Code:=====
+    {{ similar_successful_knowledge.implementation.file_dict["ensemble.py"] }}
+    {% endfor %} 
+    {% endif %}
+
+    {% if queried_former_failed_knowledge|length != 0 %}
+    --------------Previous Failed Attempts:--------------
+    {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
+    =====Code:=====
+    {{ former_failed_knowledge.implementation.file_dict["ensemble.py"] }}
+    =====Feedback:=====
+    {{ former_failed_knowledge.feedback }}
+    {% endfor %}
+    {% endif %}
+
+  user: |-
+    Please implement an ensemble function with the following specification:
+    -----------Ensemble Specification-----------
+    {{ ensemble_spec }}
+
+    {% if latest_code %}
+    ---------Former code---------
+      {{ latest_code }}
+    {% if latest_code_feedback is not none %}
+    ---------Feedback to former code---------
+      {{ latest_code_feedback }}
+    {% endif %}
+    The former code has some errors, you should write the correct code based on the former code. Avoid writing the same code to former code.
+    {% endif %}
+ensemble_eval:
+  system: |-
+    You are a data scientist evaluating an ensemble implementation.
+    The main code generation task is as follows:
+    {{task_desc}}
+    
+    The ensemble code is:
+    ```python
+    {{code}}
+    ```
+
+    You are testing the ensemble with the following code:
+    ```python
+    {{test_code}}
+    ```
+    
+    You'll be given the stdout of your testing scripts.
+    Please respond with your feedback in the following JSON format: 
+    {
+        "execution": "Describe how well the ensemble executed, including any errors or issues encountered. Please keep the error message and tracking information",
+        "return_checking": "Detail the checks performed on the ensemble results, including shape and value validation.",
+        "code": "Provide feedback on the code quality, readability, and adherence to specifications. Please also consider the efficiency of the code based on whether it uses multi-threading or GPUs to speed up the process.",
+        "final_decision": <true/false>
+    }    
+  user: |-    
+    {{stdout}}   
diff --git a/rdagent/components/coder/data_science/ensemble/test.py b/rdagent/components/coder/data_science/ensemble/test.py
new file mode 100644
index 000000000..1a7a3d08b
--- /dev/null
+++ b/rdagent/components/coder/data_science/ensemble/test.py
@@ -0,0 +1,58 @@
+"""
+Helper functions for testing the ensemble coder(CoSTEER-based) component.
+"""
+
+import sys
+from pathlib import Path
+
+from rdagent.components.coder.data_science.ensemble import EnsembleCoSTEER
+from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.scenarios.data_science.scen import KaggleScen
+
+# Add the competition folder to path
+COMPETITION_PATH = (
+    Path(__file__).parent.parent.parent.parent.parent
+    / "scenarios"
+    / "kaggle"
+    / "tpl_ex"
+    / "aerial-cactus-identification"
+)
+sys.path.append(str(COMPETITION_PATH))
+
+EnsembleExperiment = DSExperiment
+
+
+def load_ensemble_spec():
+    spec_path = COMPETITION_PATH / "spec" / "ensemble.md"
+    with open(spec_path, "r") as f:
+        return f.read()
+
+
+def develop_one_competition(competition: str):
+    # Initialize scenario and coder
+    scen = KaggleScen(competition=competition)
+    ensemble_coder = EnsembleCoSTEER(scen)
+    # Load ensemble specification
+    ensemble_spec = load_ensemble_spec()
+
+    # Create the ensemble task with actual data context and specification
+    task = EnsembleTask(
+        name="EnsembleTask",
+        description="""
+        Implement ensemble and decision making for model predictions.
+        """,
+    )
+
+    exp = EnsembleExperiment(sub_tasks=[task])
+
+    # Injecting the corresponding specification
+    exp.experiment_workspace.inject_files(**{"spec/ensemble.md": ensemble_spec})
+
+    # Develop the experiment
+    exp = ensemble_coder.develop(exp)
+    return exp
+
+
+if __name__ == "__main__":
+    develop_one_competition("aerial-cactus-identification")
diff --git a/rdagent/components/coder/data_science/feature/__init__.py b/rdagent/components/coder/data_science/feature/__init__.py
new file mode 100644
index 000000000..6020acf72
--- /dev/null
+++ b/rdagent/components/coder/data_science/feature/__init__.py
@@ -0,0 +1,118 @@
+import json
+
+from rdagent.components.coder.CoSTEER import CoSTEER
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
+from rdagent.components.coder.CoSTEER.evolving_strategy import (
+    MultiProcessEvolvingStrategy,
+)
+from rdagent.components.coder.CoSTEER.knowledge_management import (
+    CoSTEERQueriedKnowledge,
+)
+from rdagent.components.coder.data_science.feature.eval import FeatureCoSTEEREvaluator
+from rdagent.components.coder.data_science.feature.exp import FeatureTask
+from rdagent.core.exception import CoderError
+from rdagent.core.experiment import FBWorkspace
+from rdagent.core.scenario import Scenario
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+
+
+class FeatureMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
+    def implement_one_task(
+        self,
+        target_task: FeatureTask,
+        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+        workspace: FBWorkspace | None = None,
+    ) -> dict[str, str]:
+        # return a workspace with "load_data.py", "spec/load_data.md" inside
+        # assign the implemented code to the new workspace.
+        feature_information_str = target_task.get_task_information()
+
+        # 1. query
+        queried_similar_successful_knowledge = (
+            queried_knowledge.task_to_similar_task_successful_knowledge[feature_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+        queried_former_failed_knowledge = (
+            queried_knowledge.task_to_former_failed_traces[feature_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+        latest_code_feedback = [
+            knowledge.feedback
+            for knowledge in queried_former_failed_knowledge[0]
+            if knowledge.implementation.file_dict.get("feature.py") is not None
+            and knowledge.implementation.file_dict.get("feature.py") == workspace.file_dict.get("feature.py")
+        ]
+        if len(latest_code_feedback) > 0:
+            queried_former_failed_knowledge = (
+                [
+                    knowledge
+                    for knowledge in queried_former_failed_knowledge[0]
+                    if knowledge.implementation.file_dict.get("feature.py") != workspace.file_dict.get("feature.py")
+                ],
+                queried_former_failed_knowledge[1],
+            )
+
+        # 2. code
+        system_prompt = T(".prompts:feature.system").r(
+            task_desc=feature_information_str,
+            data_loader_code=workspace.file_dict.get("load_data.py"),
+            queried_similar_successful_knowledge=queried_similar_successful_knowledge,
+            queried_former_failed_knowledge=queried_former_failed_knowledge[0],
+        )
+        user_prompt = T(".prompts:feature.user").r(
+            feature_spec=workspace.file_dict["spec/feature.md"],
+            latest_code=workspace.file_dict.get("feature.py"),
+            latest_code_feedback=latest_code_feedback[0] if len(latest_code_feedback) > 0 else None,
+        )
+
+        for _ in range(5):
+            feature_code = json.loads(
+                APIBackend().build_messages_and_create_chat_completion(
+                    user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                )
+            )["code"]
+            if feature_code != workspace.file_dict.get("feature.py"):
+                break
+            else:
+                user_prompt = user_prompt + "\nPlease avoid generating same code to former code!"
+        else:
+            raise CoderError("Failed to generate a new feature code.")
+
+        return {
+            "feature.py": feature_code,
+        }
+
+    def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
+        """
+        Assign the code list to the evolving item.
+
+        The code list is aligned with the evolving item's sub-tasks.
+        If a task is not implemented, put a None in the list.
+        """
+        for index in range(len(evo.sub_tasks)):
+            if code_list[index] is None:
+                continue
+            if evo.sub_workspace_list[index] is None:
+                # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
+                evo.sub_workspace_list[index] = evo.experiment_workspace
+            evo.sub_workspace_list[index].inject_files(**code_list[index])
+        return evo
+
+
+class FeatureCoSTEER(CoSTEER):
+    def __init__(
+        self,
+        scen: Scenario,
+        *args,
+        **kwargs,
+    ) -> None:
+        eva = CoSTEERMultiEvaluator(
+            FeatureCoSTEEREvaluator(scen=scen), scen=scen
+        )  # Please specify whether you agree running your eva in parallel or not
+        es = FeatureMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+
+        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=2, scen=scen, **kwargs)
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
new file mode 100644
index 000000000..c4fcdfd26
--- /dev/null
+++ b/rdagent/components/coder/data_science/feature/eval.py
@@ -0,0 +1,68 @@
+import json
+from pathlib import Path
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.components.coder.CoSTEER.evaluators import (
+    CoSTEEREvaluator,
+    CoSTEERSingleFeedback,
+)
+from rdagent.core.evolving_framework import QueriedKnowledge
+from rdagent.core.experiment import FBWorkspace, Task
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+from rdagent.utils.env import DockerEnv, DSDockerConf
+from rdagent.utils.fmt import shrink_text
+
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+FeatureEvalFeedback = CoSTEERSingleFeedback
+
+
+class FeatureCoSTEEREvaluator(CoSTEEREvaluator):
+
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: FBWorkspace,
+        gt_implementation: FBWorkspace,
+        queried_knowledge: QueriedKnowledge = None,
+        **kwargs,
+    ) -> FeatureEvalFeedback:
+
+        target_task_information = target_task.get_task_information()
+        if (
+            queried_knowledge is not None
+            and target_task_information in queried_knowledge.success_task_to_knowledge_dict
+        ):
+            return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
+        elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
+            return FeatureEvalFeedback(
+                execution="This task has failed too many times, skip implementation.",
+                return_checking="This task has failed too many times, skip implementation.",
+                code="This task has failed too many times, skip implementation.",
+                final_decision=False,
+            )
+
+        ds_docker_conf = DSDockerConf()
+        # TODO: we should /= 20 for the timeout period on debug component
+        ds_docker_conf.extra_volumes = {
+            f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
+        }
+        de = DockerEnv(conf=ds_docker_conf)
+
+        # TODO: do we need to clean the generated temporary content?
+        fname = "feature_test.py"
+        test_code = (DIRNAME / "eval_tests" / "feature_test.txt").read_text()
+        implementation.inject_files(**{fname: test_code})
+
+        stdout = implementation.execute(env=de, entry=f"python {fname}")
+
+        system_prompt = T(".prompts:feature_eval.system").r(
+            task_desc=target_task.get_task_information(),
+            test_code=test_code,
+            code=implementation.file_dict["feature.py"],
+        )
+        user_prompt = T(".prompts:feature_eval.user").r(stdout=shrink_text(stdout))
+
+        resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
+        return FeatureEvalFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt
new file mode 100644
index 000000000..a731c0409
--- /dev/null
+++ b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt
@@ -0,0 +1,35 @@
+"""
+A qualified data loader should support following features
+- successfully run
+- len(test) == len(test_ids) == submission length
+- len(train) == len(y)
+
+Please make sure the stdout is rich enough to support informative feedback
+"""
+
+import pickle
+
+import numpy as np
+import pandas as pd
+from feature import feat_eng
+from load_data import load_data
+
+X, y, X_test, test_ids = load_data()
+print(f"X.shape: {X.shape}")
+print(f"y.shape: {y.shape}" if not isinstance(y, list) else f"y(list)'s length: {len(y)}")
+print(f"X_test.shape: {X_test.shape}")
+print(f"test_ids length: {len(test_ids)}")
+X, y, X_test = feat_eng(X, y, X_test)
+
+
+def get_length(data):
+    return len(data) if isinstance(data, list) else data.shape[0]
+
+assert get_length(X_test) == get_length(test_ids), (
+    f"Mismatch in length of test images and test IDs: X_test ({get_length(X_test)}) and test_ids ({get_length(test_ids)})"
+)
+assert get_length(X) == get_length(y), (
+    f"Mismatch in length of training images and labels: X ({get_length(X)}) and y ({get_length(y)})"
+)
+
+print("Feature Engineering test passed successfully. Length of test images matches length of test IDs.")
diff --git a/rdagent/components/coder/data_science/feature/exp.py b/rdagent/components/coder/data_science/feature/exp.py
new file mode 100644
index 000000000..e6658050d
--- /dev/null
+++ b/rdagent/components/coder/data_science/feature/exp.py
@@ -0,0 +1,10 @@
+import pickle
+import site
+import traceback
+from pathlib import Path
+from typing import Dict, Optional
+
+from rdagent.components.coder.CoSTEER.task import CoSTEERTask
+from rdagent.core.utils import cache_with_pickle
+
+FeatureTask = CoSTEERTask
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
new file mode 100644
index 000000000..b49eae8b9
--- /dev/null
+++ b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -0,0 +1,91 @@
+feature:
+  system: |-
+    You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
+    Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
+
+    Your task is as follows:
+    {{task_desc}}
+    
+    This project involves implementing feature engineering techniques to prepare data for machine learning models, and this project code will be written by GPT.
+    Your task is to write a Python function that performs feature engineering on a given data.
+    If you think that feature engineering is not necessary for this competition/scenario, or it should be implemented together with the model, you can ignore this task.
+    You should follow the provided specifications to complete this task.
+
+    Your function input is the output of a data loading function, the data loader function code is as follows:
+    ```python
+    {{data_loader_code}}
+    ```
+    Please understand the code and try to implement the feature engineering function based on the data loader output.
+
+    Please response the code in the following json format. Here is an example structure for the JSON output:
+    {
+        "code": "The Python code as a string."
+    }
+
+    {% if queried_similar_successful_knowledge|length != 0 or queried_former_failed_knowledge|length != 0 %}
+    -----------Here is the relevant information for this task-----------
+    {% endif %}
+    {% if queried_similar_successful_knowledge|length != 0 %}
+    --------------Successful Implementations for Similar Models:--------------
+    ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
+    {{ similar_successful_knowledge.target_task.get_task_information() }}
+    =====Code:=====
+    {{ similar_successful_knowledge.implementation.file_dict["feature.py"] }}
+    {% endfor %} 
+    {% endif %}
+
+    {% if queried_former_failed_knowledge|length != 0 %}
+    --------------Previous Failed Attempts:--------------
+    {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
+    =====Code:=====
+    {{ former_failed_knowledge.implementation.file_dict["feature.py"] }}
+    =====Feedback:=====
+    {{ former_failed_knowledge.feedback }}
+    {% endfor %}
+    {% endif %}
+
+  user: |-
+    ---------Feature Processing Specification---------
+    {{ feature_spec }}
+
+    {% if latest_code %}
+    ---------Former code---------
+      {{ latest_code }}
+    {% if latest_code_feedback is not none %}
+    ---------Feedback to former code---------
+      {{ latest_code_feedback }}
+    {% endif %}
+    The former code has some errors, you should write the correct code based on the former code. Avoid writing the same code to former code.
+    {% endif %}    
+
+
+feature_eval:
+  system: |-
+    You are data scientist whose job is to evaluate the feature processing code generation.
+    The main code generation task is as follows:
+    {{task_desc}}
+
+    The feature code is:
+    ```python
+    {{code}}
+    ```
+
+    You are testing the feature processing code with the following code
+    ```python
+    {{test_code}}
+    ```
+
+    You'll be given the stdout of your testing scripts.
+    Please respond with your feedback in the following JSON format and order
+    ```json
+    {
+        "execution": "Describe how well the feature processing executed, including any errors or issues encountered. Please keep the error message and tracking information",
+        "return_checking": "Detail the checks performed on the data after feature processing, including data integrity and correctness.",
+        "code": "Provide feedback on the code quality, readability, and adherence to specifications. Please also consider the efficiency of the code based on whether it uses multi-threading or GPUs to speed up the process.",
+        "final_decision": <true/false>
+    }
+    ```
+  user: |-
+    ```
+    {{stdout}}
+    ```
diff --git a/rdagent/components/coder/data_science/feature/test.py b/rdagent/components/coder/data_science/feature/test.py
new file mode 100644
index 000000000..74732e756
--- /dev/null
+++ b/rdagent/components/coder/data_science/feature/test.py
@@ -0,0 +1,37 @@
+"""
+Helper functions for testing the feature coder(CoSTEER-based) component.
+- Does the developer loop work correctly
+
+It is NOT:
+- it is not interface unittest(i.e. workspace evaluator in the CoSTEER Loop)
+"""
+
+from rdagent.components.coder.data_science.feature import FeatureCoSTEER
+from rdagent.components.coder.data_science.feature.exp import FeatureTask
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.scenarios.data_science.scen import KaggleScen
+
+
+def develop_one_competition(competition: str):  # -> experiment
+    scen = KaggleScen(competition=competition)
+    feature_coder = FeatureCoSTEER(scen)
+
+    with open("./rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md", "r") as file:
+        feat_spec = file.read()
+
+    # Create the experiment
+    ft = FeatureTask(name="FeatureTask", description=scen.get_competition_full_desc())
+    exp = DSExperiment(
+        sub_tasks=[ft],
+    )
+
+    with open("./rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py", "r") as file:
+        load_data_code = file.read()
+    exp.experiment_workspace.inject_files(**{"load_data.py": load_data_code, "spec/feature.md": feat_spec})
+
+    # Develop the experiment
+    exp = feature_coder.develop(exp)
+
+
+if __name__ == "__main__":
+    develop_one_competition("aerial-cactus-identification")
diff --git a/rdagent/components/coder/data_science/feature_process/__init__.py b/rdagent/components/coder/data_science/feature_process/__init__.py
deleted file mode 100644
index 68a1ee6b7..000000000
--- a/rdagent/components/coder/data_science/feature_process/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# from rdagent.components.coder.CoSTEER import CoSTEER
-# from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
-# from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-# from rdagent.core.scenario import Scenario
-
-
-# class FeatureCoSTEER(CoSTEER):
-#     def __init__(
-#         self,
-#         scen: Scenario,
-#         *args,
-#         **kwargs,
-#     ) -> None:
-#         eva = CoSTEERMultiEvaluator(
-#             FeatureCoSTEEREvaluator(scen=scen), scen=scen
-#         )  # Please specify whether you agree running your eva in parallel or not
-#         es = FeatureMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
-
-#         super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
diff --git a/rdagent/components/coder/data_science/model/__init__.py b/rdagent/components/coder/data_science/model/__init__.py
index 7d4020cfa..537d3fab5 100644
--- a/rdagent/components/coder/data_science/model/__init__.py
+++ b/rdagent/components/coder/data_science/model/__init__.py
@@ -1,19 +1,144 @@
-# from rdagent.components.coder.CoSTEER import CoSTEER
-# from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
-# from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-# from rdagent.core.scenario import Scenario
-
-
-# class ModelCoSTEER(CoSTEER):
-#     def __init__(
-#         self,
-#         scen: Scenario,
-#         *args,
-#         **kwargs,
-#     ) -> None:
-#         eva = CoSTEERMultiEvaluator(
-#             ModelCoSTEEREvaluator(scen=scen), scen=scen
-#         )  # Please specify whether you agree running your eva in parallel or not
-#         es = ModelMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
-
-#         super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
+import json
+from pathlib import Path
+
+from jinja2 import Environment, StrictUndefined
+
+from rdagent.components.coder.CoSTEER import CoSTEER
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
+from rdagent.components.coder.CoSTEER.evolving_strategy import (
+    MultiProcessEvolvingStrategy,
+)
+from rdagent.components.coder.CoSTEER.knowledge_management import (
+    CoSTEERQueriedKnowledge,
+)
+from rdagent.components.coder.data_science.model.eval import (
+    ModelGeneralCaseSpecEvaluator,
+)
+from rdagent.components.coder.data_science.model.exp import ModelTask
+from rdagent.core.exception import CoderError
+from rdagent.core.experiment import FBWorkspace
+from rdagent.core.scenario import Scenario
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.ret import BatchEditOut
+from rdagent.utils.agent.tpl import T
+
+
+class ModelMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
+    def implement_one_task(
+        self,
+        target_task: ModelTask,
+        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+        workspace: FBWorkspace | None = None,
+    ) -> dict[str, str]:
+        model_information_str = target_task.get_task_information()
+
+        # 1. query
+        queried_similar_successful_knowledge = (
+            queried_knowledge.task_to_similar_task_successful_knowledge[model_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+        queried_former_failed_knowledge = (
+            queried_knowledge.task_to_former_failed_traces[model_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+        latest_code_feedback = [
+            knowledge.feedback
+            for knowledge in queried_former_failed_knowledge[0]
+            if knowledge.implementation.file_dict.get(f"{target_task.name}.py") is not None
+            and knowledge.implementation.file_dict.get(f"{target_task.name}.py")
+            == workspace.file_dict.get(f"{target_task.name}.py")
+        ]
+        if len(latest_code_feedback) > 0:
+            queried_former_failed_knowledge = (
+                [
+                    knowledge
+                    for knowledge in queried_former_failed_knowledge[0]
+                    if knowledge.implementation.file_dict.get(f"{target_task.name}.py")
+                    != workspace.file_dict.get(f"{target_task.name}.py")
+                ],
+                queried_former_failed_knowledge[1],
+            )
+
+        # 2. code
+        system_prompt = T(".prompts:model_coder.system").r(
+            task_desc=model_information_str,
+            data_loader_code=workspace.file_dict.get("load_data.py"),
+            feature_code=workspace.file_dict["feature.py"],
+            queried_similar_successful_knowledge=queried_similar_successful_knowledge,
+            queried_former_failed_knowledge=queried_former_failed_knowledge[0],
+            out_spec=BatchEditOut.get_spec(),
+        )
+        # user_prompt = T(".prompts:model_coder.user").r(
+        #     model_spec=workspace.file_dict["spec/model.md"],
+        #     feature_code=workspace.file_dict["feature.py"],
+        #     latest_code=workspace.file_dict.get(f"{target_task.name}.py", None),
+        # )
+        # We want to use a simpler way to
+        user_prompt = T(".prompts:model_coder.user_general").r(
+            model_spec=workspace.file_dict["spec/model.md"],
+            workspace_code=workspace.get_codes(
+                r"^model_(?!test)\w+\.py$"
+            ),  # TODO: If we have high failure rate here, we should clean this step with less information.
+            latest_code_feedback=latest_code_feedback[0] if len(latest_code_feedback) > 0 else None,
+        )
+
+        for _ in range(5):
+            batch_edit = BatchEditOut.extract_output(
+                APIBackend().build_messages_and_create_chat_completion(
+                    user_prompt=user_prompt,
+                    system_prompt=system_prompt,
+                    json_mode=BatchEditOut.json_mode,
+                )
+            )
+
+            # 3. post process to align file name to the task name
+            batch_edit = {
+                (f"{target_task.name}.py" if value != "__DEL__" and key != f"{target_task.name}.py" else key): value
+                for key, value in batch_edit.items()
+            }
+
+            if batch_edit[f"{target_task.name}.py"] != "__DEL__" and batch_edit[
+                f"{target_task.name}.py"
+            ] != workspace.file_dict.get(f"{target_task.name}.py"):
+                break
+            else:
+                user_prompt = user_prompt + "\nPlease avoid generating same code to former code!"
+        else:
+            raise CoderError("Failed to generate a new model code.")
+
+        return batch_edit
+
+    def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
+        """
+        Assign the code list to the evolving item.
+
+        The code list is aligned with the evolving item's sub-tasks.
+        If a task is not implemented, put a None in the list.
+        """
+        for index in range(len(evo.sub_tasks)):
+            if code_list[index] is None:
+                continue
+            if evo.sub_workspace_list[index] is None:
+                # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
+                evo.sub_workspace_list[index] = evo.experiment_workspace
+            evo.sub_workspace_list[index].inject_files(**code_list[index])
+        return evo
+
+
+class ModelCoSTEER(CoSTEER):
+    def __init__(
+        self,
+        scen: Scenario,
+        *args,
+        **kwargs,
+    ) -> None:
+        eva = CoSTEERMultiEvaluator(
+            ModelGeneralCaseSpecEvaluator(scen=scen), scen=scen
+        )  # Please specify whether you agree running your eva in parallel or not
+        # eva = ModelGeneralCaseSpecEvaluator(scen=scen)
+        es = ModelMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+
+        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=2, scen=scen, **kwargs)
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
new file mode 100644
index 000000000..9331e001b
--- /dev/null
+++ b/rdagent/components/coder/data_science/model/eval.py
@@ -0,0 +1,87 @@
+"""
+Beyond previous tests
+- 
+"""
+
+import json
+import re
+from pathlib import Path
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.components.coder.CoSTEER.evaluators import (
+    CoSTEEREvaluator,
+    CoSTEERSingleFeedback,
+)
+from rdagent.core.evolving_framework import QueriedKnowledge
+from rdagent.core.exception import CoderError
+from rdagent.core.experiment import FBWorkspace, Task
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+from rdagent.utils.env import DockerEnv, DSDockerConf
+
+DIRNAME = Path(__file__).absolute().resolve().parent
+ModelSingleFeedback = CoSTEERSingleFeedback
+
+
+# Below are unit tests for testing the specification of the implemented model ------------------
+class ModelGeneralCaseSpecEvaluator(CoSTEEREvaluator):
+    """
+    Motivation case:
+    - Simplest case, we already split the data into train_data, valid_data, and test_data. We require the model to learn (optionally validate on valid data), and infer on test data.
+
+    Test workflow:
+    - Build train, valid, and test data to run it, and test the output (e.g., shape, etc.)
+    """
+
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: FBWorkspace,
+        gt_implementation: FBWorkspace,
+        queried_knowledge: QueriedKnowledge = None,
+        **kwargs,
+    ) -> ModelSingleFeedback:
+        target_task_information = target_task.get_task_information()
+        if (
+            queried_knowledge is not None
+            and target_task_information in queried_knowledge.success_task_to_knowledge_dict
+        ):
+            return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
+        elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
+            return ModelSingleFeedback(
+                execution="This task has failed too many times, skip implementation.",
+                return_checking="This task has failed too many times, skip implementation.",
+                code="This task has failed too many times, skip implementation.",
+                final_decision=False,
+            )
+
+        ds_docker_conf = DSDockerConf()
+        ds_docker_conf.extra_volumes = {
+            f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
+        }
+        de = DockerEnv(conf=ds_docker_conf)
+
+        fname = "model_test.py"
+        test_code = (
+            (DIRNAME / "eval_tests" / "model_test.txt").read_text().replace("model01", target_task.name)
+        )  # only check the model changed this time
+        implementation.inject_files(**{fname: test_code})
+        stdout = implementation.execute(env=de, entry=f"python {fname}")
+
+        if stdout is None:
+            raise CoderError(
+                "The execution output contains too many progress bars and results in the LLM's token size exceeding the limit."
+            )
+
+        system_prompt = T(".prompts:model_eval.system").r(
+            task_desc=target_task.get_task_information(),
+            test_code=test_code,
+            scenario=self.scen.get_scenario_all_desc(),
+            spec=implementation.file_dict["spec/model.md"],
+        )
+        user_prompt = T(".prompts:model_eval.user").r(
+            stdout=stdout,
+            code=implementation.file_dict[f"{target_task.name}.py"],
+        )
+        resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
+        return ModelSingleFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_test.txt b/rdagent/components/coder/data_science/model/eval_tests/model_test.txt
new file mode 100644
index 000000000..9ddcd1834
--- /dev/null
+++ b/rdagent/components/coder/data_science/model/eval_tests/model_test.txt
@@ -0,0 +1,53 @@
+import time
+
+from feature import feat_eng
+from load_data import load_data
+from model01 import model_workflow
+from sklearn.model_selection import train_test_split
+
+
+def log_execution_results(start_time, val_pred, test_pred, hypers, execution_label):
+    """Log the results of a single model execution."""
+    feedback_str = f"{execution_label} successful.\n"
+    feedback_str += f"Validation predictions shape: {val_pred.shape if val_pred is not None else 'None'}\n"
+    feedback_str += f"Test predictions shape: {test_pred.shape if test_pred is not None else 'None'}\n"
+    feedback_str += f"Hyperparameters: {hypers if hypers is not None else 'None'}\n"
+    feedback_str += f"Execution time: {time.time() - start_time:.2f} seconds.\n"
+    print(feedback_str)
+
+
+# Load and preprocess data
+X, y, test_X, test_ids = load_data()
+X, y, test_X = feat_eng(X, y, test_X)
+train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.8, random_state=42)
+print(f"train_X.shape: {train_X.shape}")
+print(f"train_y.shape: {train_y.shape}" if not isinstance(train_y, list) else f"train_y(list)'s length: {len(train_y)}")
+print(f"val_X.shape: {val_X.shape}")
+print(f"val_y.shape: {val_y.shape}" if not isinstance(val_y, list) else f"val_y(list)'s length: {len(val_y)}")
+
+# First execution
+print("The first execution begins.\n")
+start_time = time.time()
+val_pred, test_pred, hypers = model_workflow(
+    X=train_X,
+    y=train_y,
+    val_X=val_X,
+    val_y=val_y,
+    test_X=None,
+)
+log_execution_results(start_time, val_pred, test_pred, hypers, "The first execution")
+
+# Second execution
+print("The second execution begins.\n")
+start_time = time.time()
+val_pred, test_pred, final_hypers = model_workflow(
+    X=train_X,
+    y=train_y,
+    val_X=None,
+    val_y=None,
+    test_X=test_X,
+    hyper_params=hypers,
+)
+log_execution_results(start_time, val_pred, test_pred, final_hypers, "The second execution")
+
+print("Model code test passed successfully.")
diff --git a/rdagent/components/coder/data_science/model/exp.py b/rdagent/components/coder/data_science/model/exp.py
new file mode 100644
index 000000000..7ebd277bf
--- /dev/null
+++ b/rdagent/components/coder/data_science/model/exp.py
@@ -0,0 +1,43 @@
+import pickle
+import site
+import traceback
+from pathlib import Path
+from typing import Dict, Optional
+
+from rdagent.components.coder.CoSTEER.task import CoSTEERTask
+from rdagent.core.experiment import Experiment, FBWorkspace
+from rdagent.core.utils import cache_with_pickle
+from rdagent.oai.llm_utils import md5_hash
+from rdagent.utils.env import DockerEnv, DSDockerConf
+
+
+class ModelTask(CoSTEERTask):
+    def __init__(
+        self,
+        name: str,
+        description: str,
+        architecture: str = "",
+        *args,
+        hyperparameters: Dict[str, str] = {},
+        model_type: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        self.architecture: str = architecture
+        self.hyperparameters: str = hyperparameters
+        self.model_type: str | None = (
+            model_type  # Tabular for tabular model, TimesSeries for time series model, Graph for graph model, XGBoost for XGBoost model
+            # TODO: More Models Supported
+        )
+        super().__init__(name=name, description=description, *args, **kwargs)
+
+    def get_task_information(self):
+        task_desc = f"""name: {self.name}
+description: {self.description}
+"""
+        if self.architecture:
+            task_desc += f"architecture: {self.architecture}\n"
+        if self.hyperparameters:
+            task_desc += f"hyperparameters: {self.hyperparameters}\n"
+        if self.model_type:
+            task_desc += f"model_type: {self.model_type}\n"
+        return task_desc
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
new file mode 100644
index 000000000..ab51831aa
--- /dev/null
+++ b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -0,0 +1,136 @@
+model_coder:
+    system: |-
+        You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
+        Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
+        
+        Your task is as follows:
+        {{task_desc}}
+
+        The user's ultimate goal is to obtain accurate predictions from the model on input data. Follow the instructions below to ensure your response is correct and aligned with the user's expectations.
+
+        Your function's input is from the output of a feature engineering function whose input is the output of a data loading function. The raw data loader function and feature engineer function code is as follows:
+        --------- Raw Data Loader Code: ---------
+        {{data_loader_code}}
+        --------- Feature Engineering Code: ---------
+        {{feature_code}}
+        
+        Instructions for Code Generation:
+            Leveraging User Inputs:
+                The user may provide various forms of additional information to guide you:
+
+                    Successful Examples: Correct implementations of similar models.
+                    Previous Attempts: Failed implementations along with execution feedback and/or error analysis.
+                    Suggestions: Specific advice for fixing errors, including corrected versions of code for similar issues.
+                Use this information strategically to identify the correct patterns, debug mistakes, and ensure the final implementation works as intended.
+
+            Preserving Correct Code:
+                If the user has shared their latest code, carefully analyze it and only modify parts that require changes. Do not alter correct sections of the code.
+
+            Error Learning:
+                If previous failed attempts and their feedback are available, learn from them. Understand what went wrong and avoid repeating similar mistakes in your new implementation.
+                The failure knowledge may include the code unrelated to the model, such as data loading, preprocessing, or feature engineering. Focus only on the model implementation part.
+
+        {% if out_spec %}
+        {{out_spec}}
+        The file name should be the model name described in the model task in the format "{task_name}.py". You should always follow this name format.
+        {% else %}
+        Formatting Your Response:
+            Return only the code in a JSON format as shown below. Do not include any explanations or extra text. Example:
+            {
+                "code": "Your corrected or newly implemented Python code as a single string"
+            }
+        {% endif %}
+        
+        {% if queried_similar_successful_knowledge|length != 0 or queried_former_failed_knowledge|length != 0 %}
+        -----------Here is the relevant information for this task-----------
+        {% endif %}
+        {% if queried_similar_successful_knowledge|length != 0 %}
+        --------------Successful Implementations for Similar Models:--------------
+        ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
+        {{ similar_successful_knowledge.target_task.get_task_information() }}
+        =====Code:=====
+        {{ similar_successful_knowledge.implementation.file_dict[similar_successful_knowledge.target_task.name ~ '.py'] }}
+        {% endfor %} 
+        {% endif %}
+
+        {% if queried_former_failed_knowledge|length != 0 %}
+        --------------Previous Failed Attempts:--------------
+        {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
+        =====Code:=====
+        {{ former_failed_knowledge.implementation.file_dict[former_failed_knowledge.target_task.name ~ '.py'] }}
+        =====Feedback:=====
+        {{ former_failed_knowledge.feedback }}
+        {% endfor %}
+        {% endif %}
+
+    user: |-
+        ---------Model Specification---------
+        {{ model_spec }}
+
+        {% if latest_code %}
+        ---------Former Code---------
+            Former Code: {{ latest_code }}
+        The former code has some errors, you should write the correct code based on the former code. Avoid writing the same code to former code.
+        {% endif %}
+
+    user_general: |-
+        --------- Workspace code---------
+        {% if workspace_code|length == 0 %}
+        So far the workspace is empty. No model code has been implemented yet.
+        {% else %}
+        {{ workspace_code }}
+        {% if latest_code_feedback is not none %}
+        ---------Feedback to former code---------
+        {{ latest_code_feedback }}
+        {% endif %}
+        {% endif %}
+        ---------Model Specification---------
+        When you are implementing the code, you should follow the spec
+        {{ model_spec }}
+
+
+model_eval:
+    system: |-
+        You are a data scientist.
+        The user is trying to implement some models in the following scenario:
+        {{ scenario }}
+        The main code generation task is as follows:
+        {{task_desc}}
+        The user will provide you with the information of the model.
+        The information about how to implement the model is given in spec.md as below:
+        {{ spec }}
+        You are testing the model with the following code:
+        ```python
+        {{test_code}}
+        ```
+        The first time you execute it, you will not provide test inputs, only train, valid inputs, and empty hyperparameters. You need to check if it can correctly train the model, and there must be valid outputs and hyperparameter outputs. 
+        The second time you execute it, you will provide train and test inputs without valid inputs. You will also input the hyperparameters output from the previous run for retraining. 
+        Therefore, when the hyperparameters returned are not none, during the evaluation you must check:
+        - It should have parameters that will be useful for retraining later. It must include the early stop round.
+        - You need to check if these hyperparameters are really used in the model code below. The early stop round must be used if given.
+        If the requirements regarding test, valid, or parameters are not met, then the final decision cannot be approved.
+        
+        You should evaluate the code given by the user. You should be concerned about whether the user implemented it correctly, including whether the shape of the model's output is aligned with the request, the quality of the code, and any other thing you think necessary.
+        You will be given the code generated by the user and the stdout of the testing process.
+        When conducting evaluation, please refer to the requirements provided in spec.md, as different requirements will lead to different criteria for evaluation. 
+
+        Only if there is "Model code test passed successfully." in the stdout, then the model is considered successful, or else there must be some issues with the model.
+        If no stdout is provided, the model is considered to have failed due to a timeout. Please check if there are any ways to improve the model's execution speed.
+    
+        Please respond with your feedback in the following JSON format and order:
+        ```json
+        {
+            "execution": "Describe whether the model executed successfully, including any errors or issues encountered. Please keep the error message and tracking information",
+            "return_checking": "Check the generated value, including whether the value is generated and comparing the shape of the model output with the requirement in spec.md. You also need to check whether the hyperparameters used for retraining are correctly returned during the test execution of the model.",
+            "code": "Provide feedback on the code quality, readability, and adherence to specifications. Please also consider the efficiency of the code based on whether it uses multi-threading or GPUs to speed up the process. Check whether the hyperparameters from the previous run are used in the model code, compare the parameter names in stdout and if they are used in the retraining part of the code. It is acceptable when hyperparameters is None.",
+            "final_decision": <true/false>
+        }
+        ```
+
+    user: |-
+        --------------Code generated by user:---------------
+        {{ code }}
+        --------------stdoutput:---------------
+        '''
+        {{ stdout }}
+        '''
diff --git a/rdagent/components/coder/data_science/model/test.py b/rdagent/components/coder/data_science/model/test.py
new file mode 100644
index 000000000..268bdda1b
--- /dev/null
+++ b/rdagent/components/coder/data_science/model/test.py
@@ -0,0 +1,67 @@
+"""
+Generate dataset to test the model workflow output
+"""
+
+from pathlib import Path
+
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+from rdagent.components.coder.data_science.model import ModelCoSTEER
+from rdagent.components.coder.data_science.model.eval import (
+    ModelGeneralCaseSpecEvaluator,
+)
+from rdagent.components.coder.data_science.model.exp import ModelTask
+from rdagent.core.experiment import FBWorkspace
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.scenarios.data_science.scen import KaggleScen
+
+
+# Take tasks, spec.md and feat as input, generate a feedback as output
+def develop_one_competition(competition: str):
+    scen = KaggleScen(competition=competition)
+    model_coder = ModelCoSTEER(scen)
+
+    # Create the task
+    mt = ModelTask(
+        name="ModelTask",
+        description="A CNN Model",
+        model_type="CNN",
+        architecture="\hat{y}_u = CNN(X_u)",
+        # variables="variables: {'\\hat{y}_u': 'The predicted output for node u', 'X_u': 'The input features for node u'}",
+        hyperparameters="...",
+        base_code="",
+    )
+
+    tpl_ex_path = Path(__file__).resolve() / Path("rdagent/scenarios/kaggle/tpl_ex").resolve() / competition
+    injected_file_names = ["spec/model.md", "load_data.py", "feature.py", "model01.py"]
+
+    modelexp = FBWorkspace()
+    for file_name in injected_file_names:
+        file_path = tpl_ex_path / file_name
+        modelexp.inject_files(**{file_name: file_path.read_text()})
+
+    mt.base_code += modelexp.file_dict["model01.py"]
+    exp = DSExperiment(
+        sub_tasks=[mt],
+    )
+
+    # Test the evaluator:
+    """eva = ModelGeneralCaseSpecEvaluator(scen=scen)
+    exp.feedback = eva.evaluate(target_task=mt, queried_knowledge=None, implementation=modelexp, gt_implementation=None)
+    print(exp.feedback)"""
+
+    # Test the evolving strategy:
+    """es = ModelMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+    new_code = es.implement_one_task(target_task=mt, queried_knowledge=None, workspace=modelexp)
+    print(new_code)"""
+
+    # Run the experiment
+    for file_name in injected_file_names:
+        file_path = tpl_ex_path / file_name
+        exp.experiment_workspace.inject_files(**{file_name: file_path.read_text()})
+
+    exp = model_coder.develop(exp)
+
+
+if __name__ == "__main__":
+    develop_one_competition("aerial-cactus-identification")
+    # dotenv run -- python rdagent/components/coder/data_science/model/test.py
diff --git a/rdagent/components/coder/data_science/raw_data_loader/README.md b/rdagent/components/coder/data_science/raw_data_loader/README.md
new file mode 100644
index 000000000..368a08980
--- /dev/null
+++ b/rdagent/components/coder/data_science/raw_data_loader/README.md
@@ -0,0 +1,15 @@
+# CoSTEER
+
+- subworkspace使用主experiment_workspace `RD-Agent/rdagent/scenarios/data_science/experiment/experiment.py`
+
+## evolving_strategy ( implement_one_task() )
+
+1. xxxTask (in exp.py)
+    - spec
+    - description
+2. 
+
+## evaluator
+
+1. queried_knowledge部分 共用
+2. eval_test脚本
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/raw_data_loader/__init__.py b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
index 22ed405df..73a7a349d 100644
--- a/rdagent/components/coder/data_science/raw_data_loader/__init__.py
+++ b/rdagent/components/coder/data_science/raw_data_loader/__init__.py
@@ -1,19 +1,196 @@
-# from rdagent.components.coder.CoSTEER import CoSTEER
-# from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
-# from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-# from rdagent.core.scenario import Scenario
-
-
-# class DataLoaderCoSTEER(CoSTEER):
-#     def __init__(
-#         self,
-#         scen: Scenario,
-#         *args,
-#         **kwargs,
-#     ) -> None:
-#         eva = CoSTEERMultiEvaluator(
-#             DataLoaderCoSTEEREvaluator(scen=scen), scen=scen
-#         )  # Please specify whether you agree running your eva in parallel or not
-#         es = DataLoaderMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
-
-#         super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
+"""
+
+Loop should not large change exclude
+- Action Choice[current data loader & spec]
+- other should share
+    - Propose[choice] => Task[Choice] => CoSTEER => 
+        - 
+
+Extra feature:
+- cache
+
+
+File structure
+- ___init__.py: the entrance/agent of coder
+- evaluator.py
+- conf.py
+- exp.py: everything under the experiment, e.g.
+    - Task
+    - Experiment
+    - Workspace
+- test.py
+    - Each coder could be tested.
+"""
+
+import json
+
+from rdagent.components.coder.CoSTEER import CoSTEER
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
+from rdagent.components.coder.CoSTEER.evolving_strategy import (
+    MultiProcessEvolvingStrategy,
+)
+from rdagent.components.coder.CoSTEER.knowledge_management import (
+    CoSTEERQueriedKnowledge,
+    CoSTEERQueriedKnowledgeV2,
+)
+from rdagent.components.coder.data_science.raw_data_loader.eval import (
+    DataLoaderCoSTEEREvaluator,
+)
+from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.core.exception import CoderError
+from rdagent.core.experiment import FBWorkspace
+from rdagent.core.scenario import Scenario
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+
+
+class DataLoaderMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
+    def implement_one_task(
+        self,
+        target_task: DataLoaderTask,
+        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+        workspace: FBWorkspace | None = None,
+    ) -> dict[str, str]:
+        # return a workspace with "load_data.py", "spec/load_data.md" inside
+        # assign the implemented code to the new workspace.
+        competition_info = self.scen.get_scenario_all_desc()
+        data_folder_info = self.scen.processed_data_folder_description
+        data_loader_task_info = target_task.get_task_information()
+
+        queried_similar_successful_knowledge = (
+            queried_knowledge.task_to_similar_task_successful_knowledge[data_loader_task_info]
+            if queried_knowledge is not None
+            else []
+        )
+        queried_former_failed_knowledge = (
+            queried_knowledge.task_to_former_failed_traces[data_loader_task_info]
+            if queried_knowledge is not None
+            else []
+        )
+        latest_code_feedback = [
+            knowledge.feedback
+            for knowledge in queried_former_failed_knowledge[0]
+            if knowledge.implementation.file_dict.get("load_data.py") is not None
+            and knowledge.implementation.file_dict.get("load_data.py") == workspace.file_dict.get("load_data.py")
+        ]
+        if len(latest_code_feedback) > 0:
+            queried_former_failed_knowledge = (
+                [
+                    knowledge
+                    for knowledge in queried_former_failed_knowledge[0]
+                    if knowledge.implementation.file_dict.get("load_data.py") != workspace.file_dict.get("load_data.py")
+                ],
+                queried_former_failed_knowledge[1],
+            )
+
+        # 1. specifications
+        # TODO: We may move spec into a separated COSTEER task
+        if "spec/data_loader.md" not in workspace.file_dict:  # Only generate the spec once
+            system_prompt = T(".prompts:spec.system").r(
+                task_desc=data_loader_task_info,
+                competition_info=competition_info,
+                folder_spec=data_folder_info,
+            )
+            data_loader_prompt = T(".prompts:spec.user.data_loader").r(
+                latest_spec=workspace.file_dict.get("spec/data_loader.md")
+            )
+            feature_prompt = T(".prompts:spec.user.feature").r(latest_spec=workspace.file_dict.get("spec/feature.md"))
+            model_prompt = T(".prompts:spec.user.model").r(latest_spec=workspace.file_dict.get("spec/model.md"))
+            ensemble_prompt = T(".prompts:spec.user.ensemble").r(
+                latest_spec=workspace.file_dict.get("spec/ensemble.md")
+            )
+            workflow_prompt = T(".prompts:spec.user.workflow").r(
+                latest_spec=workspace.file_dict.get("spec/workflow.md")
+            )
+
+            spec_session = APIBackend().build_chat_session(session_system_prompt=system_prompt)
+
+            data_loader_spec = json.loads(
+                spec_session.build_chat_completion(user_prompt=data_loader_prompt, json_mode=True)
+            )["spec"]
+            feature_spec = json.loads(spec_session.build_chat_completion(user_prompt=feature_prompt, json_mode=True))[
+                "spec"
+            ]
+            model_spec = json.loads(spec_session.build_chat_completion(user_prompt=model_prompt, json_mode=True))[
+                "spec"
+            ]
+            ensemble_spec = json.loads(spec_session.build_chat_completion(user_prompt=ensemble_prompt, json_mode=True))[
+                "spec"
+            ]
+            workflow_spec = json.loads(spec_session.build_chat_completion(user_prompt=workflow_prompt, json_mode=True))[
+                "spec"
+            ]
+        else:
+            data_loader_spec = workspace.file_dict["spec/data_loader.md"]
+            feature_spec = workspace.file_dict["spec/feature.md"]
+            model_spec = workspace.file_dict["spec/model.md"]
+            ensemble_spec = workspace.file_dict["spec/ensemble.md"]
+            workflow_spec = workspace.file_dict["spec/workflow.md"]
+
+        # 2. code
+        system_prompt = T(".prompts:data_loader_coder.system").r(
+            task_desc=data_loader_task_info,
+            queried_similar_successful_knowledge=queried_similar_successful_knowledge,
+            queried_former_failed_knowledge=queried_former_failed_knowledge[0],
+        )
+        user_prompt = T(".prompts:data_loader_coder.user").r(
+            competition_info=competition_info,
+            data_loader_spec=data_loader_spec,
+            folder_spec=data_folder_info,
+            latest_code=workspace.file_dict.get("load_data.py"),
+            latest_code_feedback=latest_code_feedback[0] if len(latest_code_feedback) > 0 else None,
+        )
+
+        for _ in range(5):
+            data_loader_code = json.loads(
+                APIBackend().build_messages_and_create_chat_completion(
+                    user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                )
+            )["code"]
+            if data_loader_code != workspace.file_dict.get("load_data.py"):
+                break
+            else:
+                user_prompt = user_prompt + "\nPlease avoid generating same code to former code!"
+        else:
+            raise CoderError("Failed to generate a new data loader code.")
+
+        return {
+            "spec/data_loader.md": data_loader_spec,
+            "spec/feature.md": feature_spec,
+            "spec/model.md": model_spec,
+            "spec/ensemble.md": ensemble_spec,
+            "spec/workflow.md": workflow_spec,
+            "load_data.py": data_loader_code,
+        }
+
+    def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
+        """
+        Assign the code list to the evolving item.
+
+        The code list is aligned with the evolving item's sub-tasks.
+        If a task is not implemented, put a None in the list.
+        """
+        for index in range(len(evo.sub_tasks)):
+            if code_list[index] is None:
+                continue
+            if evo.sub_workspace_list[index] is None:
+                # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
+                evo.sub_workspace_list[index] = evo.experiment_workspace
+            evo.sub_workspace_list[index].inject_files(**code_list[index])
+        return evo
+
+
+class DataLoaderCoSTEER(CoSTEER):
+    def __init__(
+        self,
+        scen: Scenario,
+        *args,
+        **kwargs,
+    ) -> None:
+        eva = CoSTEERMultiEvaluator(
+            DataLoaderCoSTEEREvaluator(scen=scen), scen=scen
+        )  # Please specify whether you agree running your eva in parallel or not
+        es = DataLoaderMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+
+        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=2, scen=scen, **kwargs)
diff --git a/rdagent/components/coder/data_science/raw_data_loader/conf.py b/rdagent/components/coder/data_science/raw_data_loader/conf.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
new file mode 100644
index 000000000..ffbe8b39e
--- /dev/null
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -0,0 +1,69 @@
+# tess successfully running.
+# (GPT) if it aligns with the spec & rationality of the spec.
+import json
+from pathlib import Path
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.components.coder.CoSTEER.evaluators import (
+    CoSTEEREvaluator,
+    CoSTEERSingleFeedback,
+)
+from rdagent.components.coder.CoSTEER.knowledge_management import (
+    CoSTEERQueriedKnowledgeV2,
+)
+from rdagent.core.experiment import FBWorkspace, Task
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+from rdagent.utils.env import DockerEnv, DSDockerConf
+
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+DataLoaderEvalFeedback = CoSTEERSingleFeedback
+
+
+class DataLoaderCoSTEEREvaluator(CoSTEEREvaluator):
+
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: FBWorkspace,
+        gt_implementation: FBWorkspace,
+        queried_knowledge: CoSTEERQueriedKnowledgeV2 = None,
+        **kwargs,
+    ) -> DataLoaderEvalFeedback:
+
+        target_task_information = target_task.get_task_information()
+        if (
+            queried_knowledge is not None
+            and target_task_information in queried_knowledge.success_task_to_knowledge_dict
+        ):
+            return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
+        elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
+            return DataLoaderEvalFeedback(
+                execution="This task has failed too many times, skip implementation.",
+                return_checking="This task has failed too many times, skip implementation.",
+                code="This task has failed too many times, skip implementation.",
+                final_decision=False,
+            )
+
+        ds_docker_conf = DSDockerConf()
+        ds_docker_conf.extra_volumes = {
+            f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
+        }
+        de = DockerEnv(conf=ds_docker_conf)
+
+        # TODO: do we need to clean the generated temporary content?
+        fname = "data_loader_test.py"
+        test_code = (DIRNAME / "eval_tests" / "data_loader_test.txt").read_text()
+        implementation.inject_files(**{fname: test_code})
+        stdout = implementation.execute(env=de, entry=f"python {fname}")
+
+        system_prompt = T(".prompts:data_loader_eval.system").r(
+            task_desc=target_task.get_task_information(),
+            test_code=test_code,
+            code=implementation.file_dict["load_data.py"],
+        )
+        user_prompt = T(".prompts:data_loader_eval.user").r(stdout=stdout)
+
+        resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
+        return DataLoaderEvalFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt
new file mode 100644
index 000000000..0da4e61fe
--- /dev/null
+++ b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt
@@ -0,0 +1,25 @@
+"""
+A qualified data loader should support following features
+- successfully run
+- len(test) == len(test_ids) == submission length
+- len(train) == len(y)
+
+Please make sure the stdout is rich enough to support informative feedback
+"""
+
+import pickle
+from load_data import load_data
+
+X, y, X_test, test_ids = load_data()
+
+def get_length(data):
+    return len(data) if isinstance(data, list) else data.shape[0]
+
+assert get_length(X_test) == get_length(test_ids), (
+    f"Mismatch in length of test images and test IDs: X_test ({get_length(X_test)}) and test_ids ({get_length(test_ids)})"
+)
+assert get_length(X) == get_length(y), (
+    f"Mismatch in length of training images and labels: X ({get_length(X)}) and y ({get_length(y)})"
+)
+
+print("Data loader test passed successfully. Length of test images matches length of test IDs.")
diff --git a/rdagent/components/coder/data_science/raw_data_loader/exp.py b/rdagent/components/coder/data_science/raw_data_loader/exp.py
new file mode 100644
index 000000000..54d280719
--- /dev/null
+++ b/rdagent/components/coder/data_science/raw_data_loader/exp.py
@@ -0,0 +1,14 @@
+import pickle
+import site
+import traceback
+from pathlib import Path
+from typing import Dict, Optional
+
+from rdagent.components.coder.CoSTEER.task import CoSTEERTask
+from rdagent.core.experiment import Experiment, FBWorkspace
+from rdagent.core.utils import cache_with_pickle
+from rdagent.oai.llm_utils import md5_hash
+from rdagent.utils.agent.tpl import T
+from rdagent.utils.env import DockerEnv, DSDockerConf
+
+DataLoaderTask = CoSTEERTask
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
new file mode 100644
index 000000000..b77e64028
--- /dev/null
+++ b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -0,0 +1,386 @@
+spec:
+  system: |-
+    You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
+    Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
+
+    Currently, you are working on a Kaggle competition project. 
+    This project involves analyzing data and building models to beat other competitors, with the code being generated by large language models.
+
+    Your overall task is provided below:
+    {{ task_desc }}
+    
+    Your task is to write five specification texts (in markdown format) for the following tasks, based on the competition information provided
+    - Data loading (and preprocessing)
+    - Feature Engineering
+    - Model Building
+    - Ensemble
+    - The overall workflow
+
+    The specifications for each step should be tailored to the competition information provided. 
+    
+    Your specification should consists two parts:
+    1. The function definition in code format with detailed annotation to each parameter and return value.
+    2. A detailed docstring to the function that explains the purpose of the function, the input parameters, and the output.
+    3. Additional information or notes that the coder should consider while implementing the function.
+    Your specifications should not include any code implementation, only the function definition and docstring.
+
+    -----------Competition Information-----------
+    {{ competition_info }}
+
+    -----------Folder Description---------(All path are relative to the data folder)
+    - Ensure that all columns in sample_submission can be generated.
+    {{ folder_spec }}
+
+  user:
+    data_loader: |-
+      Data loader specification text should follow these detailed requirements:
+      1. Function Interface:
+        - Function Name: `load_data`
+        - Input: No input arguments.
+        - Output:
+          - `X` (DT, define based on competition information): Feature matrix for training data.
+          - `y` (DT): Target vector for training data.
+          - `X_test` (DT): Feature matrix for test data.
+          - `test_ids` (DT): Identifiers for the test data.
+        - Docstring Requirements:
+          - Describe the purpose of the function.
+          - Specify the data source location (`/kaggle/input/`).
+          - Clearly define the structure and type of the output.
+          - Inferred data shape to each input and output data variables. To uncertain dimension, use -1.
+
+      2. Precautions for Data Loading and Preprocessing:
+        - File Handling:
+          - Ensure proper file encoding (e.g., UTF-8) and delimiters (e.g., CSV comma-separated).
+          - Combine or process multiple files if necessary.
+        - Data Preprocessing:
+          - Convert data types correctly (e.g., numeric, categorical, date parsing).
+          - Handle missing values appropriately (e.g., impute, drop rows/columns).
+          - Optimize memory usage for large datasets using techniques like downcasting or reading data in chunks if necessary.
+        - Domain-Specific Handling: 
+          - Apply competition-specific preprocessing steps as needed (e.g., text tokenization, image resizing).
+
+      3. Code Standards:
+        - Avoid using progress bars (e.g., `tqdm`) in the implementation.
+
+      4. Notes:
+        - Update `DT` (data type) based on the specific competition dataset. This can include `pd.DataFrame`, `np.array`, `torch.Tensor`, etc.
+        - Extend domain-specific handling steps based on the competition information.
+
+      {% if latest_spec %}
+      5. Former Specification:
+        {{ latest_spec }}
+        You should follow the provided specifications to improve this task.
+      {% endif %}
+
+      
+
+      Please respond with a JSON structure as follows:
+      {
+          "spec": "The function definition in code format, tailored to the Competition Information, with detailed explanations provided in the docstring."
+      }
+
+    feature: |-
+      Feature engineering specification text should adhere to the following requirements:
+      1. Function Interface:
+        - Function Name: `feat_eng`
+        - Parameters:
+          - `X` (DT): Train data to be transformed.
+          - `y` (DT): Train label data.
+          - `X_test` (DT): Test data.
+        - Output:
+          - `X_transformed` (DT): Transformed train data.
+          - `y_transformed` (DT): Transformed train label data.
+          - `X_test_transformed` (DT): Transformed test data.
+        - Docstring Requirements:
+          - Describe the purpose of the function.
+          - Clarify the input parameters and their data types.
+          - Define the structure and format of the output.
+          - Inferred data shape to each input and output data variables. To uncertain dimension, use -1.
+
+      2. Precautions for Feature Engineering:
+        - Well handle the shape of the data
+          - The sample size of the train data and the test data should be the same in all scenarios.
+          - To most of the scenario, the input shape and the output shape should be exactly the same.
+          - To some tabular data, you may add or remove some columns so your inferred column number may be unsure.
+        - Integration with Model Pipeline
+          - If feature engineering is strictly part of the model pipeline, state explicitly that it will be handled at the model stage.
+          - If integrated here, ensure this function applies all required transformations while avoiding data leakage.
+        - General Considerations:
+          - Ensure scalability for large datasets.
+          - Handle missing values and outliers appropriately (e.g., impute, remove, or replace).
+          - Ensure consistency between feature data types and transformations.
+          - Avoid data leakage: Only use features derived from training data, excluding information from test or validation sets.
+        - Domain-Specific Features:
+          - Apply logic for competition-specific features (e.g., text vectorization, image augmentations, categorical encoding).
+
+      3. Code Standards:
+        - Avoid using progress bars (e.g., `tqdm`) in the implementation.          
+
+      4. Notes:
+        - Align `DT` (data type) definitions with those in the Data Loader specification.
+        - Extend or adjust domain-specific transformations based on competition requirements.
+        - The device has GPU support, so you can use it for feature engineering if necessary to accelerate the process.
+        - Multi processing or parallel processing can be used to speed up the feature engineering process.
+      
+      {% if latest_spec %}
+      5. Former Specification:
+        {{ latest_spec }}
+        You should follow the provided specifications to improve this task.
+      {% endif %}
+
+      Please respond with a JSON structure as follows:
+      {
+          "spec": "The function definition in code format, tailored to the Competition Information, with detailed explanations provided in the docstring."
+      }
+
+    model: |-
+      Model building specification text should adhere to the following requirements:
+
+      1. Function Interface:
+        - Function Name: `model_workflow`
+        - Parameters:
+          - `X` (DT): Training feature data.
+          - `y` (DT): Training label data.
+          - `val_X` (Optional[DT]): Validation feature data.
+          - `val_y` (Optional[DT]): Validation label data.
+          - `test_X` (Optional[DT]): Test feature data.
+          - `hyper_params` (dict): Dictionary of hyperparameters for model configuration.
+        - Output:
+          - `pred_val` (Optional[DT]): Predictions on validation data.
+          - `pred_test` (Optional[DT]): Predictions on test data.
+          - `hyper_params` (dict): Updated dictionary of hyperparameters after training.
+        - Docstring Requirements:
+          - Describe the purpose of the function.
+          - Clarify the input parameters and their data types.
+          - Define the structure and format of the output.
+          - Inferred data shape to each input and output data variables. To uncertain dimension, use -1.
+
+      2. Code Standards:
+        - Avoid using progress bars (e.g., `tqdm`) in the implementation.  
+
+      3. Precautions:
+        - Ensure input arrays (`X`, `y`, `val_X`, `val_y`, `test_X`) have consistent dimensions and shapes.
+        - Use default values for hyperparameters if `hyper_params` is not provided.
+        - Train the model on `X` and `y`.
+        - Evaluate the model using `val_X` and `val_y` if validation data is available.
+        - If `test_X` is provided, generate predictions for it.
+        - Do not use progress bars (e.g., `tqdm`) in the implementation.
+
+      4. Notes:
+        - Align `DT` (data type) with the definitions used in Feature Engineering specifications.
+        - The device has GPU support, so you can use it for training if necessary to accelerate the process.
+
+      {% if latest_spec %}
+      5. Former Specification:
+        {{ latest_spec }}
+        You should follow the provided specifications to improve this task.
+      {% endif %}
+
+      Please respond in the following JSON format:
+      {
+          "spec": "The function definition in code format, tailored to the Competition Information, with detailed explanations provided in the docstring."
+      }
+
+    ensemble: |-
+      Ensemble specification text adhere to the following requirements:
+      1. Function Interface:
+        - Function Name: `ens_and_decision`
+        - Parameters:
+          - `test_preds_dict` (Dict[str, DT]): A dictionary of test predictions from different models. The key is the model file name.
+          - `val_preds_dict` (Dict[str, DT]): A dictionary of validation predictions from different models. The key is the model file name.
+          - `val_label` (DT): Validation label.
+        - Output:
+          - `final_pred` (DT): Ensemble prediction for the test data.
+        - Docstring Requirements:
+          - Describe the purpose of the function.
+          - Clarify the input parameters and their data types.
+          - Define the structure and format of the output.
+          - Inferred data shape to each input and output data variables. To uncertain dimension, use -1.
+
+      2. Precautions:
+        - Validation of Inputs:
+          - Ensure all predictions in `test_preds_dict` and `val_preds_dict` have consistent shapes and dimensions.
+          - Verify that `val_label` is provided and matches the length of `val_preds_dict` predictions.
+          - Handle empty or invalid inputs gracefully with appropriate error messages.
+        - You should calculate the metric for each model and ensemble strategy, and save the results in `scores.csv`, e.g.:
+          ```python
+          scores = {}
+          for model_name, val_pred in val_preds_dict.items():
+              scores[model_name] = calculate_metric(val_label, val_pred)
+          
+          ... some code about ensemble strategy ...
+          ensemble_score = calculate_metric(val_label, ensemble_pred)
+          scores[<ensemble_strategy_name>] = ensemble_score
+          
+          scores_df = pd.DataFrame(scores.items(), columns=['Model', <metric_name>])
+          scores_df.to_csv("scores.csv", index=False)
+          ```
+        - Consensus Strategy:
+          - Clearly define how the ensemble predictions are aggregated (e.g., majority voting, weighted average).
+          - Avoid introducing biases or overfitting during decision-making.
+      
+      3. Code Standards:
+        - Avoid using progress bars (e.g., `tqdm`) in the implementation. 
+
+      4. Notes:
+        - Align `DT` (data type) definitions with those used in model specifications.
+        - Ensure flexibility to handle multiple ensemble strategies based on competition requirements.
+
+      {% if latest_spec %}
+      5. Former Specification:
+        {{ latest_spec }}
+        You should follow the provided specifications to improve this task.
+      {% endif %}
+
+      Please respond in the following JSON format:
+      {
+          "spec": "The function definition in code format, tailored to the Competition Information, with detailed explanations provided in the docstring."
+      }
+
+    workflow: |-
+      Your task is to implement the main workflow script (`main.py`) for a Kaggle-style machine learning competition project. 
+      Follow the provided project structure and specifications to ensure consistency and maintainability:
+        1. Workflow Integration:
+          - Integrate the following components into the workflow:
+            - Data loading (`load_data.py`).
+            - Feature engineering (`feature.py`).
+            - Model workflow for training and testing (`model_*.py`).
+            - Ensemble and decision-making (`ensemble.py`).
+          - Treat each component as a modular and callable Python function.
+        2. Feature Engineering
+          - The feature engineering should be called only once. For example:
+            `X_transformed, y_transformed, X_test_transformed = feat_eng(X, y, X_test)`
+          - It should be called before dataset splitting.
+
+        3. Dataset Splitting
+          - The dataset returned by `load_data` is not split into training and testing sets, so the dataset splitting should happen after calling `feat_eng`.
+          - By default, split the dataset into 80% for training and 20% for testing. 
+          - You can also use cross-validation or other splitting methods as you deem more useful and appropriate based on the Competition Information.
+
+        4. Submission File:
+          - Save the final predictions as `submission.csv` in the format required by the competition.
+          - Present the required submission format explicitly and ensure the output adheres to it.
+
+        5. Code Standards:
+          - Use consistent naming conventions and type annotations.
+          - Document the workflow with clear comments and docstring.
+          - Do not use progress bars (e.g., tqdm) in the code.
+
+        6. Ensemble Strategy:
+          Put all the model's return into a dict, using the model file name as key, and the return as value.
+          Sample code:
+          {% raw %}
+          {% for mn in model_names %}
+          from {{mn}} import model_workflow as {{mn}}_workflow
+          val_preds_dict["{{mn}}"], test_preds_dict["{{mn}}"], _ = {{mn}}_workflow(
+              X=train_X,
+              y=train_y,
+              val_X=val_X,
+              val_y=val_y,
+              test_X=test_X
+          )
+          {% endfor %}
+          final_pred = ens_and_decision(test_preds_dict, val_preds_dict, val_y)
+          {% endraw %}
+
+        {% if latest_spec %}
+        5. Former Specification:
+          {{ latest_spec }}
+          You should follow the provided specifications to improve this task.
+        {% endif %}
+
+        Please response the specification in the following json format. Here is an example structure for the JSON output:
+        {
+            "spec": "The corresponding specification string as described above. You should create the rules based on the competition information instead of copying the requirements."
+        }
+
+data_loader_coder:
+  system: |-
+    You are a Python data scientist working on a new project. This project will be used to analyze data and build models to predict future outcomes, and this project codes will be written by GPT.
+    Your task is described below:
+    {{ task_desc }}
+    You should follow the provided specifications to complete this task.
+    You need to write the corresponding data loading code based on the information provided in the user's Data Folder Description, rather than relying on any suggestions that might exist in the spec.
+
+    Notice, the data files are stored in the data folder located at `/kaggle/input/`, and the data folder is structured as described in the Data Folder Description. Please don't load the data from the current directory.
+
+    Please response the code in the following json format. Here is an example structure for the JSON output:
+    {
+        "code": "The Python code as a string."
+    }
+
+    {% if queried_similar_successful_knowledge|length != 0 or queried_former_failed_knowledge|length != 0 %}
+    -----------Here is the relevant information for this task-----------
+    {% endif %}
+    {% if queried_similar_successful_knowledge|length != 0 %}
+    --------------Successful Implementations for Similar Models:--------------
+    ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
+    {{ similar_successful_knowledge.target_task.get_task_information() }}
+    =====Code:=====
+    {{ similar_successful_knowledge.implementation.all_codes }}
+    {% endfor %} 
+    {% endif %}
+
+    {% if queried_former_failed_knowledge|length != 0 %}
+    --------------Previous Failed Attempts:--------------
+    {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
+    =====Code:=====
+    {{ former_failed_knowledge.implementation.all_codes }}
+    =====Feedback:=====
+    {{ former_failed_knowledge.feedback }}
+    {% endfor %}
+    {% endif %}
+
+  user: |-
+    ---------Competition Information---------
+    {{ competition_info }}
+
+    ---------Data Loader Specification---------
+    {{ data_loader_spec }}
+
+    ---------Data Folder Description---------(All path are relative to the data folder)
+    {{ folder_spec }}
+    
+    {% if latest_code %}
+    ---------Former code---------
+      {{ latest_code }}
+    {% if latest_code_feedback is not none %}
+    ---------Feedback to former code---------
+      {{ latest_code_feedback }}
+    {% endif %}
+    The former code has some errors, you should write the correct code based on the former code. Avoid writing the same code to former code.
+    {% endif %}   
+
+    You should strictly follow the function interface specifications provided by the specification to implement the function.
+
+
+data_loader_eval:
+  system: |-
+    You are data scientist writing some data loader code for a Kaggle-style machine learning competition project.
+    The main code generation task is as follows:
+    {{task_desc}}
+
+    The data loader code is in a file named "load_data.py":
+    ```python
+    {{code}}
+    ```
+
+    You are testing the data_loader with the following code
+    ```python
+    {{test_code}}
+    ```
+
+    You'll be given the stdout of your testing scripts.
+    Please respond with your feedback in the following JSON format and order
+    ```json
+    {
+        "execution": "Describe how well the data loader executed, including any errors or issues encountered. Please keep the error message and tracking information",
+        "return_checking": "Detail the checks performed on the data loaded, including data integrity and correctness.",
+        "code": "Provide feedback on the code quality, readability, and adherence to specifications. Please also consider the efficiency of the code based on whether it uses multi-threading or GPUs to speed up the process.",
+        "final_decision": <true/false>
+    }
+    ```
+  user: |-
+    ```
+    {{stdout}}
+    ```
diff --git a/rdagent/components/coder/data_science/raw_data_loader/test.py b/rdagent/components/coder/data_science/raw_data_loader/test.py
new file mode 100644
index 000000000..2cd68a790
--- /dev/null
+++ b/rdagent/components/coder/data_science/raw_data_loader/test.py
@@ -0,0 +1,30 @@
+"""
+Helper functions for testing the raw_data_loader coder(CoSTEER-based) component.
+- Does the developer loop work correctly
+
+It is NOT:
+- it is not interface unittest(i.e. workspace evaluator in the CoSTEER Loop)
+"""
+
+from rdagent.components.coder.data_science.raw_data_loader import DataLoaderCoSTEER
+from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.scenarios.data_science.scen import KaggleScen
+
+
+def develop_one_competition(competition: str):  # -> experiment
+    scen = KaggleScen(competition=competition)
+    data_loader_coder = DataLoaderCoSTEER(scen)
+
+    # Create the experiment
+    dlt = DataLoaderTask(name="DataLoaderTask", description="")
+    exp = DSExperiment(
+        sub_tasks=[dlt],
+    )
+
+    # Develop the experiment
+    exp = data_loader_coder.develop(exp)
+
+
+if __name__ == "__main__":
+    develop_one_competition("aerial-cactus-identification")
diff --git a/rdagent/components/coder/data_science/workflow/__init__.py b/rdagent/components/coder/data_science/workflow/__init__.py
index 90961567e..09f4ba6d2 100644
--- a/rdagent/components/coder/data_science/workflow/__init__.py
+++ b/rdagent/components/coder/data_science/workflow/__init__.py
@@ -1,19 +1,120 @@
-# from rdagent.components.coder.CoSTEER import CoSTEER
-# from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
-# from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
-# from rdagent.core.scenario import Scenario
-
-
-# class WorkflowCoSTEER(CoSTEER):
-#     def __init__(
-#         self,
-#         scen: Scenario,
-#         *args,
-#         **kwargs,
-#     ) -> None:
-#         eva = CoSTEERMultiEvaluator(
-#             WorkflowCoSTEEREvaluator(scen=scen), scen=scen
-#         )  # Please specify whether you agree running your eva in parallel or not
-#         es = WorkflowMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
-
-#         super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=1, scen=scen, **kwargs)
+import json
+
+from rdagent.components.coder.CoSTEER import CoSTEER
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
+from rdagent.components.coder.CoSTEER.evolving_strategy import (
+    MultiProcessEvolvingStrategy,
+)
+from rdagent.components.coder.CoSTEER.knowledge_management import (
+    CoSTEERQueriedKnowledge,
+)
+from rdagent.components.coder.data_science.workflow.eval import (
+    WorkflowGeneralCaseSpecEvaluator,
+)
+from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
+from rdagent.core.exception import CoderError
+from rdagent.core.experiment import FBWorkspace
+from rdagent.core.scenario import Scenario
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+
+
+class WorkflowMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
+    def implement_one_task(
+        self,
+        target_task: WorkflowTask,
+        queried_knowledge: CoSTEERQueriedKnowledge | None = None,
+        workspace: FBWorkspace | None = None,
+    ) -> dict[str, str]:
+        # competition_info = self.scen.competition_descriptions
+        workflow_information_str = target_task.get_task_information()
+
+        # 1. query
+        queried_similar_successful_knowledge = (
+            queried_knowledge.task_to_similar_task_successful_knowledge[workflow_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+        queried_former_failed_knowledge = (
+            queried_knowledge.task_to_former_failed_traces[workflow_information_str]
+            if queried_knowledge is not None
+            else []
+        )
+        latest_code_feedback = [
+            knowledge.feedback
+            for knowledge in queried_former_failed_knowledge[0]
+            if knowledge.implementation.file_dict.get("main.py") is not None
+            and knowledge.implementation.file_dict.get("main.py") == workspace.file_dict.get("main.py")
+        ]
+        if len(latest_code_feedback) > 0:
+            queried_former_failed_knowledge = (
+                [
+                    knowledge
+                    for knowledge in queried_former_failed_knowledge[0]
+                    if knowledge.implementation.file_dict.get("main.py") != workspace.file_dict.get("main.py")
+                ],
+                queried_former_failed_knowledge[1],
+            )
+
+        # 2. code
+        system_prompt = T(".prompts:workflow_coder.system").r(
+            task_desc=workflow_information_str,
+            competition_info=self.scen.get_competition_full_desc(),
+            queried_similar_successful_knowledge=queried_similar_successful_knowledge,
+            queried_former_failed_knowledge=queried_former_failed_knowledge[0],
+        )
+        user_prompt = T(".prompts:workflow_coder.user").r(
+            load_data_code=workspace.file_dict["load_data.py"],
+            feature_code=workspace.file_dict["feature.py"],
+            model_codes=workspace.get_codes(r"^model_(?!test)\w+\.py$"),
+            ensemble_code=workspace.file_dict["ensemble.py"],
+            latest_code=workspace.file_dict.get("main.py"),
+            workflow_spec=workspace.file_dict["spec/workflow.md"],
+            latest_code_feedback=latest_code_feedback[0] if len(latest_code_feedback) > 0 else None,
+        )
+
+        for _ in range(5):
+            workflow_code = json.loads(
+                APIBackend().build_messages_and_create_chat_completion(
+                    user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                )
+            )["code"]
+            if workflow_code != workspace.file_dict.get("main.py"):
+                break
+            else:
+                user_prompt = user_prompt + "\nPlease avoid generating same code to former code!"
+        else:
+            raise CoderError("Failed to generate a new workflow code.")
+
+        return {"main.py": workflow_code}
+
+    def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
+        """
+        Assign the code list to the evolving item.
+
+        The code list is aligned with the evolving item's sub-tasks.
+        If a task is not implemented, put a None in the list.
+        """
+        for index in range(len(evo.sub_tasks)):
+            if code_list[index] is None:
+                continue
+            if evo.sub_workspace_list[index] is None:
+                # evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
+                evo.sub_workspace_list[index] = evo.experiment_workspace
+            evo.sub_workspace_list[index].inject_files(**code_list[index])
+        return evo
+
+
+class WorkflowCoSTEER(CoSTEER):
+    def __init__(
+        self,
+        scen: Scenario,
+        *args,
+        **kwargs,
+    ) -> None:
+        eva = CoSTEERMultiEvaluator(
+            WorkflowGeneralCaseSpecEvaluator(scen=scen), scen=scen
+        )  # Please specify whether you agree running your eva in parallel or not
+        es = WorkflowMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+        super().__init__(*args, settings=CoSTEER_SETTINGS, eva=eva, es=es, evolving_version=2, scen=scen, **kwargs)
diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py
new file mode 100644
index 000000000..584075ede
--- /dev/null
+++ b/rdagent/components/coder/data_science/workflow/eval.py
@@ -0,0 +1,99 @@
+import json
+import re
+from pathlib import Path
+
+import pandas as pd
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.components.coder.CoSTEER.evaluators import (
+    CoSTEEREvaluator,
+    CoSTEERMultiFeedback,
+    CoSTEERSingleFeedback,
+    CoSTEERSingleFeedbackDeprecated,
+)
+from rdagent.core.evolving_framework import QueriedKnowledge
+from rdagent.core.experiment import FBWorkspace, Task
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
+from rdagent.utils.env import DockerEnv, DSDockerConf
+
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+WorkflowSingleFeedback = CoSTEERSingleFeedback
+WorkflowMultiFeedback = CoSTEERMultiFeedback
+
+
+class WorkflowGeneralCaseSpecEvaluator(CoSTEEREvaluator):
+    """
+    Motivation case:
+    - Simplest case, we already split the data into train_data, valid_data, and test_data. We require the model to learn (optionally validate on valid data), and infer on test data.
+
+    Test workflow:
+    - Build train, valid, and test data to run it, and test the output (e.g., shape, etc.)
+    """
+
+    def evaluate(
+        self,
+        target_task: Task,
+        implementation: FBWorkspace,
+        gt_implementation: FBWorkspace,
+        queried_knowledge: QueriedKnowledge = None,
+        **kwargs,
+    ) -> CoSTEERSingleFeedbackDeprecated:
+        target_task_information = target_task.get_task_information()
+        if (
+            queried_knowledge is not None
+            and target_task_information in queried_knowledge.success_task_to_knowledge_dict
+        ):
+            return queried_knowledge.success_task_to_knowledge_dict[target_task_information].feedback
+        elif queried_knowledge is not None and target_task_information in queried_knowledge.failed_task_info_set:
+            return WorkflowSingleFeedback(
+                execution="This task has failed too many times, skip implementation.",
+                return_checking="This task has failed too many times, skip implementation.",
+                code="This task has failed too many times, skip implementation.",
+                final_decision=False,
+            )
+        ds_docker_conf = DSDockerConf()
+        ds_docker_conf.extra_volumes = {
+            f"{DS_RD_SETTING.local_data_path}/sample/{self.scen.competition}": "/kaggle/input"
+        }
+        de = DockerEnv(conf=ds_docker_conf)
+        fname = "main.py"
+        stdout = implementation.execute(env=de, entry=f"python {fname}")
+
+        # Check score file
+        score_fp = implementation.workspace_path / "scores.csv"
+        if not score_fp.exists():
+            stdout += "\nMetrics file (scores.csv) is not generated."
+        else:
+            score_df = pd.read_csv(score_fp, index_col=0)
+            model_set_in_scores = set(score_df.index)
+            model_set_in_folder = set(
+                f[:-3] for f in implementation.file_dict.keys() if re.match(r"^model_(?!test)\w+\.py$", f)
+            )
+            for model in model_set_in_folder:
+                if model not in model_set_in_scores:
+                    stdout += (
+                        f"\nModel {model} is not evaluated in the scores.csv. The scores.csv has {model_set_in_scores}."
+                    )
+
+        # Check submission file
+        submission_fp = implementation.workspace_path / "submission.csv"
+        if not submission_fp.exists():
+            stdout += "\nSubmission file (submission.csv) is not generated."
+        else:
+            check_code = (DIRNAME / "eval_tests" / "submission_check.txt").read_text()
+            implementation.inject_files(**{"submission_check.py": check_code})
+            stdout += implementation.execute(env=de, entry="python submission_check.py")
+
+        system_prompt = T(".prompts:workflow_eval.system").r(
+            scenario=self.scen.get_scenario_all_desc(),
+            task_desc=target_task.get_task_information(),
+            spec=implementation.file_dict["spec/workflow.md"],
+        )
+        user_prompt = T(".prompts:workflow_eval.user").r(
+            stdout=stdout.strip(),
+            code=implementation.file_dict["main.py"],
+        )
+        resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
+        return WorkflowSingleFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/workflow/eval_tests/submission_check.txt b/rdagent/components/coder/data_science/workflow/eval_tests/submission_check.txt
new file mode 100644
index 000000000..abca1553f
--- /dev/null
+++ b/rdagent/components/coder/data_science/workflow/eval_tests/submission_check.txt
@@ -0,0 +1,18 @@
+import pandas as pd
+from pathlib import Path
+
+# Check if the sample submission file exists
+if not Path("/kaggle/input/sample_submission.csv").exists():
+    exit(0)
+
+sample_submission = pd.read_csv('/kaggle/input/sample_submission.csv')
+our_submission = pd.read_csv('submission.csv')
+
+success = True
+for col in sample_submission.columns:
+    if col not in our_submission.columns:
+        success = False
+        print(f'Column {col} not found in submission.csv')
+
+if success:
+    print('submission.csv is valid.')
\ No newline at end of file
diff --git a/rdagent/components/coder/data_science/workflow/exp.py b/rdagent/components/coder/data_science/workflow/exp.py
new file mode 100644
index 000000000..e49af8339
--- /dev/null
+++ b/rdagent/components/coder/data_science/workflow/exp.py
@@ -0,0 +1,10 @@
+import pickle
+import site
+import traceback
+from pathlib import Path
+from typing import Dict, Optional
+
+from rdagent.components.coder.CoSTEER.task import CoSTEERTask
+from rdagent.core.utils import cache_with_pickle
+
+WorkflowTask = CoSTEERTask
diff --git a/rdagent/components/coder/data_science/workflow/prompts.yaml b/rdagent/components/coder/data_science/workflow/prompts.yaml
new file mode 100644
index 000000000..fd41e8bd2
--- /dev/null
+++ b/rdagent/components/coder/data_science/workflow/prompts.yaml
@@ -0,0 +1,115 @@
+workflow_coder:
+  system: |-
+    You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
+    Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
+
+    Your specific task is as follows:
+    {{task_desc}}
+
+    Your current competition information is as follows:
+    {{ competition_info }}
+    
+    The user has written different Python functions that can load and preprocess data, execute feature engineering, train models, and ensemble them.
+
+    These Python codes with different functionalities are written separately in different Python files.
+    You don't need to edit the existing code. Your task is to integrate the existing processes of load_data, feature, model, and ensemble into a complete workflow.
+    This workflow code is also a Python file, and it functions similarly to a main process that calls the sub-files for each step and ultimately outputs a prediction file.
+
+    The user will also provide specifications on how to organize the code and give instructions. 
+
+    The code you implement should align with the framework given in the specifications.
+    After predicting the output, print the shape and other information of the output to stdout to help the evaluator assess the code.
+
+    Please respond with the code in the following JSON format. Here is an example structure for the JSON output:
+    {
+        "code": "The Python code as a string."
+    }
+
+    {% if queried_similar_successful_knowledge|length != 0 or queried_former_failed_knowledge|length != 0 %}
+    -----------Here is the relevant information for this task-----------
+    {% endif %}
+    {% if queried_similar_successful_knowledge|length != 0 %}
+    --------------Successful Implementations for Similar Models:--------------
+    ====={% for similar_successful_knowledge in queried_similar_successful_knowledge %} Model {{loop.index}}:=====
+    {{ similar_successful_knowledge.target_task.get_task_information() }}
+    =====Code:=====
+    {{ similar_successful_knowledge.implementation.file_dict["main.py"] }}
+    {% endfor %} 
+    {% endif %}
+
+    {% if queried_former_failed_knowledge|length != 0 %}
+    --------------Previous Failed Attempts:--------------
+    {% for former_failed_knowledge in queried_former_failed_knowledge %} Attempt {{ loop.index }}:
+    =====Code:=====
+    {{ former_failed_knowledge.implementation.file_dict["main.py"] }}
+    =====Feedback:=====
+    {{ former_failed_knowledge.feedback }}
+    {% endfor %}
+    {% endif %}
+
+  user: |-
+    ---------Workflow Specification---------
+    {{ workflow_spec }}
+
+    ---------load data code---------
+    file: load_data.py
+    {{ load_data_code }}
+
+    ---------feature engineering code---------
+    file: feature.py
+    {{ feature_code }}
+
+    ---------model training code---------
+    Attention: The input and output of the model function is flexible. Training dataset is necessary, but validation and test dateset might be optional. The hyperparameters can either be passed as arguments or be set as default values in the function. You need to use the function correctly.
+    All model files share the same function name. Please import the model files with their name like: from {file_name} import {function_name}
+    {{ model_codes }}
+
+    ---------ensemble code---------
+    Note, we will check the index of the score.csv, so please use the model name as the index to feed into ensemble function.
+    file: ensemble.py
+    {{ ensemble_code }}
+
+    {% if latest_code %}
+    ---------Former code---------
+      {{ latest_code }}
+    {% if latest_code_feedback is not none %}
+    ---------Feedback to former code---------
+      {{ latest_code_feedback }}
+    {% endif %}
+    The former code has some errors, you should write the correct code based on the former code. Avoid writing the same code to former code.
+    {% endif %}  
+
+workflow_eval:
+  system: |-
+    You are a data scientist.
+    The user is trying to build a workflow in the following scenario:
+    {{ scenario }}
+    The main code generation task is as follows:
+    {{task_desc}}
+    The user will provide you with the information of the workflow and its components.
+    The information about how to build the workflow is given in the specification file as below:
+    {{ spec }}
+    This workflow will import all the codes including data loading, feature engineering, model tuning, and ensembling.
+    You are testing it by running the workflow code. The results will be collected as the stdout and it will help you evaluate the code.
+
+    Your job is to evaluate the workflow code given by the user. You should be concerned about whether the code executes successfully, generates predictions correctly, and satisfies other requirements in the specification.
+    The components have already been evaluated by the user, so you only need to evaluate and improve the workflow code unless there are very serious issues with the components.
+
+    Your evaluation should only consider whether the code executes successfully, generates well formatted predictions, and aligns with the target task. The performance of the model is not a concern in this task.
+
+    Please respond with your feedback in the following JSON format and order:
+    ```json
+    {
+        "execution": "Describe whether the model executed successfully, including any errors or issues encountered. Please keep the error message and tracking information",
+        "return_checking": "Check the generated value, including whether the value is generated and comparing the shape of the model output with the requirement in the specification. You also need to check whether the hyperparameters used for retraining are correctly returned during the test execution of the model.",
+        "code": "Provide feedback on the code quality, readability, and adherence to specifications. Check whether the hyperparameters from the previous run are used in the model code, compare the parameter names in stdout and if they are used in the retraining part of the code.",
+        "final_decision": <true/false>
+    }
+    ```
+  user: |-
+    --------------Code generated by user:---------------
+    {{ code }}
+    --------------stdoutput:---------------
+    '''
+    {{ stdout }}
+    '''
diff --git a/rdagent/components/coder/data_science/workflow/test.py b/rdagent/components/coder/data_science/workflow/test.py
new file mode 100644
index 000000000..99b6cb3d4
--- /dev/null
+++ b/rdagent/components/coder/data_science/workflow/test.py
@@ -0,0 +1,59 @@
+"""
+Generate dataset to test the workflow output
+"""
+
+from pathlib import Path
+
+from rdagent.components.coder.CoSTEER.config import CoSTEER_SETTINGS
+from rdagent.components.coder.data_science.workflow import WorkflowCoSTEER
+from rdagent.components.coder.data_science.workflow.eval import (
+    WorkflowGeneralCaseSpecEvaluator,
+)
+from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
+from rdagent.core.experiment import FBWorkspace
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.scenarios.data_science.scen import KaggleScen
+
+
+def develop_one_competition(competition: str):
+    scen = KaggleScen(competition=competition)
+    workflow_coder = WorkflowCoSTEER(scen)
+
+    wt = WorkflowTask(
+        name="WorkflowTask",
+        description="Integrate the existing processes of load_data, feature, model, and ensemble into a complete workflow.",
+        base_code="",
+    )
+
+    tpl_ex_path = Path(__file__).resolve() / Path("rdagent/scenarios/kaggle/tpl_ex").resolve() / competition
+    injected_file_names = ["spec/workflow.md", "load_data.py", "feature.py", "model01.py", "ensemble.py", "main.py"]
+
+    workflowexp = FBWorkspace()
+    for file_name in injected_file_names:
+        file_path = tpl_ex_path / file_name
+        workflowexp.inject_files(**{file_name: file_path.read_text()})
+
+    wt.base_code += workflowexp.file_dict["main.py"]
+    exp = DSExperiment(
+        sub_tasks=[wt],
+    )
+
+    """es = WorkflowMultiProcessEvolvingStrategy(scen=scen, settings=CoSTEER_SETTINGS)
+    new_code = es.implement_one_task(target_task=wt, queried_knowledge=None, workspace = workflowexp)
+    print(new_code)"""
+
+    """eva = WorkflowGeneralCaseSpecEvaluator(scen=scen)
+    exp.feedback = eva.evaluate(target_task=wt, queried_knowledge=None, implementation=workflowexp, gt_implementation=None)
+    print(exp.feedback)"""
+
+    # Run the experiment
+    for file_name in injected_file_names:
+        file_path = tpl_ex_path / file_name
+        exp.experiment_workspace.inject_files(**{file_name: file_path.read_text()})
+
+    exp = workflow_coder.develop(exp)
+
+
+if __name__ == "__main__":
+    develop_one_competition("aerial-cactus-identification")
+    # dotenv run -- python rdagent/components/coder/data_science/workflow/test.py
diff --git a/rdagent/components/coder/factor_coder/eva_utils.py b/rdagent/components/coder/factor_coder/eva_utils.py
index 48d7fe4f7..40f85bc7a 100644
--- a/rdagent/components/coder/factor_coder/eva_utils.py
+++ b/rdagent/components/coder/factor_coder/eva_utils.py
@@ -79,7 +79,7 @@ def evaluate(
         **kwargs,
     ):
         factor_information = target_task.get_task_information()
-        code = implementation.code
+        code = implementation.all_codes
 
         system_prompt = (
             Environment(undefined=StrictUndefined)
diff --git a/rdagent/components/coder/factor_coder/evaluators.py b/rdagent/components/coder/factor_coder/evaluators.py
index c45a6c733..6b5b402d5 100644
--- a/rdagent/components/coder/factor_coder/evaluators.py
+++ b/rdagent/components/coder/factor_coder/evaluators.py
@@ -3,7 +3,7 @@
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
     CoSTEERMultiFeedback,
-    CoSTEERSingleFeedback,
+    CoSTEERSingleFeedbackDeprecated,
 )
 from rdagent.components.coder.factor_coder.eva_utils import (
     FactorCodeEvaluator,
@@ -14,7 +14,7 @@
 from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.core.experiment import Workspace
 
-FactorSingleFeedback = CoSTEERSingleFeedback
+FactorSingleFeedback = CoSTEERSingleFeedbackDeprecated
 FactorMultiFeedback = CoSTEERMultiFeedback
 
 
diff --git a/rdagent/components/coder/factor_coder/evolving_strategy.py b/rdagent/components/coder/factor_coder/evolving_strategy.py
index 5fbc81450..5ddc938d2 100644
--- a/rdagent/components/coder/factor_coder/evolving_strategy.py
+++ b/rdagent/components/coder/factor_coder/evolving_strategy.py
@@ -14,6 +14,7 @@
 )
 from rdagent.components.coder.factor_coder.config import FACTOR_COSTEER_SETTINGS
 from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace, FactorTask
+from rdagent.core.experiment import FBWorkspace
 from rdagent.core.prompts import Prompts
 from rdagent.oai.llm_conf import LLM_SETTINGS
 from rdagent.oai.llm_utils import APIBackend
@@ -72,6 +73,7 @@ def implement_one_task(
         self,
         target_task: FactorTask,
         queried_knowledge: CoSTEERQueriedKnowledge,
+        workspace: FBWorkspace | None = None,
     ) -> str:
         target_factor_task_information = target_task.get_task_information()
 
@@ -179,5 +181,5 @@ def assign_code_list_to_evo(self, code_list, evo):
                 continue
             if evo.sub_workspace_list[index] is None:
                 evo.sub_workspace_list[index] = FactorFBWorkspace(target_task=evo.sub_tasks[index])
-            evo.sub_workspace_list[index].inject_code(**{"factor.py": code_list[index]})
+            evo.sub_workspace_list[index].inject_files(**{"factor.py": code_list[index]})
         return evo
diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py
index edb7a9ce8..502901c90 100644
--- a/rdagent/components/coder/factor_coder/factor.py
+++ b/rdagent/components/coder/factor_coder/factor.py
@@ -34,12 +34,16 @@ def __init__(
         self.factor_name = (
             factor_name  # TODO: remove it in the later version. Keep it only for pickle version compatibility
         )
-        self.factor_description = factor_description
         self.factor_formulation = factor_formulation
         self.variables = variables
         self.factor_resources = resource
         self.factor_implementation = factor_implementation
-        super().__init__(name=factor_name, *args, **kwargs)
+        super().__init__(name=factor_name, description=factor_description, *args, **kwargs)
+
+    @property
+    def factor_description(self):
+        """for compatibility"""
+        return self.description
 
     def get_task_information(self):
         return f"""factor_name: {self.factor_name}
@@ -88,8 +92,8 @@ def __init__(
 
     def hash_func(self, data_type: str = "Debug") -> str:
         return (
-            md5_hash(data_type + self.code_dict["factor.py"])
-            if ("factor.py" in self.code_dict and not self.raise_exception)
+            md5_hash(data_type + self.file_dict["factor.py"])
+            if ("factor.py" in self.file_dict and not self.raise_exception)
             else None
         )
 
@@ -114,7 +118,7 @@ def execute(self, data_type: str = "Debug") -> Tuple[str, pd.DataFrame]:
 
         """
         super().execute()
-        if self.code_dict is None or "factor.py" not in self.code_dict:
+        if self.file_dict is None or "factor.py" not in self.file_dict:
             if self.raise_exception:
                 raise CodeFormatError(self.FB_CODE_NOT_SET)
             else:
diff --git a/rdagent/components/coder/factor_coder/prompts.yaml b/rdagent/components/coder/factor_coder/prompts.yaml
index 94a0c02b3..a9836cffb 100644
--- a/rdagent/components/coder/factor_coder/prompts.yaml
+++ b/rdagent/components/coder/factor_coder/prompts.yaml
@@ -52,7 +52,7 @@ evolving_strategy_factor_implementation_v1_system: |-
   {% if queried_former_failed_knowledge|length != 0 %}
   --------------Your former latest attempt:---------------
   =====Code to the former implementation=====
-  {{ queried_former_failed_knowledge[-1].implementation.code }}
+  {{ queried_former_failed_knowledge[-1].implementation.all_codes }}
   =====Feedback to the former implementation=====
   {{ queried_former_failed_knowledge[-1].feedback }}
   {% endif %}
@@ -74,9 +74,9 @@ evolving_strategy_factor_implementation_v2_user: |-
   --------------Factor information to similar error ({{error_content}}):---------------
   {{ similar_error_knowledge[0].target_task.get_task_information() }}
   =====Code with similar error ({{error_content}}):=====
-  {{ similar_error_knowledge[0].implementation.code }}
+  {{ similar_error_knowledge[0].implementation.all_codes }}
   =====Success code to former code with similar error ({{error_content}}):=====
-  {{ similar_error_knowledge[1].implementation.code }}
+  {{ similar_error_knowledge[1].implementation.all_codes }}
   {% endfor %}
   {% else %}
   Recall your last failure, your implementation met some errors.
@@ -91,13 +91,13 @@ evolving_strategy_factor_implementation_v2_user: |-
   =====Factor {{loop.index}}:=====
   {{ similar_successful_knowledge.target_task.get_task_information() }}
   =====Code:=====
-  {{ similar_successful_knowledge.implementation.code }}
+  {{ similar_successful_knowledge.implementation.all_codes }}
   {% endfor %}
   {% endif %}
   {% if latest_attempt_to_latest_successful_execution is not none %}
   You have tried to correct your former failed code but still met some errors. Here is the latest attempt to the latest successful execution, try not to get the same error to your new code:
   =====Your latest attempt=====
-  {{ latest_attempt_to_latest_successful_execution.implementation.code }}
+  {{ latest_attempt_to_latest_successful_execution.implementation.all_codes }}
   =====Feedback to your latest attempt=====
   {{ latest_attempt_to_latest_successful_execution.feedback }}
   {% endif %}
@@ -126,9 +126,9 @@ evolving_strategy_error_summary_v2_user: |-
   --------------Factor information to similar error ({{error_content}}):---------------
   {{ similar_error_knowledge[0].target_task.get_task_information() }}
   =====Code with similar error ({{error_content}}):=====
-  {{ similar_error_knowledge[0].implementation.code }}
+  {{ similar_error_knowledge[0].implementation.all_codes }}
   =====Success code to former code with similar error ({{error_content}}):=====
-  {{ similar_error_knowledge[1].implementation.code }}
+  {{ similar_error_knowledge[1].implementation.all_codes }}
   {% endfor %}
   {% endif %}
 
@@ -158,7 +158,7 @@ select_implementable_factor_user: |-
   --------------Your former attempt:---------------
   {% for former_attempt in factor_info[2] %}
   =====Code to attempt {{ loop.index }}=====
-  {{ former_attempt.implementation.code }}
+  {{ former_attempt.implementation.all_codes }}
   =====Feedback to attempt {{ loop.index }}=====
   {{ former_attempt.feedback }}
   {% endfor %}
diff --git a/rdagent/components/coder/model_coder/eva_utils.py b/rdagent/components/coder/model_coder/eva_utils.py
index 0a78d8a25..1d2bb88f7 100644
--- a/rdagent/components/coder/model_coder/eva_utils.py
+++ b/rdagent/components/coder/model_coder/eva_utils.py
@@ -15,6 +15,7 @@
 evaluate_prompts = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
 
+# This shape evaluator is also used in data_science
 def shape_evaluator(prediction: np.ndarray, target_shape: Tuple = None) -> Tuple[str, bool]:
     if target_shape is None or prediction is None:
         return (
@@ -67,7 +68,7 @@ def evaluate(
             assert isinstance(gt_implementation, ModelFBWorkspace)
 
         model_task_information = target_task.get_task_information()
-        code = implementation.code
+        code = implementation.all_codes
 
         system_prompt = (
             Environment(undefined=StrictUndefined)
@@ -93,7 +94,7 @@ def evaluate(
                     code=code,
                     model_execution_feedback=execution_feedback_to_render,
                     model_value_feedback=model_value_feedback,
-                    gt_code=gt_implementation.code if gt_implementation else None,
+                    gt_code=gt_implementation.all_codes if gt_implementation else None,
                 )
             )
             if (
diff --git a/rdagent/components/coder/model_coder/evaluators.py b/rdagent/components/coder/model_coder/evaluators.py
index a311ded81..170039ab4 100644
--- a/rdagent/components/coder/model_coder/evaluators.py
+++ b/rdagent/components/coder/model_coder/evaluators.py
@@ -1,7 +1,7 @@
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
     CoSTEERMultiFeedback,
-    CoSTEERSingleFeedback,
+    CoSTEERSingleFeedbackDeprecated,
 )
 from rdagent.components.coder.model_coder.eva_utils import (
     ModelCodeEvaluator,
@@ -13,7 +13,7 @@
 from rdagent.core.evolving_framework import QueriedKnowledge
 from rdagent.core.experiment import Task, Workspace
 
-ModelSingleFeedback = CoSTEERSingleFeedback
+ModelSingleFeedback = CoSTEERSingleFeedbackDeprecated
 ModelMultiFeedback = CoSTEERMultiFeedback
 
 
diff --git a/rdagent/components/coder/model_coder/evolving_strategy.py b/rdagent/components/coder/model_coder/evolving_strategy.py
index b980508f0..83b7afa3e 100644
--- a/rdagent/components/coder/model_coder/evolving_strategy.py
+++ b/rdagent/components/coder/model_coder/evolving_strategy.py
@@ -16,6 +16,7 @@
     ModelFBWorkspace,
     ModelTask,
 )
+from rdagent.core.experiment import FBWorkspace
 from rdagent.core.prompts import Prompts
 from rdagent.oai.llm_conf import LLM_SETTINGS
 from rdagent.oai.llm_utils import APIBackend
@@ -28,6 +29,7 @@ def implement_one_task(
         self,
         target_task: ModelTask,
         queried_knowledge: CoSTEERQueriedKnowledge = None,
+        workspace: FBWorkspace | None = None,
     ) -> str:
         model_information_str = target_task.get_task_information()
 
@@ -102,5 +104,5 @@ def assign_code_list_to_evo(self, code_list, evo):
                 continue
             if evo.sub_workspace_list[index] is None:
                 evo.sub_workspace_list[index] = ModelFBWorkspace(target_task=evo.sub_tasks[index])
-            evo.sub_workspace_list[index].inject_code(**{"model.py": code_list[index]})
+            evo.sub_workspace_list[index].inject_files(**{"model.py": code_list[index]})
         return evo
diff --git a/rdagent/components/coder/model_coder/model.py b/rdagent/components/coder/model_coder/model.py
index b5a3e3ac5..4db6a8a3f 100644
--- a/rdagent/components/coder/model_coder/model.py
+++ b/rdagent/components/coder/model_coder/model.py
@@ -24,7 +24,6 @@ def __init__(
         model_type: Optional[str] = None,
         **kwargs,
     ) -> None:
-        self.description: str = description
         self.formulation: str = formulation
         self.architecture: str = architecture
         self.variables: str = variables
@@ -32,7 +31,7 @@ def __init__(
         self.model_type: str = (
             model_type  # Tabular for tabular model, TimesSeries for time series model, Graph for graph model, XGBoost for XGBoost model
         )
-        super().__init__(name=name, *args, **kwargs)
+        super().__init__(name=name, description=description, *args, **kwargs)
 
     def get_task_information(self):
         task_desc = f"""name: {self.name}
@@ -84,8 +83,8 @@ def hash_func(
         param_init_value: float = 1.0,
     ) -> str:
         target_file_name = f"{batch_size}_{num_features}_{num_timesteps}_{input_value}_{param_init_value}"
-        for code_file_name in sorted(list(self.code_dict.keys())):
-            target_file_name = f"{target_file_name}_{self.code_dict[code_file_name]}"
+        for code_file_name in sorted(list(self.file_dict.keys())):
+            target_file_name = f"{target_file_name}_{self.file_dict[code_file_name]}"
         return md5_hash(target_file_name)
 
     @cache_with_pickle(hash_func)
@@ -124,7 +123,7 @@ def execute(
                 env={},
                 code_dump_file_py_name="model_test",
             )
-            if results is None:
+            if len(results) == 0:
                 raise RuntimeError(f"Error in running the model code: {log}")
             [execution_feedback_str, execution_model_output] = results
 
diff --git a/rdagent/components/coder/model_coder/one_shot/__init__.py b/rdagent/components/coder/model_coder/one_shot/__init__.py
index 7f7fa83e6..29c9ff921 100644
--- a/rdagent/components/coder/model_coder/one_shot/__init__.py
+++ b/rdagent/components/coder/model_coder/one_shot/__init__.py
@@ -35,7 +35,7 @@ def develop(self, exp: ModelExperiment) -> ModelExperiment:
             # Extract the code part from the response
             match = re.search(r".*```[Pp]ython\n(.*)\n```.*", resp, re.DOTALL)
             code = match.group(1)
-            mti.inject_code(**{"model.py": code})
+            mti.inject_files(**{"model.py": code})
             mti_l.append(mti)
         exp.sub_workspace_list = mti_l
         return exp
diff --git a/rdagent/components/coder/model_coder/prompts.yaml b/rdagent/components/coder/model_coder/prompts.yaml
index 8742bd26b..126c986af 100644
--- a/rdagent/components/coder/model_coder/prompts.yaml
+++ b/rdagent/components/coder/model_coder/prompts.yaml
@@ -65,7 +65,7 @@ evolving_strategy_model_coder:
         {% if queried_former_failed_knowledge|length != 0 %}
         --------------Your former latest attempt:---------------
         =====Code to the former implementation=====
-        {{ queried_former_failed_knowledge[-1].implementation.code }}
+        {{ queried_former_failed_knowledge[-1].implementation.all_codes }}
         =====Feedback to the former implementation=====
         {{ queried_former_failed_knowledge[-1].feedback }}
         {% endif %}
@@ -85,7 +85,7 @@ evolving_strategy_model_coder:
         =====Model {{loop.index}}:=====
         {{ similar_successful_knowledge.target_task.get_task_information() }}
         =====Code:=====
-        {{ similar_successful_knowledge.implementation.code }}
+        {{ similar_successful_knowledge.implementation.all_codes }}
         {% endfor %}
         {% endif %}
 
@@ -93,7 +93,7 @@ evolving_strategy_model_coder:
         --------------Former failed code:---------------
         {% for former_failed_knowledge in queried_former_failed_knowledge %}
         =====Code to implementation {{ loop.index }}=====
-        {{ former_failed_knowledge.implementation.code }}
+        {{ former_failed_knowledge.implementation.all_codes }}
         =====Feedback to implementation {{ loop.index }}=====
         {{ former_failed_knowledge.feedback }}
         {% endfor %}
diff --git a/rdagent/components/loader/task_loader.py b/rdagent/components/loader/task_loader.py
index a3b344504..3ef807854 100644
--- a/rdagent/components/loader/task_loader.py
+++ b/rdagent/components/loader/task_loader.py
@@ -90,5 +90,5 @@ def load(self, task: ModelTask) -> ModelFBWorkspace:
         mti.prepare()
         with open(self.path / f"{task.name}.py", "r") as f:
             code = f.read()
-        mti.inject_code(**{"model.py": code})
+        mti.inject_files(**{"model.py": code})
         return mti
diff --git a/rdagent/components/proposal/__init__.py b/rdagent/components/proposal/__init__.py
index 305df2196..51980766f 100644
--- a/rdagent/components/proposal/__init__.py
+++ b/rdagent/components/proposal/__init__.py
@@ -82,7 +82,7 @@ class LLMHypothesis2Experiment(Hypothesis2Experiment[Experiment]):
     def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, bool]: ...
 
     @abstractmethod
-    def convert_response(self, response: str, trace: Trace) -> Experiment: ...
+    def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace) -> Experiment: ...
 
     def convert(self, hypothesis: Hypothesis, trace: Trace) -> Experiment:
         context, json_flag = self.prepare_context(hypothesis, trace)
@@ -109,7 +109,7 @@ def convert(self, hypothesis: Hypothesis, trace: Trace) -> Experiment:
 
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=json_flag)
 
-        return self.convert_response(resp, trace)
+        return self.convert_response(resp, hypothesis, trace)
 
 
 class FactorHypothesis2Experiment(LLMHypothesis2Experiment):
diff --git a/rdagent/components/workflow/rd_loop.py b/rdagent/components/workflow/rd_loop.py
index a331afd7d..11ef2ba4b 100644
--- a/rdagent/components/workflow/rd_loop.py
+++ b/rdagent/components/workflow/rd_loop.py
@@ -3,26 +3,26 @@
 It is from `rdagent/app/qlib_rd_loop/model.py` and try to replace `rdagent/app/qlib_rd_loop/RDAgent.py`
 """
 
-import time
 from typing import Any
 
 from rdagent.components.workflow.conf import BasePropSetting
 from rdagent.core.developer import Developer
 from rdagent.core.proposal import (
+    Experiment2Feedback,
+    Hypothesis,
     Hypothesis2Experiment,
-    HypothesisExperiment2Feedback,
+    HypothesisFeedback,
     HypothesisGen,
     Trace,
 )
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
-from rdagent.log.time import measure_time
 from rdagent.utils.workflow import LoopBase, LoopMeta
 
 
 class RDLoop(LoopBase, metaclass=LoopMeta):
-    @measure_time
+
     def __init__(self, PROP_SETTING: BasePropSetting):
         with logger.tag("init"):
             scen: Scenario = import_class(PROP_SETTING.scen)()
@@ -39,42 +39,54 @@ def __init__(self, PROP_SETTING: BasePropSetting):
             self.runner: Developer = import_class(PROP_SETTING.runner)(scen)
             logger.log_object(self.runner, tag="runner")
 
-            self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
+            self.summarizer: Experiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
             logger.log_object(self.summarizer, tag="summarizer")
             self.trace = Trace(scen=scen)
             super().__init__()
 
-    @measure_time
-    def propose(self, prev_out: dict[str, Any]):
-        with logger.tag("r"):  # research
-            hypothesis = self.hypothesis_gen.gen(self.trace)
-            logger.log_object(hypothesis, tag="hypothesis generation")
+    # excluded steps
+    def _propose(self):
+        hypothesis = self.hypothesis_gen.gen(self.trace)
+        logger.log_object(hypothesis, tag="hypothesis generation")
         return hypothesis
 
-    @measure_time
-    def exp_gen(self, prev_out: dict[str, Any]):
-        with logger.tag("r"):  # research
-            exp = self.hypothesis2experiment.convert(prev_out["propose"], self.trace)
-            logger.log_object(exp.sub_tasks, tag="experiment generation")
+    def _exp_gen(self, hypothesis: Hypothesis):
+        exp = self.hypothesis2experiment.convert(hypothesis, self.trace)
+        logger.log_object(exp.sub_tasks, tag="experiment generation")
         return exp
 
-    @measure_time
+    # included steps
+    def direct_exp_gen(self, prev_out: dict[str, Any]):
+        with logger.tag("r"):  # research
+            hypo = self._propose()
+            exp = self._exp_gen(hypo)
+        return {"propose": hypo, "exp_gen": exp}
+
     def coding(self, prev_out: dict[str, Any]):
         with logger.tag("d"):  # develop
-            exp = self.coder.develop(prev_out["exp_gen"])
+            exp = self.coder.develop(prev_out["direct_exp_gen"]["exp_gen"])
             logger.log_object(exp.sub_workspace_list, tag="coder result")
         return exp
 
-    @measure_time
     def running(self, prev_out: dict[str, Any]):
         with logger.tag("ef"):  # evaluate and feedback
             exp = self.runner.develop(prev_out["coding"])
             logger.log_object(exp, tag="runner result")
         return exp
 
-    @measure_time
     def feedback(self, prev_out: dict[str, Any]):
-        feedback = self.summarizer.generate_feedback(prev_out["running"], prev_out["propose"], self.trace)
-        with logger.tag("ef"):  # evaluate and feedback
-            logger.log_object(feedback, tag="feedback")
-        self.trace.hist.append((prev_out["propose"], prev_out["running"], feedback))
+        e = prev_out.get(self.EXCEPTION_KEY, None)
+        if e is not None:
+            feedback = HypothesisFeedback(
+                observations="Error occurred in loop, skip this loop",
+                hypothesis_evaluation="",
+                new_hypothesis="",
+                reason="",
+                decision=False,
+            )
+            self.trace.hist.append((prev_out["direct_exp_gen"]["exp_gen"], feedback))
+        else:
+            feedback = self.summarizer.generate_feedback(prev_out["running"], self.trace)
+            with logger.tag("ef"):  # evaluate and feedback
+                logger.log_object(feedback, tag="feedback")
+            self.trace.hist.append((prev_out["running"], feedback))
diff --git a/rdagent/core/evaluation.py b/rdagent/core/evaluation.py
index fae07a96a..e8720a8d4 100644
--- a/rdagent/core/evaluation.py
+++ b/rdagent/core/evaluation.py
@@ -1,14 +1,33 @@
+import typing
 from abc import ABC, abstractmethod
 
-from rdagent.core.experiment import Task, Workspace
 from rdagent.core.scenario import Scenario
 
+if typing.TYPE_CHECKING:
+    from rdagent.core.experiment import Task, Workspace
+
 
 class Feedback:
-    pass
+    """
+    Design Principle:
+        It will be more like a **dataclass**.
+        The building process of feedback will should be in evaluator
+    """
+
+    def __bool__(self) -> bool:
+        return True
 
 
 class Evaluator(ABC):
+    """
+    Design Principle:
+
+        It should cover the building process of feedback from raw information.
+            Typically the buiilding of feedback will be two phases.
+            1. raw information including stdout & workspace  (feeedback itself will handle this)
+            2. advanced/summaried feedback information. (evaluate will handle this)
+    """
+
     def __init__(
         self,
         scen: Scenario,
@@ -18,9 +37,9 @@ def __init__(
     @abstractmethod
     def evaluate(
         self,
-        target_task: Task,
-        implementation: Workspace,
-        gt_implementation: Workspace,
+        target_task: "Task",
+        implementation: "Workspace",
+        gt_implementation: "Workspace",
         **kwargs: object,
-    ) -> None:
+    ) -> Feedback:
         raise NotImplementedError
diff --git a/rdagent/core/evolving_agent.py b/rdagent/core/evolving_agent.py
index e196d0c0a..eded60b36 100644
--- a/rdagent/core/evolving_agent.py
+++ b/rdagent/core/evolving_agent.py
@@ -31,7 +31,7 @@ def multistep_evolve(
     def filter_evolvable_subjects_by_feedback(
         self,
         evo: EvolvableSubjects,
-        feedback: Feedback | None,
+        feedback: Feedback | list[Feedback] | None,
     ) -> EvolvableSubjects: ...
 
 
@@ -59,6 +59,7 @@ def multistep_evolve(
         filter_final_evo: bool = False,
     ) -> EvolvableSubjects:
         for _ in tqdm(range(self.max_loop), "Implementing"):
+            # with logger.tag(f"evo_loop_{evo_loop_id}"):
             # 1. knowledge self-evolving
             if self.knowledge_self_gen and self.rag is not None:
                 self.rag.generate_knowledge(self.evolving_trace)
@@ -95,6 +96,14 @@ def multistep_evolve(
 
             # 6. update trace
             self.evolving_trace.append(es)
+
+            # 7. check if all tasks are completed
+            if self.with_feedback:
+                all_completed = all(es.feedback) if isinstance(es.feedback, list) else es.feedback
+                if all_completed:
+                    logger.info("All tasks in evolving subject have been completed.")
+                    break
+
         if self.with_feedback and filter_final_evo:
             evo = self.filter_evolvable_subjects_by_feedback(evo, self.evolving_trace[-1].feedback)
         return evo
diff --git a/rdagent/core/evolving_framework.py b/rdagent/core/evolving_framework.py
index 24c7c6ae7..9ff874482 100644
--- a/rdagent/core/evolving_framework.py
+++ b/rdagent/core/evolving_framework.py
@@ -52,7 +52,7 @@ class EvoStep:
 
     evolvable_subjects: EvolvableSubjects
     queried_knowledge: QueriedKnowledge | None = None
-    feedback: Feedback | None = None
+    feedback: Feedback | list[Feedback] | None = None
 
 
 class EvolvingStrategy(ABC):
diff --git a/rdagent/core/exception.py b/rdagent/core/exception.py
index 2167ab9dc..f24156339 100644
--- a/rdagent/core/exception.py
+++ b/rdagent/core/exception.py
@@ -7,6 +7,8 @@ class CoderError(Exception):
     The more detailed evaluation in dataframe values are managed by the evaluator.
     """
 
+    # NOTE: it corresponds to the error of **component**
+
 
 class CodeFormatError(CoderError):
     """
@@ -26,11 +28,13 @@ class NoOutputError(CoderError):
     """
 
 
-class CustomRunnerError(Exception):
+class RunnerError(Exception):
     """
     Exceptions raised when running the code output.
     """
 
+    # NOTE: it corresponds to the error of whole **project**
+
 
 class FactorEmptyError(Exception):
     """
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
index 2fda6af97..231ac6740 100644
--- a/rdagent/core/experiment.py
+++ b/rdagent/core/experiment.py
@@ -2,7 +2,9 @@
 
 import os
 import platform
+import re
 import shutil
+import typing
 import uuid
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
@@ -11,13 +13,18 @@
 from typing import Any, Generic, TypeVar
 
 from rdagent.core.conf import RD_AGENT_SETTINGS
+from rdagent.utils import filter_progress_bar
+
+if typing.TYPE_CHECKING:
+    from rdagent.core.proposal import Hypothesis
+    from rdagent.utils.env import Env
 
 """
 This file contains the all the class about organizing the task in RD-Agent.
 """
 
 
-class Task(ABC):
+class AbsTask(ABC):
     def __init__(self, name: str, version: int = 1) -> None:
         """
         The version of the task, default is 1
@@ -34,6 +41,18 @@ def get_task_information(self) -> str:
         """
 
 
+class Task(AbsTask):
+    def __init__(self, name: str, version: int = 1, description: str = "") -> None:
+        super().__init__(name, version)
+        self.description = description
+
+    def get_task_information(self) -> str:
+        return f"Task Name: {self.name}\nDescription: {self.description}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} {self.name}>"
+
+
 ASpecificTask = TypeVar("ASpecificTask", bound=Task)
 
 
@@ -56,6 +75,13 @@ def copy(self) -> Workspace:
         error_message = "copy method is not implemented."
         raise NotImplementedError(error_message)
 
+    @property
+    @abstractmethod
+    def all_codes(self) -> str:
+        """
+        Get all the code files in the workspace as a single string.
+        """
+
 
 ASpecificWS = TypeVar("ASpecificWS", bound=Workspace)
 
@@ -85,26 +111,45 @@ class FBWorkspace(Workspace):
 
         def run_pipeline(self, **files: str):
             self.prepare()
-            self.inject_code(**files)
+            self.inject_files(**files)
             self.execute()
 
     """
 
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
-        self.code_dict: dict[str, Any] = {}
-        self.code_dict = (
+        self.file_dict: dict[str, Any] = (
             {}
         )  # The code injected into the folder, store them in the variable to reproduce the former result
         self.workspace_path: Path = RD_AGENT_SETTINGS.workspace_path / uuid.uuid4().hex
 
-    @property
-    def code(self) -> str:
+    @staticmethod
+    def _format_code_dict(code_dict: dict[str, str]) -> str:
+        """
+        Helper function to format the code dictionary into a string.
+        """
         code_string = ""
-        for file_name, code in self.code_dict.items():
-            code_string += f"File: {file_name}\n{code}\n"
+        for file_name, code in code_dict.items():
+            code_string += f"\nFile Path: {file_name}\n```\n{code}\n```"
         return code_string
 
+    @property
+    def all_codes(self) -> str:
+        """
+        Get all the code files in the workspace as a single string, excluding test files.
+        """
+        filtered_dict = {k: v for k, v in self.file_dict.items() if k.endswith(".py") and "test" not in k}
+        return self._format_code_dict(filtered_dict)
+
+    def get_codes(self, pattern: str) -> str:
+        """
+        Get code files matching a specific pattern as a single string, excluding test files.
+        """
+        filtered_dict = {
+            k: v for k, v in self.file_dict.items() if re.search(pattern, k) and k.endswith(".py") and "test" not in k
+        }
+        return self._format_code_dict(filtered_dict)
+
     def prepare(self) -> None:
         """
         Prepare the workspace except the injected code
@@ -128,21 +173,29 @@ def link_all_files_in_folder_to_workspace(data_path: Path, workspace_path: Path)
             if platform.system() == "Windows":
                 os.link(data_file_path, workspace_data_file_path)
 
-    def inject_code(self, **files: str) -> None:
+    DEL_KEY = "__DEL__"
+
+    def inject_files(self, **files: str) -> None:
         """
         Inject the code into the folder.
         {
-            <file name>: <code>
+            <file name1>: <code>,  // indicate writing <code> into <file name>
+                          (create new file or replace existing file)
+            <file name2>: "__DEL__"  // indicate removing file name2. When we want to replace a file to a new one,
+                          we usually use this
         }
         """
         self.prepare()
         for k, v in files.items():
-            self.code_dict[k] = v
-            target_file_path = self.workspace_path / k
-            if not target_file_path.parent.exists():
+            target_file_path = self.workspace_path / k  # Define target_file_path before using it
+            if v == self.DEL_KEY:  # Use self.DEL_KEY to access the class variable
+                if target_file_path.exists():
+                    target_file_path.unlink()  # Unlink the file if it exists
+                self.file_dict.pop(k, None)  # Safely remove the key from file_dict
+            else:
+                self.file_dict[k] = v
                 target_file_path.parent.mkdir(parents=True, exist_ok=True)
-            with Path.open(self.workspace_path / k, "w") as f:
-                f.write(v)
+                target_file_path.write_text(v)
 
     def get_files(self) -> list[Path]:
         """
@@ -160,7 +213,7 @@ def inject_code_from_folder(self, folder_path: Path) -> None:
         for file_path in folder_path.rglob("*"):
             if file_path.suffix in (".py", ".yaml", ".md"):
                 relative_path = file_path.relative_to(folder_path)
-                self.inject_code(**{str(relative_path): file_path.read_text()})
+                self.inject_files(**{str(relative_path): file_path.read_text()})
 
     def copy(self) -> FBWorkspace:
         """
@@ -173,14 +226,17 @@ def clear(self) -> None:
         Clear the workspace
         """
         shutil.rmtree(self.workspace_path, ignore_errors=True)
-        self.code_dict = {}
+        self.file_dict = {}
 
-    def execute(self) -> object | None:
+    def execute(self, env: Env | None = None, entry: str | None = None) -> object | None:
         """
         Before each execution, make sure to prepare and inject code
         """
         self.prepare()
-        self.inject_code(**self.code_dict)
+        self.inject_files(**self.file_dict)
+        # TODO: env should be not None in new design (no code can run without environment)
+        if env is not None and entry is not None:
+            return filter_progress_bar(env.run(entry, str(self.workspace_path)))
         return None
 
     def __str__(self) -> str:
@@ -205,12 +261,20 @@ def __init__(
         self,
         sub_tasks: Sequence[ASpecificTask],
         based_experiments: Sequence[ASpecificWSForExperiment] = [],
+        hypothesis: Hypothesis | None = None,
     ) -> None:
+        self.hypothesis: Hypothesis | None = hypothesis  # Experiment is optionally generated by hypothesis
         self.sub_tasks: Sequence[ASpecificTask] = sub_tasks
         self.sub_workspace_list: list[ASpecificWSForSubTasks | None] = [None] * len(self.sub_tasks)
+        # TODO:
+        # It will be used in runner in history
+        # If we implement the whole workflow, we don't have to use it, then we remove it.
         self.based_experiments: Sequence[ASpecificWSForExperiment] = based_experiments
+
         self.result: object = None  # The result of the experiment, can be different types in different scenarios.
-        self.sub_results: dict[str, float] = {}
+        self.sub_results: dict[str, float] = (
+            {}
+        )  # TODO: in Kaggle, now sub results are all saved in self.result, remove this in the future.
         self.experiment_workspace: ASpecificWSForExperiment | None = None
 
 
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
index a420bb311..0739dc82c 100644
--- a/rdagent/core/proposal.py
+++ b/rdagent/core/proposal.py
@@ -57,7 +57,34 @@ def __str__(self) -> str:
 # Origin(path of repo/data/feedback) => view/summarization => generated Hypothesis
 
 
-class HypothesisFeedback(Feedback):
+class ExperimentFeedback(Feedback):
+    def __init__(
+        self,
+        decision: bool,
+        reason: str,
+        exception: Exception | None = None,
+    ) -> None:
+        self.decision = decision
+        self.reason = reason
+        self.exception: Exception | None = (
+            exception  # if the experiment raises exception, it will be integrated into part of the feedback.
+        )
+
+    def __bool__(self) -> bool:
+        return self.decision
+
+    def __str__(self) -> str:
+        return f"Decision: {self.decision}\nReason: {self.reason}"
+
+    @classmethod
+    def from_exception(cls, e: Exception) -> ExperimentFeedback:
+        """
+        A convenient method to create Feedback from an exception.
+        """
+        return cls(decision=False, reason=f"The experiment fails due to {e!s}", exception=e)
+
+
+class HypothesisFeedback(ExperimentFeedback):
     def __init__(
         self,
         observations: str,
@@ -66,21 +93,16 @@ def __init__(
         reason: str,
         decision: bool,
     ) -> None:
+        super().__init__(decision, reason)
         self.observations = observations
         self.hypothesis_evaluation = hypothesis_evaluation
         self.new_hypothesis = new_hypothesis
-        self.reason = reason
-        self.decision = decision
-
-    def __bool__(self) -> bool:
-        return self.decision
 
     def __str__(self) -> str:
-        return f"""Observations: {self.observations}
+        return f"""{super().__str__()}
+Observations: {self.observations}
 Hypothesis Evaluation: {self.hypothesis_evaluation}
-New Hypothesis: {self.new_hypothesis}
-Decision: {self.decision}
-Reason: {self.reason}"""
+New Hypothesis: {self.new_hypothesis}"""
 
 
 ASpecificScen = TypeVar("ASpecificScen", bound=Scenario)
@@ -90,19 +112,41 @@ def __str__(self) -> str:
 class Trace(Generic[ASpecificScen, ASpecificKB]):
     def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = None) -> None:
         self.scen: ASpecificScen = scen
-        self.hist: list[tuple[Hypothesis, Experiment, HypothesisFeedback]] = []
+        self.hist: list[tuple[Experiment, ExperimentFeedback]] = []
+        # TODO: self.hist is 2-tuple now, remove hypothesis from it, change old code for this later.
         self.knowledge_base: ASpecificKB | None = knowledge_base
 
     def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experiment | None]:
         """Access the last experiment result, sub-task, and the corresponding hypothesis."""
         # TODO: The return value does not align with the signature.
-        for hypothesis, experiment, feedback in self.hist[::-1]:
+        for experiment, feedback in self.hist[::-1]:
             if feedback.decision:
-                return hypothesis, experiment
+                return experiment.hypothesis, experiment
 
         return None, None
 
 
+class ExpGen(ABC):
+
+    def __init__(self, scen: Scenario) -> None:
+        self.scen = scen
+
+    @abstractmethod
+    def gen(self, trace: Trace) -> Experiment:
+        """
+        Generate the experiment based on the trace.
+
+        `ExpGen().gen()` play a role like
+
+        .. code-block:: python
+
+            # ExpGen().gen() ==
+            Hypothesis2Experiment().convert(
+                HypothesisGen().gen(trace)
+            )
+        """
+
+
 class HypothesisGen(ABC):
     # NOTE: the design is a little wierd
     # - Sometimes we want accurate access the prompts in a specific level
@@ -141,7 +185,7 @@ def convert(self, hypothesis: Hypothesis, trace: Trace) -> ASpecificExp:
 # Boolean, Reason, Confidence, etc.
 
 
-class HypothesisExperiment2Feedback(ABC):
+class Experiment2Feedback(ABC):
     """ "Generated feedbacks on the hypothesis from **Executed** Implementations of different tasks
     & their comparisons with previous performances"""
 
@@ -149,7 +193,7 @@ def __init__(self, scen: Scenario) -> None:
         self.scen = scen
 
     @abstractmethod
-    def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
+    def generate_feedback(self, exp: Experiment, trace: Trace) -> ExperimentFeedback:
         """
         The `exp` should be executed and the results should be included, as well as the comparison
         between previous results (done by LLM).
diff --git a/rdagent/core/scenario.py b/rdagent/core/scenario.py
index 3a384a85d..a9ff6b83f 100644
--- a/rdagent/core/scenario.py
+++ b/rdagent/core/scenario.py
@@ -4,6 +4,11 @@
 
 
 class Scenario(ABC):
+    """
+    We should include scenario information here. Following inform should not be included
+    - method related (e.g. rag... config for a concrete module)
+    """
+
     @property
     @abstractmethod
     def background(self) -> str:
@@ -25,20 +30,9 @@ def source_data(self) -> str:
         """
         return self.get_source_data_desc()
 
-    @property
-    @abstractmethod
-    def interface(self) -> str:
-        """Interface description about how to run the code"""
-
-    @property
-    @abstractmethod
-    def output_format(self) -> str:
-        """Output format description"""
-
-    @property
-    @abstractmethod
-    def simulator(self) -> str:
-        """Simulator description"""
+    # NOTE: we should keep the interface simpler. So some previous interfaces are deleted.
+    # If we need some specific function only used in the subclass(no external usage).
+    # We should not set them in the base class
 
     @property
     @abstractmethod
diff --git a/rdagent/core/utils.py b/rdagent/core/utils.py
index be0a15cf8..45fff015d 100644
--- a/rdagent/core/utils.py
+++ b/rdagent/core/utils.py
@@ -49,7 +49,7 @@ def __reduce__(self) -> NoReturn:
         NOTE:
         When loading an object from a pickle, the __new__ method does not receive the `kwargs`
         it was initialized with. This makes it difficult to retrieve the correct singleton object.
-        Therefore, we have made it unpickable.
+        Therefore, we have made it unpicklable.
         """
         msg = f"Instances of {self.__class__.__name__} cannot be pickled"
         raise pickle.PicklingError(msg)
@@ -69,7 +69,7 @@ def similarity(text1: str, text2: str) -> int:
     text2 = text2 if isinstance(text2, str) else ""
 
     # Maybe we can use other similarity algorithm such as tfidf
-    return cast(int, fuzz.ratio(text1, text2))  # mypy does not reguard it as int
+    return cast(int, fuzz.ratio(text1, text2))  # mypy does not regard it as int
 
 
 def import_class(class_path: str) -> Any:
@@ -127,7 +127,7 @@ def multiprocessing_wrapper(func_calls: list[tuple[Callable, tuple]], n: int) ->
     It will not call multiprocessing if `n=1`
 
     NOTE:
-    We coooperate with chat_cache_seed feature
+    We cooperate with chat_cache_seed feature
     We ensure get the same seed trace even we have multiple number of seed
 
     Parameters
diff --git a/rdagent/log/logger.py b/rdagent/log/logger.py
index 87caf32c6..11905eb40 100644
--- a/rdagent/log/logger.py
+++ b/rdagent/log/logger.py
@@ -1,4 +1,6 @@
+import json
 import os
+import pickle
 import sys
 from contextlib import contextmanager
 from datetime import datetime, timezone
@@ -113,6 +115,21 @@ def log_object(self, obj: object, *, tag: str = "") -> None:
         caller_info = get_caller_info()
         tag = f"{self._tag}.{tag}.{self.get_pids()}".strip(".")
 
+        # FIXME: it looks like a hacking... We should redesign it...
+        if "debug_" in tag:
+            debug_log_path = self.log_trace_path / "debug_llm.pkl"
+            debug_data = {"tag": tag, "obj": obj}
+            if debug_log_path.exists():
+                with debug_log_path.open("rb") as f:
+                    existing_data = pickle.load(f)
+                existing_data.append(debug_data)
+                with debug_log_path.open("wb") as f:
+                    pickle.dump(existing_data, f)
+            else:
+                with debug_log_path.open("wb") as f:
+                    pickle.dump([debug_data], f)
+            return
+
         logp = self.storage.log(obj, name=tag, save_type="pkl")
 
         file_handler_id = logger.add(
diff --git a/rdagent/log/storage.py b/rdagent/log/storage.py
index 0411a5bad..acf87606b 100644
--- a/rdagent/log/storage.py
+++ b/rdagent/log/storage.py
@@ -100,6 +100,8 @@ def iter_msg(self, watch: bool = False) -> Generator[Message, None, None]:
                 msg_l.append(m)
 
         for file in self.path.glob("**/*.pkl"):
+            if file.name == "debug_llm.pkl":
+                continue
             tag = ".".join(file.relative_to(self.path).as_posix().replace("/", ".").split(".")[:-3])
             pid = file.parent.name
 
diff --git a/rdagent/log/time.py b/rdagent/log/time.py
deleted file mode 100644
index 27b1b0db1..000000000
--- a/rdagent/log/time.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import time
-from functools import wraps
-
-from rdagent.log import rdagent_logger as logger
-
-
-def measure_time(method):
-    @wraps(method)
-    def timed(*args, **kwargs):
-        start_time = time.time()
-        result = method(*args, **kwargs)
-        end_time = time.time()
-        duration = end_time - start_time
-        method_name = method.__name__
-        # logger.log_object(f"{method_name} took {duration:.2f} sec")
-        logger.info(f"{method_name} took {duration:.2f} sec")
-        return result
-
-    return timed
diff --git a/rdagent/log/ui/app.py b/rdagent/log/ui/app.py
index 77faae4e9..04f49ed24 100644
--- a/rdagent/log/ui/app.py
+++ b/rdagent/log/ui/app.py
@@ -357,7 +357,7 @@ def hypothesis_hover_text(h: Hypothesis, d: bool = False):
     hover_texts = [
         hypothesis_hover_text(state.hypotheses[int(i[6:])], state.h_decisions[int(i[6:])])
         for i in df.index
-        if i != "alpha158"
+        if i != "alpha158" and i != "Baseline"
     ]
     if state.alpha158_metrics is not None:
         hover_texts = ["Baseline: alpha158"] + hover_texts
@@ -457,7 +457,7 @@ def summary_window():
                 for j, w in enumerate(ws):
                     with wtabs[j]:
                         # Evolving Code
-                        for k, v in w.code_dict.items():
+                        for k, v in w.file_dict.items():
                             with st.expander(f":green[`{k}`]", expanded=False):
                                 st.code(v, language="python")
 
@@ -652,7 +652,7 @@ def evolving_window():
             with wtabs[j]:
                 # Evolving Code
                 st.markdown(f"**Workspace Path**: {w.workspace_path}")
-                for k, v in w.code_dict.items():
+                for k, v in w.file_dict.items():
                     with st.expander(f":green[`{k}`]", expanded=True):
                         st.code(v, language="python")
 
diff --git a/rdagent/log/ui/llm_st.py b/rdagent/log/ui/llm_st.py
new file mode 100644
index 000000000..9c05441c0
--- /dev/null
+++ b/rdagent/log/ui/llm_st.py
@@ -0,0 +1,253 @@
+import argparse
+import json
+import pickle
+import re
+import time
+from pathlib import Path
+
+import streamlit as st
+from streamlit import session_state
+
+st.set_page_config(layout="wide", page_title="debug_llm", page_icon="🎓", initial_sidebar_state="expanded")
+
+# 获取 log_path 参数
+parser = argparse.ArgumentParser(description="RD-Agent Streamlit App")
+parser.add_argument("--log_dir", type=str, help="Path to the log directory")
+args = parser.parse_args()
+
+
+@st.cache_data
+def get_folders_sorted(log_path):
+    """缓存并返回排序后的文件夹列表，并加入进度打印"""
+    with st.spinner("正在加载文件夹列表..."):
+        folders = sorted(
+            (folder for folder in log_path.iterdir() if folder.is_dir() and list(folder.iterdir())),
+            key=lambda folder: folder.stat().st_mtime,
+            reverse=True,
+        )
+        st.write(f"找到 {len(folders)} 个文件夹")
+    return [folder.name for folder in folders]
+
+
+# 设置主日志路径
+main_log_path = Path(args.log_dir) if args.log_dir else Path("./log")
+if not main_log_path.exists():
+    st.error(f"Log dir {main_log_path} does not exist!")
+    st.stop()
+
+if "data" not in session_state:
+    session_state.data = []
+if "log_path" not in session_state:
+    session_state.log_path = None
+
+tlist = []
+
+
+def load_data():
+    """加载数据到 session_state 并显示进度"""
+    log_file = main_log_path / session_state.log_path / "debug_llm.pkl"
+    try:
+        with st.spinner(f"正在加载数据文件 {log_file}..."):
+            start_time = time.time()
+            with open(log_file, "rb") as f:
+                session_state.data = pickle.load(f)
+            st.success(f"数据加载完成！耗时 {time.time() - start_time:.2f} 秒")
+            st.session_state["current_loop"] = 1
+    except Exception as e:
+        session_state.data = [{"error": str(e)}]
+        st.error(f"加载数据失败: {e}")
+
+
+# UI - Sidebar
+with st.sidebar:
+    st.markdown(":blue[**Log Path**]")
+    manually = st.toggle("Manual Input")
+    if manually:
+        st.text_input("log path", key="log_path", label_visibility="collapsed")
+    else:
+        folders = get_folders_sorted(main_log_path)
+        st.selectbox(f"**Select from {main_log_path.absolute()}**", folders, key="log_path")
+
+    if st.button("Refresh Data"):
+        load_data()
+        st.rerun()
+
+    expand_all = st.toggle("Expand All", key="expand_all")
+
+
+# Helper functions
+def show_text(text, lang=None):
+    """显示文本代码块"""
+    if lang:
+        st.code(text, language=lang, wrap_lines=True)
+    elif "\n" in text:
+        st.code(text, language="python", wrap_lines=True)
+    else:
+        st.code(text, language="html", wrap_lines=True)
+
+
+def highlight_prompts_uri(uri):
+    """高亮 URI 的格式"""
+    parts = uri.split(":")
+    return f"**{parts[0]}:**:green[**{parts[1]}**]"
+
+
+def extract_loopid_func_name(tag):
+    """提取 Loop ID 和函数名称"""
+    match = re.search(r"Loop_(\d+)\.(\w+)\.", tag)
+    return match.groups() if match else (None, None)
+
+
+def extract_evoid(tag):
+    """提取 EVO ID"""
+    match = re.search(r"\.evo_loop_(\d+)\.", tag)
+    return match.group(1) if match else None
+
+
+# Display Data
+progress_text = st.empty()
+progress_bar = st.progress(0)
+
+# 每页展示一个 Loop
+LOOPS_PER_PAGE = 1
+
+# 获取所有的 Loop ID
+loop_groups = {}
+for i, d in enumerate(session_state.data):
+    tag = d["tag"]
+    loop_id, _ = extract_loopid_func_name(tag)
+    if loop_id:
+        if loop_id not in loop_groups:
+            loop_groups[loop_id] = []
+        loop_groups[loop_id].append(d)
+
+# 按 Loop ID 排序
+sorted_loop_ids = sorted(loop_groups.keys(), key=int)  # 假设 Loop ID 是数字
+total_loops = len(sorted_loop_ids)
+total_pages = total_loops  # 每页展示一个 Loop
+
+if total_pages:
+    # 初始化 current_loop
+    if "current_loop" not in st.session_state:
+        st.session_state["current_loop"] = 1
+
+    # Loop 导航按钮
+    col1, col2, col3, col4, col5 = st.sidebar.columns([1.2, 1, 2, 1, 1.2])
+
+    with col1:
+        if st.button("|<"):  # 首页
+            st.session_state["current_loop"] = 1
+    with col2:
+        if st.button("<") and st.session_state["current_loop"] > 1:  # 上一页
+            st.session_state["current_loop"] -= 1
+    with col3:
+        # 下拉列表显示所有 Loop
+        st.session_state["current_loop"] = st.selectbox(
+            "选择 Loop",
+            options=list(range(1, total_loops + 1)),
+            index=st.session_state["current_loop"] - 1,  # 默认选中当前 Loop
+            label_visibility="collapsed",  # 隐藏标签
+        )
+    with col4:
+        if st.button("\>") and st.session_state["current_loop"] < total_loops:  # 下一页
+            st.session_state["current_loop"] += 1
+    with col5:
+        if st.button("\>|"):  # 最后一页
+            st.session_state["current_loop"] = total_loops
+
+    # 获取当前 Loop
+    current_loop = st.session_state["current_loop"]
+
+    # 渲染当前 Loop 数据
+    loop_id = sorted_loop_ids[current_loop - 1]
+    progress_text = st.empty()
+    progress_text.text(f"正在处理 Loop {loop_id}...")
+    progress_bar.progress(current_loop / total_loops, text=f"Loop :green[**{current_loop}**] / {total_loops}")
+
+    # 渲染 Loop Header
+    loop_anchor = f"Loop_{loop_id}"
+    if loop_anchor not in tlist:
+        tlist.append(loop_anchor)
+        st.header(loop_anchor, anchor=loop_anchor, divider="blue")
+
+    # 渲染当前 Loop 的所有数据
+    loop_data = loop_groups[loop_id]
+    for d in loop_data:
+        tag = d["tag"]
+        obj = d["obj"]
+        _, func_name = extract_loopid_func_name(tag)
+        evo_id = extract_evoid(tag)
+
+        func_anchor = f"loop_{loop_id}.{func_name}"
+        if func_anchor not in tlist:
+            tlist.append(func_anchor)
+            st.header(f"in *{func_name}*", anchor=func_anchor, divider="green")
+
+        evo_anchor = f"loop_{loop_id}.evo_step_{evo_id}"
+        if evo_id and evo_anchor not in tlist:
+            tlist.append(evo_anchor)
+            st.subheader(f"evo_step_{evo_id}", anchor=evo_anchor, divider="orange")
+
+        # 根据 tag 渲染内容
+        if "debug_exp_gen" in tag:
+            with st.expander(
+                f"Exp in :violet[**{obj.experiment_workspace.workspace_path}**]", expanded=False, icon="🧩"
+            ):
+                st.write(obj)
+        elif "debug_tpl" in tag:
+            uri = obj["uri"]
+            tpl = obj["template"]
+            cxt = obj["context"]
+            rd = obj["rendered"]
+            with st.expander(highlight_prompts_uri(uri), expanded=False, icon="⚙️"):
+                t1, t2, t3 = st.tabs([":green[**Rendered**]", ":blue[**Template**]", ":orange[**Context**]"])
+                with t1:
+                    show_text(rd)
+                with t2:
+                    show_text(tpl, lang="django")
+                with t3:
+                    st.json(cxt)
+        elif "debug_llm" in tag:
+            system = obj.get("system", None)
+            user = obj["user"]
+            resp = obj["resp"]
+            with st.expander(f"**LLM**", expanded=False, icon="🤖"):
+                t1, t2, t3 = st.tabs([":green[**Response**]", ":blue[**User**]", ":orange[**System**]"])
+                with t1:
+                    try:
+                        rdict = json.loads(resp)
+                        if "code" in rdict:
+                            code = rdict["code"]
+                            st.markdown(":red[**Code in response dict:**]")
+                            st.code(code, language="python", wrap_lines=True, line_numbers=True)
+                            rdict.pop("code")
+                        elif "spec" in rdict:
+                            spec = rdict["spec"]
+                            st.markdown(":red[**Spec in response dict:**]")
+                            st.markdown(spec)
+                            rdict.pop("spec")
+                        else:
+                            # show model codes
+                            showed_keys = []
+                            for k, v in rdict.items():
+                                if k.startswith("model_") and k.endswith(".py"):
+                                    st.markdown(f":red[**{k}**]")
+                                    st.code(v, language="python", wrap_lines=True, line_numbers=True)
+                                    showed_keys.append(k)
+                            for k in showed_keys:
+                                rdict.pop(k)
+                        st.write(":red[**Other parts (except for the code or spec) in response dict:**]")
+                        st.json(rdict)
+                    except:
+                        st.json(resp)
+                with t2:
+                    show_text(user)
+                with t3:
+                    show_text(system or "No system prompt available")
+
+    progress_text.text("当前 Loop 数据处理完成！")
+
+    # Sidebar TOC
+    with st.sidebar:
+        toc = "\n".join([f"- [{t}](#{t})" if t.startswith("L") else f"  - [{t.split('.')[1]}](#{t})" for t in tlist])
+        st.markdown(toc, unsafe_allow_html=True)
diff --git a/rdagent/log/ui/web.py b/rdagent/log/ui/web.py
index eb4862b44..452bfab0e 100644
--- a/rdagent/log/ui/web.py
+++ b/rdagent/log/ui/web.py
@@ -285,7 +285,7 @@ def consume_msg(self, msg: Message | FactorFBWorkspace | ModelFBWorkspace):
                 ModelTaskWindow(self.container.container()).consume_msg(task_msg)
 
         # task codes
-        for k, v in ws.code_dict.items():
+        for k, v in ws.file_dict.items():
             self.container.markdown(f"`{k}`")
             self.container.code(v, language="python")
 
diff --git a/rdagent/oai/backend/__init__.py b/rdagent/oai/backend/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/oai/backend/base.py b/rdagent/oai/backend/base.py
new file mode 100644
index 000000000..d9b57c312
--- /dev/null
+++ b/rdagent/oai/backend/base.py
@@ -0,0 +1,2 @@
+class APIBackend:
+    """abstract"""
diff --git a/rdagent/oai/backend/deprec/__init__.py b/rdagent/oai/backend/deprec/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/oai/backend/deprec/conf.py b/rdagent/oai/backend/deprec/conf.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/oai/backend/deprec/deprecated.py b/rdagent/oai/backend/deprec/deprecated.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/oai/backend/litellm.py b/rdagent/oai/backend/litellm.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/oai/llm_conf.py b/rdagent/oai/llm_conf.py
index 15bce4ed8..f61d0ca8f 100644
--- a/rdagent/oai/llm_conf.py
+++ b/rdagent/oai/llm_conf.py
@@ -2,13 +2,21 @@
 
 from pathlib import Path
 
+from pydantic import Field
+
 from rdagent.core.conf import ExtendedBaseSettings
 
 
 class LLMSettings(ExtendedBaseSettings):
+    # backend
+    backend: str = "rdagent.oai.backend.DeprecBackend"
+
     log_llm_chat_content: bool = True
 
-    use_azure: bool = False
+    use_azure: bool = Field(default=False, deprecated=True)
+    chat_use_azure: bool = False
+    embedding_use_azure: bool = False
+
     chat_use_azure_token_provider: bool = False
     embedding_use_azure_token_provider: bool = False
     managed_identity_client_id: str | None = None
@@ -24,7 +32,7 @@ class LLMSettings(ExtendedBaseSettings):
     # Behavior of returning answers to the same question when caching is enabled
     use_auto_chat_cache_seed_gen: bool = False
     """
-    `_create_chat_completion_inner_function` provdies a feature to pass in a seed to affect the cache hash key
+    `_create_chat_completion_inner_function` provides a feature to pass in a seed to affect the cache hash key
     We want to enable a auto seed generator to get different default seed for `_create_chat_completion_inner_function`
     if seed is not given.
     So the cache will only not miss you ask the same question on same round.
@@ -33,7 +41,8 @@ class LLMSettings(ExtendedBaseSettings):
 
     # Chat configs
     openai_api_key: str = ""  # TODO: simplify the key design.
-    chat_openai_api_key: str = ""
+    chat_openai_api_key: str | None = None
+    chat_openai_base_url: str | None = None  #
     chat_azure_api_base: str = ""
     chat_azure_api_version: str = ""
     chat_model: str = "gpt-4-turbo"
@@ -50,6 +59,7 @@ class LLMSettings(ExtendedBaseSettings):
 
     # Embedding configs
     embedding_openai_api_key: str = ""
+    embedding_openai_base_url: str = ""
     embedding_azure_api_base: str = ""
     embedding_azure_api_version: str = ""
     embedding_model: str = ""
diff --git a/rdagent/oai/llm_utils.py b/rdagent/oai/llm_utils.py
index e8c914b18..16b8a4553 100644
--- a/rdagent/oai/llm_utils.py
+++ b/rdagent/oai/llm_utils.py
@@ -124,17 +124,13 @@ def chat_get(self, key: str) -> str | None:
         md5_key = md5_hash(key)
         self.c.execute("SELECT chat FROM chat_cache WHERE md5_key=?", (md5_key,))
         result = self.c.fetchone()
-        if result is None:
-            return None
-        return result[0]
+        return None if result is None else result[0]
 
     def embedding_get(self, key: str) -> list | dict | str | None:
         md5_key = md5_hash(key)
         self.c.execute("SELECT embedding FROM embedding_cache WHERE md5_key=?", (md5_key,))
         result = self.c.fetchone()
-        if result is None:
-            return None
-        return json.loads(result[0])
+        return None if result is None else json.loads(result[0])
 
     def chat_set(self, key: str, value: str) -> None:
         md5_key = md5_hash(key)
@@ -143,6 +139,7 @@ def chat_set(self, key: str, value: str) -> None:
             (md5_key, value),
         )
         self.conn.commit()
+        return None
 
     def embedding_set(self, content_to_embedding_dict: dict) -> None:
         for key, value in content_to_embedding_dict.items():
@@ -153,19 +150,18 @@ def embedding_set(self, content_to_embedding_dict: dict) -> None:
             )
         self.conn.commit()
 
-    def message_get(self, conversation_id: str) -> list[str]:
+    def message_get(self, conversation_id: str) -> list[dict[str, Any]]:
         self.c.execute("SELECT message FROM message_cache WHERE conversation_id=?", (conversation_id,))
         result = self.c.fetchone()
-        if result is None:
-            return []
-        return json.loads(result[0])
+        return [] if result is None else json.loads(result[0])
 
-    def message_set(self, conversation_id: str, message_value: list[str]) -> None:
+    def message_set(self, conversation_id: str, message_value: list[dict[str, Any]]) -> None:
         self.c.execute(
             "INSERT OR REPLACE INTO message_cache (conversation_id, message) VALUES (?, ?)",
             (conversation_id, json.dumps(message_value)),
         )
         self.conn.commit()
+        return None
 
 
 class SessionChatHistoryCache(SingletonBaseClass):
@@ -173,10 +169,10 @@ def __init__(self) -> None:
         """load all history conversation json file from self.session_cache_location"""
         self.cache = SQliteLazyCache(cache_location=LLM_SETTINGS.prompt_cache_path)
 
-    def message_get(self, conversation_id: str) -> list[str]:
+    def message_get(self, conversation_id: str) -> list[dict[str, Any]]:
         return self.cache.message_get(conversation_id)
 
-    def message_set(self, conversation_id: str, message_value: list[str]) -> None:
+    def message_set(self, conversation_id: str, message_value: list[dict[str, Any]]) -> None:
         self.cache.message_set(conversation_id, message_value)
 
 
@@ -203,7 +199,7 @@ def build_chat_completion_message_and_calculate_token(self, user_prompt: str) ->
         messages = self.build_chat_completion_message(user_prompt)
         return self.api_backend.calculate_token_from_messages(messages)
 
-    def build_chat_completion(self, user_prompt: str, **kwargs: Any) -> str:
+    def build_chat_completion(self, user_prompt: str, *args, **kwargs) -> str:  # type: ignore[no-untyped-def]
         """
         this function is to build the session messages
         user prompt should always be provided
@@ -211,11 +207,13 @@ def build_chat_completion(self, user_prompt: str, **kwargs: Any) -> str:
         messages = self.build_chat_completion_message(user_prompt)
 
         with logger.tag(f"session_{self.conversation_id}"):
-            response = self.api_backend._try_create_chat_completion_or_embedding(  # noqa: SLF001
+            response: str = self.api_backend._try_create_chat_completion_or_embedding(  # noqa: SLF001
+                *args,
                 messages=messages,
                 chat_completion=True,
                 **kwargs,
             )
+            logger.log_object({"user": user_prompt, "resp": response}, tag="debug_llm")
 
         messages.append(
             {
@@ -264,7 +262,7 @@ def __init__(  # noqa: C901, PLR0912, PLR0915
             self.generator = Llama.build(
                 ckpt_dir=LLM_SETTINGS.llama2_ckpt_dir,
                 tokenizer_path=LLM_SETTINGS.llama2_tokenizer_path,
-                max_seq_len=LLM_SETTINGS.max_tokens,
+                max_seq_len=LLM_SETTINGS.chat_max_tokens,
                 max_batch_size=LLM_SETTINGS.llams2_max_batch_size,
             )
             self.encoder = None
@@ -307,7 +305,8 @@ def __init__(  # noqa: C901, PLR0912, PLR0915
             self.chat_model = LLM_SETTINGS.chat_model if chat_model is None else chat_model
             self.encoder = None
         else:
-            self.use_azure = LLM_SETTINGS.use_azure
+            self.chat_use_azure = LLM_SETTINGS.chat_use_azure or LLM_SETTINGS.use_azure
+            self.embedding_use_azure = LLM_SETTINGS.embedding_use_azure or LLM_SETTINGS.use_azure
             self.chat_use_azure_token_provider = LLM_SETTINGS.chat_use_azure_token_provider
             self.embedding_use_azure_token_provider = LLM_SETTINGS.embedding_use_azure_token_provider
             self.managed_identity_client_id = LLM_SETTINGS.managed_identity_client_id
@@ -330,6 +329,8 @@ def __init__(  # noqa: C901, PLR0912, PLR0915
             self.chat_model = LLM_SETTINGS.chat_model if chat_model is None else chat_model
             self.chat_model_map = json.loads(LLM_SETTINGS.chat_model_map)
             self.encoder = self._get_encoder()
+            self.chat_openai_base_url = LLM_SETTINGS.chat_openai_base_url
+            self.embedding_openai_base_url = LLM_SETTINGS.embedding_openai_base_url
             self.chat_api_base = LLM_SETTINGS.chat_azure_api_base if chat_api_base is None else chat_api_base
             self.chat_api_version = (
                 LLM_SETTINGS.chat_azure_api_version if chat_api_version is None else chat_api_version
@@ -345,44 +346,38 @@ def __init__(  # noqa: C901, PLR0912, PLR0915
                 LLM_SETTINGS.embedding_azure_api_version if embedding_api_version is None else embedding_api_version
             )
 
-            if self.use_azure:
-                if self.chat_use_azure_token_provider or self.embedding_use_azure_token_provider:
-                    dac_kwargs = {}
-                    if self.managed_identity_client_id is not None:
-                        dac_kwargs["managed_identity_client_id"] = self.managed_identity_client_id
-                    credential = DefaultAzureCredential(**dac_kwargs)
-                    token_provider = get_bearer_token_provider(
-                        credential,
-                        "https://cognitiveservices.azure.com/.default",
-                    )
-                if self.chat_use_azure_token_provider:
-                    self.chat_client = openai.AzureOpenAI(
-                        azure_ad_token_provider=token_provider,
-                        api_version=self.chat_api_version,
-                        azure_endpoint=self.chat_api_base,
-                    )
-                else:
-                    self.chat_client = openai.AzureOpenAI(
-                        api_key=self.chat_api_key,
-                        api_version=self.chat_api_version,
-                        azure_endpoint=self.chat_api_base,
-                    )
+            if (self.chat_use_azure or self.embedding_use_azure) and (
+                self.chat_use_azure_token_provider or self.embedding_use_azure_token_provider
+            ):
+                dac_kwargs = {}
+                if self.managed_identity_client_id is not None:
+                    dac_kwargs["managed_identity_client_id"] = self.managed_identity_client_id
+                credential = DefaultAzureCredential(**dac_kwargs)
+                token_provider = get_bearer_token_provider(
+                    credential,
+                    "https://cognitiveservices.azure.com/.default",
+                )
+            self.chat_client: openai.OpenAI = (
+                openai.AzureOpenAI(
+                    azure_ad_token_provider=token_provider if self.chat_use_azure_token_provider else None,
+                    api_key=self.chat_api_key if not self.chat_use_azure_token_provider else None,
+                    api_version=self.chat_api_version,
+                    azure_endpoint=self.chat_api_base,
+                )
+                if self.chat_use_azure
+                else openai.OpenAI(api_key=self.chat_api_key, base_url=self.chat_openai_base_url)
+            )
 
-                if self.embedding_use_azure_token_provider:
-                    self.embedding_client = openai.AzureOpenAI(
-                        azure_ad_token_provider=token_provider,
-                        api_version=self.embedding_api_version,
-                        azure_endpoint=self.embedding_api_base,
-                    )
-                else:
-                    self.embedding_client = openai.AzureOpenAI(
-                        api_key=self.embedding_api_key,
-                        api_version=self.embedding_api_version,
-                        azure_endpoint=self.embedding_api_base,
-                    )
-            else:
-                self.chat_client = openai.OpenAI(api_key=self.chat_api_key)
-                self.embedding_client = openai.OpenAI(api_key=self.embedding_api_key)
+            self.embedding_client: openai.OpenAI = (
+                openai.AzureOpenAI(
+                    azure_ad_token_provider=token_provider if self.embedding_use_azure_token_provider else None,
+                    api_key=self.embedding_api_key if not self.embedding_use_azure_token_provider else None,
+                    api_version=self.embedding_api_version,
+                    azure_endpoint=self.embedding_api_base,
+                )
+                if self.embedding_use_azure
+                else openai.OpenAI(api_key=self.embedding_api_key, base_url=self.embedding_openai_base_url)
+            )
 
         self.dump_chat_cache = LLM_SETTINGS.dump_chat_cache if dump_chat_cache is None else dump_chat_cache
         self.use_chat_cache = LLM_SETTINGS.use_chat_cache if use_chat_cache is None else use_chat_cache
@@ -401,7 +396,7 @@ def __init__(  # noqa: C901, PLR0912, PLR0915
         self.use_gcr_endpoint = LLM_SETTINGS.use_gcr_endpoint
         self.retry_wait_seconds = LLM_SETTINGS.retry_wait_seconds
 
-    def _get_encoder(self):
+    def _get_encoder(self) -> tiktoken.Encoding:
         """
         tiktoken.encoding_for_model(self.chat_model) does not cover all cases it should consider.
 
@@ -418,15 +413,16 @@ def _azure_patch(model: str) -> str:
 
         model = self.chat_model
         try:
-            return tiktoken.encoding_for_model(model)
+            encoding = tiktoken.encoding_for_model(model)
         except KeyError:
             logger.warning(f"Failed to get encoder. Trying to patch the model name")
             for patch_func in [_azure_patch]:
                 try:
-                    return tiktoken.encoding_for_model(patch_func(model))
+                    encoding = tiktoken.encoding_for_model(patch_func(model))
                 except KeyError:
                     logger.error(f"Failed to get encoder even after patching with {patch_func.__name__}")
                     raise
+        return encoding
 
     def build_chat_session(
         self,
@@ -443,10 +439,10 @@ def build_messages(
         self,
         user_prompt: str,
         system_prompt: str | None = None,
-        former_messages: list[dict] | None = None,
+        former_messages: list[dict[str, Any]] | None = None,
         *,
         shrink_multiple_break: bool = False,
-    ) -> list[dict]:
+    ) -> list[dict[str, Any]]:
         """
         build the messages to avoid implementing several redundant lines of code
 
@@ -476,15 +472,15 @@ def build_messages(
         )
         return messages
 
-    def build_messages_and_create_chat_completion(
+    def build_messages_and_create_chat_completion(  # type: ignore[no-untyped-def]
         self,
         user_prompt: str,
         system_prompt: str | None = None,
         former_messages: list | None = None,
         chat_cache_prefix: str = "",
-        *,
         shrink_multiple_break: bool = False,
-        **kwargs: Any,
+        *args,
+        **kwargs,
     ) -> str:
         if former_messages is None:
             former_messages = []
@@ -494,30 +490,37 @@ def build_messages_and_create_chat_completion(
             former_messages,
             shrink_multiple_break=shrink_multiple_break,
         )
-        return self._try_create_chat_completion_or_embedding(
+
+        resp = self._try_create_chat_completion_or_embedding(  # type: ignore[misc]
+            *args,
             messages=messages,
             chat_completion=True,
             chat_cache_prefix=chat_cache_prefix,
             **kwargs,
         )
+        if isinstance(resp, list):
+            raise ValueError("The response of _try_create_chat_completion_or_embedding should be a string.")
+        logger.log_object({"system": system_prompt, "user": user_prompt, "resp": resp}, tag="debug_llm")
+        return resp
 
-    def create_embedding(self, input_content: str | list[str], **kwargs: Any) -> list[Any] | Any:
+    def create_embedding(self, input_content: str | list[str], *args, **kwargs) -> list[Any] | Any:  # type: ignore[no-untyped-def]
         input_content_list = [input_content] if isinstance(input_content, str) else input_content
-        resp = self._try_create_chat_completion_or_embedding(
+        resp = self._try_create_chat_completion_or_embedding(  # type: ignore[misc]
             input_content_list=input_content_list,
             embedding=True,
+            *args,
             **kwargs,
         )
         if isinstance(input_content, str):
             return resp[0]
         return resp
 
-    def _create_chat_completion_auto_continue(self, messages: list, **kwargs: dict) -> str:
+    def _create_chat_completion_auto_continue(self, messages: list[dict[str, Any]], *args, **kwargs) -> str:  # type: ignore[no-untyped-def]
         """
         Call the chat completion function and automatically continue the conversation if the finish_reason is length.
         TODO: This function only continues once, maybe need to continue more than once in the future.
         """
-        response, finish_reason = self._create_chat_completion_inner_function(messages=messages, **kwargs)
+        response, finish_reason = self._create_chat_completion_inner_function(messages, *args, **kwargs)
 
         if finish_reason == "length":
             new_message = deepcopy(messages)
@@ -528,44 +531,47 @@ def _create_chat_completion_auto_continue(self, messages: list, **kwargs: dict)
                     "content": "continue the former output with no overlap",
                 },
             )
-            new_response, finish_reason = self._create_chat_completion_inner_function(messages=new_message, **kwargs)
+            new_response, finish_reason = self._create_chat_completion_inner_function(new_message, *args, **kwargs)
             return response + new_response
         return response
 
-    def _try_create_chat_completion_or_embedding(
+    def _try_create_chat_completion_or_embedding(  # type: ignore[no-untyped-def]
         self,
         max_retry: int = 10,
-        *,
         chat_completion: bool = False,
         embedding: bool = False,
-        **kwargs: Any,
-    ) -> Any:
+        *args,
+        **kwargs,
+    ) -> str | list[float]:
         assert not (chat_completion and embedding), "chat_completion and embedding cannot be True at the same time"
         max_retry = LLM_SETTINGS.max_retry if LLM_SETTINGS.max_retry is not None else max_retry
         for i in range(max_retry):
             try:
                 if embedding:
-                    return self._create_embedding_inner_function(**kwargs)
+                    return self._create_embedding_inner_function(*args, **kwargs)
                 if chat_completion:
-                    return self._create_chat_completion_auto_continue(**kwargs)
+                    return self._create_chat_completion_auto_continue(*args, **kwargs)
             except openai.BadRequestError as e:  # noqa: PERF203
-                logger.warning(e)
+                logger.warning(str(e))
                 logger.warning(f"Retrying {i+1}th time...")
-                if "'messages' must contain the word 'json' in some form" in e.message:
+                if (
+                    "'messages' must contain the word 'json' in some form" in e.message
+                    or "\\'messages\\' must contain the word \\'json\\' in some form" in e.message
+                ):
                     kwargs["add_json_in_prompt"] = True
                 elif embedding and "maximum context length" in e.message:
                     kwargs["input_content_list"] = [
                         content[: len(content) // 2] for content in kwargs.get("input_content_list", [])
                     ]
             except Exception as e:  # noqa: BLE001
-                logger.warning(e)
+                logger.warning(str(e))
                 logger.warning(f"Retrying {i+1}th time...")
                 time.sleep(self.retry_wait_seconds)
         error_message = f"Failed to create chat completion after {max_retry} retries."
         raise RuntimeError(error_message)
 
-    def _create_embedding_inner_function(
-        self, input_content_list: list[str], **kwargs: Any
+    def _create_embedding_inner_function(  # type: ignore[no-untyped-def]
+        self, input_content_list: list[str], *args, **kwargs
     ) -> list[Any]:  # noqa: ARG002
         content_to_embedding_dict = {}
         filtered_input_content_list = []
@@ -584,7 +590,7 @@ def _create_embedding_inner_function(
                 filtered_input_content_list[i : i + LLM_SETTINGS.embedding_max_str_num]
                 for i in range(0, len(filtered_input_content_list), LLM_SETTINGS.embedding_max_str_num)
             ]:
-                if self.use_azure:
+                if self.embedding_use_azure:
                     response = self.embedding_client.embeddings.create(
                         model=self.embedding_model,
                         input=sliced_filtered_input_content_list,
@@ -601,7 +607,7 @@ def _create_embedding_inner_function(
                     self.cache.embedding_set(content_to_embedding_dict)
         return [content_to_embedding_dict[content] for content in input_content_list]
 
-    def _build_log_messages(self, messages: list[dict]) -> str:
+    def _build_log_messages(self, messages: list[dict[str, Any]]) -> str:
         log_messages = ""
         for m in messages:
             log_messages += (
@@ -612,19 +618,20 @@ def _build_log_messages(self, messages: list[dict]) -> str:
             )
         return log_messages
 
-    def _create_chat_completion_inner_function(  # noqa: C901, PLR0912, PLR0915
+    def _create_chat_completion_inner_function(  # type: ignore[no-untyped-def] # noqa: C901, PLR0912, PLR0915
         self,
-        messages: list[dict],
+        messages: list[dict[str, Any]],
         temperature: float | None = None,
         max_tokens: int | None = None,
         chat_cache_prefix: str = "",
         frequency_penalty: float | None = None,
         presence_penalty: float | None = None,
-        *,
         json_mode: bool = False,
         add_json_in_prompt: bool = False,
         seed: Optional[int] = None,
-    ) -> str:
+        *args,
+        **kwargs,
+    ) -> tuple[str, str | None]:
         """
         seed : Optional[int]
             When retrying with cache enabled, it will keep returning the same results.
@@ -670,7 +677,7 @@ def _create_chat_completion_inner_function(  # noqa: C901, PLR0912, PLR0915
         finish_reason = None
         if self.use_llama2:
             response = self.generator.chat_completion(
-                messages,  # type: ignore
+                messages,
                 max_gen_len=max_tokens,
                 temperature=temperature,
             )
@@ -699,7 +706,7 @@ def _create_chat_completion_inner_function(  # noqa: C901, PLR0912, PLR0915
             if LLM_SETTINGS.log_llm_chat_content:
                 logger.info(f"{LogColors.CYAN}Response:{resp}{LogColors.END}", tag="llm_messages")
         else:
-            kwargs = dict(
+            call_kwargs = dict(
                 model=model,
                 messages=messages,
                 max_tokens=max_tokens,
@@ -715,8 +722,8 @@ def _create_chat_completion_inner_function(  # noqa: C901, PLR0912, PLR0915
                         message["content"] = message["content"] + "\nPlease respond in json format."
                         if message["role"] == "system":
                             break
-                kwargs["response_format"] = {"type": "json_object"}
-            response = self.chat_client.chat.completions.create(**kwargs)
+                call_kwargs["response_format"] = {"type": "json_object"}
+            response = self.chat_client.chat.completions.create(**call_kwargs)
 
             if self.chat_stream:
                 resp = ""
@@ -762,7 +769,9 @@ def _create_chat_completion_inner_function(  # noqa: C901, PLR0912, PLR0915
             self.cache.chat_set(input_content_json, resp)
         return resp, finish_reason
 
-    def calculate_token_from_messages(self, messages: list[dict]) -> int:
+    def calculate_token_from_messages(self, messages: list[dict[str, Any]]) -> int:
+        if self.encoder is None:
+            raise ValueError("Encoder is not initialized.")
         if self.use_llama2 or self.use_gcr_endpoint:
             logger.warning("num_tokens_from_messages() is not implemented for model llama2.")
             return 0  # TODO implement this function for llama2
@@ -787,7 +796,7 @@ def build_messages_and_calculate_token(
         self,
         user_prompt: str,
         system_prompt: str | None,
-        former_messages: list[dict] | None = None,
+        former_messages: list[dict[str, Any]] | None = None,
         *,
         shrink_multiple_break: bool = False,
     ) -> int:
@@ -818,4 +827,4 @@ def calculate_embedding_distance_between_str_list(
     target_embeddings_np = target_embeddings_np / np.linalg.norm(target_embeddings_np, axis=1, keepdims=True)
     similarity_matrix = np.dot(source_embeddings_np, target_embeddings_np.T)
 
-    return similarity_matrix.tolist()
+    return similarity_matrix.tolist()  # type: ignore[no-any-return]
diff --git a/rdagent/scenarios/data_mining/developer/feedback.py b/rdagent/scenarios/data_mining/developer/feedback.py
index 5a96609be..271efd16c 100644
--- a/rdagent/scenarios/data_mining/developer/feedback.py
+++ b/rdagent/scenarios/data_mining/developer/feedback.py
@@ -9,8 +9,8 @@
 from rdagent.core.experiment import Experiment
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import (
+    Experiment2Feedback,
     Hypothesis,
-    HypothesisExperiment2Feedback,
     HypothesisFeedback,
     Trace,
 )
@@ -22,14 +22,15 @@
 DIRNAME = Path(__file__).absolute().resolve().parent
 
 
-class DMModelHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
+class DMModelExperiment2Feedback(Experiment2Feedback):
     """Generated feedbacks on the hypothesis from **Executed** Implementations of different tasks & their comparisons with previous performances"""
 
-    def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
+    def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback:
         """
         The `ti` should be executed and the results should be included, as well as the comparison between previous results (done by LLM).
         For example: `mlflow` of Qlib will be included.
         """
+        hypothesis = exp.hypothesis
 
         logger.info("Generating feedback...")
         # Define the system prompt for hypothesis feedback
@@ -46,7 +47,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
                 context=context,
                 last_hypothesis=SOTA_hypothesis,
                 last_task=SOTA_experiment.sub_tasks[0].get_task_information() if SOTA_hypothesis else None,
-                last_code=SOTA_experiment.sub_workspace_list[0].code_dict.get("model.py") if SOTA_hypothesis else None,
+                last_code=SOTA_experiment.sub_workspace_list[0].file_dict.get("model.py") if SOTA_hypothesis else None,
                 last_result=SOTA_experiment.result if SOTA_hypothesis else None,
                 hypothesis=hypothesis,
                 exp=exp,
diff --git a/rdagent/scenarios/data_mining/developer/model_runner.py b/rdagent/scenarios/data_mining/developer/model_runner.py
index 2d04149a6..5beedcef8 100644
--- a/rdagent/scenarios/data_mining/developer/model_runner.py
+++ b/rdagent/scenarios/data_mining/developer/model_runner.py
@@ -7,10 +7,10 @@
 class DMModelRunner(CachedRunner[DMModelExperiment]):
     @cache_with_pickle(CachedRunner.get_cache_key, CachedRunner.assign_cached_result)
     def develop(self, exp: DMModelExperiment) -> DMModelExperiment:
-        if exp.sub_workspace_list[0].code_dict.get("model.py") is None:
+        if exp.sub_workspace_list[0].file_dict.get("model.py") is None:
             raise ModelEmptyError("model.py is empty")
         # to replace & inject code
-        exp.experiment_workspace.inject_code(**{"model.py": exp.sub_workspace_list[0].code_dict["model.py"]})
+        exp.experiment_workspace.inject_files(**{"model.py": exp.sub_workspace_list[0].file_dict["model.py"]})
 
         env_to_use = {"PYTHONPATH": "./"}
 
diff --git a/rdagent/scenarios/data_mining/proposal/model_proposal.py b/rdagent/scenarios/data_mining/proposal/model_proposal.py
index 37f36e2f4..a2a6354d2 100644
--- a/rdagent/scenarios/data_mining/proposal/model_proposal.py
+++ b/rdagent/scenarios/data_mining/proposal/model_proposal.py
@@ -80,7 +80,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             else "No previous hypothesis and feedback available since it's the first round."
         )
 
-        experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist]
+        experiment_list: List[ModelExperiment] = [t[0] for t in trace.hist]
 
         model_list = []
         for experiment in experiment_list:
@@ -95,7 +95,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             "RAG": None,
         }, True
 
-    def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
+    def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace) -> ModelExperiment:
         response_dict = json.loads(response)
         tasks = []
         for model_name in response_dict:
@@ -116,6 +116,6 @@ def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
                     model_type=model_type,
                 )
             )
-        exp = DMModelExperiment(tasks)
-        exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+        exp = DMModelExperiment(tasks, hypothesis=hypothesis)
+        exp.based_experiments = [t[0] for t in trace.hist if t[1]]
         return exp
diff --git a/rdagent/scenarios/data_science/__init__.py b/rdagent/scenarios/data_science/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/scenarios/data_science/debug/data.py b/rdagent/scenarios/data_science/debug/data.py
new file mode 100644
index 000000000..3161046d2
--- /dev/null
+++ b/rdagent/scenarios/data_science/debug/data.py
@@ -0,0 +1,282 @@
+import os
+import platform
+import shutil
+from collections import Counter
+from pathlib import Path
+
+import pandas as pd
+from tqdm import tqdm
+
+try:
+    import bson  # pip install pymongo
+except:
+    pass
+
+from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+
+
+class DataHandler:
+    """Base DataHandler interface."""
+
+    def load(self, path) -> pd.DataFrame:
+        raise NotImplementedError
+
+    def dump(self, df: pd.DataFrame, path):
+        raise NotImplementedError
+
+
+class GenericDataHandler(DataHandler):
+    """
+    A generic data handler that automatically detects file type based on suffix
+    and uses the correct pandas method for load/dump.
+    """
+
+    def load(self, path) -> pd.DataFrame:
+        path = Path(path)
+        suffix = path.suffix.lower()
+
+        if suffix == ".csv":
+            return pd.read_csv(path)
+        elif suffix == ".pkl":
+            return pd.read_pickle(path)
+        elif suffix == ".parquet":
+            return pd.read_parquet(path)
+        elif suffix in [".h5", ".hdf", ".hdf5"]:
+            # Note: for HDF, you need a 'key' in read_hdf. If you expect a single key,
+            # you might do: pd.read_hdf(path, key='df') or something similar.
+            # Adjust as needed based on your HDF structure.
+            return pd.read_hdf(path, key="data")
+        elif suffix == ".jsonl":
+            # Read JSON Lines file
+            return pd.read_json(path, lines=True)
+        elif suffix == ".bson":
+            data = bson.decode_file_iter(open(path, "rb"))
+            df = pd.DataFrame(data)
+            return df
+        else:
+            raise ValueError(f"Unsupported file type: {suffix}")
+
+    def dump(self, df: pd.DataFrame, path):
+        path = Path(path)
+        suffix = path.suffix.lower()
+
+        if suffix == ".csv":
+            df.to_csv(path, index=False)
+        elif suffix == ".pkl":
+            df.to_pickle(path)
+        elif suffix == ".parquet":
+            df.to_parquet(path, index=True)
+        elif suffix in [".h5", ".hdf", ".hdf5"]:
+            # Similarly, you need a key for HDF.
+            df.to_hdf(path, key="data", mode="w")
+        elif suffix == ".jsonl":
+            # Save DataFrame to JSON Lines file
+            df.to_json(path, orient="records", lines=True)
+        elif suffix == ".bson":
+            data = df.to_dict(orient="records")
+            with open(path, "wb") as file:
+                # Write each record in the list to the BSON file
+                for record in data:
+                    file.write(bson.BSON.encode(record))
+        else:
+            raise ValueError(f"Unsupported file type: {suffix}")
+
+
+class DataReducer:
+    """Base DataReducer interface."""
+
+    def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
+        raise NotImplementedError
+
+
+class RandDataReducer(DataReducer):
+    """
+    Example random sampler: ensures at least `min_num` rows
+    or at least `min_frac` fraction of the data (whichever is larger).
+    """
+
+    def __init__(self, min_frac=0.02, min_num=5):
+        self.min_frac = min_frac
+        self.min_num = min_num
+
+    def reduce(self, df: pd.DataFrame, frac: float = None) -> pd.DataFrame:
+        frac = max(self.min_frac, self.min_num / len(df)) if frac is None else frac
+        # print(f"Sampling {frac * 100:.2f}% of the data ({len(df)} rows)")
+        if frac >= 1:
+            return df
+        return df.sample(frac=frac, random_state=1)
+
+
+class UniqueIDDataReducer(DataReducer):
+    def __init__(self, min_frac=0.02, min_num=5):
+        self.min_frac = min_frac
+        self.min_num = min_num
+        self.random_reducer = RandDataReducer(min_frac, min_num)
+
+    def reduce(self, df: pd.DataFrame) -> pd.DataFrame:
+        if (
+            not isinstance(df, pd.DataFrame)
+            or not isinstance(df.iloc[0, -1], (int, float, str, tuple, frozenset, bytes, complex, type(None)))
+            or df.iloc[:, -1].unique().shape[0] == 0
+            or df.iloc[:, -1].unique().shape[0] >= df.shape[0] * 0.5
+        ):
+            return self.random_reducer.reduce(df)
+        unique_labels = df.iloc[:, -1].unique()
+        unique_labels = unique_labels[~pd.isna(unique_labels)]
+        unique_count = unique_labels.shape[0]
+        print("Unique labels:", unique_count / df.shape[0])
+
+        labels = df.iloc[:, -1]
+        unique_labels = labels.dropna().unique()
+        unique_count = len(unique_labels)
+
+        sampled_rows = df.groupby(labels, group_keys=False).apply(lambda x: x.sample(n=1, random_state=1))
+
+        frac = max(self.min_frac, self.min_num / len(df))
+
+        if int(len(df) * frac) < unique_count:
+            return sampled_rows.reset_index(drop=True)
+
+        remain_df = df.drop(index=sampled_rows.index)
+        remaining_frac = frac - unique_count / len(df)
+
+        remaining_sampled = self.random_reducer.reduce(remain_df, remaining_frac)
+        result_df = pd.concat([sampled_rows, remaining_sampled]).sort_index()
+        return result_df
+
+
+def count_files_in_folder(folder: Path) -> int:
+    """
+    Count the total number of files in a folder, including files in subfolders.
+    """
+    return sum(1 for _ in folder.rglob("*") if _.is_file())
+
+
+def create_debug_data(
+    competition: str,
+    dr_cls: type[DataReducer] = UniqueIDDataReducer,
+    min_frac=0.01,
+    min_num=5,
+    dataset_path=None,
+    sample_path=None,
+):
+    """
+    Reads the original data file, creates a reduced sample,
+    and renames/moves files for easier debugging.
+    Automatically detects file type (csv, pkl, parquet, hdf, etc.).
+    """
+    if dataset_path is None:
+        dataset_path = KAGGLE_IMPLEMENT_SETTING.local_data_path  # FIXME: don't hardcode this KAGGLE_IMPLEMENT_SETTING
+
+    if sample_path is None:
+        sample_path = Path(dataset_path) / "sample"
+
+    data_folder = Path(dataset_path) / competition
+    sample_folder = Path(sample_path) / competition
+
+    # Traverse the folder and exclude specific file types
+    included_extensions = {".csv", ".pkl", ".parquet", ".h5", ".hdf", ".hdf5", ".jsonl", ".bson"}
+    files_to_process = [file for file in data_folder.rglob("*") if file.is_file()]
+    total_files_count = len(files_to_process)
+    print(
+        f"[INFO] Original dataset folder `{data_folder}` has {total_files_count} files in total (including subfolders)."
+    )
+    file_types_count = Counter(file.suffix.lower() for file in files_to_process)
+    print("File type counts:")
+    for file_type, count in file_types_count.items():
+        print(f"{file_type}: {count}")
+
+    # This set will store filenames or paths that appear in the sampled data
+    sample_used_file_names = set()
+
+    # Prepare data handler and reducer
+    data_handler = GenericDataHandler()
+    data_reducer = dr_cls(min_frac=min_frac, min_num=min_num)
+
+    skip_subfolder_data = any(
+        f.is_file() and f.suffix in included_extensions
+        for f in data_folder.iterdir()
+        if f.name.startswith(("train", "test"))
+    )
+    processed_files = []
+
+    for file_path in tqdm(files_to_process, desc="Processing data", unit="file"):
+        sampled_file_path = sample_folder / file_path.relative_to(data_folder)
+        if sampled_file_path.exists():
+            continue
+
+        if file_path.suffix.lower() not in included_extensions:
+            continue
+
+        if skip_subfolder_data and file_path.parent != data_folder:
+            continue  # bypass files in subfolders
+
+        sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Load the original data
+        df = data_handler.load(file_path)
+
+        # Create a sampled subset
+        df_sampled = data_reducer.reduce(df)
+        processed_files.append(file_path)
+        # Dump the sampled data
+        try:
+            data_handler.dump(df_sampled, sampled_file_path)
+            # Extract possible file references from the sampled data
+            if "submission" in file_path.stem:
+                continue  # Skip submission files
+            for col in df_sampled.columns:
+                unique_vals = df_sampled[col].astype(str).unique()
+                for val in unique_vals:
+                    # Add the entire string to the set;
+                    # in real usage, might want to parse or extract basename, etc.
+                    sample_used_file_names.add(val)
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+            continue
+
+    # Process non-data files
+    subfolder_dict = {}
+    for file_path in files_to_process:
+        if file_path in processed_files:
+            continue  # Already handled above
+        rel_dir = file_path.relative_to(data_folder).parts[0]
+        subfolder_dict.setdefault(rel_dir, []).append(file_path)
+
+    # For each subfolder, decide which files to copy
+    for rel_dir, file_list in tqdm(subfolder_dict.items(), desc="Processing files", unit="file"):
+        used_files = []
+        not_used_files = []
+
+        # Check if each file is in the "used" list
+        for fp in file_list:
+            if str(fp.name) in sample_used_file_names or str(fp.stem) in sample_used_file_names:
+                used_files.append(fp)
+            else:
+                not_used_files.append(fp)
+
+        # Directly copy used files
+        for uf in used_files:
+            sampled_file_path = sample_folder / uf.relative_to(data_folder)
+            if sampled_file_path.exists():
+                continue
+            sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
+            shutil.copy(uf, sampled_file_path)
+
+        # If no files are used, randomly sample files to keep the folder from being empty
+        if len(used_files) == 0:
+            if len(file_list) <= min_num:
+                num_to_keep = len(file_list)
+            else:
+                num_to_keep = max(int(len(file_list) * min_frac), min_num)
+            print(f"Sampling {num_to_keep} files without label from {len(file_list)} files in {rel_dir}")
+            sampled_not_used = pd.Series(not_used_files).sample(n=num_to_keep, random_state=1)
+            for nf in sampled_not_used:
+                sampled_file_path = sample_folder / nf.relative_to(data_folder)
+                if sampled_file_path.exists():
+                    continue
+                sampled_file_path.parent.mkdir(parents=True, exist_ok=True)
+                shutil.copy(nf, sampled_file_path)
+
+    final_files_count = count_files_in_folder(sample_folder)
+    print(f"[INFO] After sampling, the sample folder `{sample_folder}` contains {final_files_count} files in total.")
diff --git a/rdagent/scenarios/data_science/dev/coder.py b/rdagent/scenarios/data_science/dev/coder.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
new file mode 100644
index 000000000..7e794eb56
--- /dev/null
+++ b/rdagent/scenarios/data_science/dev/feedback.py
@@ -0,0 +1,85 @@
+import json
+
+from rdagent.components.knowledge_management.graph import UndirectedNode
+from rdagent.core.experiment import Experiment
+from rdagent.core.prompts import Prompts
+from rdagent.core.proposal import (
+    Experiment2Feedback,
+    ExperimentFeedback,
+    HypothesisFeedback,
+)
+from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace
+from rdagent.utils import convert2bool, remove_path_info_from_str
+from rdagent.utils.agent.tpl import T
+from rdagent.utils.repo.diff import generate_diff
+
+
+class DSExperiment2Feedback(Experiment2Feedback):
+    def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeedback:
+        # 用哪些信息来生成feedback
+        # 1. pending_tasks_list[0][0] 任务的描述
+        # 2. hypothesis 任务的假设
+        # 3. 相对sota_exp的改动
+        # 4. result 任务的结果
+        # 5. sota_exp.result 之前最好的结果
+        sota_exp = trace.sota_experiment()
+        sota_desc = T("scenarios.data_science.share:describe.exp").r(
+            exp=sota_exp, heading="SOTA of previous exploration of the scenario"
+        )
+
+        # Get feedback description using shared template
+        feedback_desc = T("scenarios.data_science.share:describe.feedback").r(
+            exp_and_feedback=(trace.hist[-1] if trace.hist else None), heading="Previous Trial Feedback"
+        )
+
+        # TODO:
+        # -  Should we choose between the diff from last experiment or last sota ?
+
+        # Retrieve the last experiment from the history
+        last_exp = trace.hist[-1][0] if trace.hist else None
+        if last_exp:
+            last_workspace_path = last_exp.experiment_workspace.workspace_path
+            current_workspace_path = exp.experiment_workspace.workspace_path
+            # Generate a diff between the two workspaces
+            diff_edition = generate_diff(last_workspace_path, current_workspace_path)
+        else:
+            diff_edition = []
+
+        diff_edition = [
+            remove_path_info_from_str(
+                exp.experiment_workspace.workspace_path,
+                remove_path_info_from_str(last_exp.experiment_workspace.workspace_path, line),
+            )
+            for line in diff_edition
+        ]
+
+        # assumption:
+        # The feedback should focus on experiment **improving**.
+        # Assume that all the the sota exp is based on the previous sota experiment
+
+        system_prompt = T(".prompts:exp_feedback.system").r(scenario=self.scen.get_scenario_all_desc())
+        user_prompt = T(".prompts:exp_feedback.user").r(
+            sota_desc=sota_desc,
+            cur_exp=exp,
+            diff_edition=diff_edition,
+            feedback_desc=feedback_desc,
+        )
+
+        resp_dict = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(
+                user_prompt=user_prompt,
+                system_prompt=system_prompt,
+                json_mode=True,
+            )
+        )
+
+        return HypothesisFeedback(
+            observations=resp_dict.get("Observations", "No observations provided"),
+            hypothesis_evaluation=resp_dict.get("Feedback for Hypothesis", "No feedback provided"),
+            new_hypothesis=resp_dict.get("New Hypothesis", "No new hypothesis provided"),
+            reason=resp_dict.get("Reasoning", "No reasoning provided"),
+            decision=convert2bool(resp_dict.get("Replace Best Result", "no")),
+        )
diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
new file mode 100644
index 000000000..bbc9ac2f6
--- /dev/null
+++ b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -0,0 +1,57 @@
+exp_feedback:
+  system: |-
+    You are an advanced assistant for analyzing results in data-driven R&D.
+    The task is described in the following scenario:
+    {{ scenario }}
+
+    You will analyze the current experiment's hypothesis, code, results, and compare them with previous experiments and the best past result.
+    Your feedback should:
+    1. Confirm if the current result supports or refutes the hypothesis.
+    2. Compare with previous best results.
+    3. Suggest improvements or new directions. Stay innovative and adapative.
+    4. SOTA results are the best outcomes we have achieved in this scenario. If we do not have complete experiment available (i.e., results that are runnable and can generate evaluation outcomes), **please replace it as the best result/SOTA**.
+
+    Please provide detailed and constructive feedback.
+    Example JSON Structure for Result Analysis:
+    {
+      "Observations": "Your overall observations here",
+      "Feedback for Hypothesis": "Observations related to the hypothesis",
+      "New Hypothesis": "Your new hypothesis here",
+      "Reasoning": "Reasoning for the new hypothesis",
+      "Replace Best Result": "yes or no"
+    }
+
+  user: |-
+    We are in a process of finding and validating hypotheses to build powerful codes. Each round aims to confirm or reject hypotheses based on results.
+
+    {{ sota_desc }}
+
+    ## Current solution
+    Current solution to be evaluated:
+
+    ### Task of Current solution
+    {{cur_exp.pending_tasks_list[0][0].get_task_information()}}
+
+    {% if cur_exp.hypothesis %}
+    the experiment is designed based on hypothesis: {{ cur_exp.hypothesis }}
+    Modified code according to hypothesis:
+    {% else %}
+    Modified code:
+    {% endif %}
+
+    {% for de in diff_edition %}
+    {{ de }}
+    {% endfor %}
+
+    Final results of the current solution: 
+    {{ cur_exp.result }}
+    ### Complete Code of current solution
+    {{cur_exp.experiment_workspace.all_codes}}
+
+    {{feedback_desc}}
+
+    Please refer to these hypotheses and feedback to help you recommend new experiment and hypothesis
+
+    Consider Changing Direction for Significant Gaps with the Best Result and the last round:
+      - If the new results significantly differ from SOTA, consider a new direction.
+      - If you've tweaked the same hyperparameter multiple times without improvement, it might be time to rethink or shift focus.
diff --git a/rdagent/scenarios/data_science/dev/runner.py b/rdagent/scenarios/data_science/dev/runner.py
new file mode 100644
index 000000000..d403b212c
--- /dev/null
+++ b/rdagent/scenarios/data_science/dev/runner.py
@@ -0,0 +1,33 @@
+import pandas as pd
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.core.developer import Developer
+from rdagent.core.exception import RunnerError
+from rdagent.log import rdagent_logger as logger
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.utils.env import DockerEnv, DSDockerConf
+
+
+class DSRunner(Developer[DSExperiment]):
+    def develop(self, exp: DSExperiment) -> DSExperiment:
+        ds_docker_conf = DSDockerConf()
+        ds_docker_conf.extra_volumes = {f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": "/kaggle/input"}
+        ds_docker_conf.running_timeout_period = 60 * 60  # 1 hours
+
+        de = DockerEnv(conf=ds_docker_conf)
+
+        # execute workflow
+        stdout = exp.experiment_workspace.execute(env=de, entry="python main.py")
+
+        score_fp = exp.experiment_workspace.workspace_path / "scores.csv"
+        if not score_fp.exists():
+            logger.error("Metrics file (scores.csv) is not generated.")
+            raise RunnerError(f"Metrics file (scores.csv) is not generated, log is:\n{stdout}")
+
+        submission_fp = exp.experiment_workspace.workspace_path / "submission.csv"
+        if not submission_fp.exists():
+            logger.error("Submission file (submission.csv) is not generated.")
+            raise RunnerError(f"Submission file (submission.csv) is not generated, log is:\n{stdout}")
+
+        exp.result = pd.read_csv(score_fp, index_col=0)
+        return exp
diff --git a/rdagent/scenarios/data_science/experiment/__init__.py b/rdagent/scenarios/data_science/experiment/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
new file mode 100644
index 000000000..b468970e8
--- /dev/null
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -0,0 +1,29 @@
+import re
+from typing import Literal
+
+import pandas as pd
+
+from rdagent.core.experiment import Experiment, FBWorkspace, Task
+
+COMPONENT = Literal["DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow"]
+
+
+class DSExperiment(Experiment[Task, FBWorkspace, FBWorkspace]):
+    def __init__(self, pending_tasks_list: list, *args, **kwargs) -> None:
+        super().__init__(sub_tasks=[], *args, **kwargs)
+        self.experiment_workspace = FBWorkspace()
+        self.pending_tasks_list = pending_tasks_list
+
+    def next_component_required(self) -> COMPONENT | None:
+        files = list(self.experiment_workspace.file_dict.keys())
+        if "load_data.py" not in files:
+            return "DataLoadSpec"
+        if "feature.py" not in files:
+            return "FeatureEng"
+        if not any(re.match(r"model.*\.py", file) for file in files):
+            return "Model"
+        if "ensemble.py" not in files:
+            return "Ensemble"
+        if "main.py" not in files:
+            return "Workflow"
+        return None
diff --git a/rdagent/scenarios/data_science/proposal/__init__.py b/rdagent/scenarios/data_science/proposal/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
new file mode 100644
index 000000000..59d9c4d61
--- /dev/null
+++ b/rdagent/scenarios/data_science/proposal/exp_gen.py
@@ -0,0 +1,383 @@
+import json
+import re
+from typing import Literal
+
+import pandas as pd
+
+from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
+from rdagent.components.coder.data_science.feature.exp import FeatureTask
+from rdagent.components.coder.data_science.model.exp import ModelTask
+from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
+from rdagent.core.experiment import Experiment, Workspace
+from rdagent.core.knowledge_base import KnowledgeBase
+from rdagent.core.proposal import (
+    ExperimentFeedback,
+    ExpGen,
+    Hypothesis,
+    HypothesisFeedback,
+    Trace,
+)
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.data_science.experiment.experiment import COMPONENT, DSExperiment
+from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.utils.agent.tpl import T
+from rdagent.utils.repo.diff import generate_diff
+
+
+class DSHypothesis(Hypothesis):
+    def __init__(
+        self,
+        component: COMPONENT,
+        hypothesis: str = "",
+        reason: str = "",
+        concise_reason: str = "",
+        concise_observation: str = "",
+        concise_justification: str = "",
+        concise_knowledge: str = "",
+    ) -> None:
+        super().__init__(
+            hypothesis, reason, concise_reason, concise_observation, concise_justification, concise_knowledge
+        )
+        self.component = component
+
+    def __str__(self) -> str:
+        if self.hypothesis == "":
+            return f"No hypothesis available. Trying to construct the first runnable {self.component} component."
+        return f"""Chosen Component: {self.component}
+Hypothesis: {self.hypothesis}
+Reason: {self.reason}
+Concise Reason & Knowledge: {self.concise_reason}
+Concise Observation: {self.concise_observation}
+Concise Justification: {self.concise_justification}
+Concise Knowledge: {self.concise_knowledge}
+"""
+
+
+COMPONENT_TASK_MAPPING = {
+    "DataLoadSpec": {
+        "target_name": "Data loader and specification generation",
+        "spec_file": "spec/data_loader.md",
+        "task_output_format": T(".prompts:output_format.data_loader").r(),
+        "task_class": DataLoaderTask,
+    },
+    "FeatureEng": {
+        "target_name": "Feature engineering",
+        "spec_file": "spec/feature.md",
+        "task_output_format": T(".prompts:output_format.feature").r(),
+        "task_class": FeatureTask,
+    },
+    "Model": {
+        "target_name": "Building model",
+        "spec_file": "spec/model.md",
+        "task_output_format": T(".prompts:output_format.model").r(),
+        "task_class": ModelTask,
+        "extra_params": {
+            "model_type": "Model type not provided",
+            "architecture": "Model architecture not provided",
+            "hyperparameters": "Model hyperparameters not provided",
+        },
+        "extra_requirement": T(".prompts:extra_requirement.model").r(),
+    },
+    "Ensemble": {
+        "target_name": "Ensemble",
+        "spec_file": "spec/ensemble.md",
+        "task_output_format": T(".prompts:output_format.ensemble").r(),
+        "task_class": EnsembleTask,
+    },
+    "Workflow": {
+        "target_name": "Workflow",
+        "spec_file": "spec/workflow.md",
+        "task_output_format": T(".prompts:output_format.workflow").r(),
+        "task_class": WorkflowTask,
+    },
+}
+
+
+class DSTrace(Trace[DataScienceScen, KnowledgeBase]):
+    def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None = None) -> None:
+        self.scen: DataScienceScen = scen
+        self.hist: list[tuple[DSExperiment, ExperimentFeedback]] = []
+        self.knowledge_base = knowledge_base
+
+    def sota_experiment(self, last_n: int = -1) -> DSExperiment | None:
+        """
+        Access the last experiment result.
+
+        Parameters
+        ----------
+        last_n : int
+            The index from the last experiment result to access.
+            Use -1 for the most recent experiment, -2 for the second most recent, and so on.
+
+        Returns
+        -------
+        Experiment or None
+            The experiment result if found, otherwise None.
+        """
+        assert last_n < 0
+        for exp, ef in self.hist[::-1]:
+            # the sota exp should be accepted decision and all required components are completed.
+            if ef.decision and exp.next_component_required() is None:
+                last_n += 1
+                if last_n == 0:
+                    return exp
+        return None
+
+    def last_successful_exp(self) -> DSExperiment | None:
+        """
+        Access the last successful experiment even part of the components are not completed.
+        """
+        for exp, ef in self.hist[::-1]:
+            if ef.decision:
+                return exp
+        return None
+
+
+class DSExpGen(ExpGen):
+    """Data Science Task Generator."""
+
+    def __init__(self, scen: DataScienceScen, max_trace_hist: int = 3) -> None:
+        self.max_trace_hist = max_trace_hist  # max number of historical trace to know when propose new experiment
+        super().__init__(scen)
+
+    def _init_task_gen(
+        self,
+        targets: str,
+        scenario_desc: str,
+        task_output_format: str,
+        workspace_code: str | None = None,
+        spec: str = None,
+        hypothesis: Hypothesis | None = None,
+        exp_and_feedback_desc: str | None = None,
+    ) -> dict:
+        system_prompt = T(".prompts:task_gen.system").r(
+            targets=targets,
+            scenario=scenario_desc,
+            task_specification=spec,
+            hypothesis=hypothesis,
+            task_output_format=task_output_format,
+        )
+        user_prompt = T(".prompts:task_gen.user").r(
+            targets=targets,
+            hypothesis=hypothesis,
+            workspace_code=workspace_code,
+            exp_and_feedback_desc=exp_and_feedback_desc,
+        )
+
+        resp_dict = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(
+                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+            )
+        )
+
+        return resp_dict
+
+    def _handle_missing_component(
+        self,
+        component: COMPONENT,
+        task_cls: type,
+        scenario_desc: str,
+        trace: Trace,
+        last_successful_exp: DSExperiment | None,
+        spec_file: str | None = None,
+        component_prompt_key: str | None = None,
+    ) -> DSExperiment:
+        """Handle any component using a unified approach.
+
+        Args:
+            component: Name of the component (e.g. "DataLoadSpec")
+            task_cls: The task class to instantiate (e.g. DataLoaderTask)
+            scenario_desc: Description of the current scenario
+            last_successful_exp: Last successful experiment or None
+            spec_file: Path to specification file if needed
+        """
+        resp_dict = self._init_task_gen(
+            targets=component,
+            scenario_desc=scenario_desc,
+            spec=last_successful_exp.experiment_workspace.file_dict[spec_file] if spec_file else None,
+            task_output_format=T(f".prompts:output_format.{component_prompt_key or component.lower()}").r(),
+        )
+
+        # Create task instance
+        exp_and_feedback = trace.hist[-1] if len(trace.hist) > 0 else None
+        if (
+            exp_and_feedback
+            and exp_and_feedback[1].exception is not None
+            and (
+                exp_and_feedback[0].pending_tasks_list[0][0].name == component
+                or exp_and_feedback[0].pending_tasks_list[0][0].name.startswith("model_")
+                and component == "Model"
+            )
+        ):  # Assumption: when completing missing component, using component name as task name
+            resp_dict[
+                "description"
+            ] += f"\n\nYou have tried to implement the same component and got the following exception: \n{exp_and_feedback[1].exception}\n Please try different methods to avoid the same errors and results in an infinite loop"
+
+        task = task_cls(
+            name=component if component != "Model" else resp_dict.pop("model_name"),
+            description=resp_dict.get("description", f"{component} description not provided"),
+            **resp_dict.get("extra_params", {}),
+        )
+
+        exp = DSExperiment(pending_tasks_list=[[task]], hypothesis=DSHypothesis(component))
+        if last_successful_exp:
+            exp.experiment_workspace.inject_code_from_folder(last_successful_exp.experiment_workspace.workspace_path)
+        return exp
+
+    def gen(self, trace: DSTrace) -> DSExperiment:
+        scenario_desc = trace.scen.get_scenario_all_desc()
+        last_successful_exp = trace.last_successful_exp()
+
+        if len(trace.hist) == 0 or last_successful_exp is None:
+            next_missing_component = "DataLoadSpec"
+        else:
+            next_missing_component = last_successful_exp.next_component_required()
+
+        init_component_config = {
+            "DataLoadSpec": {"task_cls": DataLoaderTask, "spec_file": None, "component_prompt_key": "data_loader"},
+            "FeatureEng": {"task_cls": FeatureTask, "spec_file": "spec/feature.md", "component_prompt_key": "feature"},
+            "Model": {"task_cls": ModelTask, "spec_file": "spec/model.md", "component_prompt_key": "model"},
+            "Ensemble": {"task_cls": EnsembleTask, "spec_file": "spec/ensemble.md", "component_prompt_key": "ensemble"},
+            "Workflow": {"task_cls": WorkflowTask, "spec_file": "spec/workflow.md", "component_prompt_key": "workflow"},
+        }
+
+        if next_missing_component in init_component_config:
+            # TODO: we may merge the if else logic in the future.
+            # the current
+            config = init_component_config[next_missing_component]
+            return self._handle_missing_component(
+                component=next_missing_component,
+                task_cls=config["task_cls"],
+                scenario_desc=scenario_desc,
+                last_successful_exp=last_successful_exp,
+                spec_file=config.get("spec_file"),
+                trace=trace,
+                component_prompt_key=config.get("component_prompt_key"),
+            )
+        else:  # propose new component by LLM
+            # Guidelines:
+            # System prompts: Shared condition you are facing
+            # - scenario description: `scenario_desc`
+            # - expected output format
+            # User prompts: Task Specific information
+            # - Previous Feedback
+            # - Current sota implementation (encourage change based on it)
+            # - Extra RAG
+            sota_exp = trace.sota_experiment()
+            assert sota_exp is not None, "SOTA experiment is not provided."
+            exp_and_feedback = trace.hist[-1]
+            last_exp = exp_and_feedback[0]
+
+            # Step 1: Generate component
+            # Describe current best solution using shared template
+            sota_exp_desc = T("scenarios.data_science.share:describe.exp").r(
+                exp=sota_exp, heading="Best of previous exploration of the scenario"
+            )
+            last_exp_diff = "\n".join(
+                generate_diff(
+                    sota_exp.experiment_workspace.workspace_path, last_exp.experiment_workspace.workspace_path
+                )
+            )
+            exp_and_feedback_desc = T("scenarios.data_science.share:describe.feedback").r(
+                exp_and_feedback=exp_and_feedback
+            )
+
+            # Generate component using template with proper context
+            component_sys_prompt = T(".prompts:component_gen.system").r(
+                scenario=scenario_desc,
+                sota_exp_desc=sota_exp_desc,
+                last_exp_diff=last_exp_diff,
+                component_output_format=T(".prompts:output_format.component").r(),
+            )
+
+            component_user_prompt = T(".prompts:component_gen.user").r(
+                exp_and_feedback_desc=exp_and_feedback_desc,
+            )
+
+            resp_dict_component: dict = json.loads(
+                APIBackend().build_messages_and_create_chat_completion(
+                    component_user_prompt, component_sys_prompt, json_mode=True
+                )
+            )
+
+            component = resp_dict_component.get("component", "Component not provided")
+
+            # Why we should split component selection and steps after?
+            # - after we know the selected component, we can use RAG.
+
+            # Step 2: Generate the rest of the hypothesis & task
+            component_info = COMPONENT_TASK_MAPPING.get(component)
+
+            if component_info:
+                system_prompt = T(".prompts:direct_exp_gen.system").r(
+                    targets=component_info["target_name"],
+                    component=component,
+                    scenario=scenario_desc,
+                    hypothesis_output_format=T(".prompts:output_format.hypothesis").r(),
+                    task_specification=sota_exp.experiment_workspace.file_dict[component_info["spec_file"]],
+                    task_output_format=component_info["task_output_format"],
+                    extra_requirement=component_info.get("extra_requirement"),
+                    workflow_check=(not component == "Workflow"),
+                )
+
+                recent_trace_desc = []
+                for i in range(self.max_trace_hist):
+                    if i < len(trace.hist):
+                        eaf = trace.hist[-i - 1]
+                        if eaf[1].decision:
+                            # we only add failed direction incase of trying same invalid direction
+                            break
+                        recent_trace_desc.insert(
+                            0, T("scenarios.data_science.share:describe.feedback").r(exp_and_feedback=eaf)
+                        )
+                user_prompt = T(".prompts:direct_exp_gen.user").r(
+                    exp_and_feedback_desc=exp_and_feedback_desc,
+                    sota_exp_desc=sota_exp_desc,
+                    last_exp_diff=last_exp_diff,
+                    recent_trace_desc="\n".join(recent_trace_desc),
+                )
+
+                resp_dict = json.loads(
+                    APIBackend().build_messages_and_create_chat_completion(
+                        user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                    )
+                )
+                assert "hypothesis_proposal" in resp_dict, "Hypothesis proposal not provided."
+                assert "task_design" in resp_dict, "Task design not provided."
+                task_class = component_info["task_class"]
+                hypothesis_proposal = resp_dict.get("hypothesis_proposal", {})
+                hypothesis = DSHypothesis(
+                    component=component,
+                    hypothesis=hypothesis_proposal.get("hypothesis", ""),
+                    reason=hypothesis_proposal.get("reason", ""),
+                    concise_reason=hypothesis_proposal.get("concise_reason", ""),
+                    concise_observation=hypothesis_proposal.get("concise_observation", ""),
+                    concise_justification=hypothesis_proposal.get("concise_justification", ""),
+                    concise_knowledge=hypothesis_proposal.get("concise_knowledge", ""),
+                )
+
+                task_design = resp_dict.get("task_design", {})
+                task_name = task_design["model_name"] if component == "Model" else component
+                description = task_design.get(
+                    "description", f"{component_info['target_name']} description not provided"
+                )
+                task = task_class(
+                    name=task_name,
+                    description=description,
+                    **{k: task_design.get(k, v) for k, v in component_info.get("extra_params", {}).items()},
+                )
+
+                exp = DSExperiment(pending_tasks_list=[[task]], hypothesis=hypothesis)
+                exp.experiment_workspace.inject_code_from_folder(sota_exp.experiment_workspace.workspace_path)
+
+                new_workflow_desc = resp_dict.get("workflow_update", "No update needed")
+                if new_workflow_desc != "No update needed":
+                    workflow_task = WorkflowTask(
+                        name="Workflow",
+                        description=new_workflow_desc,
+                    )
+                    exp.pending_tasks_list.append([workflow_task])
+                return exp
+            else:
+                raise ValueError(f"Unknown component: {component}")
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
new file mode 100644
index 000000000..03989e6d9
--- /dev/null
+++ b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -0,0 +1,340 @@
+hypothesis_gen: # It is deprecated now, please refer to direct_exp_gen
+  system: |-
+    The user is working on generating new hypotheses for the {{ targets }} in a data-driven research and development process. 
+    The {{ targets }} are used in the following scenario:
+    {{ scenario }}
+    
+    The user has already proposed several hypotheses and conducted evaluations. This information will be provided to you. Your task is to:
+    1. Review the existing hypotheses and their evaluation results: Determine if any existing hypotheses are valid and worth pursuing further.
+    2. Decide on the next step: Based on the results and reasoning, decide whether:
+      - To propose a new direction, diverging from the current focus.
+      - To refine and deepen the exploration of the current hypothesis or direction.
+    3. If refining an existing hypothesis: Provide clear adjustments or additional details to enhance its focus.
+    4. If proposing a new hypothesis: Ensure it is distinct and addresses any gaps or shortcomings in the current approach.
+
+    The current component to focus on is: {{ component }}.
+    {% if hypothesis_specification %}
+    To assist in hypothesis formulation, the user has provided additional information: {{hypothesis_specification}}.
+    Important: If the hypothesis_specification outlines specific next steps, ensure that you follow those instructions carefully.
+    {% endif %}
+    Please generate the output using the following format and specifications:
+    {{ hypothesis_output_format }}
+
+  user: |-
+    {% if exp_and_feedback_desc|length == 0 %}
+    This is the first round of hypothesis generation. The user has not yet proposed any hypotheses for this scenario.
+    {% else %}
+    This is not the first round. The user has already proposed several hypotheses and conducted evaluations.
+    
+    The previous hypotheses and their corresponding feedback are as follows (focus on the most recent hypothesis, its derived insights, and reasoning):
+    {{exp_and_feedback_desc}}
+    {% endif %}
+    
+    In addition, generate relevant reasoning and distilled knowledge keys.
+    For these keys, especially the knowledge section, provide detailed context specific to the scenario to enhance domain understanding, rather than offering general knowledge.
+
+hypothesis_model: # It is deprecated now, please refer to direct_exp_gen
+  system: |-
+    The user is working on generating new hypotheses for the {{ targets }} in a data-driven research and development process. 
+    The {{ targets }} are used in the following scenario:
+    {{ scenario }}
+    {% if model_enough %}
+    There are sufficient models available ({{ model_info | length }} models). Your task is to choose one of the existing models for further tuning or optimization. Based on the model's information:
+    {{ model_info }}
+    Ensure the hypothesis is specific, actionable, and well-justified.
+    {% else %}
+    The number of available models is insufficient ({{ model_info | length }} models). Your task is to first decide whether to:
+    - Tune an existing model: Select one of the current models for further tuning and improvement.
+    - Add a new model: Introduce a new model to expand the hypothesis space.
+    Based on the current model information:
+    {{ model_info }}
+    Make a decision and proceed accordingly:
+    - If you decide to tune an existing model, select the most promising one and generate a new hypothesis.
+    - If you decide to add a new model, specify the type of model you would add and generate a new hypothesis related to the new model.
+    {% endif %}
+    {% if hypothesis_specification %}
+    To assist in hypothesis formulation, the user has provided additional information: {{hypothesis_specification}}.
+    Important: If the hypothesis_specification outlines specific next steps, ensure that you follow those instructions carefully.
+    {% endif %}
+    Please generate the output using the following format and specifications:
+    {{ hypothesis_output_format }}
+
+hypothesis_and_feedback: |-
+  {% for experiment, feedback in hist %}
+  Hypothesis {{ loop.index }}
+  The experiment is design driven by hypothesis : {{ experiment.hypothesis }}
+  Observation on the result with the hypothesis: {{ feedback.observations }}
+  Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
+  Did changing to this hypothesis work? (focus on the change):  {{ feedback.decision }}
+  {% endfor %}
+
+task_gen: # It is deprecated now, please refer to direct_exp_gen
+  system: |-
+    {% if hypothesis is not none %}
+    The user is trying to generate new {{ targets }} based on the hypothesis generated in the previous step. 
+    {% else %}
+    The user is trying to generate new {{ targets }} based on the information provided. 
+    {% endif %}
+    The {{ targets }} are used in certain scenario, the scenario is as follows:
+    {{ scenario }}
+
+    {% if task_specification is not none %}
+    The user has wrote some specification for the {{ targets }}. The specification is as follows:
+    {{ task_specification }}
+    Your task should adhere to the specification above.
+    {% endif %}
+
+    {% if hypothesis is not none %}
+    The user will use the {{ targets }} generated to do some experiments. The user will provide this information to you:
+    1. The target hypothesis you are targeting to generate {{ targets }} for.
+    2. The hypothesis generated in the previous steps and their corresponding feedbacks.
+    3. Former proposed {{ targets }} on similar hypothesis.
+    4. Some additional information to help you generate new {{ targets }}.
+    {% endif %}
+
+    Please generate the output following the format below:
+    {{ task_output_format }}
+    
+  user: |-
+    {% if workspace_code %}
+    Here is a list of all the filenames and their corresponding content in the workspace:
+    {{workspace_code}}
+    {% endif %}
+
+    {% if hypothesis is not none %}
+    The user has made several hypothesis on this scenario and did several evaluation on them.
+    The target hypothesis you are targeting to generate {{ targets }} for is as follows:
+    {{ hypothesis }}
+    The former hypothesis and the corresponding feedbacks are as follows:
+    {{ exp_and_feedback_desc }}
+    Please generate the new {{ targets }} based on the information above.
+    {% else %}
+    Please generate the new {{ targets }} task.
+    {% endif %}
+
+task_gen_model: # It is deprecated now, please refer to direct_exp_gen
+  system: |-
+    {% if hypothesis is not none %}
+    The user is trying to generate new {{ targets }} based on the hypothesis generated in the previous step. 
+    {% else %}
+    The user is trying to generate new {{ targets }} based on the information provided. 
+    {% endif %}
+    The {{ targets }} are used in certain scenario, the scenario is as follows:
+    {{ scenario }}
+
+    {% if hypothesis is not none %}
+    The user will use the {{ targets }} generated to do some experiments. The user will provide this information to you:
+    1. The target hypothesis you are targeting to generate {{ targets }} for.
+    2. The hypothesis generated in the previous steps and their corresponding feedbacks.
+    3. Former proposed {{ targets }} on similar hypothesis.
+    4. Some additional information to help you generate new {{ targets }}.
+    {% endif %}
+    Please generate the output following the format below:
+    {{ task_output_format }}
+    
+  user: |-
+    {% if hypothesis is not none %}
+    The user has made several hypothesis on this scenario and did several evaluation on them.
+    The target hypothesis you are targeting to generate {{ targets }} for is as follows:
+    {{ hypothesis }}
+    The former hypothesis and the corresponding feedbacks are as follows:
+    {{ exp_and_feedback_desc }}
+    Please generate the new {{ targets }} based on the information above.
+    {% else %}
+    Please generate the new {{ targets }} task.
+    {% endif %}
+
+direct_exp_gen:
+  system: |-
+    You are a data scientist and a top Kaggle competitor. The user is working on creating a solution for a Kaggle competition. Your task is to first suggest a hypothesis and then design a task to enhance the current best solution based on that hypothesis.
+
+    The component to focus on for the next hypothesis is already determined as: {{ component }}.
+    It will be used in the following scenario:
+    {{ scenario }}
+
+    # Hypothesis Proposal
+
+    The user has already proposed several hypotheses and conducted evaluations on them. This information will be provided to you later. Your task is to check if a similar hypothesis has already been generated. If one exists and you agree with it, you can use it. If you disagree, please create an improved version.
+
+    To assist you in formulating new hypotheses, the user has provided some additional information: 
+    Hypothesis should avoid being too general and vague, and should be specific and actionable. For example, hypothesis like 'tune a model' is too general, while hypothesis like 'increase the learning rate to 0.1 of the lightgbm model will improve the performance' is specific and actionable.
+    Your hypothesis should based on current SOTA solution. The user will conduct experiments based on the SOTA solution(current best experiments) to test whether your hypothesis is right on this specific competition.
+    Important: If the hypothesis_specification outlines the next steps you need to follow, ensure you adhere to those instructions.
+
+    [Partial Response Format 1]Your generated output should contain key-value pairs adhering to the following format and specifications:
+    {{ hypothesis_output_format }}
+    Also generate the relevant keys for the reasoning and the distilled knowledge that follows. For those keys, in particular for knowledge, explain in the context of the specific scenario to build up domain knowledge in the specific field rather than general knowledge.
+
+    # Task Design
+
+    The user is trying to generate new {{ targets }} based on the hypothesis generated in the previous step.
+
+    The scope of the {{ targets }} can be described by a interface specification as follows
+    ```Python
+    {{task_specification}}
+    ```
+
+    The user will use the {{ targets }} generated to do some experiments. The user will provide this information to you:
+    1. The target hypothesis you are targeting to generate {{ targets }} for.
+    2. The hypothesis generated in the previous steps and their corresponding feedbacks.
+    3.  Former proposed {{ targets }} on similar hypothesis.
+    4. Some additional information to help you generate new {{ targets }}.
+
+    [Partial Response Format 2] Your generated output should contain key-value pairs adhering to the following format and specifications:
+    {{ task_output_format }}
+
+    {% if workflow_check %}
+    # Workflow update
+    Since components have dependencies, the workflow should be updated to reflect the changes made to the target component. Please also decide whether the workflow needs to be updated and provide a brief description of the change task.
+    [Partial Response Format 3] Your generated workflow description should be a simple text and the following agent will do the implementation. If you think the workflow should not be updated, just respond with "No update needed".
+    {% endif %}
+
+    Your response should contain two parts: the hypothesis proposal and the task design. Please follow the format and specifications provided below:
+    {
+      "hypothesis_proposal": [Partial Response Format 1],
+      "task_design": [Partial Response Format 2],
+      {% if workflow_check %}"workflow_update": [Partial Response Format 3], {% endif %}
+    }
+
+    {% if extra_requirement %}
+    {{extra_requirement}}
+    {% endif %}
+
+  user: |-
+    # The detailed description of current best experiments
+    {{sota_exp_desc}}
+
+    ## The according feedbacks for the best experiments
+    {{ exp_and_feedback_desc }}
+
+    {% if recent_trace_desc %}
+    # Several trials after the best experiments
+    The user has made several hypothesis on this scenario and did several evaluation on them.
+    The former hypothesis and the corresponding feedbacks are as follows (focus on the last one & the new hypothesis that it provides and reasoning to see if you agree):
+    {{recent_trace_desc}}
+
+    # The difference from the best experiments to the last one
+    {{last_exp_diff}}
+    {% endif %}
+
+
+extra_requirement:
+  model: |-
+    If there are sufficient models available. Your task is to choose one of the existing models for further tuning or optimization. Based on the model's information:
+
+    If the number of available models is insufficient . Your task is to first decide whether to:
+    - Tune an existing model: Select one of the current models for further tuning and improvement.
+    - Add a new model: Introduce a new model to expand the hypothesis space.
+
+    The information of the model is described by the code of workspace.
+
+    Make a decision and proceed accordingly:
+    - If you decide to tune an existing model, select the existing model file and generate a new hypothesis.
+    - If you decide to add a new model, specify the type of model you would add and generate a new hypothesis related to the new model.
+
+
+component_gen:
+  system: |-
+    You are a Kaggle Grander Master. You are going to provide a solution for a kaggle competition.
+
+    Here is the description of the competition scenario
+    ```
+    {{ scenario }}
+    ```
+
+    # Here is the current best version of implementation.
+    {{sota_exp_desc}}
+
+    {% if last_exp_diff %}
+    # Here is the latest version of implementation different from the sota_exp_desc
+    {{ last_exp_diff }}
+    {% endif %}
+
+    You will be provided the feedback for the latest implementation.
+
+    Please select the component you are going to improve the latest implementation or sota implementation.
+
+    Please generate the output following the format below:
+    {{ component_output_format }}
+
+  user: |-
+    {{ exp_and_feedback_desc }}
+
+
+exp_and_feedback: |-
+  {% for experiment, feedback in trace.hist[-10:] %}
+  ## Experiment {{ loop.index }}
+  Experiment are focusing on task: {{experiment.pending_tasks_list[0][0]}}
+  {% if experiment.hypothesis %}
+  The experiment is design driven by hypothesis : {{ experiment.hypothesis }}
+  Observation on the result with the hypothesis: {{ feedback.observations }}
+  {% endif %}
+  Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
+  Did changing to this hypothesis work? (focus on the change):  {{ feedback.decision }}
+  {% endfor %}
+
+hypothesis_specification: |-
+  Hypothesis should avoid being too general and vague, and should be specific and actionable. For example, hypothesis like 'tune a model' is too general, while hypothesis like 'increase the learning rate to 0.1 of the lightgbm model will improve the performance' is specific and actionable.
+  Your hypothesis should based on current SOTA solution. The user will conduct experiments based on the SOTA solution to test whether your hypothesis is right on this specific competition.
+  {{ sota_solution}}
+
+output_format:
+  component: |-
+    {
+      "component": "The component you suggest to focus on. It must be one of ['DataLoadSpec', 'FeatureEng', 'Model', 'Ensemble', 'Workflow']."
+    }
+  hypothesis: |-
+    The output should follow JSON format. The schema is as follows:
+    {
+      "component": "If "hypothesis_specification" provides the component you need to take, please follow "hypothesis_specification" to choose the component. Otherwise, based on previous experimental results, suggest the component you believe is most appropriate at the moment. It should be one of ["DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow"]",
+      "hypothesis": "The new hypothesis generated based on the information provided.",
+      "reason": "The reason why you generate this hypothesis. It should be comprehensive and logical. It should cover the other keys below and extend them.",
+      "concise_reason": "Two-line summary. First line focuses on a concise justification for the change. Second line generalizes a knowledge statement.",
+      "concise_observation": "One line summary. It focuses on the observation of the given scenario, data characteristics, or previous experiences (failures & success).",
+      "concise_justification": "One line summary. Justify the hypothesis based on theoretical principles or initial assumptions.",
+      "concise_knowledge": "One line summary. Transferable knowledge based on theoretical principles. Use conditional grammar. eg. "If...., ..; When..., .; and etc" Make sure that you state things clearly without ambiguity. Eg. avoid saying "previous hypothesis", because one wouldn't know what that is."
+    }
+  data_loader: |-
+    According to the hypothesis, please help user design one data loader task.
+    The output should follow JSON format. The schema is as follows:
+    {
+        "description": "description of the overall data loader for the data science workflow",
+        # Don't add ellipsis (...) or any filler text that might cause JSON parsing errors here!
+    }
+  feature: |-
+    According to the hypothesis, please help user design one or more feature engineering tasks.
+    The output should follow JSON format. The schema is as follows:
+    {
+        "description": "description of feature engineering task",
+        # Don't add ellipsis (...) or any filler text that might cause JSON parsing errors here!
+    }
+  model: |-
+    According to the hypothesis, please help user design one model task.
+    The output should follow JSON format. The schema is as follows: 
+    {
+        "model_name": "model name, must start with 'model_' and only contain letters, numbers, and underscores",
+        "description": "A detailed description of the model",
+        "extra_params":
+        {
+          "model_type": "The type of the model, e.g., neural network, tree-based model, etc.",
+          "architecture": "A detailed description of the model's architecture, e.g., neural network layers or tree structures",
+          "hyperparameters": {
+            "hyperparameter_name_1": "value of hyperparameter 1",
+            "hyperparameter_name_2": "value of hyperparameter 2",
+            "hyperparameter_name_3": "value of hyperparameter 3"
+          },
+        },
+    }
+    Usually, a larger model works better than a smaller one. Hence, the parameters should be larger.
+  ensemble: |-
+    According to the hypothesis, please help user design one ensemble task.
+    The output should follow JSON format. The schema is as follows:
+    {
+        "description": "A detailed description of the ensemble",
+    }
+  workflow: |-
+    According to the hypothesis, please help user design one workflow task.
+    The output should follow JSON format. The schema is as follows:
+    {
+        "description": "A detailed description of the workflow",
+    }
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
new file mode 100644
index 000000000..9c6d7cbf9
--- /dev/null
+++ b/rdagent/scenarios/data_science/scen/__init__.py
@@ -0,0 +1,338 @@
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+from PIL import Image, TiffTags
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.core.scenario import Scenario
+from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.kaggle.kaggle_crawler import (
+    crawl_descriptions,
+    leaderboard_scores,
+)
+from rdagent.utils.agent.tpl import T
+
+
+def read_csv_head(file_path, indent=0, lines=5, max_col_width=100):
+    """
+    Reads the first few rows of a CSV file and formats them with indentation and optional truncation.
+
+    Parameters:
+        file_path (str): Path to the CSV file.
+        indent (int): Number of spaces to prepend to each line for indentation.
+        lines (int): Number of rows to read from the CSV file.
+        max_col_width (int): Maximum width of each column's content.
+
+    Returns:
+        str: A formatted string of the first few rows of the CSV file.
+    """
+    try:
+        # Read the CSV file with specified rows
+        df = pd.read_csv(file_path, nrows=lines)
+
+        if df.empty:
+            return " " * indent + "(No data in the file)"
+
+        # Truncate column contents to a maximum width
+        truncated_df = df.copy()
+        for col in truncated_df.columns:
+            truncated_df[col] = (
+                truncated_df[col]
+                .astype(str)
+                .apply(lambda x: (x[:max_col_width] + "...") if len(x) > max_col_width else x)
+            )
+
+        # Convert DataFrame to a string representation
+        df_string_lines = truncated_df.to_string(index=False).split("\n")
+
+        # Add indentation to each line
+        indented_lines = [" " * indent + line for line in df_string_lines]
+
+        return "\n".join(indented_lines)
+    except FileNotFoundError:
+        return f"Error: File not found at path '{file_path}'."
+    except pd.errors.EmptyDataError:
+        return f"Error: The file at '{file_path}' is empty."
+    except Exception as e:
+        return f"Error reading CSV: {e}"
+
+
+def get_dir_snapshot(folder_path):
+    """
+    [note]
+        - Returns a set of file extensions within the subfolder (excluding subfolder names)
+        - Compares only the types of files contained, not specific file names or quantities
+    """
+    exts = set()
+    try:
+        with os.scandir(folder_path) as it:
+            for entry in it:
+                if entry.is_file():
+                    file_ext = os.path.splitext(entry.name)[1]
+                    exts.add(file_ext)
+    except Exception as e:
+        logger.error(f"Error scanning directory: {e}")
+
+    return frozenset(exts)
+
+
+def describe_data_folder(folder_path, indent=0, max_files=2, partial_expand_subfolders=2, is_top_level=True):
+    """
+    folder_path              : Current directory path
+    indent                   : Current indentation
+    max_files                : Maximum number of files of the same type to display
+    partial_expand_subfolders: When all subfolders have the same internal file types, only expand this many subfolders, the rest are omitted
+    is_top_level             : Indicates if the current folder is the top-level folder
+    """
+    result = []
+    files_count = {}
+    files_details = {}
+
+    for root, dirs, files in os.walk(folder_path):
+        dirs.sort()
+        if not dirs:
+            for file in files:
+                print(file)
+                file_path = os.path.join(root, file)
+                file_type = os.path.splitext(file)[1][1:]
+                file_size = os.path.getsize(file_path)
+
+                if file_type not in files_count:
+                    files_count[file_type] = 0
+                    files_details[file_type] = []
+                files_count[file_type] += 1
+
+                # At top level, collect all CSV and Markdown files without restrictions
+                # In deeper levels, follow the max_files restriction
+                if is_top_level and file_type in ["csv", "md"]:
+                    files_details[file_type].append((file, file_size, file_path))
+                elif len(files_details[file_type]) < max_files:
+                    files_details[file_type].append((file, file_size, file_path))
+            break
+
+        # Collect "type snapshots" of subfolders
+        snapshots = []
+        for d in dirs:
+            subfolder_path = os.path.join(root, d)
+            snapshot = get_dir_snapshot(subfolder_path)
+            snapshots.append(snapshot)
+
+        # Determine if all subfolders have the same file type distribution
+        first_snapshot = snapshots[0]
+        all_same_structure = all(s == first_snapshot for s in snapshots)
+
+        if all_same_structure:
+            for i, d in enumerate(dirs):
+                if i < partial_expand_subfolders:
+                    result.append(" " * indent + f"- Folder: {d}")
+                    subfolder_path = os.path.join(root, d)
+                    result.append(
+                        describe_data_folder(
+                            folder_path=subfolder_path,
+                            indent=indent + 2,
+                            max_files=max_files,
+                            partial_expand_subfolders=partial_expand_subfolders,
+                            is_top_level=False,
+                        )
+                    )
+                else:
+                    remaining = len(dirs) - i
+                    result.append(" " * indent + f"... ({remaining} more subfolders)")
+                    break
+        else:
+            for d in dirs:
+                result.append(" " * indent + f"- Folder: {d}")
+                subfolder_path = os.path.join(root, d)
+                result.append(
+                    describe_data_folder(
+                        folder_path=subfolder_path,
+                        indent=indent + 2,
+                        max_files=max_files,
+                        partial_expand_subfolders=partial_expand_subfolders,
+                        is_top_level=False,
+                    )
+                )
+
+        for file in files:
+            file_path = os.path.join(root, file)
+            file_type = os.path.splitext(file)[1][1:]
+            file_size = os.path.getsize(file_path)
+
+            if file_type not in files_count:
+                files_count[file_type] = 0
+                files_details[file_type] = []
+            files_count[file_type] += 1
+
+            # At top level, collect all CSV and Markdown files without restrictions
+            # In deeper levels, follow the max_files restriction
+            if is_top_level and file_type in ["csv", "md"]:
+                files_details[file_type].append((file, file_size, file_path))
+            elif not is_top_level and len(files_details[file_type]) <= max_files:
+                files_details[file_type].append((file, file_size, file_path))
+
+        break
+
+    # Print the folder and its contents
+    for file_type, count in files_count.items():
+        if count > max_files and file_type not in ["csv", "md"]:
+            result.append(" " * indent + f"{count} {file_type}s:")
+            for file, size, path in files_details[file_type]:
+                result.append(" " * (indent + 2) + f"- {file} ({size} bytes)")
+            result.append(" " * (indent + 2) + "... (file limit reached)")
+        else:
+            for file, size, path in files_details[file_type]:
+                result.append(" " * indent + f"- {file} ({size} bytes)")
+                if file_type == "csv":
+                    result.append(" " * (indent + 2) + f"- Head of {file}:")
+                    csv_head = read_csv_head(path, indent + 4)
+                    # if len(csv_head) > 300:
+                    #     csv_head = " ".join(csv_head.strip().split())
+                    #     csv_head = csv_head[:300] + "\n" + " " * (indent + 4) + "... (truncated)"
+                    result.append(csv_head)
+                if file_type == "md":
+                    result.append(" " * (indent + 2) + f"- Content of {file}:")
+                    if file == "description.md":
+                        result.append(" " * (indent + 4) + f"Please refer to the background of the scenario context.")
+                        continue
+                    with open(path, "r", encoding="utf-8") as f:
+                        result.append(" " * (indent + 4) + f.read())
+                if file_type == "tif":
+                    result.append(" " * (indent + 2) + f"- Metadata of {file}:")
+                    with Image.open(path) as img:
+                        for tag, value in img.tag_v2.items():
+                            tag_name = TiffTags.TAGS_V2.get(tag, f"Unknown Tag {tag}")
+                            result.append(" " * (indent + 4) + f"{tag_name}: {value}")
+                if file_type == "json":
+                    result.append(" " * (indent + 2) + f"- Content of {file}:")
+                    with open(path, "r", encoding="utf-8") as f:
+                        for i, line in enumerate(f):
+                            if i < 2:
+                                result.append(
+                                    " " * (indent + 4) + line.strip()[:100] + ("..." if len(line.strip()) > 100 else "")
+                                )
+                            else:
+                                break
+
+    return "\n".join(result) + "\n"
+
+
+class DataScienceScen(Scenario):
+    """Data Science Scenario"""
+
+    def __init__(self, competition: str) -> None:
+        self.competition = competition
+        self.raw_description = self._get_description()
+        self.processed_data_folder_description = self._get_data_folder_description()
+        self._analysis_competition_description()
+        self.metric_direction = self._get_direction()
+
+    def _get_description(self):
+        if (fp := Path(f"{DS_RD_SETTING.local_data_path}/{self.competition}.json")).exists():
+            logger.info(f"Found {self.competition}.json, loading from local file.")
+            with fp.open("r") as f:
+                return json.load(f)
+        else:
+            logger.error(
+                f"Cannot find {self.competition}.json in {DS_RD_SETTING.local_data_path}, please check the file."
+            )
+
+    def _get_direction(self):
+        return self.metric_direction_guess if hasattr(self, "metric_direction_guess") else True
+
+    def _analysis_competition_description(self):
+        sys_prompt = T(".prompts:competition_description_template.system").r()
+        user_prompt = T(".prompts:competition_description_template.user").r(
+            competition_raw_description=self.raw_description,
+            competition_processed_data_folder_description=self.processed_data_folder_description,
+        )
+
+        response_analysis = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=sys_prompt,
+            json_mode=True,
+        )
+
+        response_json_analysis = json.loads(response_analysis)
+        self.task_type = response_json_analysis.get("Task Type", "No type provided")
+        self.data_type = response_json_analysis.get("Data Type", "No data type provided")
+        self.brief_description = response_json_analysis.get("Brief Description", "No brief description provided")
+        self.dataset_description = response_json_analysis.get("Dataset Description", "No dataset description provided")
+        self.target_description = response_json_analysis.get("Evaluation Description", "No target description provided")
+        self.submission_specifications = response_json_analysis.get(
+            "Submission Specifications", "No submission requirements provided"
+        )
+        self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
+        self.metric_direction_guess = response_json_analysis.get("Metric Direction", True)
+
+    def get_competition_full_desc(self) -> str:
+        return f"""Task Type: {self.task_type}
+    Data Type: {self.data_type}
+    Brief Description: {self.brief_description}
+    Dataset Description: {self.dataset_description}
+    Target Description: {self.target_description}
+    Submission Specifications: {self.submission_specifications}
+    Model Output Channel: {self.model_output_channel}
+    """
+
+    @property
+    def background(self) -> str:
+        background_template = T(".prompts:competition_background")
+        background_prompt = background_template.r(
+            task_type=self.task_type,
+            data_type=self.data_type,
+            brief_description=self.brief_description,
+            dataset_description=self.dataset_description,
+            target_description=self.target_description,
+        )
+        return background_prompt
+
+    @property
+    def rich_style_description(self) -> str:
+        return T(".prompts:rich_style_description").r(
+            name="Data Science",
+            competition=self.competition,
+        )
+
+    def get_scenario_all_desc(self) -> str:
+        return T(".prompts:scenario_description").r(
+            background=self.background,
+            submission_specifications=self.submission_specifications,
+            evaluation=self.target_description,
+            metric_direction=self.metric_direction,
+        )
+
+    def _get_data_folder_description(self) -> str:
+        return describe_data_folder(Path(DS_RD_SETTING.local_data_path) / self.competition)
+
+
+class KaggleScen(DataScienceScen):
+    """Kaggle Scenario
+    It is based on kaggle now.
+        - But it is not use the same interface with previous kaggle version.
+        - Ideally, we should reuse previous kaggle scenario.
+          But we found that too much scenario unrelated code in kaggle scenario and hard to reuse.
+          So we start from a simple one....
+    """
+
+    def _get_description(self):
+        return crawl_descriptions(self.competition, DS_RD_SETTING.local_data_path)
+
+    def _get_direction(self):
+        if DS_RD_SETTING.if_using_mle_data:
+            return super()._get_direction()
+        leaderboard = leaderboard_scores(self.competition)
+        return "maximize" if float(leaderboard[0]) > float(leaderboard[-1]) else "minimize"
+
+    @property
+    def rich_style_description(self) -> str:
+        return T(".prompts:rich_style_description").r(
+            name="Kaggle",
+            competition=f"[{self.competition}](https://www.kaggle.com/competitions/{self.competition})",
+        )
+
+
+if __name__ == "__main__":
+    print(describe_data_folder(Path("/data/userdata/share/mle_kaggle") / "stanford-covid-vaccine"))
diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
new file mode 100644
index 000000000..40fa4b0d0
--- /dev/null
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -0,0 +1,78 @@
+scenario_description: |-
+  ------Background of the scenario------
+  {{background}}
+
+  ------The expected output & submission format specifications------
+  {{submission_specifications}}
+
+  {% if evaluation is not none %}
+  ------Evaluation------
+  {{evaluation}}
+  {% endif %}
+
+  The evaluation metrics used is directed as:
+  {% if metric_direction %}The metric is better when it is bigger. 
+  {% else %}The metric is better when it is smaller.
+  {% endif %}
+
+competition_description_template:
+  system: |-
+    You are a data science assistant that extracts structured information from unstructured text.
+    The user will provide you a Kaggle competition description, and you need to extract specific details from it.
+    For the dataset, the competition may not include detailed information about the dataset. The user has read the dataset and provide you the relevant information. Please include it in your response.
+    Please answer in Json format with the following schema:
+    {
+      "Task Type": "The type of competition task, e.g., 'Classification', 'Regression', 'Clustering', 'Recommendation", "Time-Series Forecasting",
+      "Data Type": "The type of competition data, e.g., 'Tabular', 'Time Series', 'Text (Natural Language Processing)', 'Image (Computer Vision)', 'Audio', 'Video'", 
+      "Brief Description": "A brief description of the competition",
+      "Dataset Description": "The dataset utilized in the competition is described based on two sources: the Competition Description, which provides contextual details about the original files, and the Processed Data folder description, which outlines the structure of the dataset after processing. While there may be differences—for instance, original files mentioned in the Competition Description (e.g., .zip files) may have been extracted or restructured—your task is to interpret the new file structure accurately (do not contain any file or folder that is not in Processed Data folder description) and reconcile it with the contextual information from the Competition Description to provide a clear and updated explanation.",
+      "Evaluation Description": "A description of the evaluation used in the competition.",
+      "Submission Specifications": "The submission specification & sample submission file descriptions for the model to output."
+      "Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
+      "Metric direction": True or False as True means bigger metric number is better, False means smaller is better.
+    }
+  user: |-
+    Competition Description: 
+    {{ competition_raw_description }}
+
+    Processed Data folder description:
+    {{ competition_processed_data_folder_description }}
+    
+    [Note] There may be some discrepancies between the competition description and the processed data folder description. Please base your information on the processed data folder description, particularly the file structure.
+
+
+competition_background: |-
+  You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
+  Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
+  You are dedicated to producing accurate, efficient, and innovative solutions.
+
+  The task type for this competition is {{ task_type }}.
+  The data type used in this competition is {{ data_type }}.
+  Briefly, the competition involves: {{ brief_description }}.
+  The dataset used in this competition is: {{ dataset_description }}.
+  Your goal in this competition is to: {{target_description }}.
+
+rich_style_description: |-
+  ### {{ name }} Agent: Automated Feature Engineering & Model Tuning Evolution
+
+  #### [Overview](#_summary)
+
+  In this scenario, our automated system proposes hypothesis, choose action, implements code, conducts validation, and utilizes feedback in a continuous, iterative process.
+
+  #### {{ name }} Competition info
+
+  Current Competition: {{ competition }}
+
+  #### [Automated R&D](#_rdloops)
+
+  - **[R (Research)](#_research)**
+  - Iteration of ideas and hypotheses.
+  - Continuous learning and knowledge construction.
+
+  - **[D (Development)](#_development)**
+  - Evolving code generation, model refinement, and features generation.
+  - Automated implementation and testing of models/features.
+
+  #### [Objective](#_summary)
+
+  To automatically optimize performance metrics within the validation set, ultimately discovering the most efficient features and models through autonomous research and development.
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
new file mode 100644
index 000000000..773eec221
--- /dev/null
+++ b/rdagent/scenarios/data_science/share.yaml
@@ -0,0 +1,38 @@
+describe: # some template to describe some object
+  # exp is a template used fo
+  exp: |-
+    ## {{ heading | default('Best solution of previous exploration of the scenario') }}
+    {% if exp %}
+    ### Code
+    Here is the complete code of the solution.
+    {{ exp.experiment_workspace.all_codes }}
+
+    {% if exp.hypothesis is not none %}
+    ### Hypothesis for the experiment
+    the experiment is designed based on hypothesis: {{exp.hypothesis}}
+    {% endif %}
+
+    ### Results
+    {% if exp.result is none %}
+    There are no according evaluation results
+    {% else %}
+    Evaluated results on validation is:
+    {{ exp.result }}
+    {% endif %}
+
+    {% else %}
+    No previous complete experiment available.
+    {% endif %}
+
+  feedback: |-
+    {% if exp_and_feedback and exp_and_feedback|length > 1 %}
+    ## {{heading | default('Previous trial and feedback')}}
+    Before current trial, a previous recent trial is listed below.
+    {% if exp_and_feedback[0].hypothesis %}
+    the experiment is designed based on hypothesis: {{ exp_and_feedback[0].hypothesis }}
+    {% endif %}
+    ### Task of previous trial
+    {{ exp_and_feedback[0].pending_tasks_list[0][0].get_task_information() }}
+    feedback decision: {{ exp_and_feedback[1].decision }}
+    reason: {{ exp_and_feedback[1].reason }}
+    {% endif %}
diff --git a/rdagent/scenarios/kaggle/README.md b/rdagent/scenarios/kaggle/README.md
index 940ac92dc..2ba601401 100644
--- a/rdagent/scenarios/kaggle/README.md
+++ b/rdagent/scenarios/kaggle/README.md
@@ -17,4 +17,14 @@ sudo mv chromedriver /usr/local/bin
 sudo chmod +x /usr/local/bin/chromedriver
 
 chromedriver --version
-```
\ No newline at end of file
+```
+
+## config
+
+1. authentication: `~/.kaggle/kaggle.json`
+2. Accept Rules in competition website. (Join Competition)
+
+## notebook crawler
+
+1. `download_notebooks()`
+2. `convert_notebooks_to_text()`
\ No newline at end of file
diff --git a/rdagent/scenarios/kaggle/developer/coder.py b/rdagent/scenarios/kaggle/developer/coder.py
index 71c390bbb..a04054c57 100644
--- a/rdagent/scenarios/kaggle/developer/coder.py
+++ b/rdagent/scenarios/kaggle/developer/coder.py
@@ -68,5 +68,5 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
                 .from_string(DEFAULT_SELECTION_CODE)
                 .render(feature_index_list=chosen_index_to_list_index)
             )
-        exp.experiment_workspace.inject_code(**{KG_SELECT_MAPPING[target_model_type]: code})
+        exp.experiment_workspace.inject_files(**{KG_SELECT_MAPPING[target_model_type]: code})
         return exp
diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
index 708c74e77..fe4effa0a 100644
--- a/rdagent/scenarios/kaggle/developer/feedback.py
+++ b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -8,8 +8,8 @@
 from rdagent.core.experiment import Experiment
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import (
+    Experiment2Feedback,
     Hypothesis,
-    HypothesisExperiment2Feedback,
     HypothesisFeedback,
     Trace,
 )
@@ -22,7 +22,7 @@
 DIRNAME = Path(__file__).absolute().resolve().parent
 
 
-class KGHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
+class KGExperiment2Feedback(Experiment2Feedback):
     def process_results(self, current_result, sota_result):
         # Convert the results to dataframes
         current_df = pd.DataFrame(current_result)
@@ -46,7 +46,7 @@ def process_results(self, current_result, sota_result):
 
         return combined_df, evaluation_description
 
-    def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
+    def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback:
         """
         The `ti` should be executed and the results should be included, as well as the comparison between previous results (done by LLM).
         For example: `mlflow` of Qlib will be included.
@@ -60,6 +60,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
         Returns:
             Any: The feedback generated for the given experiment and hypothesis.
         """
+        hypothesis = exp.hypothesis
         logger.info("Generating feedback...")
         current_result = exp.result
 
@@ -105,12 +106,12 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
         if hypothesis.action == "Model tuning":
             current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.sub_workspace_list[0].code
         elif hypothesis.action == "Model feature selection":
-            current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.experiment_workspace.code_dict[
+            current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.experiment_workspace.file_dict[
                 KG_SELECT_MAPPING[exp.sub_tasks[0].model_type]
             ]
         else:
             current_sub_exps_to_code = {
-                sub_ws.target_task.get_task_information(): sub_ws.code for sub_ws in exp.sub_workspace_list
+                sub_ws.target_task.get_task_information(): sub_ws.all_codes for sub_ws in exp.sub_workspace_list
             }
         current_sub_exps_to_code_str = json.dumps(current_sub_exps_to_code, indent=2)
         current_result = exp.result
@@ -118,7 +119,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
 
         last_hypothesis_and_feedback = None
         if trace.hist and len(trace.hist) > 0:
-            last_hypothesis_and_feedback = (trace.hist[-1][0], trace.hist[-1][2])
+            last_hypothesis_and_feedback = (trace.hist[-1][0].hypothesis, trace.hist[-1][1])
 
         # Prepare render dictionary
         render_dict = {
@@ -156,16 +157,16 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
         new_hypothesis = response_json.get("New Hypothesis", "No new hypothesis provided")
         reason = response_json.get("Reasoning", "No reasoning provided")
         decision = convert2bool(response_json.get("Replace Best Result", "no"))
-        leaderboard = self.scen.leaderboard
-        current_score = current_result.iloc[0]
-        sorted_scores = sorted(leaderboard, reverse=True)
-        import bisect
-
-        if self.scen.evaluation_metric_direction:
-            insert_position = bisect.bisect_right([-score for score in sorted_scores], -current_score)
-        else:
-            insert_position = bisect.bisect_left(sorted_scores, current_score, lo=0, hi=len(sorted_scores))
-        percentile_ranking = (insert_position) / (len(sorted_scores)) * 100
+        # leaderboard = self.scen.leaderboard
+        # current_score = current_result.iloc[0]
+        # sorted_scores = sorted(leaderboard, reverse=True)
+        # import bisect
+
+        # if self.scen.evaluation_metric_direction:
+        #     insert_position = bisect.bisect_right([-score for score in sorted_scores], -current_score)
+        # else:
+        #     insert_position = bisect.bisect_left(sorted_scores, current_score, lo=0, hi=len(sorted_scores))
+        # percentile_ranking = (insert_position) / (len(sorted_scores)) * 100
 
         experiment_feedback = {
             "hypothesis_text": current_hypothesis,
diff --git a/rdagent/scenarios/kaggle/developer/runner.py b/rdagent/scenarios/kaggle/developer/runner.py
index 51890086b..9d407e02c 100644
--- a/rdagent/scenarios/kaggle/developer/runner.py
+++ b/rdagent/scenarios/kaggle/developer/runner.py
@@ -73,11 +73,11 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
             # TODO: There's a possibility of generating a hybrid model (lightgbm + xgboost), which results in having two items in the model_type list.
             model_type = sub_ws.target_task.model_type
 
-            if sub_ws.code_dict == {}:
+            if sub_ws.file_dict == {}:
                 raise ModelEmptyError("No model is implemented.")
             else:
                 model_file_name = f"model/model_{model_type.lower()}.py"
-                exp.experiment_workspace.inject_code(**{model_file_name: sub_ws.code_dict["model.py"]})
+                exp.experiment_workspace.inject_files(**{model_file_name: sub_ws.file_dict["model.py"]})
         else:
             raise ModelEmptyError("No model is implemented.")
         env_to_use = {"PYTHONPATH": "./"}
@@ -102,14 +102,14 @@ def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
         current_feature_file_count = len(list(exp.experiment_workspace.workspace_path.glob("feature/feature*.py")))
         implemented_factor_count = 0
         for sub_ws in exp.sub_workspace_list:
-            if sub_ws.code_dict == {}:
+            if sub_ws.file_dict == {}:
                 continue
             execued_df = sub_ws.execute()[1]
             if execued_df is None:
                 continue
             implemented_factor_count += 1
             target_feature_file_name = f"feature/feature_{current_feature_file_count:05d}.py"
-            exp.experiment_workspace.inject_code(**{target_feature_file_name: sub_ws.code_dict["factor.py"]})
+            exp.experiment_workspace.inject_files(**{target_feature_file_name: sub_ws.file_dict["factor.py"]})
             feature_shape = execued_df.shape[-1]
             exp.experiment_workspace.data_description.append((sub_ws.target_task.get_task_information(), feature_shape))
             current_feature_file_count += 1
diff --git a/rdagent/scenarios/kaggle/docker/mle_bench_docker/Dockerfile b/rdagent/scenarios/kaggle/docker/mle_bench_docker/Dockerfile
index 227969351..f82f29979 100644
--- a/rdagent/scenarios/kaggle/docker/mle_bench_docker/Dockerfile
+++ b/rdagent/scenarios/kaggle/docker/mle_bench_docker/Dockerfile
@@ -7,6 +7,7 @@ RUN apt-get clean && apt-get update && apt-get install -y \
     git \  
     build-essential \
     git-lfs \
+    unzip \
     && rm -rf /var/lib/apt/lists/* 
 
 RUN git clone https://github.com/openai/mle-bench.git
diff --git a/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py b/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
index 24975d499..207988579 100644
--- a/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
+++ b/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
@@ -43,7 +43,7 @@ def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
             / KAGGLE_IMPLEMENT_SETTING.competition
         )
         if len(self.based_experiments) > 0:
-            self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
+            self.experiment_workspace.inject_files(**self.based_experiments[-1].experiment_workspace.file_dict)
             self.experiment_workspace.data_description = deepcopy(
                 self.based_experiments[-1].experiment_workspace.data_description
             )
@@ -69,7 +69,7 @@ def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
             / KAGGLE_IMPLEMENT_SETTING.competition
         )
         if len(self.based_experiments) > 0:
-            self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
+            self.experiment_workspace.inject_files(**self.based_experiments[-1].experiment_workspace.file_dict)
             self.experiment_workspace.data_description = deepcopy(
                 self.based_experiments[-1].experiment_workspace.data_description
             )
diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
index b5a7f84e5..2816d78af 100644
--- a/rdagent/scenarios/kaggle/experiment/scenario.py
+++ b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -39,7 +39,7 @@ class KGScenario(Scenario):
     def __init__(self, competition: str) -> None:
         super().__init__()
         self.competition = competition
-        self.competition_descriptions = crawl_descriptions(competition)
+        self.competition_descriptions = crawl_descriptions(competition, KAGGLE_IMPLEMENT_SETTING.local_data_path)
         self.input_shape = None
 
         self.competition_type = None
@@ -125,7 +125,7 @@ def background(self) -> str:
         background_template = prompt_dict["kg_background"]
 
         train_script = (
-            Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template" / "train.py"
+            Path(__file__).parent / "templates" / KAGGLE_IMPLEMENT_SETTING.competition / "train.py"
         ).read_text()
 
         background_prompt = (
diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py
index 1bfaaf866..5d4b0497f 100644
--- a/rdagent/scenarios/kaggle/experiment/workspace.py
+++ b/rdagent/scenarios/kaggle/experiment/workspace.py
@@ -34,7 +34,7 @@ def __init__(self, template_folder_path: Path, *args, **kwargs) -> None:
     @property
     def model_description(self) -> dict[str, str]:
         model_description = {}
-        for k, v in self.code_dict.items():
+        for k, v in self.file_dict.items():
             if k.startswith("model/"):
                 model_description[k] = v
         return model_description
@@ -62,7 +62,7 @@ def generate_preprocess_data(
                 else None
             ),
         )
-        if results is None:
+        if len(results) == 0:
             logger.error("Feature preprocess failed.")
             raise Exception("Feature preprocess failed.")
         else:
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
index b6b71917d..265d4d214 100644
--- a/rdagent/scenarios/kaggle/kaggle_crawler.py
+++ b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -16,10 +16,13 @@
 from selenium.webdriver.common.by import By
 
 from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+from rdagent.core.conf import ExtendedBaseSettings
 from rdagent.core.exception import KaggleError
 from rdagent.core.prompts import Prompts
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.data_science.debug.data import create_debug_data
+from rdagent.utils.agent.tpl import T
 from rdagent.utils.env import MLEBDockerEnv
 
 # %%
@@ -31,8 +34,14 @@
 service = Service("/usr/local/bin/chromedriver")
 
 
-def crawl_descriptions(competition: str, wait: float = 3.0, force: bool = False) -> dict[str, str]:
-    if (fp := Path(f"{KAGGLE_IMPLEMENT_SETTING.local_data_path}/{competition}.json")).exists() and not force:
+def crawl_descriptions(
+    competition: str, local_data_path: str, wait: float = 3.0, force: bool = False
+) -> dict[str, str] | str:
+    if (fp := Path(f"{local_data_path}/{competition}/description.md")).exists() and not force:
+        logger.info(f"Found {competition}/description.md, loading from it.")
+        return fp.read_text()
+
+    if (fp := Path(f"{local_data_path}/{competition}.json")).exists() and not force:
         logger.info(f"Found {competition}.json, loading from local file.")
         with fp.open("r") as f:
             return json.load(f)
@@ -94,35 +103,33 @@ def kaggle_description_css_selectors() -> tuple[str, str]:
     descriptions["Data Description"] = data_element.get_attribute("innerHTML")
 
     driver.quit()
-    with open(f"{KAGGLE_IMPLEMENT_SETTING.local_data_path}/{competition}.json", "w") as f:
+    with open(f"{local_data_path}/{competition}.json", "w") as f:
         json.dump(descriptions, f)
     return descriptions
 
 
-def download_data(competition: str, local_path: str = KAGGLE_IMPLEMENT_SETTING.local_data_path) -> None:
-    if KAGGLE_IMPLEMENT_SETTING.if_using_mle_data:
+def download_data(competition: str, settings: ExtendedBaseSettings = KAGGLE_IMPLEMENT_SETTING) -> None:
+    local_path = settings.local_data_path
+    if settings.if_using_mle_data:
         zipfile_path = f"{local_path}/zip_files"
         zip_competition_path = Path(zipfile_path) / competition
-        if (
-            not zip_competition_path.exists()
-            or not (Path(local_path) / competition).exists()
-            or list((Path(local_path) / competition).iterdir()) == []
-        ):
-            mleb_env = MLEBDockerEnv()
-            mleb_env.prepare()
-            (Path(local_path) / "zip_files").mkdir(parents=True, exist_ok=True)
-            (Path(local_path) / competition).mkdir(parents=True, exist_ok=True)
 
+        mleb_env = MLEBDockerEnv()
+        mleb_env.prepare()
+        if not zip_competition_path.exists():
+            (Path(zipfile_path)).mkdir(parents=True, exist_ok=True)
             mleb_env.run(
                 f"mlebench prepare -c {competition} --data-dir ./zip_files",
                 local_path=local_path,
                 running_extra_volume={str(Path("~/.kaggle").expanduser().absolute()): "/root/.kaggle"},
             )
+
+        if not (Path(local_path) / competition).exists() or list((Path(local_path) / competition).iterdir()) == []:
+            (Path(local_path) / competition).mkdir(parents=True, exist_ok=True)
+
+            mleb_env.run(f"cp -r ./zip_files/{competition}/prepared/public/* ./{competition}", local_path=local_path)
             mleb_env.run(
-                f"/bin/sh -c 'cp -r ./zip_files/{competition}/prepared/public/* ./{competition}'", local_path=local_path
-            )
-            mleb_env.run(
-                f'/bin/sh -c \'for zip_file in ./{competition}/*.zip; do dir_name="${{zip_file%.zip}}"; mkdir -p "$dir_name"; unzip -o "$zip_file" -d "$dir_name"; done\'',
+                f'for zip_file in ./{competition}/*.zip; do dir_name="${{zip_file%.zip}}"; mkdir -p "$dir_name"; unzip -o "$zip_file" -d "$dir_name"; done',
                 local_path=local_path,
             )
             # NOTE:
@@ -158,6 +165,10 @@ def download_data(competition: str, local_path: str = KAGGLE_IMPLEMENT_SETTING.l
                 for sub_zip_file in Path(unzip_path).rglob("*.zip"):
                     unzip_data(sub_zip_file, unzip_target_path=unzip_path)
 
+    # sample data
+    if not Path(f"{local_path}/sample/{competition}").exists():
+        create_debug_data(competition, dataset_path=local_path)
+
 
 def unzip_data(unzip_file_path: str, unzip_target_path: str) -> None:
     with zipfile.ZipFile(unzip_file_path, "r") as zip_ref:
@@ -219,19 +230,8 @@ def download_notebooks(
 
 
 def notebook_to_knowledge(notebook_text: str) -> str:
-    prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
-
-    sys_prompt = (
-        Environment(undefined=StrictUndefined)
-        .from_string(prompt_dict["gen_knowledge_from_code_mini_case"]["system"])
-        .render()
-    )
-
-    user_prompt = (
-        Environment(undefined=StrictUndefined)
-        .from_string(prompt_dict["gen_knowledge_from_code_mini_case"]["user"])
-        .render(notebook=notebook_text)
-    )
+    sys_prompt = T(".prompts:gen_knowledge_from_code_mini_case.system").r()
+    user_prompt = T(".prompts:gen_knowledge_from_code_mini_case.user").r(notebook=notebook_text)
 
     response = APIBackend().build_messages_and_create_chat_completion(
         user_prompt=user_prompt,
diff --git a/rdagent/scenarios/kaggle/prompts.yaml b/rdagent/scenarios/kaggle/prompts.yaml
index c73ae099f..b7a7e489c 100644
--- a/rdagent/scenarios/kaggle/prompts.yaml
+++ b/rdagent/scenarios/kaggle/prompts.yaml
@@ -25,8 +25,8 @@ KG_hypothesis_gen_RAG: |-
   {% endif %}
 
 hypothesis_and_feedback: |-
-  {% for hypothesis, experiment, feedback in trace.hist[-10:] %}
-  Hypothesis {{ loop.index }}: {{ hypothesis }}
+  {% for experiment, feedback in trace.hist[-10:] %}
+  Hypothesis {{ loop.index }}: {{ experiment.hypothesis }}
   Observation on the result with the hypothesis: {{ feedback.observations }}
   Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
   Did changing to this hypothesis work? (focus on the change):  {{ feedback.decision }}
@@ -225,7 +225,7 @@ model_tuning_feedback_generation:
     Your feedback should:
     1. Confirm if the current result supports or refutes the hypothesis.
     2. Compare with previous best results.
-    3. Suggest improvements or new directions. Stay innovative and adapative.
+    3. Suggest improvements or new directions. Stay innovative and adaptive.
 
     Please provide detailed and constructive feedback. Note that as hypothesis evolve, a general trend should be that the model grows larger. 
     Example JSON Structure for Result Analysis:
@@ -343,7 +343,7 @@ gen_knowledge_from_code_mini_case:
     You were a proficient data scientist.
   user: |-
     The following notebook (contain markdown part and code part) is a high-performing solution for a kaggle competition.
-    Please answer the following questions one by one and **as detailedly as possible**.
+    Please answer the following questions one by one and **as detailed as possible**.
     Make sure that another data scientist can exactly reproduce this copy of code based on your answer.
     Focus on the training process.
 
diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py
index 273860bb5..63636ee5c 100644
--- a/rdagent/scenarios/kaggle/proposal/proposal.py
+++ b/rdagent/scenarios/kaggle/proposal/proposal.py
@@ -204,7 +204,7 @@ class KGHypothesisGen(FactorAndModelHypothesisGen):
     .. code-block:: python
 
         class KGHypothesisGen(ModelHypothesisGen):
-            prompts: Prompts = a_specifc_prompt_dict
+            prompts: Prompts = a_specific_prompt_dict
     """
 
     def __init__(self, scen: Scenario) -> Tuple[dict, bool]:
@@ -233,7 +233,7 @@ def update_reward_estimates(self, trace: Trace) -> None:
             reward = (performance_t - performance_t_minus_1) / performance_t_minus_1
             n_o = self.scen.action_counts[last_action]
             mu_o = self.scen.reward_estimates[last_action]
-            self.scen.scen.reward_estimates[last_action] += (reward - mu_o) / n_o
+            self.scen.reward_estimates[last_action] += (reward - mu_o) / n_o
         else:
             # First iteration, nothing to update
             pass
@@ -276,11 +276,11 @@ def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
 
         hypothesis_specification = f"Hypothesis should avoid being too general and vague, and should be specific and actionable. For example, hypothesis like 'tune a model' is too general, while hypothesis like 'increase the learning rate to 0.1 of the lightgbm model will improve the performance' is specific and actionable."
         if len(trace.hist) > 0:
-            sota_features = str(trace.hist[-1][1].based_experiments[-1].experiment_workspace.data_description)
+            sota_features = str(trace.hist[-1][0].based_experiments[-1].experiment_workspace.data_description)
             sota_models = json.dumps(
-                trace.hist[-1][1].based_experiments[-1].experiment_workspace.model_description, indent=2
+                trace.hist[-1][0].based_experiments[-1].experiment_workspace.model_description, indent=2
             )
-            sota_result = trace.hist[-1][1].based_experiments[-1].result
+            sota_result = trace.hist[-1][0].based_experiments[-1].result
             hypothesis_specification += f"\nYour hypothesis should based on current SOTA solution. The user will conduct experiments based on the SOTA solution to test whether your hypothesis is right on this specific ecompetition. \n\nSOTA Features: {sota_features}\n\nSOTA Models: {sota_models}\n\nSOTA Result: {sota_result}"
         if self.scen.if_action_choosing_based_on_UCB:
             hypothesis_specification += (
@@ -340,7 +340,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             else "No previous hypothesis and feedback available since it's the first round."
         )
 
-        experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist]
+        experiment_list: List[ModelExperiment] = [t[0] for t in trace.hist]
 
         model_list = []
         for experiment in experiment_list:
@@ -362,7 +362,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             ),
         }, True
 
-    def convert_feature_experiment(self, response: str, trace: Trace) -> KGFactorExperiment:
+    def convert_feature_experiment(self, response: str, hypothesis: Hypothesis, trace: Trace) -> KGFactorExperiment:
         response_dict = json.loads(response)
         tasks = []
 
@@ -384,12 +384,13 @@ def convert_feature_experiment(self, response: str, trace: Trace) -> KGFactorExp
             sub_tasks=tasks,
             based_experiments=(
                 [KGFactorExperiment(sub_tasks=[], source_feature_size=trace.scen.input_shape[-1])]
-                + [t[1] for t in trace.hist if t[2]]
+                + [t[0] for t in trace.hist if t[1]]
             ),
+            hypothesis=hypothesis,
         )
         return exp
 
-    def convert_model_experiment(self, response: str, trace: Trace) -> KGModelExperiment:
+    def convert_model_experiment(self, response: str, hypothesis: Hypothesis, trace: Trace) -> KGModelExperiment:
         response_dict = json.loads(response)
         tasks = []
         model_type = response_dict.get("model_type", "Model type not provided")
@@ -399,11 +400,11 @@ def convert_model_experiment(self, response: str, trace: Trace) -> KGModelExperi
             )
 
         based_experiments = [KGModelExperiment(sub_tasks=[], source_feature_size=trace.scen.input_shape[-1])] + [
-            t[1] for t in trace.hist if t[2]
+            t[0] for t in trace.hist if t[1]
         ]
         model_type = response_dict.get("model_type", "Model type not provided")
         if model_type in KG_MODEL_MAPPING:
-            base_code = based_experiments[-1].experiment_workspace.code_dict.get(KG_MODEL_MAPPING[model_type], None)
+            base_code = based_experiments[-1].experiment_workspace.file_dict.get(KG_MODEL_MAPPING[model_type], None)
         else:
             base_code = None
 
@@ -421,14 +422,15 @@ def convert_model_experiment(self, response: str, trace: Trace) -> KGModelExperi
         exp = KGModelExperiment(
             sub_tasks=tasks,
             based_experiments=based_experiments,
+            hypothesis=hypothesis,
         )
         return exp
 
-    def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
+    def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace) -> ModelExperiment:
         if self.current_action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
-            return self.convert_feature_experiment(response, trace)
+            return self.convert_feature_experiment(response, hypothesis, trace)
         elif self.current_action in [KG_ACTION_MODEL_FEATURE_SELECTION, KG_ACTION_MODEL_TUNING]:
-            return self.convert_model_experiment(response, trace)
+            return self.convert_model_experiment(response, hypothesis, trace)
 
 
 class KGTrace(Trace[KGScenario, KGKnowledgeGraph]):
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/README.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/README.md
index 0faf7dd2a..f5e48c7a0 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/README.md
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/README.md
@@ -26,7 +26,7 @@ We use a runnable concrete example to demonstrate what the project should be lik
 
 
 ## Step1: write the feature engineering code
-- We can generate some file like [[feat01.py]] that match the pattern `feat.*\.py`
+- We can generate some file like [[feature.py]] that match the pattern `feat.*\.py`
 
 ## Step2: Model training
 
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/ens.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/ensemble.py
similarity index 100%
rename from rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/ens.py
rename to rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/ensemble.py
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/feat01.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/feature.py
similarity index 100%
rename from rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/feat01.py
rename to rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/feature.py
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py
index 3f161f6d3..e110f00a7 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/load_data.py
@@ -32,7 +32,7 @@ def load_images_and_labels(csv_file, image_folder):
     return np.array(images), np.array(labels)
 
 
-def load_from_raw_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, list[str]]:
+def load_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, list[str]]:
     """
     load raw data from disk to get data in uniform data
 
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
index f3a4bb78c..d879bcb32 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/main.py
@@ -1,14 +1,14 @@
-from load_data import load_from_raw_data
+from load_data import load_data
 from sklearn.model_selection import train_test_split
 
 # Load data
-train_images, train_labels, test_images, test_ids = load_from_raw_data()
+train_images, train_labels, test_images, test_ids = load_data()
 
 
 # feature engineering
-from feat01 import feat_eng
+from feature import feat_eng
 
-train_images, train_lables, train_param = feat_eng(train_images, train_labels)
+train_images, train_lables, train_param = feat_eng(train_images, train_labels, train_images, train_labels)
 test_images, _, _ = feat_eng(test_images, param=train_param)
 
 
@@ -21,11 +21,11 @@
 # Model workflow
 from model01 import model_workflow
 
-val_pred, test_pred = model_workflow(train_images, train_labels, validation_images, validation_labels, test_images)
+val_pred, test_pred, _ = model_workflow(train_images, train_labels, validation_images, validation_labels, test_images)
 
 
 # Ensemble
-from ens import ens_and_decision
+from ensemble import ens_and_decision
 
 pred_binary = ens_and_decision([test_pred], [val_pred], validation_labels)
 
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
index 3cb54dfb6..4b4f16259 100644
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/model01.py
@@ -25,7 +25,7 @@ def model_workflow(
     val_y: np.ndarray = None,
     test_X: np.ndarray = None,
     **hyper_params,
-) -> tuple[np.ndarray | None, np.ndarray | None]:
+) -> tuple[np.ndarray | None, np.ndarray | None, dict]:
     """
     Manages the workflow of a machine learning model, including training, validation, and testing.
 
@@ -62,9 +62,6 @@ def model_workflow(
     train_datagen = ImageDataGenerator(rescale=1.0 / 255, horizontal_flip=True, vertical_flip=True)
     train_generator = train_datagen.flow(train_images, train_labels, batch_size=batch_size, shuffle=True)
 
-    validation_datagen = ImageDataGenerator(rescale=1.0 / 255)
-    validation_generator = validation_datagen.flow(validation_images, validation_labels, batch_size=batch_size)
-
     # Get input shape from the training data
     input_shape = X.shape[1:]
     num_classes = hyper_params.get("num_classes", 2)
@@ -115,27 +112,54 @@ def model_workflow(
         metrics=["accuracy"],
     )
 
+    # Extract early_stop_round from hyper_params, default is 25
+    early_stop_round = hyper_params.get("early_stop_round", 25)
+
     callbacks = [
-        EarlyStopping(monitor="val_loss", patience=hyper_params.get("patience", 25)),
+        EarlyStopping(monitor="val_loss", patience=early_stop_round),
         ModelCheckpoint(filepath="best_model.keras", monitor="val_loss", save_best_only=True),
     ]
 
     # Training
     epochs = hyper_params.get("epochs", 100)
-    history = model.fit(
-        train_generator,
-        validation_data=validation_generator,
-        epochs=epochs,
-        verbose=1,
-        shuffle=True,
-        callbacks=callbacks,
-    )
-    # Predict on validation data
-    val_pred = model.predict(validation_datagen.flow(validation_images, batch_size=1, shuffle=False), verbose=1)
-
-    # Load the test data and evaluate the model
-    test_datagen = ImageDataGenerator(rescale=1.0 / 255)
-    test_generator = test_datagen.flow(test_images, batch_size=1, shuffle=False)
-
-    test_pred = model.predict(test_generator, verbose=1)
-    return val_pred, test_pred
+    if val_X is not None and val_y is not None:
+        validation_datagen = ImageDataGenerator(rescale=1.0 / 255)
+        validation_generator = validation_datagen.flow(validation_images, validation_labels, batch_size=batch_size)
+        history = model.fit(
+            train_generator,
+            validation_data=validation_generator,
+            epochs=epochs,
+            verbose=1,
+            shuffle=True,
+            callbacks=callbacks,
+        )
+        # Dynamic adjustment of early_stop_round
+        if "early_stop_round" not in hyper_params:
+            val_loss = history.history["val_loss"]
+            best_epoch = np.argmin(val_loss)
+            dynamic_early_stop = max(5, int((len(val_loss) - best_epoch) * 0.5))  # 50% of remaining epochs
+
+            print(f"Dynamic early_stop_round: {dynamic_early_stop}")
+            hyper_params["early_stop_round"] = dynamic_early_stop
+
+        # Predict on validation data
+        val_pred = model.predict(validation_datagen.flow(validation_images, batch_size=1, shuffle=False), verbose=1)
+    else:
+        history = model.fit(
+            train_generator,
+            epochs=epochs,
+            verbose=1,
+            shuffle=True,
+            callbacks=callbacks,
+        )
+        val_pred = None
+
+    # Predict on test data
+    if test_X is not None:
+        test_datagen = ImageDataGenerator(rescale=1.0 / 255)
+        test_generator = test_datagen.flow(test_images, batch_size=1, shuffle=False)
+        test_pred = model.predict(test_generator, verbose=1)
+    else:
+        test_pred = None
+
+    return val_pred, test_pred, hyper_params
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec.md
deleted file mode 100644
index 69378e075..000000000
--- a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# Specification for Implementing a Kaggle Competition Project
-
-This document outlines the structure and interface protocols for implementing a machine learning project, similar to a Kaggle competition. Follow these guidelines to ensure consistency and maintainability across projects.
-
-## Project Structure
-
-The project should be organized into the following components:
-
-1. **Data Loading** (`load_data.py`): A module responsible for loading and preprocessing raw data.
-2. **Feature Engineering**(`feat*.py`): A module for transforming raw data into features suitable for model training.
-3. **Model Workflow**(`model*.py`): A module that manages the training, validation, and testing of machine learning models.
-4. **Ensemble and Decision Making**(`ens.py`): A module for combining predictions from multiple models and making final decisions.
-5. **Workflow**(`main.py`): A script to put the above component together to get the final submission(`submission.csv`)
-
-## Data Loading
-
-- Implement a function to load data from raw files.
-- The function should return training images, training labels, test images, and test IDs.
-
-## Feature Engineering
-
-- Implement a function for feature engineering with the following signature:
-
-```python
-def feature_eng(X: np.ndarray, y: np.ndarray | None = None, X_fit: np.ndarray | None = None, y_fit: np.ndarray | None = None, param: object | None = None) -> tuple[np.ndarray, np.ndarray | None, object]:
-    """
-    Perform feature engineering on the input data.
-
-    Parameters:
-    - X: np.ndarray
-        The input data to be transformed.
-    - y: np.ndarray | None
-        The target data.
-    - X_fit: np.ndarray | None
-        Data for fitting the transformation parameters.
-    - y_fit: np.ndarray | None
-        Target data for fitting.
-    - param: object | None
-        Pre-fitted parameters for transformation.
-
-    Returns:
-    - transformed_data: np.ndarray
-        Transformed data.
-    - transformed_target: np.ndarray | None
-        Transformed target data.
-    - fitted_param: object
-        Fitted parameters.
-    """
-```
-
-- Ensure that the feature engineering process is consistent and can be applied to both training and test data.
-
-## Model Workflow
-
-- Implement a function to manage the model workflow with the following signature:
-
-```python
-def model_workflow(X: np.ndarray, y: np.ndarray, val_X: np.ndarray = None, val_y: np.ndarray = None, test_X: np.ndarray = None, **hyper_params) -> tuple[np.ndarray | None, np.ndarray | None]:
-    """
-    Manages the workflow of a machine learning model, including training, validation, and testing.
-
-    Parameters
-    ----------
-    X : np.ndarray
-        Training data features.
-    y : np.ndarray
-        Training data labels.
-    val_X : np.ndarray, optional
-        Validation data features.
-    val_y : np.ndarray, optional
-        Validation data labels.
-    test_X : np.ndarray, optional
-        Test data features.
-    **hyper_params
-        Additional hyperparameters for the model.
-
-    Returns
-    -------
-    tuple[np.ndarray | None, np.ndarray | None]
-        Predictions on the validation data, predictions on the test data
-    """
-```
-
-- The function should handle data augmentation, model creation, training, and prediction.
-
-## Ensemble and Decision Making
-
-- Implement a function for ensemble and decision making with the following signature:
-
-```python
-def ens_and_decision(test_pred_l: list[np.ndarray], val_pred_l: list[np.ndarray], val_label: np.ndarray) -> np.ndarray:
-    """
-    Handle the following:
-    1) Ensemble predictions using a simple average.
-    2) Make final decision after ensemble (convert the predictions to final binary form).
-
-    Parameters
-    ----------
-    test_pred_l : list[np.ndarray]
-        List of predictions on the test data.
-    val_pred_l : list[np.ndarray]
-        List of predictions on the validation data.
-    val_label : np.ndarray
-        True labels of the validation data.
-
-    Returns
-    -------
-    np.ndarray
-        Binary predictions on the test data.
-    """
-```
-
-- The function should combine predictions and convert them to a binary format.
-
-## Submission
-
-- Implement a script to generate the submission file.
-- The script should write predictions to a CSV file in the format required by the competition.
-
-## General Guidelines
-
-- Ensure that all modules and functions are well-documented.
-- Follow consistent naming conventions and code style.
-- Use type annotations for function signatures to improve code readability and maintainability.
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/data_loader.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/data_loader.md
new file mode 100644
index 000000000..0d5168f81
--- /dev/null
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/data_loader.md
@@ -0,0 +1,4 @@
+## Data Loading
+
+- Implement a function to load data from raw files.
+- The function should return training images, training labels, test images, and test IDs.
\ No newline at end of file
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/ensemble.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/ensemble.md
new file mode 100644
index 000000000..68326d19f
--- /dev/null
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/ensemble.md
@@ -0,0 +1,28 @@
+## Ensemble and Decision Making
+
+- Implement a function for ensemble and decision making with the following signature:
+
+```python
+def ens_and_decision(test_pred_l: list[np.ndarray], val_pred_l: list[np.ndarray], val_label: np.ndarray) -> np.ndarray:
+    """
+    Handle the following:
+    1) Ensemble predictions using a simple average.
+    2) Make final decision after ensemble (convert the predictions to final form).
+
+    Parameters
+    ----------
+    test_pred_l : list[np.ndarray]
+        List of predictions on the test data.
+    val_pred_l : list[np.ndarray]
+        List of predictions on the validation data.
+    val_label : np.ndarray
+        True labels of the validation data.
+
+    Returns
+    -------
+    np.ndarray
+        Predictions on the test data.
+    """
+```
+
+- The function should combine predictions and convert them to a proper format.
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md
new file mode 100644
index 000000000..60139d6af
--- /dev/null
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/feature.md
@@ -0,0 +1,33 @@
+
+## Feature Engineering
+
+- Implement a function for feature engineering with the following signature:
+
+```python
+def feat_eng(X: np.ndarray, y: np.ndarray | None = None, X_fit: np.ndarray | None = None, y_fit: np.ndarray | None = None, param: object | None = None) -> tuple[np.ndarray, np.ndarray | None, object]:
+    """
+    Perform feature engineering on the input data.
+
+    Parameters:
+    - X: np.ndarray
+        The input data to be transformed.
+    - y: np.ndarray | None
+        The target data.
+    - X_fit: np.ndarray | None
+        Data for fitting the transformation parameters.
+    - y_fit: np.ndarray | None
+        Target data for fitting.
+    - param: object | None
+        Pre-fitted parameters for transformation.
+
+    Returns:
+    - transformed_data: np.ndarray
+        Transformed data.
+    - transformed_target: np.ndarray | None
+        Transformed target data.
+    - fitted_param: object
+        Fitted parameters.
+    """
+```
+
+- Ensure that the feature engineering process is consistent and can be applied to both training and test data.
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/model.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/model.md
new file mode 100644
index 000000000..04c24dad5
--- /dev/null
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/model.md
@@ -0,0 +1,44 @@
+## Model Workflow
+
+- Implement a function to manage the model workflow with the following signature:
+
+```python
+def model_workflow(X: np.ndarray, y: np.ndarray, val_X: np.ndarray = None, val_y: np.ndarray = None, test_X: np.ndarray = None, **hyper_params) -> tuple[np.ndarray | None, np.ndarray | None, dict]:
+    """
+    Manages the workflow of a machine learning model, including training, validation
+    The testing&validation's inference is included, as well
+
+    - If test/valid exist, output inference on them
+    - Follow the hyperparameter if exists
+        - Hyperparameters at least has <early stop round>. The code must check if it is given and use it.
+        - the returned hyperparameter should align with the input(except the newly generated early stop)
+    - Return hyperparameters for retrain if not exists. Hyperparameters should have <early stop round>
+    - If valid exist, add <early stop round> to update the hyperparameter
+
+
+    Parameters
+    ----------
+    X : np.ndarray
+        Training data features.
+    y : np.ndarray
+        Training data labels.
+    val_X : np.ndarray, optional
+        Validation data features.
+    val_y : np.ndarray, optional
+        Validation data labels.
+    test_X : np.ndarray, optional
+        Test data features.
+    **hyper_params
+        Additional hyperparameters for the model.
+
+    Returns
+    -------
+    tuple[np.ndarray | None, np.ndarray | None, dict]
+        Predictions on the validation data, predictions on the test data
+    """
+```
+- In this task, the shape of input(X of train, valid and test) should be (num_samples, height, width, channels).
+
+- In this task, the shape of output should be (num_samples, num_class), as num_class = 1 here.
+
+- The function should handle data augmentation, model creation, training, and prediction.
diff --git a/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/workflow.md b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/workflow.md
new file mode 100644
index 000000000..7ea16f693
--- /dev/null
+++ b/rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/spec/workflow.md
@@ -0,0 +1,24 @@
+# Specification for Implementing a Kaggle Competition Project
+
+This document outlines the structure and interface protocols for implementing a machine learning project, similar to a Kaggle competition. Follow these guidelines to ensure consistency and maintainability across projects.
+
+## Project Structure
+
+The project should be organized into the following components:
+
+1. **Data Loading** (`load_data.py`): A module responsible for loading and preprocessing raw data.
+2. **Feature Engineering**(`feat*.py`): A module for transforming raw data into features suitable for model training.
+3. **Model Workflow**(`model*.py`): A module that manages the training, validation, and testing of machine learning models.
+4. **Ensemble and Decision Making**(`ensemble.py`): A module for combining predictions from multiple models and making final decisions.
+5. **Workflow**(`main.py`): A script to put the above component together to get the final submission(`submission.csv`)
+
+## Submission
+
+- Implement a script to generate the submission file.
+- The script should write predictions to a CSV file in the format required by the competition.
+
+## General Guidelines
+
+- Ensure that all modules and functions are well-documented.
+- Follow consistent naming conventions and code style.
+- Use type annotations for function signatures to improve code readability and maintainability.
diff --git a/rdagent/scenarios/qlib/developer/feedback.py b/rdagent/scenarios/qlib/developer/feedback.py
index c34e0e89b..1a80cfe6c 100644
--- a/rdagent/scenarios/qlib/developer/feedback.py
+++ b/rdagent/scenarios/qlib/developer/feedback.py
@@ -7,8 +7,8 @@
 from rdagent.core.experiment import Experiment
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import (
+    Experiment2Feedback,
     Hypothesis,
-    HypothesisExperiment2Feedback,
     HypothesisFeedback,
     Trace,
 )
@@ -56,8 +56,8 @@ def process_results(current_result, sota_result):
     return filtered_combined_df.to_string()
 
 
-class QlibFactorHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
-    def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
+class QlibFactorExperiment2Feedback(Experiment2Feedback):
+    def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback:
         """
         Generate feedback for the given experiment and hypothesis.
 
@@ -69,6 +69,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
         Returns:
             Any: The feedback generated for the given experiment and hypothesis.
         """
+        hypothesis = exp.hypothesis
         logger.info("Generating feedback...")
         hypothesis_text = hypothesis.hypothesis
         current_result = exp.result
@@ -122,15 +123,15 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
         )
 
 
-class QlibModelHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
+class QlibModelExperiment2Feedback(Experiment2Feedback):
     """Generated feedbacks on the hypothesis from **Executed** Implementations of different tasks & their comparisons with previous performances"""
 
-    def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
+    def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback:
         """
         The `ti` should be executed and the results should be included, as well as the comparison between previous results (done by LLM).
         For example: `mlflow` of Qlib will be included.
         """
-
+        hypothesis = exp.hypothesis
         logger.info("Generating feedback...")
         # Define the system prompt for hypothesis feedback
         system_prompt = feedback_prompts["model_feedback_generation"]["system"]
@@ -146,7 +147,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
                 context=context,
                 last_hypothesis=SOTA_hypothesis,
                 last_task=SOTA_experiment.sub_tasks[0].get_task_information() if SOTA_hypothesis else None,
-                last_code=SOTA_experiment.sub_workspace_list[0].code_dict.get("model.py") if SOTA_hypothesis else None,
+                last_code=SOTA_experiment.sub_workspace_list[0].file_dict.get("model.py") if SOTA_hypothesis else None,
                 last_result=SOTA_experiment.result if SOTA_hypothesis else None,
                 hypothesis=hypothesis,
                 exp=exp,
diff --git a/rdagent/scenarios/qlib/developer/model_runner.py b/rdagent/scenarios/qlib/developer/model_runner.py
index ed83ac2f3..4a7196bf7 100644
--- a/rdagent/scenarios/qlib/developer/model_runner.py
+++ b/rdagent/scenarios/qlib/developer/model_runner.py
@@ -19,10 +19,10 @@ class QlibModelRunner(CachedRunner[QlibModelExperiment]):
 
     @cache_with_pickle(CachedRunner.get_cache_key, CachedRunner.assign_cached_result)
     def develop(self, exp: QlibModelExperiment) -> QlibModelExperiment:
-        if exp.sub_workspace_list[0].code_dict.get("model.py") is None:
+        if exp.sub_workspace_list[0].file_dict.get("model.py") is None:
             raise ModelEmptyError("model.py is empty")
         # to replace & inject code
-        exp.experiment_workspace.inject_code(**{"model.py": exp.sub_workspace_list[0].code_dict["model.py"]})
+        exp.experiment_workspace.inject_files(**{"model.py": exp.sub_workspace_list[0].file_dict["model.py"]})
 
         env_to_use = {"PYTHONPATH": "./"}
 
diff --git a/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py b/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py
index 19e19b5bf..b6c97d7f7 100644
--- a/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py
+++ b/rdagent/scenarios/qlib/factor_experiment_loader/json_loader.py
@@ -57,7 +57,7 @@ def load(self, json_file_path: Path) -> TestCases:
             )
             gt = FactorFBWorkspace(task, raise_exception=False)
             code = {"factor.py": factor_data["gt_code"]}
-            gt.inject_code(**code)
+            gt.inject_files(**code)
             test_cases.test_case_l.append(TestCase(task, gt))
 
         return test_cases
diff --git a/rdagent/scenarios/qlib/prompts.yaml b/rdagent/scenarios/qlib/prompts.yaml
index dc84f8deb..b1ea9e520 100644
--- a/rdagent/scenarios/qlib/prompts.yaml
+++ b/rdagent/scenarios/qlib/prompts.yaml
@@ -1,7 +1,7 @@
 hypothesis_and_feedback: |-
-  {% for hypothesis, experiment, feedback in trace.hist[-10:] %}
-  Hypothesis {{ loop.index }}: {{ hypothesis }}
-  Corresponding Code (that leads to the difference in performance): {{experiment.sub_workspace_list[0].code_dict.get("model.py")}}
+  {% for experiment, feedback in trace.hist[-10:] %}
+  Hypothesis {{ loop.index }}: {{ experiment.hypothesis }}
+  Corresponding Code (that leads to the difference in performance): {{experiment.sub_workspace_list[0].file_dict.get("model.py")}}
   Observation on the result with the hypothesis: {{ feedback.observations }}
   Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
   New Feedback for Context (For you to agree or improve upon):  {{ feedback.new_hypothesis }}
@@ -258,7 +258,7 @@ model_feedback_generation:
     Now let's come to this round. You will receive the result and you will evaluate if the performance increases or decreases. 
     Hypothesis: {{hypothesis.hypothesis}}
     Experiment Setup: {{exp.sub_tasks[0]}}
-    Code Implemented: {{exp.sub_workspace_list[0].code_dict.get("model.py")}}
+    Code Implemented: {{exp.sub_workspace_list[0].file_dict.get("model.py")}}
     Relevant Reasoning: {{hypothesis.reason}}
     Result: {{exp.result}}
 
diff --git a/rdagent/scenarios/qlib/proposal/factor_proposal.py b/rdagent/scenarios/qlib/proposal/factor_proposal.py
index 204e9bcbb..80df3eba0 100644
--- a/rdagent/scenarios/qlib/proposal/factor_proposal.py
+++ b/rdagent/scenarios/qlib/proposal/factor_proposal.py
@@ -65,7 +65,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict |
             else "No previous hypothesis and feedback available since it's the first round."
         )
 
-        experiment_list: List[FactorExperiment] = [t[1] for t in trace.hist]
+        experiment_list: List[FactorExperiment] = [t[0] for t in trace.hist]
 
         factor_list = []
         for experiment in experiment_list:
@@ -80,7 +80,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict |
             "RAG": None,
         }, True
 
-    def convert_response(self, response: str, trace: Trace) -> FactorExperiment:
+    def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace) -> FactorExperiment:
         response_dict = json.loads(response)
         tasks = []
 
@@ -97,8 +97,8 @@ def convert_response(self, response: str, trace: Trace) -> FactorExperiment:
                 )
             )
 
-        exp = QlibFactorExperiment(tasks)
-        exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [t[1] for t in trace.hist if t[2]]
+        exp = QlibFactorExperiment(tasks, hypothesis=hypothesis)
+        exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [t[0] for t in trace.hist if t[1]]
 
         unique_tasks = []
 
diff --git a/rdagent/scenarios/qlib/proposal/model_proposal.py b/rdagent/scenarios/qlib/proposal/model_proposal.py
index 51d92b032..81ee91ad4 100644
--- a/rdagent/scenarios/qlib/proposal/model_proposal.py
+++ b/rdagent/scenarios/qlib/proposal/model_proposal.py
@@ -65,7 +65,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             else "No previous hypothesis and feedback available since it's the first round."
         )
 
-        experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist]
+        experiment_list: List[ModelExperiment] = [t[0] for t in trace.hist]
 
         model_list = []
         for experiment in experiment_list:
@@ -80,7 +80,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             "RAG": None,
         }, True
 
-    def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
+    def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace) -> ModelExperiment:
         response_dict = json.loads(response)
         tasks = []
         for model_name in response_dict:
@@ -101,6 +101,6 @@ def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
                     model_type=model_type,
                 )
             )
-        exp = QlibModelExperiment(tasks)
-        exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+        exp = QlibModelExperiment(tasks, hypothesis=hypothesis)
+        exp.based_experiments = [t[0] for t in trace.hist if t[1]]
         return exp
diff --git a/rdagent/utils/__init__.py b/rdagent/utils/__init__.py
index 41a096ace..aef26ef4c 100644
--- a/rdagent/utils/__init__.py
+++ b/rdagent/utils/__init__.py
@@ -7,13 +7,19 @@
 # TODO: split the utils in this module into different modules in the future.
 
 import importlib
+import json
 import re
 import sys
+from pathlib import Path
 from types import ModuleType
 from typing import Union
 
+from rdagent.oai.llm_conf import LLM_SETTINGS
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T
 
-def get_module_by_module_path(module_path: Union[str, ModuleType]):
+
+def get_module_by_module_path(module_path: Union[str, ModuleType]) -> ModuleType:
     """Load module from path like a/b/c/d.py or a.b.c.d
 
     :param module_path:
@@ -29,9 +35,14 @@ def get_module_by_module_path(module_path: Union[str, ModuleType]):
         if module_path.endswith(".py"):
             module_name = re.sub("^[^a-zA-Z_]+", "", re.sub("[^0-9a-zA-Z_]", "", module_path[:-3].replace("/", "_")))
             module_spec = importlib.util.spec_from_file_location(module_name, module_path)
+            if module_spec is None:
+                raise ModuleNotFoundError(f"Cannot find module at {module_path}")
             module = importlib.util.module_from_spec(module_spec)
             sys.modules[module_name] = module
-            module_spec.loader.exec_module(module)
+            if module_spec.loader is not None:
+                module_spec.loader.exec_module(module)
+            else:
+                raise ModuleNotFoundError(f"Cannot load module at {module_path}")
         else:
             module = importlib.import_module(module_path)
     return module
@@ -53,3 +64,82 @@ def convert2bool(value: Union[str, bool]) -> bool:
         return value
     else:
         raise ValueError(f"Unknown value type {value} to bool")
+
+
+def remove_ansi_codes(s: str) -> str:
+    """
+    It is for removing ansi ctrl characters in the string(e.g. colored text)
+    """
+    ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
+    return ansi_escape.sub("", s)
+
+
+def filter_progress_bar(stdout: str) -> str:
+    """
+    Filter out progress bars from stdout using regex.
+    """
+    # Initial progress bar regex pattern
+    progress_bar_re = (
+        r"(\d+/\d+\s+[━]+\s+\d+s?\s+\d+ms/step.*?\u0008+|"
+        r"\d+/\d+\s+[━]+\s+\d+s?\s+\d+ms/step|"
+        r"\d+/\d+\s+[━]+\s+\d+s?\s+\d+ms/step.*|"
+        r"\d+/\d+\s+[━]+.*?\u0008+|"
+        r"\d+/\d+\s+[━]+.*|[ ]*\u0008+|"
+        r"\d+%\|[█▏▎▍▌▋▊▉]+\s+\|\s+\d+/\d+\s+\[\d{2}:\d{2}<\d{2}:\d{2},\s+\d+\.\d+it/s\]|"
+        r"\d+%\|[█]+\|\s+\d+/\d+\s+\[\d{2}:\d{2}<\d{2}:\d{2},\s*\d+\.\d+it/s\])"
+    )
+
+    filtered_stdout = remove_ansi_codes(stdout)
+    filtered_stdout = re.sub(progress_bar_re, "", filtered_stdout)
+    filtered_stdout = re.sub(r"\s*\n\s*", "\n", filtered_stdout)
+
+    needs_sub = True
+    # Attempt further filtering up to 5 times
+    for _ in range(5):
+        filtered_stdout_shortened = filtered_stdout
+        system_prompt = T(".prompts:filter_progress_bar.system").r()
+
+        for __ in range(10):
+            user_prompt = T(".prompts:filter_progress_bar.user").r(
+                stdout=filtered_stdout_shortened,
+            )
+            stdout_token_size = APIBackend().build_messages_and_calculate_token(
+                user_prompt=user_prompt,
+                system_prompt=system_prompt,
+            )
+            if stdout_token_size < LLM_SETTINGS.chat_token_limit * 0.1:
+                return filtered_stdout_shortened
+            elif stdout_token_size > LLM_SETTINGS.chat_token_limit * 0.6:
+                filtered_stdout_shortened = filtered_stdout_shortened[
+                    len(filtered_stdout_shortened) // 4 : len(filtered_stdout_shortened) * 3 // 4
+                ]
+            else:
+                break
+
+        response = json.loads(
+            APIBackend().build_messages_and_create_chat_completion(
+                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+            )
+        )
+        needs_sub = response.get("needs_sub", True)
+        regex_patterns = response.get("regex_patterns", [])
+        if isinstance(regex_patterns, list):
+            for pattern in regex_patterns:
+                filtered_stdout = re.sub(pattern, "", filtered_stdout)
+        else:
+            filtered_stdout = re.sub(regex_patterns, "", filtered_stdout)
+
+        if not needs_sub:
+            break
+        filtered_stdout = re.sub(r"\s*\n\s*", "\n", filtered_stdout)
+
+    return filtered_stdout
+
+
+def remove_path_info_from_str(base_path: Path, target_string: str) -> str:
+    """
+    Remove the absolute path from the target string
+    """
+    target_string = re.sub(str(base_path), "...", target_string)
+    target_string = re.sub(str(base_path.absolute()), "...", target_string)
+    return target_string
diff --git a/rdagent/utils/agent/ret.py b/rdagent/utils/agent/ret.py
index 3c68354ab..6189af24a 100644
--- a/rdagent/utils/agent/ret.py
+++ b/rdagent/utils/agent/ret.py
@@ -4,6 +4,7 @@
 We think this part can be shared.
 """
 
+import json
 import re
 from abc import abstractclassmethod
 from typing import Any
@@ -12,6 +13,8 @@
 
 
 class AgentOut:
+    json_mode: bool = False  # To get the output, is json_mode required.
+
     @abstractclassmethod
     def get_spec(cls, **context: Any) -> str:
         raise NotImplementedError(f"Please implement the `get_spec` method")
@@ -32,3 +35,15 @@ def extract_output(cls, resp: str):
         if match:
             code = match.group(1)
             return code
+
+
+class BatchEditOut(AgentOut):
+    json_mode: bool = True
+
+    @classmethod
+    def get_spec(cls):
+        return T(".tpl:BatchEditOut").r()
+
+    @classmethod
+    def extract_output(cls, resp: str):
+        return json.loads(resp)
diff --git a/rdagent/utils/agent/tpl.py b/rdagent/utils/agent/tpl.py
index 99df9b91d..28345c347 100644
--- a/rdagent/utils/agent/tpl.py
+++ b/rdagent/utils/agent/tpl.py
@@ -1,7 +1,7 @@
 """
-Here are some infrastruture to build a agent
+Here are some infrastructure to build a agent
 
-The motivation of tempalte and AgentOutput Design
+The motivation of template and AgentOutput Design
 """
 
 import inspect
@@ -12,6 +12,7 @@
 from jinja2 import Environment, StrictUndefined
 
 from rdagent.core.utils import SingletonBaseClass
+from rdagent.log import rdagent_logger as logger
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 PROJ_PATH = DIRNAME.parent.parent
@@ -29,16 +30,23 @@ def __init__(self, uri: str):
         here are some uri usages
             case 1) "a.b.c:x.y.z"
                 It will load DIRNAME/a/b/c.yaml as `yaml` and load yaml[x][y][z]
+
+                Form example, if you want to load "rdagent/scenarios/kaggle/experiment/prompts.yaml"
+                `a.b.c` should be "scenarios.kaggle.experiment.prompts" and "rdagent" should be exclude
             case 2) ".c:x.y.z"
                 It will load c.yaml in caller's (who call `T(uri)`) directory as `yaml` and load yaml[x][y][z]
 
             the loaded content will be saved in `self.template`
         """
+        self.uri = uri
         # Inspect the calling stack to get the caller's directory
         stack = inspect.stack()
         caller_frame = stack[1]
         caller_module = inspect.getmodule(caller_frame[0])
-        caller_dir = Path(caller_module.__file__).parent
+        if caller_module and caller_module.__file__:
+            caller_dir = Path(caller_module.__file__).parent
+        else:
+            caller_dir = DIRNAME
 
         # Parse the URI
         path_part, yaml_path = uri.split(":")
@@ -46,6 +54,7 @@ def __init__(self, uri: str):
 
         if path_part.startswith("."):
             yaml_file_path = caller_dir / f"{path_part[1:].replace('.', '/')}.yaml"
+            self.uri = f"{str(caller_dir.resolve().relative_to(PROJ_PATH)).replace('/', '.')}{uri}"
         else:
             yaml_file_path = (PROJ_PATH / path_part.replace(".", "/")).with_suffix(".yaml")
 
@@ -59,11 +68,24 @@ def __init__(self, uri: str):
 
         self.template = yaml_content
 
-    def r(self, **context: Any):
+    def r(self, **context: Any) -> str:
         """
         Render the template with the given context.
         """
-        return Environment(undefined=StrictUndefined).from_string(self.template).render(**context)
+        rendered = Environment(undefined=StrictUndefined).from_string(self.template).render(**context).strip("\n")
+        while "\n\n\n" in rendered:
+            rendered = rendered.replace("\n\n\n", "\n\n")
+        rendered = "\n".join(line for line in rendered.splitlines() if line.strip())
+        logger.log_object(
+            obj={
+                "uri": self.uri,
+                "template": self.template,
+                "context": context,
+                "rendered": rendered,
+            },
+            tag="debug_tpl",
+        )
+        return rendered
 
 
 T = RDAT  # shortcuts
diff --git a/rdagent/utils/agent/tpl.yaml b/rdagent/utils/agent/tpl.yaml
index 2b41013a0..d61e465f6 100644
--- a/rdagent/utils/agent/tpl.yaml
+++ b/rdagent/utils/agent/tpl.yaml
@@ -4,3 +4,14 @@ PythonAgentOut: |-
   <You code>
   ```
   
+BatchEditOut: |-
+  You should return a edition that applies to multiple files in a workspace in JSON.
+  Except for the model file, other files should not be renamed.
+  Files that do not need to be modified do not need to be included in the returned dict.
+
+  For example:
+  Inject the code into the folder. Your file name should always contain the suffix. Your file name keys should be unique to avoid delete or replace conflicts.
+  {
+      <file name1>: "<code>",  // indicate writing <code> into <file name> (create new file or replace existing file)
+      <file name2>: "__DEL__"  // indicate removing file name2. When we want to replace a file to a new one, we usually use this
+  }
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
index d6f817849..811aef8ad 100644
--- a/rdagent/utils/env.py
+++ b/rdagent/utils/env.py
@@ -10,15 +10,19 @@
 import json
 import os
 import pickle
+import re
 import subprocess
+import time
 import uuid
+import zipfile
 from abc import abstractmethod
 from pathlib import Path
 from typing import Generic, Optional, TypeVar
 
-import docker
-import docker.models
-import docker.models.containers
+import docker  # type: ignore[import-untyped]
+import docker.models  # type: ignore[import-untyped]
+import docker.models.containers  # type: ignore[import-untyped]
+import docker.types  # type: ignore[import-untyped]
 from pydantic import BaseModel
 from rich import print
 from rich.console import Console
@@ -27,7 +31,9 @@
 from rich.table import Table
 
 from rdagent.core.conf import ExtendedBaseSettings, ExtendedSettingsConfigDict
+from rdagent.core.experiment import RD_AGENT_SETTINGS
 from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import md5_hash
 
 ASpecificBaseModel = TypeVar("ASpecificBaseModel", bound=BaseModel)
 
@@ -45,13 +51,13 @@ def __init__(self, conf: ASpecificBaseModel):
         self.conf = conf
 
     @abstractmethod
-    def prepare(self):
+    def prepare(self, *args, **kwargs) -> None:  # type: ignore[no-untyped-def]
         """
         Prepare for the environment based on it's configure
         """
 
     @abstractmethod
-    def run(self, entry: str | None, local_path: str | None = None, env: dict | None = None) -> str:
+    def run(self, entry: str | None, local_path: str = ".", env: dict | None = None) -> str:
         """
         Run the folder under the environment.
 
@@ -87,7 +93,7 @@ class LocalEnv(Env[LocalConf]):
     Sometimes local environment may be more convinient for testing
     """
 
-    def prepare(self):
+    def prepare(self) -> None:
         if not (Path("~/.qlib/qlib_data/cn_data").expanduser().resolve().exists()):
             self.run(
                 entry="python -m qlib.run.get_data qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn",
@@ -127,7 +133,7 @@ class DockerConf(ExtendedBaseSettings):
     mount_path: str  # the path in the docker image to mount the folder
     default_entry: str  # the entry point of the image
 
-    extra_volumes: dict | None = {}
+    extra_volumes: dict = {}
     # Sometime, we need maintain some extra data for the workspace.
     # And the extra data may be shared and the downloading can be time consuming.
     # So we just want to download it once.
@@ -147,7 +153,7 @@ class QlibDockerConf(DockerConf):
     image: str = "local_qlib:latest"
     mount_path: str = "/workspace/qlib_workspace/"
     default_entry: str = "qrun conf.yaml"
-    extra_volumes: dict = {Path("~/.qlib/").expanduser().resolve(): "/root/.qlib/"}
+    extra_volumes: dict = {str(Path("~/.qlib/").expanduser().resolve().absolute()): "/root/.qlib/"}
     shm_size: str | None = "16g"
     enable_gpu: bool = True
 
@@ -161,9 +167,12 @@ class DMDockerConf(DockerConf):
     mount_path: str = "/workspace/dm_workspace/"
     default_entry: str = "python train.py"
     extra_volumes: dict = {
-        Path("~/.rdagent/.data/physionet.org/files/mimic-eicu-fiddle-feature/1.0.0/FIDDLE_mimic3/")
-        .expanduser()
-        .resolve(): "/root/.data/"
+        str(
+            Path("~/.rdagent/.data/physionet.org/files/mimic-eicu-fiddle-feature/1.0.0/FIDDLE_mimic3/")
+            .expanduser()
+            .resolve()
+            .absolute()
+        ): "/root/.data/"
     }
     shm_size: str | None = "16g"
 
@@ -188,6 +197,20 @@ class KGDockerConf(DockerConf):
     )
 
 
+class DSDockerConf(DockerConf):
+    model_config = ExtendedSettingsConfigDict(env_prefix="DS_DOCKER_")
+
+    build_from_dockerfile: bool = False
+    image: str = "gcr.io/kaggle-gpu-images/python:latest"
+    mount_path: str = "/kaggle/workspace"
+    default_entry: str = "python main.py"
+
+    running_timeout_period: int = 600
+    mem_limit: str | None = (
+        "48g"  # Add memory limit attribute # new-york-city-taxi-fare-prediction may need more memory
+    )
+
+
 class MLEBDockerConf(DockerConf):
     model_config = ExtendedSettingsConfigDict(env_prefix="MLEB_DOCKER_")
 
@@ -210,12 +233,16 @@ class MLEBDockerConf(DockerConf):
 class DockerEnv(Env[DockerConf]):
     # TODO: Save the output into a specific file
 
-    def prepare(self):
+    def prepare(self, *args, **kwargs) -> None:  # type: ignore[no-untyped-def]
         """
         Download image if it doesn't exist
         """
         client = docker.from_env()
-        if self.conf.build_from_dockerfile and self.conf.dockerfile_folder_path.exists():
+        if (
+            self.conf.build_from_dockerfile
+            and self.conf.dockerfile_folder_path is not None
+            and self.conf.dockerfile_folder_path.exists()
+        ):
             logger.info(f"Building the image from dockerfile: {self.conf.dockerfile_folder_path}")
             resp_stream = client.api.build(
                 path=str(self.conf.dockerfile_folder_path), tag=self.conf.image, network_mode=self.conf.network
@@ -272,7 +299,7 @@ def prepare(self):
         except docker.errors.APIError as e:
             raise RuntimeError(f"Error while pulling the image: {e}")
 
-    def _gpu_kwargs(self, client):
+    def _gpu_kwargs(self, client: docker.DockerClient) -> dict:  # type: ignore[no-any-unimported]
         """get gpu kwargs based on its availability"""
         if not self.conf.enable_gpu:
             return {}
@@ -288,15 +315,25 @@ def _gpu_kwargs(self, client):
             return {}
         return gpu_kwargs
 
+    def replace_time_info(self, input_string: str) -> str:
+        """To remove any time related information from the logs since it will destroy the cache mechanism"""
+        """We currently set this function as default, but it can be changed in the future"""
+        datetime_pattern = r"\b\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(?:\.\d+)?\b"
+        output_string = re.sub(datetime_pattern, "[DATETIME]", input_string)
+        return output_string
+
     def __run(
         self,
         entry: str | None = None,
-        local_path: str | None = None,
+        local_path: str = ".",
         env: dict | None = None,
         running_extra_volume: dict | None = None,
+        remove_timestamp: bool = True,
     ) -> str:
         if env is None:
             env = {}
+        env["PYTHONWARNINGS"] = "ignore"
+        env["TF_CPP_MIN_LOG_LEVEL"] = "2"
         client = docker.from_env()
 
         volumns = {}
@@ -313,7 +350,7 @@ def __run(
         log_output = ""
 
         try:
-            container: docker.models.containers.Container = client.containers.run(
+            container: docker.models.containers.Container = client.containers.run(  # type: ignore[no-any-unimported]
                 image=self.conf.image,
                 command=entry,
                 volumes=volumns,
@@ -340,6 +377,7 @@ def __run(
             print(table)
             for log in logs:
                 decoded_log = log.strip().decode()
+                decoded_log = self.replace_time_info(decoded_log) if remove_timestamp else decoded_log
                 Console().print(decoded_log, markup=False)
                 log_output += decoded_log + "\n"
             print(Rule("[bold green]Docker Logs End[/bold green]", style="dark_orange"))
@@ -354,27 +392,89 @@ def __run(
         except docker.errors.APIError as e:
             raise RuntimeError(f"Error while running the container: {e}")
 
+    def zip_a_folder_into_a_file(self, folder_path: str, zip_file_path: str) -> None:
+        """
+        Zip a folder into a file, use zipfile instead of subprocess
+        """
+        with zipfile.ZipFile(zip_file_path, "w") as z:
+            for root, _, files in os.walk(folder_path):
+                for file in files:
+                    z.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), folder_path))
+
+    def unzip_a_file_into_a_folder(self, zip_file_path: str, folder_path: str) -> None:
+        """
+        Unzip a file into a folder, use zipfile instead of subprocess
+        """
+        with zipfile.ZipFile(zip_file_path, "r") as z:
+            z.extractall(folder_path)
+
+    def cached_run(
+        self,
+        entry: str | None = None,
+        local_path: str = ".",
+        env: dict | None = None,
+        running_extra_volume: dict | None = None,
+        remove_timestamp: bool = True,
+    ) -> str:
+        """
+        Run the folder under the environment.
+        Will cache the output and the folder diff for next round of running.
+        Use the python codes and the parameters(entry, running_extra_volume) as key to hash the input.
+        """
+        target_folder = Path(RD_AGENT_SETTINGS.pickle_cache_folder_path_str) / f"utils.env.run"
+        target_folder.mkdir(parents=True, exist_ok=True)
+        key = md5_hash(
+            json.dumps(
+                [
+                    [str(path.relative_to(Path(local_path))), path.read_text()]
+                    for path in sorted(Path(local_path).rglob("*.py"))
+                ]
+            )
+            + json.dumps({"entry": entry, "running_extra_volume": running_extra_volume})
+            + json.dumps({"extra_volumes": self.conf.extra_volumes})
+        )
+        if Path(target_folder / f"{key}.pkl").exists() and Path(target_folder / f"{key}.zip").exists():
+            with open(target_folder / f"{key}.pkl", "rb") as f:
+                ret: str = pickle.load(f)
+            self.unzip_a_file_into_a_folder(str(target_folder / f"{key}.zip"), local_path)
+        else:
+            ret = self.__run(entry, local_path, env, running_extra_volume, remove_timestamp)
+            with open(target_folder / f"{key}.pkl", "wb") as f:
+                pickle.dump(ret, f)
+            self.zip_a_folder_into_a_file(local_path, str(target_folder / f"{key}.zip"))
+        return ret
+
     def run(
         self,
         entry: str | None = None,
-        local_path: str | None = None,
+        local_path: str = ".",
         env: dict | None = None,
         running_extra_volume: dict | None = None,
-    ):
+    ) -> str:
         if entry is None:
             entry = self.conf.default_entry
-        entry_add_timeout = f"timeout {self.conf.running_timeout_period} {entry}"
-        return self.__run(entry_add_timeout, local_path, env, running_extra_volume)
+        entry_add_timeout = (
+            f"/bin/sh -c 'timeout {self.conf.running_timeout_period} {entry}; chmod -R 777 {self.conf.mount_path}'"
+        )
+
+        start = time.time()
+        out = self.cached_run(entry_add_timeout, local_path, env, running_extra_volume)
+        end = time.time()
+
+        if end - start + 1 >= self.conf.running_timeout_period:
+            out += f"\n\nThe running time exceeds {self.conf.running_timeout_period} seconds, so the process is killed."
+
+        return out
 
     def dump_python_code_run_and_get_results(
         self,
         code: str,
         dump_file_names: list[str],
-        local_path: str | None = None,
+        local_path: str,
         env: dict | None = None,
         running_extra_volume: dict | None = None,
         code_dump_file_py_name: Optional[str] = None,
-    ):
+    ) -> tuple[str, list]:
         """
         Dump the code into the local path and run the code.
         """
@@ -390,7 +490,7 @@ def dump_python_code_run_and_get_results(
                 results.append(pickle.load(open(os.path.join(local_path, f"{name}"), "rb")))
                 os.remove(os.path.join(local_path, f"{name}"))
             else:
-                return log_output, None
+                return log_output, []
         return log_output, results
 
 
@@ -400,7 +500,7 @@ class QTDockerEnv(DockerEnv):
     def __init__(self, conf: DockerConf = QlibDockerConf()):
         super().__init__(conf)
 
-    def prepare(self):
+    def prepare(self, *args, **kwargs) -> None:  # type: ignore[explicit-override, no-untyped-def]
         """
         Download image & data if it doesn't exist
         """
@@ -420,7 +520,7 @@ class DMDockerEnv(DockerEnv):
     def __init__(self, conf: DockerConf = DMDockerConf()):
         super().__init__(conf)
 
-    def prepare(self, username: str, password: str):
+    def prepare(self, username: str, password: str) -> None:
         """
         Download image & data if it doesn't exist
         """
@@ -439,7 +539,7 @@ def prepare(self, username: str, password: str):
 class KGDockerEnv(DockerEnv):
     """Kaggle Competition Docker"""
 
-    def __init__(self, competition: str = None, conf: DockerConf = KGDockerConf()):
+    def __init__(self, competition: str | None = None, conf: DockerConf = KGDockerConf()):
         super().__init__(conf)
 
 
diff --git a/rdagent/utils/fmt.py b/rdagent/utils/fmt.py
new file mode 100644
index 000000000..b49c9e71c
--- /dev/null
+++ b/rdagent/utils/fmt.py
@@ -0,0 +1,28 @@
+"""
+Tools that support generating better formats.
+"""
+
+
+def shrink_text(text: str, context_lines: int = 200) -> str:
+    """
+    When the context is too long, hide the part that is not important.
+
+        text before
+        ... (XXXXX lines are hidden) ...
+        text after
+    """
+    lines = text.splitlines()
+    total_lines = len(lines)
+
+    if total_lines <= context_lines:
+        return text
+
+    # Calculate how many lines to show from start and end
+    half_lines = context_lines // 2
+    start = "\n".join(lines[:half_lines])
+    end = "\n".join(lines[-half_lines:])
+
+    # Count the number of lines we're hiding
+    hidden_lines = total_lines - half_lines * 2
+
+    return f"{start}\n... ({hidden_lines} lines are hidden) ...\n{end}"
diff --git a/rdagent/utils/prompts.yaml b/rdagent/utils/prompts.yaml
new file mode 100644
index 000000000..db31a454c
--- /dev/null
+++ b/rdagent/utils/prompts.yaml
@@ -0,0 +1,18 @@
+filter_progress_bar:
+  system: |
+    You are an assistant helping to analyze and filter training log messages and a progress bar output from a given text. Evaluate the text to determine if training log messages and a progress bar output patterns are present and, if so, generate a list of regex patterns to remove them. 
+    Additionally, indicate whether substitution is needed. If the input exceeds a token limit, the system will provide only a shortened portion of the text.
+    Note: About the training log message, if the log message contains useful information like loss or accuracy and it is reported in each epoch, it should not be removed. If the log message is not useful, for example, reporting nan in each iteration or just reporting the iteration number, please remove them.
+
+    Respond in the following JSON format and order:
+    ```json
+    {
+        "needs_sub": <true/false>, 
+        "regex_patterns": ["regex pattern 1", "regex pattern 2", ...]
+    }
+  user: |
+    The following text contains stdout:
+
+    {{ stdout }}
+
+    Check if the text contains training log messages and progress bar patterns. If patterns are found, provide a list of regex patterns to filter them. Otherwise, indicate that substitution is not needed.
diff --git a/rdagent/utils/repo/diff.py b/rdagent/utils/repo/diff.py
new file mode 100644
index 000000000..d3acf2e0d
--- /dev/null
+++ b/rdagent/utils/repo/diff.py
@@ -0,0 +1,55 @@
+import difflib
+from pathlib import Path
+from typing import List
+
+
+def generate_diff(dir1: str, dir2: str) -> List[str]:
+    """
+    Generate a diff between two directories(from dir1 to dir2), considering only .py files.
+    It is mocking `diff -durN dir1 dir2` in linux.
+
+    Args:
+        dir1 (str): Path to the first directory.
+        dir2 (str): Path to the second directory.
+
+    Returns:
+        List[str]: A list of diffs for .py files that are different between the two directories.
+    """
+
+    diff_files = []
+
+    dir1_files = {f.relative_to(dir1) for f in Path(dir1).rglob("*.py") if f.is_file()}
+    dir2_files = {f.relative_to(dir2) for f in Path(dir2).rglob("*.py") if f.is_file()}
+
+    all_files = dir1_files.union(dir2_files)
+
+    for file in all_files:
+        file1 = Path(dir1) / file
+        file2 = Path(dir2) / file
+
+        if file1.exists() and file2.exists():
+            with file1.open() as f1, file2.open() as f2:
+                diff = list(
+                    difflib.unified_diff(f1.readlines(), f2.readlines(), fromfile=str(file1), tofile=str(file2))
+                )
+                if diff:
+                    diff_files.extend(diff)
+        else:
+            if file1.exists():
+                with file1.open() as f1:
+                    diff = list(
+                        difflib.unified_diff(
+                            f1.readlines(), [], fromfile=str(file1), tofile=str(file2) + " (empty file)"
+                        )
+                    )
+                    diff_files.extend(diff)
+            elif file2.exists():
+                with file2.open() as f2:
+                    diff = list(
+                        difflib.unified_diff(
+                            [], f2.readlines(), fromfile=str(file1) + " (empty file)", tofile=str(file2)
+                        )
+                    )
+                    diff_files.extend(diff)
+
+    return diff_files
diff --git a/rdagent/utils/workflow.py b/rdagent/utils/workflow.py
index 0c1f369b6..e2e83b7a6 100644
--- a/rdagent/utils/workflow.py
+++ b/rdagent/utils/workflow.py
@@ -17,7 +17,6 @@
 
 from tqdm.auto import tqdm
 
-from rdagent.core.exception import CoderError
 from rdagent.log import rdagent_logger as logger
 
 
@@ -54,7 +53,7 @@ def __new__(cls, clsname, bases, attrs):
         """
         steps = LoopMeta._get_steps(bases)  # all the base classes of parents
         for name, attr in attrs.items():
-            if not name.startswith("__") and isinstance(attr, Callable):
+            if not name.startswith("_") and isinstance(attr, Callable):
                 if name not in steps:
                     # NOTE: if we override the step in the subclass
                     # Then it is not the new step. So we skip it.
@@ -67,10 +66,16 @@ def __new__(cls, clsname, bases, attrs):
 class LoopTrace:
     start: datetime.datetime  # the start time of the trace
     end: datetime.datetime  # the end time of the trace
+    step_idx: int
     # TODO: more information about the trace
 
 
 class LoopBase:
+    """
+    Assumption:
+    - The last step is responsible for recording information!!!!
+    """
+
     steps: list[Callable]  # a list of steps to work on
     loop_trace: dict[int, list[LoopTrace]]
 
@@ -78,6 +83,8 @@ class LoopBase:
         default_factory=tuple
     )  # you can define a list of error that will skip current loop
 
+    EXCEPTION_KEY = "_EXCEPTION"
+
     def __init__(self):
         self.loop_idx = 0  # current loop index
         self.step_idx = 0  # the index of next step to be run
@@ -103,30 +110,30 @@ def run(self, step_n: int | None = None):
 
                 li, si = self.loop_idx, self.step_idx
 
-                start = datetime.datetime.now(datetime.timezone.utc)
-
                 name = self.steps[si]
+                # with logger.tag(f"Loop_{li}.{name}"):
+                start = datetime.datetime.now(datetime.timezone.utc)
                 func = getattr(self, name)
                 try:
                     self.loop_prev_out[name] = func(self.loop_prev_out)
                     # TODO: Fix the error logger.exception(f"Skip loop {li} due to {e}")
                 except self.skip_loop_error as e:
+                    # FIXME: This does not support previous demo (due to their last step is not for recording)
                     logger.warning(f"Skip loop {li} due to {e}")
-                    self.loop_idx += 1
-                    self.step_idx = 0
-                    continue
-                except CoderError as e:
-                    logger.warning(f"Traceback loop {li} due to {e}")
-                    self.step_idx = 0
+                    # NOTE: strong assumption!  The last step is responsible for recording information
+                    self.step_idx = len(self.steps) - 1  # directly jump to the last step.
+                    self.loop_prev_out[self.EXCEPTION_KEY] = e
                     continue
-
-                end = datetime.datetime.now(datetime.timezone.utc)
-
-                self.loop_trace[li].append(LoopTrace(start, end))
-
-                # Update tqdm progress bar
-                pbar.set_postfix(loop_index=li, step_index=si, step_name=name)
-                pbar.update(1)
+                finally:
+                    # make sure failure steps are displayed correclty
+                    end = datetime.datetime.now(datetime.timezone.utc)
+                    self.loop_trace[li].append(LoopTrace(start, end, step_idx=si))
+
+                    # Update tqdm progress bar directly to step_idx
+                    pbar.n = si + 1
+                    pbar.set_postfix(
+                        loop_index=li, step_index=si + 1, step_name=name
+                    )  # step_name indicate  last finished step_name
 
                 # index increase and save session
                 self.step_idx = (self.step_idx + 1) % len(self.steps)
diff --git a/requirements.txt b/requirements.txt
index 0858575c5..3320c1d07 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,6 +8,7 @@ loguru
 fire
 fuzzywuzzy
 openai
+azure.identity
 
 numpy # we use numpy as default data format. So we have to install numpy
 pandas # we use pandas as default data format. So we have to install pandas
@@ -46,4 +47,4 @@ nbformat
 
 # tool
 seaborn
-setuptools-scm
+setuptools-scm
\ No newline at end of file
diff --git a/test/utils/coder/test_CoSTEER.py b/test/utils/coder/test_CoSTEER.py
new file mode 100644
index 000000000..47176ab15
--- /dev/null
+++ b/test/utils/coder/test_CoSTEER.py
@@ -0,0 +1,54 @@
+import unittest
+
+
+class CoSTEERTest(unittest.TestCase):
+
+    def setUp(self):
+        self.test_competition = "aerial-cactus-identification"
+
+    def tearDown(self):
+        pass
+
+    def to_str(self, obj):
+        return "".join(str(obj).split())
+
+    def test_data_loader(self):
+        from rdagent.components.coder.data_science.raw_data_loader.test import (
+            develop_one_competition,
+        )
+
+        # if all tasks in exp are failed, will raise CoderError
+        exp = develop_one_competition(self.test_competition)
+
+    def test_feature(self):
+        from rdagent.components.coder.data_science.feature.test import (
+            develop_one_competition,
+        )
+
+        exp = develop_one_competition(self.test_competition)
+
+    def test_model(self):
+        from rdagent.components.coder.data_science.model.test import (
+            develop_one_competition,
+        )
+
+        exp = develop_one_competition(self.test_competition)
+
+    def test_ensemble(self):
+        from rdagent.components.coder.data_science.ensemble.test import (
+            develop_one_competition,
+        )
+
+        exp = develop_one_competition(self.test_competition)
+
+    def test_workflow(self):
+        from rdagent.components.coder.data_science.workflow.test import (
+            develop_one_competition,
+        )
+
+        exp = develop_one_competition(self.test_competition)
+
+
+if __name__ == "__main__":
+    unittest.main()
+    # pytest test/utils/coder/test_CoSTEER.py
diff --git a/test/utils/test_import.py b/test/utils/test_import.py
index 997e86234..5a07ebe6b 100644
--- a/test/utils/test_import.py
+++ b/test/utils/test_import.py
@@ -23,10 +23,13 @@ def import_all_modules_from_directory(directory):
                 continue
             if "model_coder" in fstr:
                 continue
+            if "llm_st" in fstr:
+                continue
             if (
                 fstr.endswith("rdagent/log/ui/app.py")
                 or fstr.endswith("rdagent/app/cli.py")
                 or fstr.endswith("rdagent/app/CI/run.py")
+                or fstr.endswith("rdagent/app/utils/ape.py")
             ):
                 # the entrance points
                 continue