From 8e4871bfa3d5decc13cd38fe37fb3d1f0c537504 Mon Sep 17 00:00:00 2001 From: Tom Zayats Date: Mon, 16 Dec 2024 13:32:44 -0800 Subject: [PATCH 1/3] added run_name --- journeys/evaluation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/journeys/evaluation.py b/journeys/evaluation.py index a51456a1..0e21bf96 100644 --- a/journeys/evaluation.py +++ b/journeys/evaluation.py @@ -51,6 +51,7 @@ "SEMANTIC_MODEL_STRING": "VARCHAR", "EVAL_TABLE": "VARCHAR", "EVAL_HASH": "VARCHAR", + "EVAL_RUN_NAME": "VARCHAR", } LLM_JUDGE_PROMPT_TEMPLATE = """\ @@ -289,6 +290,7 @@ def write_eval_results(frame: pd.DataFrame) -> None: frame_to_write = frame.copy() frame_to_write["TIMESTAMP"] = st.session_state["eval_timestamp"] frame_to_write["EVAL_HASH"] = st.session_state["eval_hash"] + frame_to_write["EVAL_RUN_NAME"] = st.session_state["eval_run_name"] frame_to_write["EVAL_TABLE"] = st.session_state["eval_table"] frame_to_write["EVAL_TABLE_HASH"] = st.session_state["eval_table_hash"] frame_to_write["MODEL_HASH"] = st.session_state["semantic_model_hash"] @@ -612,6 +614,7 @@ def evaluation_mode_show() -> None: st.write( "Welcome!๐Ÿงช In the evaluation mode you can evaluate your semantic model using pairs of golden queries/questions and their expected SQL statements. These pairs should be captured in an **Evaluation Table**. Accuracy metrics will be shown and the results will be stored in an **Evaluation Results Table**." ) + st.text_input("Evaluation Run Name", key="eval_run_name") # TODO: find a less awkward way of specifying this. if any(key not in st.session_state for key in ("eval_table", "results_eval_table")): @@ -637,6 +640,7 @@ def evaluation_mode_show() -> None: evolution_run_summary = pd.DataFrame( [ + ["Evaluation Run Name", st.session_state["eval_run_name"]], ["Evaluation Table Hash", st.session_state["eval_table_hash"]], ["Semantic Model Hash", st.session_state["semantic_model_hash"]], ["Evaluation Run Hash", st.session_state["eval_hash"]], From 286d5e6a78f88a6270d2813fffc87ac573444478 Mon Sep 17 00:00:00 2001 From: Tom Zayats Date: Fri, 17 Jan 2025 08:59:33 -0800 Subject: [PATCH 2/3] added eval run name --- journeys/evaluation.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/journeys/evaluation.py b/journeys/evaluation.py index 0e21bf96..b77fb0a4 100644 --- a/journeys/evaluation.py +++ b/journeys/evaluation.py @@ -129,7 +129,7 @@ def visualize_eval_results(frame: pd.DataFrame) -> None: st.write(f"**Explanation**: {row['EXPLANATION']}") -def _llm_judge(frame: pd.DataFrame) -> pd.DataFrame: +def _llm_judge(frame: pd.DataFrame, max_frame_size = 200) -> pd.DataFrame: if frame.empty: return pd.DataFrame({"EXPLANATION": [], "CORRECT": []}) @@ -141,8 +141,8 @@ def _llm_judge(frame: pd.DataFrame) -> pd.DataFrame: axis=1, func=lambda x: LLM_JUDGE_PROMPT_TEMPLATE.format( input_question=x["QUERY"], - frame1_str=x["ANALYST_RESULT"].to_string(index=False), - frame2_str=x["GOLD_RESULT"].to_string(index=False), + frame1_str=x["ANALYST_RESULT"][:max_frame_size].to_string(index=False), + frame2_str=x["GOLD_RESULT"][:max_frame_size].to_string(index=False), ), ).to_frame(name=col_name) session = st.session_state["session"] @@ -355,7 +355,9 @@ def run_sql_queries() -> None: for i, (row_id, row) in enumerate(eval_table_frame.iterrows(), start=1): status_text.text(f"Evaluating Analyst query {i}/{total_requests}...") + + analyst_query = analyst_results_frame.loc[row_id, "ANALYST_SQL"] analyst_result = execute_query( conn=get_snowflake_connection(), query=analyst_query @@ -588,6 +590,7 @@ def clear_evaluation_selection() -> None: "selected_results_eval_old_table", "selected_results_eval_schema", "use_existing_eval_results_table", + "selected_eval_run_name", ) for feature in session_states: if feature in st.session_state: @@ -600,6 +603,9 @@ def clear_evaluation_data() -> None: "eval_accuracy", "analyst_results_frame", "query_results_frame", + "eval_run_name", + "eval_timestamp", + "eval_hash", ) for feature in session_states: if feature in st.session_state: @@ -614,7 +620,7 @@ def evaluation_mode_show() -> None: st.write( "Welcome!๐Ÿงช In the evaluation mode you can evaluate your semantic model using pairs of golden queries/questions and their expected SQL statements. These pairs should be captured in an **Evaluation Table**. Accuracy metrics will be shown and the results will be stored in an **Evaluation Results Table**." ) - st.text_input("Evaluation Run Name", key="eval_run_name") + st.text_input("Evaluation Run Name", key="selected_eval_run_name", value= st.session_state.get("selected_eval_run_name", "")) # TODO: find a less awkward way of specifying this. if any(key not in st.session_state for key in ("eval_table", "results_eval_table")): @@ -688,8 +694,12 @@ def run_evaluation() -> None: return placeholder.write("Model validated โœ…") clear_evaluation_data() + st.session_state["eval_run_name"] = st.session_state["selected_eval_run_name"] st.session_state["semantic_model_hash"] = current_hash - st.write("Running evaluation...") + if st.session_state["eval_run_name"] == "": + st.write("Running evaluation ...") + else: + st.write(f"Running evaluation for name: {st.session_state['eval_run_name']} ...") st.session_state["eval_timestamp"] = time.strftime("%Y-%m-%d %H:%M:%S") st.session_state["eval_hash"] = generate_hash(st.session_state["eval_timestamp"]) send_analyst_requests() From 140712862de1b576cfeb54d4bcba2886f3a8d88f Mon Sep 17 00:00:00 2001 From: Tom Zayats Date: Fri, 17 Jan 2025 09:44:06 -0800 Subject: [PATCH 3/3] cleare up results when running new eval --- journeys/evaluation.py | 102 ++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 47 deletions(-) diff --git a/journeys/evaluation.py b/journeys/evaluation.py index b77fb0a4..63703811 100644 --- a/journeys/evaluation.py +++ b/journeys/evaluation.py @@ -80,53 +80,55 @@ def visualize_eval_results(frame: pd.DataFrame) -> None: n_questions = len(frame) n_correct = frame["CORRECT"].sum() accuracy = (n_correct / n_questions) * 100 - st.markdown( - f"###### Results: {n_correct} out of {n_questions} questions correct with accuracy {accuracy:.2f}%" - ) - for row_id, row in frame.iterrows(): - match_emoji = "โœ…" if row["CORRECT"] else "โŒ" - with st.expander(f"Row ID: {row_id} {match_emoji}"): - st.write(f"Input Query: {row['QUERY']}") - st.write(row["ANALYST_TEXT"].replace("\n", " ")) - - col1, col2 = st.columns(2) - - try: - analyst_sql = sqlglot.parse_one(row["ANALYST_SQL"], dialect="snowflake") - analyst_sql = analyst_sql.sql(dialect="snowflake", pretty=True) - except Exception as e: - logger.warning(f"Error parsing analyst SQL: {e} for {row_id}") - analyst_sql = row["ANALYST_SQL"] - - try: - gold_sql = sqlglot.parse_one(row["GOLD_SQL"], dialect="snowflake") - gold_sql = gold_sql.sql(dialect="snowflake", pretty=True) - except Exception as e: - logger.warning(f"Error parsing gold SQL: {e} for {row_id}") - gold_sql = row["GOLD_SQL"] - - with col1: - st.write("Analyst SQL") - st.code(analyst_sql, language="sql") - - with col2: - st.write("Golden SQL") - st.code(gold_sql, language="sql") - - col1, col2 = st.columns(2) - with col1: - if isinstance(row["ANALYST_RESULT"], str): - st.error(row["ANALYST_RESULT"]) - else: - st.write(row["ANALYST_RESULT"]) - - with col2: - if isinstance(row["GOLD_RESULT"], str): - st.error(row["GOLD_RESULT"]) - else: - st.write(row["GOLD_RESULT"]) - - st.write(f"**Explanation**: {row['EXPLANATION']}") + results_placeholder = st.session_state.get("eval_results_placeholder") + with results_placeholder.container(): + st.markdown( + f"###### Results: {n_correct} out of {n_questions} questions correct with accuracy {accuracy:.2f}%" + ) + for row_id, row in frame.iterrows(): + match_emoji = "โœ…" if row["CORRECT"] else "โŒ" + with st.expander(f"Row ID: {row_id} {match_emoji}"): + st.write(f"Input Query: {row['QUERY']}") + st.write(row["ANALYST_TEXT"].replace("\n", " ")) + + col1, col2 = st.columns(2) + + try: + analyst_sql = sqlglot.parse_one(row["ANALYST_SQL"], dialect="snowflake") + analyst_sql = analyst_sql.sql(dialect="snowflake", pretty=True) + except Exception as e: + logger.warning(f"Error parsing analyst SQL: {e} for {row_id}") + analyst_sql = row["ANALYST_SQL"] + + try: + gold_sql = sqlglot.parse_one(row["GOLD_SQL"], dialect="snowflake") + gold_sql = gold_sql.sql(dialect="snowflake", pretty=True) + except Exception as e: + logger.warning(f"Error parsing gold SQL: {e} for {row_id}") + gold_sql = row["GOLD_SQL"] + + with col1: + st.write("Analyst SQL") + st.code(analyst_sql, language="sql") + + with col2: + st.write("Golden SQL") + st.code(gold_sql, language="sql") + + col1, col2 = st.columns(2) + with col1: + if isinstance(row["ANALYST_RESULT"], str): + st.error(row["ANALYST_RESULT"]) + else: + st.write(row["ANALYST_RESULT"]) + + with col2: + if isinstance(row["GOLD_RESULT"], str): + st.error(row["GOLD_RESULT"]) + else: + st.write(row["GOLD_RESULT"]) + + st.write(f"**Explanation**: {row['EXPLANATION']}") def _llm_judge(frame: pd.DataFrame, max_frame_size = 200) -> pd.DataFrame: @@ -640,6 +642,8 @@ def evaluation_mode_show() -> None: if st.button("Run Evaluation"): run_evaluation() + + if "total_eval_frame" in st.session_state: current_hash = generate_hash(st.session_state["working_yml"]) model_changed_test = current_hash != st.session_state["semantic_model_hash"] @@ -661,6 +665,7 @@ def evaluation_mode_show() -> None: else: st.markdown("#### Current Evaluation Run Summary") st.dataframe(evolution_run_summary, hide_index=True) + st.session_state["eval_results_placeholder"] = st.empty() visualize_eval_results(st.session_state["total_eval_frame"]) @@ -700,6 +705,9 @@ def run_evaluation() -> None: st.write("Running evaluation ...") else: st.write(f"Running evaluation for name: {st.session_state['eval_run_name']} ...") + if "eval_results_placeholder" in st.session_state: + results_placeholder = st.session_state["eval_results_placeholder"] + results_placeholder.empty() st.session_state["eval_timestamp"] = time.strftime("%Y-%m-%d %H:%M:%S") st.session_state["eval_hash"] = generate_hash(st.session_state["eval_timestamp"]) send_analyst_requests()