diff --git a/ui/demo_streamlit/app.py b/ui/demo_streamlit/app.py index 88f8d39..f3c8e32 100755 --- a/ui/demo_streamlit/app.py +++ b/ui/demo_streamlit/app.py @@ -1,17 +1,17 @@ import streamlit as st from routes import ROUTES, get_page +from utils import load_css +# Configuration +st.set_page_config( + page_title="EG1 - Évaluation", + page_icon="ui/demo_streamlit/static/images/eg1_logo.png", + layout="wide", + initial_sidebar_state="expanded", +) -pg = st.navigation([get_page(route) for route in ROUTES]) -st.set_page_config(layout="wide") -custom_css = """ - -""" -st.markdown(custom_css, unsafe_allow_html=True) +load_css("style.css") +pg = st.navigation([get_page(route) for route in ROUTES]) pg.run() diff --git a/ui/demo_streamlit/static/css/style.css b/ui/demo_streamlit/static/css/style.css new file mode 100644 index 0000000..1c536c8 --- /dev/null +++ b/ui/demo_streamlit/static/css/style.css @@ -0,0 +1,78 @@ +/* Color */ +:root { + --principal-title: #000091; + --level2-title: #313178; + --level3-title: #273961; + --button-main: #000091; + --background-main: #F6F6F6; + --text-main: #161616; +} + +/* System Police */ +body { + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue', sans-serif; + background-color: var(--background-main); + color: var(--text-main); + line-height: 1.5; +} + +/* Title */ +h1 { + color: var(--principal-title); + font-weight: 600; +} + +h2 { + color: var(--level2-title); + font-weight: 600; +} + +h3 { + color: var(--level3-title); + font-weight: 600; +} + +/* Button */ +.stButton>button { + background-color: var(--button-main); + color: white; + border-radius: 4px; + border: none; + padding: 0.5rem 1rem; + transition: background-color 0.2s ease; +} + +.stButton>button:hover { + background-color: #1565c0; +} + +/* Widgets */ +.stSelectbox, .stTextInput { + border-radius: 4px; + border: 1px solid var(--button-main); +} + +/* DataFrames */ +.dataframe { + border-collapse: collapse; + width: 100%; + margin-bottom: 1rem; +} + +.dataframe th { + background-color: var(--principal-title); + color: white; + padding: 0.5rem; + text-align: left; +} + +.dataframe td { + border: 1px solid #e0e0e0; + padding: 0.5rem; +} + +/* Sidebar */ +.sidebar .sidebar-content { + background-color: white; + border-right: 1px solid #e0e0e0; +} diff --git a/ui/demo_streamlit/static/images/eg1_logo.png b/ui/demo_streamlit/static/images/eg1_logo.png new file mode 100644 index 0000000..1351c26 Binary files /dev/null and b/ui/demo_streamlit/static/images/eg1_logo.png differ diff --git a/ui/demo_streamlit/utils.py b/ui/demo_streamlit/utils.py index 1c1c1f4..5607b4c 100755 --- a/ui/demo_streamlit/utils.py +++ b/ui/demo_streamlit/utils.py @@ -1,3 +1,4 @@ +import os import requests import streamlit as st @@ -19,3 +20,7 @@ def fetch(method, endpoint, data=None): else: st.error(f"Failed to fetch data from {endpoint}.") return None + +def load_css(file_name): + with open(os.path.join("ui", "demo_streamlit", "static", "css", file_name)) as f: + st.markdown(f'', unsafe_allow_html=True) diff --git a/ui/demo_streamlit/views/datasets.py b/ui/demo_streamlit/views/datasets.py index 8167957..751cec1 100755 --- a/ui/demo_streamlit/views/datasets.py +++ b/ui/demo_streamlit/views/datasets.py @@ -13,10 +13,10 @@ def main(): datasets = fetch("get", "/datasets") if not datasets: return - + + # Main content main_content, right_menu = st.columns([8, 2]) - # Main content with main_content: with st.container(): st.write("""Avalaible datasets @@ -25,7 +25,6 @@ def main(): for dataset in datasets: when = datetime.fromisoformat(dataset["created_at"]).strftime("%d %B %Y") with st.container(): - # Add an anchor for navigation st.markdown( f"
", unsafe_allow_html=True, @@ -44,7 +43,6 @@ def main(): st.caption(f"Created the {when}") st.divider() - # Navigation menu with right_menu: st.markdown("###### Quick Navigation") for dataset in datasets: diff --git a/ui/demo_streamlit/views/experiments.py b/ui/demo_streamlit/views/experiments.py index 1080526..8ed09c0 100755 --- a/ui/demo_streamlit/views/experiments.py +++ b/ui/demo_streamlit/views/experiments.py @@ -1,134 +1,137 @@ import streamlit as st import pandas as pd import numpy as np +from typing import Dict, List, Tuple, Optional from utils import fetch - - -def fetch_all_experiments(): - endpoint = "/experiments" - return fetch("get", endpoint) - - -def fetch_experiment_results(exp_id): - endpoint = f"/experiment/{exp_id}" - params = {"with_results": "true"} - return fetch("get", endpoint, params) - - -def process_experiment_results(experiment): - results = experiment.get("results", []) - df_metrics = {} - - for metric_results in results: - metric_name = metric_results["metric_name"] - arr = np.array( - [x["score"] for x in metric_results["observation_table"] if pd.notna(x["score"])] - ) - - if len(arr) > 0: - df = pd.DataFrame( - [ - [ - np.mean(arr), - np.std(arr), - np.median(arr), - f"{arr.mean():.2f} ± {arr.std():.2f}", - len(arr), - ] - ], - columns=["mean", "std", "median", "mean_std", "support"], - ) - - df_metrics[metric_name] = df - +from io import StringIO + +# Constants for warning +FINISHED_STATUS = "finished" +UNKNOWN_DATASET = "Unknown Dataset" +UNKNOWN_MODEL = "Unknown Model" + +@st.cache_data +def fetch_all_experiments() -> List[Dict]: + return fetch("get", "/experiments") + +@st.cache_data +def fetch_experiment_results(exp_id: int) -> Dict: + return fetch("get", f"/experiment/{exp_id}", {"with_dataset": "true"}) + +def process_experiment_data(response: Dict) -> Tuple[Optional[pd.DataFrame], str, str]: + if not response: + return None, UNKNOWN_DATASET, UNKNOWN_MODEL + + df = pd.read_json(StringIO(response["dataset"]["df"])) + + if "answers" in response: + df["answer"] = df.index.map({answer["num_line"]: answer["answer"] for answer in response["answers"]}) + + if "results" in response: + for result in response["results"]: + metric_name = result["metric_name"] + df[f"result_{metric_name}"] = df.index.map({obs["num_line"]: obs["score"] for obs in result["observation_table"]}) + + return df, response.get("dataset", {}).get("name", UNKNOWN_DATASET), response.get("model", {}).get("name", UNKNOWN_MODEL) + +def calculate_metric_stats(arr: np.array) -> Dict[str, float]: + return { + "mean": np.mean(arr), + "std": np.std(arr), + "median": np.median(arr), + "mean_std": f"{arr.mean():.2f} ± {arr.std():.2f}", + "support": len(arr) + } + +def process_experiment_results(experiment: Dict) -> pd.DataFrame: + df_metrics = { + metric_results["metric_name"]: pd.DataFrame([calculate_metric_stats(np.array([x["score"] for x in metric_results["observation_table"] if pd.notna(x["score"])]))]) + for metric_results in experiment.get("results", []) + if len([x["score"] for x in metric_results["observation_table"] if pd.notna(x["score"])]) > 0 + } return pd.DataFrame( {metric_name: df["mean_std"].iloc[0] for metric_name, df in sorted(df_metrics.items())}, - index=[experiment["name"]], + index=[experiment["name"]] ) - -def display_all_experiments(): - experiments = fetch_all_experiments() - - if not experiments: - st.error("No experiments found.") - return - - formatted_experiments = [] - - for exp in experiments: - if exp["experiment_status"] == "finished" and exp["experiment_set_id"] is None: - formatted_exp = { - "id": exp["id"], - "name": exp["name"], - "dataset": exp["dataset"]["name"], - "model": exp["model"]["name"] if exp["model"] else "N/A", - } - - for result in exp.get("results", []): - metric_name = result["metric_name"] - scores = [ - obs["score"] for obs in result["observation_table"] if obs["score"] is not None - ] - if scores: - avg_score = sum(scores) / len(scores) - formatted_exp[f"{metric_name}_score"] = f"{avg_score:.2f}" - - formatted_experiments.append(formatted_exp) - - df = pd.DataFrame(formatted_experiments) - - metric_columns = [col for col in df.columns if col.endswith("_score")] - df = df[df[metric_columns].notna().any(axis=1)] - - st.dataframe(df) - - if not df.empty: - selected_exp_id = st.selectbox( - "Select a finished experiment to view details:", df["id"].tolist() - ) - - if st.button("Show Selected Experiment Results"): - display_experiment_results(selected_exp_id) - else: - st.info("No finished experiments found.") - - -def display_experiment_results(exp_id): +@st.cache_data +def preprocess_experiments(experiments: List[Dict]) -> pd.DataFrame: + formatted_experiments = [ + { + "id": exp["id"], + "name": exp["name"], + "dataset": exp["dataset"]["name"], + "model": exp["model"]["name"] if exp["model"] else "N/A", + **{f"{result['metric_name']}_score": f"{sum(obs['score'] for obs in result['observation_table'] if obs['score'] is not None) / len([obs for obs in result['observation_table'] if obs['score'] is not None]):.2f}" + for result in exp.get("results", []) if any(obs['score'] is not None for obs in result['observation_table'])} + } + for exp in experiments + if exp["experiment_status"] == FINISHED_STATUS and exp["experiment_set_id"] is None + ] + return pd.DataFrame(formatted_experiments) + +def display_experiment_results(exp_id: int): experiment = fetch_experiment_results(exp_id) if not experiment: + st.error(f"No results found for experiment {exp_id}") return - if experiment["experiment_status"] != "finished": + if experiment["experiment_status"] != FINISHED_STATUS: st.warning(f"Experiment {exp_id} is not finished yet...") if experiment["num_success"] != experiment["num_try"]: - st.warning("Warning: some experiments are failed.") + st.warning("Warning: some experiments have failed.") if experiment["num_observation_success"] != experiment["num_observation_try"]: - st.warning("Warning: some metrics are failed.") + st.warning("Warning: some metrics have failed.") results_df = process_experiment_results(experiment) + df_with_results, dataset_name, model_name = process_experiment_data(experiment) + + cols = st.columns(3) + cols[0].write(f"**Dataset:** {dataset_name}") + cols[1].write(f"**Model:** {model_name}") if not results_df.empty: + st.subheader("Aggregated Results") st.dataframe(results_df) + + st.subheader("Detailed Results") + st.dataframe(df_with_results) else: st.info("No results available for this experiment.") - def main(): st.title("Experiments (not in a Set)") - st.info("Here, you can see the experiments that are not in evaluation sets. ") - - options_button = ["View All Experiments (finished)", "View Experiment by ID"] - view_option = st.radio("Select View Option", options_button) + st.info("Here, you can see the experiments that are not in evaluation sets.") - if view_option == "View All Experiments (finished)": - display_all_experiments() + st.subheader("All Experiments (finished)") + experiments = fetch_all_experiments() + + if not experiments: + st.error("No experiments found.") else: - exp_id = st.number_input("Enter Experiment ID", min_value=1, step=1) - if st.button("Show Results"): - display_experiment_results(exp_id) + df = preprocess_experiments(experiments) + + metric_columns = [col for col in df.columns if col.endswith("_score")] + df = df[df[metric_columns].notna().any(axis=1)] + + st.dataframe(df) + st.divider() + + if not df.empty: + st.markdown("### Select a finished experiment to view details:") + selected_exp_id = st.selectbox( + label="", + options=df["id"].tolist(), + format_func=lambda x: f"Experiment {x}", + label_visibility="collapsed" + ) + if st.button("Show Selected Experiment Results"): + display_experiment_results(selected_exp_id) + else: + st.info("No finished experiments found.") main() + diff --git a/ui/demo_streamlit/views/experiments_set.py b/ui/demo_streamlit/views/experiments_set.py index 5afccdf..6c37004 100755 --- a/ui/demo_streamlit/views/experiments_set.py +++ b/ui/demo_streamlit/views/experiments_set.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd from datetime import datetime import streamlit as st @@ -32,7 +33,10 @@ def _get_expset_status(expset: dict) -> tuple[dict, dict]: return status, counts -def get_experiment_data(exp_id): +def _get_experiment_data(exp_id): + """ + for each exp_id, returns query, answer true, answer llm and metrics + """ response = fetch("get", f"/experiment/{exp_id}", {"with_dataset": "true"}) if not response: return None @@ -56,10 +60,14 @@ def get_experiment_data(exp_id): def display_experiment_set_overview(expset, experiments_df): + """ + returns a dataframe with the list of Experiments and the associated status + """ + status, counts = _get_expset_status(expset) - st.write(f"## Overview of experiment set: {expset['name']}") - st.write(f"experiment_set id: {expset['id']}") - finished_ratio = int(counts["total_observation_successes"] / counts["observation_length"] * 100) + st.markdown(f"## Overview of experiment set: ~~ {expset['name']} ~~") + st.markdown(f"experiment_set id: {expset['id']}") + finished_ratio = int(counts["total_observation_successes"] // counts["observation_length"] * 100) st.markdown(f"Finished: {finished_ratio}%", unsafe_allow_html=True) failure_ratio = int( (counts["total_observation_tries"] - counts["total_observation_successes"]) @@ -70,6 +78,9 @@ def display_experiment_set_overview(expset, experiments_df): st.markdown( f"Failure: {failure_ratio}%", unsafe_allow_html=True ) + + + st.markdown(expset.get("readme", "No description available")) row_height = 35 header_height = 35 @@ -85,17 +96,10 @@ def display_experiment_set_overview(expset, experiments_df): ) -def display_experiment_set_result(expset, experiments_df): - st.write("## Results of the Experiment Set") - - total_experiments = len(experiments_df) - total_success = experiments_df["Num success"].sum() - - st.write(f"Total Experiments: {total_experiments}") - st.write(f"Total Successful Experiments: {total_success}") - - def display_experiment_sets(experiment_sets): + """ + returns the list of experiments set, with their status/info + """ cols = st.columns(3) for idx, exp_set in enumerate(experiment_sets): @@ -157,16 +161,134 @@ def display_experiment_details(experimentset, experiments_df): experiment_ids = experiments_df["Id"].tolist() selected_exp_id = st.selectbox("Select Experiment ID", experiment_ids) if selected_exp_id: - df_with_results, dataset_name, model_name = get_experiment_data(selected_exp_id) + df_with_results, dataset_name, model_name = _get_experiment_data(selected_exp_id) if df_with_results is not None: - st.write(f"### Detailed results of the experiment id={selected_exp_id} ") - st.write(f"**Dataset:** {dataset_name}") - st.write(f"**Model:** {model_name}") + cols = st.columns(4) + with cols[0]: + st.write(f"**experiment_id** n° {selected_exp_id}") + with cols[1]: + st.write(f"**Dataset:** {dataset_name}") + with cols[2]: + st.write(f"**Model:** {model_name}") st.dataframe(df_with_results) else: st.error("Failed to fetch experiment data") +def process_experiment_results(experimentset): + """ + process experiment results dynamically across different experiment types. + """ + rows = [] + metrics = set() + experiment_names = [exp["name"] for exp in experimentset.get("experiments", [])] + + is_repeat_mode = _check_repeat_mode(experiment_names) + + for exp in experimentset.get("experiments", []): + if exp["experiment_status"] != "finished": + st.warning(f"Warning: experiment {exp['id']} is not finished yet...") + continue + + response = fetch("get", f"/experiment/{exp['id']}?with_results=true") + if not response: + continue + + model_name = response["model"]["name"] + extra_params = response["model"].get("extra_params", {}) + variant = _extract_experiment_variant(extra_params) + row = {"model": f"{model_name}_{variant}" if variant else model_name} + + for metric_results in response.get("results", []): + metric = metric_results["metric_name"] + metrics.add(metric) + scores = [x["score"] for x in metric_results["observation_table"] if pd.notna(x.get("score"))] + if scores: + row[f"{metric}_mean"] = np.mean(scores) + row[f"{metric}_std"] = np.std(scores) + row[f"{metric}_support"] = len(scores) + + rows.append(row) + + if not rows: + st.error("No valid experiment results found") + return None + + df = pd.DataFrame(rows) + + if is_repeat_mode: + df = df.groupby("model").agg({col: ['mean', 'std'] for col in df.columns if col.endswith('_mean')}) + df.columns = [f"{col[0]}_{col[1]}" for col in df.columns] + else: + default_sort_metric = _find_default_sort_metric(metrics) + if default_sort_metric and f"{default_sort_metric}_mean" in df.columns: + df = df.sort_values(by=f"{default_sort_metric}_mean", ascending=False) + + return df + +def _check_repeat_mode(experiment_names): + """ + check whether the experiment is related to a repetition + """ + if len(experiment_names) <= 1: + return False + + base_names = [name.rsplit('_', 1)[0] for name in experiment_names] + + if len(set(base_names)) == 1: + suffixes = [name.split('_')[-1] for name in experiment_names] + return all(suffix.isdigit() for suffix in suffixes) + + return False + +def _extract_experiment_variant(extra_params: dict): + """ + extract a meaningful variant identifier from extra parameters. + """ + if not extra_params: + return "" + + if "rag" in extra_params: + if "limit" in extra_params["rag"]: + return f"limit_{extra_params['rag']['limit']}" + + return str(list(extra_params.keys())[0]) if extra_params else "" + +def _find_default_sort_metric(metrics): + """ + find a sensible default metric for sorting results. + """ + preferred_metrics = ['judge_exactness', 'contextual_relevancy'] + for metric in preferred_metrics: + if metric in metrics: + return f"{metric}_mean" + + return list(metrics)[0] + "_mean" if metrics else None + +def display_experiment_results(experimentset): + results_df = process_experiment_results(experimentset) + + if results_df is not None: + st.write("### Experiment Results") + st.dataframe(results_df) + +def display_experiment_set_result(experimentset, experiments_df): + st.write("## Results of the Experiment Set") + + total_experiments = len(experiments_df) + all_successful = (experiments_df["Num try"] == experiments_df["Num success"]).all() + + if all_successful: + display_experiment_results(experimentset) + else: + st.error("Detailed results cannot be displayed as not all experiments are successful") + cols = st.columns(6) + with cols[0]: + st.write(f"Total Experiments: {total_experiments}") + with cols[1]: + st.write(f"Failure Experiments: {total_experiments - (experiments_df['Num success'] > 0).sum()}") + + def main(): if st.session_state.get("experimentset"): # Get the expet