fixed leaderboard formatting

lm-sys · Sep 6, 2024 · 5063c01 · 5063c01
1 parent 4ea8f48
commit 5063c01
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 9 deletions.
diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py
@@ -601,6 +601,14 @@ def bot_response(
     font-size: 105%;
 }
 
+#overview_leaderboard_dataframe table th {
+    font-size: 90%;
+}
+
+#overview_leaderboard_dataframe table td {
+    font-size: 105%;
+}
+
 .tab-nav button {
     font-size: 18px;
 }

diff --git a/fastchat/serve/monitor/classify/category.py b/fastchat/serve/monitor/classify/category.py
@@ -233,7 +233,7 @@ def post_process(self, judgment):
             "is_counting": "counting" in score,
             "is_ocr": "optical character recognition" in score,
             "is_entity_recognition": "entity recognition" in score,
-            "is_creative_composition": "creative composition" in score,
+            "is_creative_composition": "creative writing" in score,
             "is_spatial_reasoning": "spatial reasoning" in score,
             "response": judgment,
         }
diff --git a/fastchat/serve/monitor/elo_analysis.py b/fastchat/serve/monitor/elo_analysis.py
@@ -707,7 +707,6 @@ def pretty_print_elo_rating(rating):
     if args.clean_battle_file:
         # Read data from a cleaned battle files
         battles = pd.read_json(args.clean_battle_file)
-        print(battles.columns)
     else:
         # Read data from all log files
         log_files = get_log_files(args.max_num_files)
@@ -718,14 +717,31 @@ def pretty_print_elo_rating(rating):
         "chinese": lambda x: x["language"] == "Chinese",
         "english": lambda x: x["language"] == "English",
         "russian": lambda x: x["language"] == "Russian",
+        "vietnamese": lambda x: x["language"] == "Vietnamese",
         "multiturn": lambda x: x["turn"] > 1,
         "exclude_preset": lambda x: not x["preset"],
+        "no_refusal": lambda x: not x["is_refusal"],
+        "is_captioning": lambda x: x["category_tag"]["vision_v0.1"]["is_captioning"],
+        "is_entity_recognition": lambda x: x["category_tag"]["vision_v0.1"]["is_entity_recognition"],
+        "is_ocr": lambda x: x["category_tag"]["vision_v0.1"]["is_ocr"],
+        "is_counting": lambda x: x["category_tag"]["vision_v0.1"]["is_counting"],
+        "is_creative_composition": lambda x: x["category_tag"]["vision_v0.1"]["is_creative_composition"],
+        "is_spatial_reasoning": lambda x: x["category_tag"]["vision_v0.1"]["is_spatial_reasoning"],
+        "if": lambda x: x["category_tag"]["if_v0.1"]["if"],
+        "math": lambda x: x["category_tag"]["math_v0.1"]["math"],
     }
     assert all(
         [cat in filter_func_map for cat in args.category]
     ), f"Invalid category: {args.category}"
 
     results = {}
+    for cat in args.category:
+        values = battles.apply(filter_func_map[cat], axis=1)
+        # if all values are False, skip
+        print(f"Category {cat} has {values.sum()} battles")
+        if not any(values):
+            print(f"Skipping category {cat}")
+            continue
     for cat in args.category:
         filter_func = filter_func_map[cat]
         results[cat] = report_elo_analysis_results(

diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py
@@ -674,12 +674,13 @@ def highlight_top_3(s):
         style = style.apply(highlight_top_3, subset=[category])
 
     if metric == "rating":
-        style = style.background_gradient(
-            cmap="Blues",
-            subset=category_names,
-            vmin=1150,
-            vmax=category_df[category_names].max().max(),
-        )
+        for category in category_names:
+            style = style.background_gradient(
+                cmap="Blues",
+                subset=[category],
+                vmin=category_df[category].max() - 150,
+                vmax=category_df[category].max(),
+            )
 
     return style
 
@@ -705,7 +706,7 @@ def build_category_leaderboard_tab(
         headers=["Model"] + [key_to_category_name[k] for k in categories],
         datatype=["markdown"] + ["str" for k in categories],
         value=full_table_vals,
-        elem_id="full_leaderboard_dataframe",
+        elem_id="overview_leaderboard_dataframe",
         column_widths=[250]
         + categories_width,  # IMPORTANT: THIS IS HARDCODED WITH THE CURRENT CATEGORIES
         height=800,
@@ -731,6 +732,17 @@ def build_category_leaderboard_tab(
 ]
 selected_categories_width = [95, 85, 130, 75, 150, 100, 95, 100]
 
+vision_categories = [
+    "full",
+    "is_captioning",
+    "is_entity_recognition",
+    "is_ocr",
+    "is_creative_composition",
+    "if",
+    "no_refusal",
+]
+vision_categories_width = [90, 90, 90, 50, 80, 95, 90, 80]
+
 language_categories = [
     "english",
     "chinese",
@@ -838,6 +850,14 @@ def build_leaderboard_tab(
                     language_categories,
                     language_categories_width,
                 )
+                if elo_results_vision is not None:
+                    vision_combined_table = get_combined_table(elo_results_vision, model_table_df)
+                    build_category_leaderboard_tab(
+                        vision_combined_table,
+                        "Vision",
+                        vision_categories,
+                        vision_categories_width,
+                    )
                 gr.Markdown(
                     f"""
             ***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.

diff --git a/fastchat/serve/monitor/monitor_md.py b/fastchat/serve/monitor/monitor_md.py
@@ -30,6 +30,12 @@
     "full_style_control": "Overall (Style Control)",
     "hard_6_style_control": "Hard Prompts (Overall) (Style Control)",
     "exclude_preset": "Exclude Preset",
+    "is_captioning": "Captioning",
+    "is_entity_recognition": "Entity Recognition",
+    "is_ocr": "OCR",
+    "is_counting": "Counting",
+    "is_creative_composition": "Creative Writing",
+    "is_spatial_reasoning": "Spatial Reasoning",
 }
 cat_name_to_explanation = {
     "Overall": "Overall Questions",
@@ -55,6 +61,12 @@
     "overall_limit_5_user_vote": "overall_limit_5_user_vote",
     "Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
     "Exclude Preset": "Exclude Preset Images",
+    "Captioning": "Open-Ended Captioning",
+    "Entity Recognition": "Entity Recognition (e.g. who is in the image)",
+    "OCR": "Optical Character Recognition",
+    "Counting": "Counting",
+    "Creative Writing": "Creative Writing (e.g. write a story about this image)",
+    "Spatial Reasoning": "Spatial Reasoning",
 }
 cat_name_to_baseline = {
     "Hard Prompts (English)": "English",