Skip to content

Commit

Permalink
fixed leaderboard formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
lisadunlap committed Sep 6, 2024
1 parent 4ea8f48 commit 5063c01
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 9 deletions.
8 changes: 8 additions & 0 deletions fastchat/serve/gradio_web_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,14 @@ def bot_response(
font-size: 105%;
}
#overview_leaderboard_dataframe table th {
font-size: 90%;
}
#overview_leaderboard_dataframe table td {
font-size: 105%;
}
.tab-nav button {
font-size: 18px;
}
Expand Down
2 changes: 1 addition & 1 deletion fastchat/serve/monitor/classify/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def post_process(self, judgment):
"is_counting": "counting" in score,
"is_ocr": "optical character recognition" in score,
"is_entity_recognition": "entity recognition" in score,
"is_creative_composition": "creative composition" in score,
"is_creative_composition": "creative writing" in score,
"is_spatial_reasoning": "spatial reasoning" in score,
"response": judgment,
}
18 changes: 17 additions & 1 deletion fastchat/serve/monitor/elo_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,7 +707,6 @@ def pretty_print_elo_rating(rating):
if args.clean_battle_file:
# Read data from a cleaned battle files
battles = pd.read_json(args.clean_battle_file)
print(battles.columns)
else:
# Read data from all log files
log_files = get_log_files(args.max_num_files)
Expand All @@ -718,14 +717,31 @@ def pretty_print_elo_rating(rating):
"chinese": lambda x: x["language"] == "Chinese",
"english": lambda x: x["language"] == "English",
"russian": lambda x: x["language"] == "Russian",
"vietnamese": lambda x: x["language"] == "Vietnamese",
"multiturn": lambda x: x["turn"] > 1,
"exclude_preset": lambda x: not x["preset"],
"no_refusal": lambda x: not x["is_refusal"],
"is_captioning": lambda x: x["category_tag"]["vision_v0.1"]["is_captioning"],
"is_entity_recognition": lambda x: x["category_tag"]["vision_v0.1"]["is_entity_recognition"],
"is_ocr": lambda x: x["category_tag"]["vision_v0.1"]["is_ocr"],
"is_counting": lambda x: x["category_tag"]["vision_v0.1"]["is_counting"],
"is_creative_composition": lambda x: x["category_tag"]["vision_v0.1"]["is_creative_composition"],
"is_spatial_reasoning": lambda x: x["category_tag"]["vision_v0.1"]["is_spatial_reasoning"],
"if": lambda x: x["category_tag"]["if_v0.1"]["if"],
"math": lambda x: x["category_tag"]["math_v0.1"]["math"],
}
assert all(
[cat in filter_func_map for cat in args.category]
), f"Invalid category: {args.category}"

results = {}
for cat in args.category:
values = battles.apply(filter_func_map[cat], axis=1)
# if all values are False, skip
print(f"Category {cat} has {values.sum()} battles")
if not any(values):
print(f"Skipping category {cat}")
continue
for cat in args.category:
filter_func = filter_func_map[cat]
results[cat] = report_elo_analysis_results(
Expand Down
34 changes: 27 additions & 7 deletions fastchat/serve/monitor/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,12 +674,13 @@ def highlight_top_3(s):
style = style.apply(highlight_top_3, subset=[category])

if metric == "rating":
style = style.background_gradient(
cmap="Blues",
subset=category_names,
vmin=1150,
vmax=category_df[category_names].max().max(),
)
for category in category_names:
style = style.background_gradient(
cmap="Blues",
subset=[category],
vmin=category_df[category].max() - 150,
vmax=category_df[category].max(),
)

return style

Expand All @@ -705,7 +706,7 @@ def build_category_leaderboard_tab(
headers=["Model"] + [key_to_category_name[k] for k in categories],
datatype=["markdown"] + ["str" for k in categories],
value=full_table_vals,
elem_id="full_leaderboard_dataframe",
elem_id="overview_leaderboard_dataframe",
column_widths=[250]
+ categories_width, # IMPORTANT: THIS IS HARDCODED WITH THE CURRENT CATEGORIES
height=800,
Expand All @@ -731,6 +732,17 @@ def build_category_leaderboard_tab(
]
selected_categories_width = [95, 85, 130, 75, 150, 100, 95, 100]

vision_categories = [
"full",
"is_captioning",
"is_entity_recognition",
"is_ocr",
"is_creative_composition",
"if",
"no_refusal",
]
vision_categories_width = [90, 90, 90, 50, 80, 95, 90, 80]

language_categories = [
"english",
"chinese",
Expand Down Expand Up @@ -838,6 +850,14 @@ def build_leaderboard_tab(
language_categories,
language_categories_width,
)
if elo_results_vision is not None:
vision_combined_table = get_combined_table(elo_results_vision, model_table_df)
build_category_leaderboard_tab(
vision_combined_table,
"Vision",
vision_categories,
vision_categories_width,
)
gr.Markdown(
f"""
***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
Expand Down
12 changes: 12 additions & 0 deletions fastchat/serve/monitor/monitor_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@
"full_style_control": "Overall (Style Control)",
"hard_6_style_control": "Hard Prompts (Overall) (Style Control)",
"exclude_preset": "Exclude Preset",
"is_captioning": "Captioning",
"is_entity_recognition": "Entity Recognition",
"is_ocr": "OCR",
"is_counting": "Counting",
"is_creative_composition": "Creative Writing",
"is_spatial_reasoning": "Spatial Reasoning",
}
cat_name_to_explanation = {
"Overall": "Overall Questions",
Expand All @@ -55,6 +61,12 @@
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
"Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
"Exclude Preset": "Exclude Preset Images",
"Captioning": "Open-Ended Captioning",
"Entity Recognition": "Entity Recognition (e.g. who is in the image)",
"OCR": "Optical Character Recognition",
"Counting": "Counting",
"Creative Writing": "Creative Writing (e.g. write a story about this image)",
"Spatial Reasoning": "Spatial Reasoning",
}
cat_name_to_baseline = {
"Hard Prompts (English)": "English",
Expand Down

0 comments on commit 5063c01

Please sign in to comment.