yuchenlin commited on
Commit
1757118
β€’
1 Parent(s): 1c919b3

formatting

Browse files
Files changed (6) hide show
  1. .gitignore +2 -5
  2. app.py +31 -34
  3. data_utils.py +2 -5
  4. model_info.json +1 -0
  5. update_data.sh +4 -40
  6. utils_display.py +6 -1
.gitignore CHANGED
@@ -1,6 +1,3 @@
1
 
2
- *.pyc
3
- ZeroEval-main/.DS_Store
4
- ZeroEval-main/result_dirs/.DS_Store
5
- ZeroEval-main/result_dirs/zebra-grid/.DS_Store
6
- .DS_Store
 
1
 
2
+ *.pyc
3
+ *.DS_Store
 
 
 
app.py CHANGED
@@ -37,59 +37,56 @@ with open("_metrics.md", "r") as f:
37
  original_df = None
38
  # available_models = [] # to be filled in later
39
  available_models = list(model_info.keys())
40
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def _tab_leaderboard():
42
- global original_df, available_models, gpt4t_dfs, haiku_dfs, llama_dfs, score_df
43
  with gr.TabItem("πŸ“Š Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
44
  default_main_df = original_df.copy()
45
- default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
46
- default_main_df_no_task = default_main_df.copy()
47
- # default_main_df_no_task = hide_task_column(default_main_df)
48
- # default_main_df_no_task = rerank(default_main_df_no_task, rank_column=WB_ELO_COLUMN)
49
- # default_main_df_no_task = rerank(default_main_df_no_task, rank_column=HYBRID_AVG_COLUMN)
50
- with gr.Row():
51
- # with gr.Column(scale=5):
52
- # gr.Markdown(LEADERBOARD_REMARKS_MAIN, elem_classes="markdown-text-small top-left-LP")
53
- # with gr.Row():
54
- # with gr.Column(scale=2):
55
- # md = gr.Markdown(" ### πŸ‘€ More presentation options ⬇️", elem_classes="markdown-text")
56
-
57
- # with gr.Column(scale=3):
58
- # with gr.Column(scale=2):
59
- # gr.Markdown(f"""**__πŸͺ§ Default options:__** K={DEFAULT_K}; Hybrid-Macro; for best corr w/ LMSYS Elo.""", elem_classes="markdown-text")
60
-
61
-
62
- # gr.Markdown(LENGTH_MARGIN_DESC_MD, elem_classes="markdown-text-tiny no_margin")
63
- with gr.Column(scale=5):
64
- with gr.Accordion("πŸ’¬ Metric explanations", open=False, elem_classes="accordion-label"):
65
- gr.Markdown(LEADERBOARD_REMARKS_MAIN, elem_classes="markdown-text-small no_margin")
66
- rank_column_radio = gr.Radio(["πŸ†š+πŸ’― Hybrid", "πŸ†š Reward-Mix (Pairwise)", "πŸ’― Score (Individual)", "🌟 WB Elo (beta)" ], show_label=False, elem_id="rank-column-radio",
67
- value="🌟 WB Elo (beta)"
68
- # value="πŸ†š+πŸ’― Hybrid"
69
- )
70
- with gr.Column(scale=2):
71
- with gr.Row():
72
- checkbox_show_task_categorized = gr.Checkbox(label="πŸ†š by Task Type", elem_id="show-task-categorized", value=False)
73
- show_open_source_model_only = gr.Checkbox(label="πŸ”‘ Open Models", elem_id="show-open-source-models", value=False)
74
  # with gr.Row():
75
  # with gr.Column(scale=2):
76
 
77
  leaderboard_table = gr.components.Dataframe(
78
- value=default_main_df_no_task,
79
  datatype= ["number", "markdown", "markdown", "number"],
80
  # max_rows=None,
81
  height=6000,
82
  elem_id="leaderboard-table",
83
  interactive=False,
84
  visible=True,
85
- column_widths=[50, 260,120, 120, 120, 130,100,100,110,100],
86
  wrap=True
87
  # min_width=60,
88
  )
89
  # checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
90
  # show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
91
  # rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
92
-
 
93
 
94
 
95
  def _tab_submit():
 
37
  original_df = None
38
  # available_models = [] # to be filled in later
39
  available_models = list(model_info.keys())
40
+
41
+ def df_filters(mode_selection_radio, show_open_source_model_only):
42
+ global original_df
43
+ # remove the rows when the model contains "❌"
44
+ original_df = original_df[~original_df["Model"].str.contains("❌")]
45
+
46
+ modes = {
47
+ "greedy": ["greedy"],
48
+ "sampling (Temp=0.5)": ["sampling"],
49
+ "all": ["greedy", "sampling"]
50
+ }
51
+ # filter the df by the mode_selection_radio
52
+ default_main_df = original_df[original_df["Mode"].isin(modes[mode_selection_radio])]
53
+ default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
54
+ return default_main_df.copy()
55
+
56
+ def _gstr(text):
57
+ return gr.Text(text, visible=False)
58
+
59
  def _tab_leaderboard():
60
+ global original_df, available_models
61
  with gr.TabItem("πŸ“Š Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
62
  default_main_df = original_df.copy()
63
+ # default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
64
+ # default_main_df_no_task = default_main_df.copy()
65
+ default_mode = "greedy"
66
+ default_main_df = df_filters(default_mode, False)
67
+ with gr.Row():
68
+ with gr.Column(scale=5):
69
+ mode_selection_radio = gr.Radio(["greedy", "sampling (Temp=0.5)", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  # with gr.Row():
71
  # with gr.Column(scale=2):
72
 
73
  leaderboard_table = gr.components.Dataframe(
74
+ value=default_main_df,
75
  datatype= ["number", "markdown", "markdown", "number"],
76
  # max_rows=None,
77
  height=6000,
78
  elem_id="leaderboard-table",
79
  interactive=False,
80
  visible=True,
81
+ column_widths=[50, 260, 100, 100, 120, 120, 100,100,110,100],
82
  wrap=True
83
  # min_width=60,
84
  )
85
  # checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
86
  # show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
87
  # rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
88
+ mode_selection_radio.change(fn=df_filters, inputs=[mode_selection_radio, _gstr("")], outputs=[leaderboard_table])
89
+
90
 
91
 
92
  def _tab_submit():
data_utils.py CHANGED
@@ -32,11 +32,8 @@ def post_processing(df, column_names, rank_column=RANKING_COLUMN, ordered_column
32
  if col == "Model" and click_url:
33
  df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
34
  else:
35
- df[col] = df[col].apply(formatter) # For numerical values
36
- if "Elo" in col:
37
- df[col] = df[col].replace('-', np.nan).astype(float)
38
-
39
-
40
  df.rename(columns=column_names, inplace=True)
41
  list_columns = [col for col in ordered_columns if col in df.columns]
42
  df = df[list_columns]
 
32
  if col == "Model" and click_url:
33
  df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
34
  else:
35
+ df[col] = df[col].apply(formatter) # For numerical values
36
+
 
 
 
37
  df.rename(columns=column_names, inplace=True)
38
  list_columns = [col for col in ordered_columns if col in df.columns]
39
  df = df[list_columns]
model_info.json CHANGED
@@ -53,6 +53,7 @@
53
  "deepseek-coder": {"pretty_name": "DeepSeek-Coder-V2", "hf_model_id": "https://platform.deepseek.com/api-docs/api/deepseek-api/", "open": true},
54
  "gemma-2-27b-it@nvidia": {"pretty_name": "Gemma-2-27B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-27b-it"},
55
  "gemma-2-9b-it@nvidia": {"pretty_name": "Gemma-2-9B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-9b-it"},
 
56
  "neo_7b_instruct_v0.1": {"pretty_name": "Neo-7B-Instruct", "hf_model_id": "m-a-p/neo_7b_instruct_v0.1"},
57
  "Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B-chat"},
58
  "vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},
 
53
  "deepseek-coder": {"pretty_name": "DeepSeek-Coder-V2", "hf_model_id": "https://platform.deepseek.com/api-docs/api/deepseek-api/", "open": true},
54
  "gemma-2-27b-it@nvidia": {"pretty_name": "Gemma-2-27B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-27b-it"},
55
  "gemma-2-9b-it@nvidia": {"pretty_name": "Gemma-2-9B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-9b-it"},
56
+ "gemma-2-9b-it": {"pretty_name": "Gemma-2-9B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-9b-it", "hidden": true},
57
  "neo_7b_instruct_v0.1": {"pretty_name": "Neo-7B-Instruct", "hf_model_id": "m-a-p/neo_7b_instruct_v0.1"},
58
  "Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B-chat"},
59
  "vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},
update_data.sh CHANGED
@@ -1,40 +1,4 @@
1
- TARGET_DIR="ZeroEval-main"
2
-
3
- rm -r $TARGET_DIR
4
- # Download the ZIP file
5
- curl -L -o zeroeval.zip https://github.com/yuchenlin/ZeroEval/archive/refs/heads/main.zip
6
- unzip zeroeval.zip
7
- rm zeroeval.zip
8
-
9
- #!/bin/bash
10
-
11
- # Define the target directory and the exception folder
12
- EXCEPTION_FOLDER="result_dirs"
13
-
14
- # Ensure the target directory exists
15
- if [ -d "$TARGET_DIR" ]; then
16
- # Loop through each item in the target directory
17
- for item in "$TARGET_DIR"/*; do
18
- # Check if it is not the exception folder
19
- if [ "$(basename "$item")" != "$EXCEPTION_FOLDER" ]; then
20
- # Remove the item (file or directory)
21
- rm -rf "$item"
22
- echo "Removed: $item"
23
- fi
24
- done
25
- else
26
- echo "Target directory does not exist: $TARGET_DIR"
27
- fi
28
-
29
- # only keep the result_dirs/zebra-grid under result_dirs folder; remove all other sub-folders under result_dirs
30
- # Remove all subdirectories in result_dirs except zebra-grid
31
- find "$TARGET_DIR/result_dirs" -maxdepth 1 -type d ! -name 'zebra-grid' ! -name 'result_dirs' -exec rm -rf {} +
32
-
33
- rm -rf $TARGET_DIR/.github
34
- rm -rf $TARGET_DIR/.gitignore
35
-
36
-
37
- # tables
38
-
39
-
40
- # bash update_table.sh
 
1
+ # download the file from https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json
2
+ # and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
3
+ mkdir -p ZeroEval-main/result_dirs
4
+ wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils_display.py CHANGED
@@ -7,7 +7,9 @@ def make_clickable_model(model_name):
7
  global model_info
8
  modified_model_name = model_name
9
  if model_name in model_info:
10
- if model_info[model_name]["hf_model_id"].startswith("http"):
 
 
11
  link = model_info[model_name]["hf_model_id"]
12
  modified_model_name = f'πŸ”’ <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_info[model_name]["pretty_name"]}</a>'
13
  else:
@@ -16,6 +18,9 @@ def make_clickable_model(model_name):
16
  if "Neo-7B" in modified_model_name:
17
  # models that are fully open source
18
  modified_model_name = modified_model_name.replace("πŸ”‘", "πŸ’ŽπŸ”‘")
 
 
 
19
 
20
  if "🚨</a>" in modified_model_name:
21
  modified_model_name = modified_model_name.replace(' 🚨</a>', '</a> 🚨')
 
7
  global model_info
8
  modified_model_name = model_name
9
  if model_name in model_info:
10
+ is_open_model = model_info[model_name]["hf_model_id"].startswith("http")
11
+ is_open_model = model_info[model_name].get("open", False)
12
+ if not is_open_model:
13
  link = model_info[model_name]["hf_model_id"]
14
  modified_model_name = f'πŸ”’ <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_info[model_name]["pretty_name"]}</a>'
15
  else:
 
18
  if "Neo-7B" in modified_model_name:
19
  # models that are fully open source
20
  modified_model_name = modified_model_name.replace("πŸ”‘", "πŸ’ŽπŸ”‘")
21
+ hidden = model_info[model_name].get("hidden", False)
22
+ if hidden:
23
+ modified_model_name = f'❌ {modified_model_name}'
24
 
25
  if "🚨</a>" in modified_model_name:
26
  modified_model_name = modified_model_name.replace(' 🚨</a>', '</a> 🚨')