File size: 2,443 Bytes
9203553
 
 
cdf268e
aa8b23d
 
9203553
aa8b23d
 
 
2d0af54
e2473e2
9203553
e2473e2
9203553
 
e2473e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9203553
 
 
 
 
 
 
 
 
 
aa8b23d
 
 
 
 
 
 
 
9203553
 
 
 
2d0af54
 
9203553
2d0af54
9203553
2d0af54
9203553
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import logging
import os

import pandas as pd  # type: ignore[import]
from datasets import (get_dataset_config_names,  # type: ignore[import]
                      load_dataset)

from .leaderboard_formatting import (COLUMNS_PRETTY, METRICS_PER_TASK,
                                     SORT_COLUMN_PER_TASK,
                                     get_columns_per_task)
from .tasks import TASKS_PRETTY_REVERSE

AVAILABLE_TASKS = get_dataset_config_names(os.environ["DATASET_ID"])


def _get_results_stub() -> pd.DataFrame:
    stub_df = pd.DataFrame(
        [
            {
                "Model Name": "GPT-4",
                "Availability": "Proprietary",
                "Context Size": "16k",
                "BLEU": "X",
                "ROUGE": "X",
                "ChrF": "X",
                "BERTScore": "X",
                "BERTScore (Normalized)": "X",
                "Submitted By": "🏟 Long Code Arena Team",
            },
            {
                "Model Name": "CodeLlama-7b (instruct)",
                "Availability": "Llama 2 license",
                "Context Size": "16k",
                "BLEU": "X",
                "ROUGE": "X",
                "ChrF": "X",
                "BERTScore": "X",
                "BERTScore (Normalized)": "X",
                "Submitted By": "🏟 Long Code Arena Team",
            },
        ]
    )
    return stub_df


def _get_results_dataset(task_id: str) -> pd.DataFrame:
    results_df = load_dataset(
        os.environ["DATASET_ID"], task_id, split="test"
    ).to_pandas()
    results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore")
    results_df["Context Size"] = results_df["Context Size"].map(
        lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x
    )

    results_df = results_df.sort_values(
        by=SORT_COLUMN_PER_TASK[task_id], ascending=False
    )

    for metric_column in METRICS_PER_TASK[task_id]:
        results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.2f}")

    results_df = results_df[get_columns_per_task(task_id)]
    return results_df


def get_results_for_task(task_pretty: str) -> pd.DataFrame:
    task_id = TASKS_PRETTY_REVERSE[task_pretty]
    if task_id in AVAILABLE_TASKS:
        logging.info(f"Retrieving results for {task_pretty}...")
        return _get_results_dataset(task_id)
    logging.info(f"Generating leaderboard stub for {task_pretty}...")
    return _get_results_stub()