File size: 3,272 Bytes
9203553
 
04f40cd
9203553
cdf268e
6c92442
9203553
2d26479
6c92442
 
 
 
 
 
 
04f40cd
e2473e2
2d26479
 
 
 
9203553
a9273cf
9203553
e2473e2
 
 
 
 
 
 
 
 
 
 
 
04f40cd
e2473e2
 
 
 
 
 
 
 
 
 
 
04f40cd
e2473e2
 
 
 
9203553
 
04f40cd
 
 
 
 
 
 
9203553
04f40cd
 
 
9203553
6c92442
aa8b23d
6c92442
aa8b23d
 
dc801c4
6c92442
dc801c4
6c92442
aa8b23d
2d26479
 
 
 
04f40cd
9203553
 
 
 
2d0af54
 
9203553
2d0af54
9203553
2d0af54
9203553
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import logging
import os
import re

import pandas as pd  # type: ignore[import]
from datasets import get_dataset_config_names, load_dataset  # type: ignore[import]

from .formatting import model_hyperlink
from .leaderboard_formatting import (
    COLUMNS_PRETTY,
    METRICS_PER_TASK,
    SORT_COLUMN_PER_TASK,
    get_columns_per_task,
)
from .tasks_content import TASKS_PRETTY_REVERSE
from .utils import MD_LINK_PATTERN

try:
    AVAILABLE_TASKS = get_dataset_config_names(os.environ["DATASET_ID"])
except FileNotFoundError:
    AVAILABLE_TASKS = []


def _get_results_stub() -> pd.DataFrame:
    stub_df = pd.DataFrame(
        [
            {
                "Model Name": "GPT-4",
                "Availability": "Proprietary",
                "Context Size": "16k",
                "BLEU": "X",
                "ROUGE": "X",
                "ChrF": "X",
                "BERTScore": "X",
                "BERTScore (Normalized)": "X",
                "Submitted By": "🏟 Long Code Arena Team",
                "Resources": "",
            },
            {
                "Model Name": "CodeLlama-7b (instruct)",
                "Availability": "Llama 2 license",
                "Context Size": "16k",
                "BLEU": "X",
                "ROUGE": "X",
                "ChrF": "X",
                "BERTScore": "X",
                "BERTScore (Normalized)": "X",
                "Submitted By": "🏟 Long Code Arena Team",
                "Resources": "",
            },
        ]
    )
    return stub_df


def _process_urls(raw_urls: str) -> str:
    if not raw_urls:
        return raw_urls
    html_urls = [model_hyperlink(*re.search(MD_LINK_PATTERN, url.strip()).groups()) for url in raw_urls.split(",")]
    return ", ".join(html_urls)


def _get_results_dataset(task_id: str) -> pd.DataFrame:
    results_df = load_dataset(
        os.environ["DATASET_ID"], task_id, split="test", download_mode="force_redownload"
    ).to_pandas()
    results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore")
    results_df["Context Size"] = results_df["Context Size"].map(lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x)

    results_df = results_df.sort_values(by=SORT_COLUMN_PER_TASK[task_id], ascending=False)

    for metric_column in METRICS_PER_TASK[task_id]:
        if "BERTScore" in metric_column:
            results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.5f}")
        else:
            results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.2f}")

    results_df["Model Name"] = [
        model_hyperlink(link=link, model_name=model_name) if link else model_name
        for link, model_name in zip(results_df["model_url"], results_df["Model Name"])
    ]
    results_df["Resources"] = [_process_urls(urls) for urls in results_df["Resources"]]
    results_df = results_df[get_columns_per_task(task_id)]
    return results_df


def get_results_for_task(task_pretty: str) -> pd.DataFrame:
    task_id = TASKS_PRETTY_REVERSE[task_pretty]
    if task_id in AVAILABLE_TASKS:
        logging.info(f"Retrieving results for {task_pretty}...")
        return _get_results_dataset(task_id)
    logging.info(f"Generating leaderboard stub for {task_pretty}...")
    return _get_results_stub()