update space
Browse files- app.py +23 -21
- src/about.py +2 -2
- src/leaderboard/read_evals.py +6 -5
app.py
CHANGED
|
@@ -36,7 +36,7 @@ from src.submission.submit import add_new_eval
|
|
| 36 |
def restart_space():
|
| 37 |
API.restart_space(repo_id=REPO_ID)
|
| 38 |
|
| 39 |
-
|
| 40 |
try:
|
| 41 |
print(EVAL_REQUESTS_PATH)
|
| 42 |
snapshot_download(
|
|
@@ -54,6 +54,7 @@ except Exception:
|
|
| 54 |
|
| 55 |
|
| 56 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
|
| 57 |
|
| 58 |
(
|
| 59 |
finished_eval_queue_df,
|
|
@@ -61,6 +62,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
|
| 61 |
pending_eval_queue_df,
|
| 62 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 63 |
|
|
|
|
| 64 |
def init_leaderboard(dataframe):
|
| 65 |
if dataframe is None or dataframe.empty:
|
| 66 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
@@ -74,20 +76,20 @@ def init_leaderboard(dataframe):
|
|
| 74 |
),
|
| 75 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
| 76 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 77 |
-
filter_columns=[
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
],
|
| 91 |
bool_checkboxgroup_label="Hide models",
|
| 92 |
interactive=False,
|
| 93 |
)
|
|
@@ -97,7 +99,7 @@ demo = gr.Blocks(css=custom_css)
|
|
| 97 |
with demo:
|
| 98 |
gr.HTML(TITLE)
|
| 99 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 100 |
-
gr.Markdown(INTRODUCTION_TEXT_ZH, elem_classes="markdown-text")
|
| 101 |
|
| 102 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 103 |
with gr.TabItem("π
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
|
@@ -106,16 +108,16 @@ with demo:
|
|
| 106 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
| 107 |
with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1):
|
| 108 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 109 |
-
with gr.TabItem("ZH", elem_id="llm-benchmark-tab-table", id=2):
|
| 110 |
-
|
| 111 |
|
| 112 |
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
| 113 |
with gr.Column():
|
| 114 |
with gr.Row():
|
| 115 |
with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1):
|
| 116 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 117 |
-
with gr.TabItem("ZH", elem_id="llm-benchmark-tab-table", id=2):
|
| 118 |
-
|
| 119 |
|
| 120 |
with gr.Column():
|
| 121 |
with gr.Accordion(
|
|
@@ -221,4 +223,4 @@ with demo:
|
|
| 221 |
scheduler = BackgroundScheduler()
|
| 222 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 223 |
scheduler.start()
|
| 224 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 36 |
def restart_space():
|
| 37 |
API.restart_space(repo_id=REPO_ID)
|
| 38 |
|
| 39 |
+
# Space initialisation
|
| 40 |
try:
|
| 41 |
print(EVAL_REQUESTS_PATH)
|
| 42 |
snapshot_download(
|
|
|
|
| 54 |
|
| 55 |
|
| 56 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 57 |
+
# print("Before calling init_leaderboard:", LEADERBOARD_DF)
|
| 58 |
|
| 59 |
(
|
| 60 |
finished_eval_queue_df,
|
|
|
|
| 62 |
pending_eval_queue_df,
|
| 63 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 64 |
|
| 65 |
+
|
| 66 |
def init_leaderboard(dataframe):
|
| 67 |
if dataframe is None or dataframe.empty:
|
| 68 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
|
| 76 |
),
|
| 77 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
| 78 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 79 |
+
# filter_columns=[
|
| 80 |
+
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 81 |
+
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
| 82 |
+
# ColumnFilter(
|
| 83 |
+
# AutoEvalColumn.params.name,
|
| 84 |
+
# type="slider",
|
| 85 |
+
# min=0.01,
|
| 86 |
+
# max=150,
|
| 87 |
+
# label="Select the number of parameters (B)",
|
| 88 |
+
# ),
|
| 89 |
+
# ColumnFilter(
|
| 90 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
| 91 |
+
# ),
|
| 92 |
+
# ],
|
| 93 |
bool_checkboxgroup_label="Hide models",
|
| 94 |
interactive=False,
|
| 95 |
)
|
|
|
|
| 99 |
with demo:
|
| 100 |
gr.HTML(TITLE)
|
| 101 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 102 |
+
# gr.Markdown(INTRODUCTION_TEXT_ZH, elem_classes="markdown-text")
|
| 103 |
|
| 104 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 105 |
with gr.TabItem("π
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
|
|
|
| 108 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
| 109 |
with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1):
|
| 110 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 111 |
+
# with gr.TabItem("ZH", elem_id="llm-benchmark-tab-table", id=2):
|
| 112 |
+
# gr.Markdown(LLM_BENCHMARKS_TEXT_ZH, elem_classes="markdown-text")
|
| 113 |
|
| 114 |
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
| 115 |
with gr.Column():
|
| 116 |
with gr.Row():
|
| 117 |
with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1):
|
| 118 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 119 |
+
# with gr.TabItem("ZH", elem_id="llm-benchmark-tab-table", id=2):
|
| 120 |
+
# gr.Markdown(EVALUATION_QUEUE_TEXT_ZH, elem_classes="markdown-text")
|
| 121 |
|
| 122 |
with gr.Column():
|
| 123 |
with gr.Accordion(
|
|
|
|
| 223 |
scheduler = BackgroundScheduler()
|
| 224 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 225 |
scheduler.start()
|
| 226 |
+
demo.queue(default_concurrency_limit=40).launch(share=True)
|
src/about.py
CHANGED
|
@@ -13,8 +13,8 @@ class Task:
|
|
| 13 |
class Tasks(Enum):
|
| 14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 15 |
task0 = Task("mmmlu", "acc", "MMMLU")
|
| 16 |
-
task1 = Task("mmlu", "acc", "MMLU")
|
| 17 |
-
task2 = Task("cmmlu", "acc", "CMMLU")
|
| 18 |
task3 = Task("mmmlu_ar", "acc", "MMMLU_AR")
|
| 19 |
task4 = Task("mmmlu_bn", "acc", "MMMLU_BN")
|
| 20 |
task5 = Task("mmmlu_de", "acc", "MMMLU_DE")
|
|
|
|
| 13 |
class Tasks(Enum):
|
| 14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 15 |
task0 = Task("mmmlu", "acc", "MMMLU")
|
| 16 |
+
# task1 = Task("mmlu", "acc", "MMLU")
|
| 17 |
+
# task2 = Task("cmmlu", "acc", "CMMLU")
|
| 18 |
task3 = Task("mmmlu_ar", "acc", "MMMLU_AR")
|
| 19 |
task4 = Task("mmmlu_bn", "acc", "MMMLU_BN")
|
| 20 |
task5 = Task("mmmlu_de", "acc", "MMMLU_DE")
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -96,7 +96,7 @@ class EvalResult:
|
|
| 96 |
def update_with_request_file(self, requests_path):
|
| 97 |
"""Finds the relevant request file for the current model and updates info with it"""
|
| 98 |
request_file = get_request_file_for_model(requests_path, self.full_model.split("/")[-1], self.precision.value.name)
|
| 99 |
-
# print("########",
|
| 100 |
|
| 101 |
try:
|
| 102 |
with open(request_file, "r") as f:
|
|
@@ -112,9 +112,10 @@ class EvalResult:
|
|
| 112 |
|
| 113 |
def to_dict(self):
|
| 114 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 115 |
-
keys_to_average = ['mmmlu', 'mmlu', 'cmmlu']
|
| 116 |
-
average = sum([self.results[key] for key in keys_to_average if self.results.get(key) is not None]) / len(
|
| 117 |
-
|
|
|
|
| 118 |
data_dict = {
|
| 119 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 120 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
@@ -182,6 +183,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 182 |
for model_result_filepath in model_result_filepaths:
|
| 183 |
# Creation of result
|
| 184 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
|
|
|
| 185 |
eval_result.update_with_request_file(requests_path)
|
| 186 |
|
| 187 |
# Store results of same eval together
|
|
@@ -198,5 +200,4 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 198 |
results.append(v)
|
| 199 |
except KeyError: # not all eval values present
|
| 200 |
continue
|
| 201 |
-
|
| 202 |
return results
|
|
|
|
| 96 |
def update_with_request_file(self, requests_path):
|
| 97 |
"""Finds the relevant request file for the current model and updates info with it"""
|
| 98 |
request_file = get_request_file_for_model(requests_path, self.full_model.split("/")[-1], self.precision.value.name)
|
| 99 |
+
# print("########",requests_path,self.full_model.split("/")[-1])
|
| 100 |
|
| 101 |
try:
|
| 102 |
with open(request_file, "r") as f:
|
|
|
|
| 112 |
|
| 113 |
def to_dict(self):
|
| 114 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 115 |
+
# keys_to_average = ['mmmlu', 'mmlu', 'cmmlu']
|
| 116 |
+
# average = sum([self.results[key] for key in keys_to_average if self.results.get(key) is not None]) / len(
|
| 117 |
+
# keys_to_average)
|
| 118 |
+
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
| 119 |
data_dict = {
|
| 120 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 121 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
|
| 183 |
for model_result_filepath in model_result_filepaths:
|
| 184 |
# Creation of result
|
| 185 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 186 |
+
print(results_path)
|
| 187 |
eval_result.update_with_request_file(requests_path)
|
| 188 |
|
| 189 |
# Store results of same eval together
|
|
|
|
| 200 |
results.append(v)
|
| 201 |
except KeyError: # not all eval values present
|
| 202 |
continue
|
|
|
|
| 203 |
return results
|