Spaces:
Running
Running
| import csv | |
| from collections import defaultdict | |
| import gradio as gr | |
| import pandas as pd | |
| def strip_colname(x): | |
| if x.startswith("score_"): | |
| return x[6:] | |
| return x | |
| INTRO = """The current leaderboard displays performance across all filtered directions based on the dev subset of BOUQuET. | |
| A smarter leaderboard and the code for reproducing the evaluations will be published soon! | |
| """ | |
| LANGS_EXPLANATION = """## Languages | |
| Below, we give a brief description of each language variety participating in the leaderboard. | |
| Each language variety is identified by | |
| an [ISO 639-3 code](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) (the first 3 letters) for the language, | |
| an [ISO 15924](https://en.wikipedia.org/wiki/ISO_15924) code (the next 4 letters) for the writing system, | |
| and optionally, a [Glottolog code](https://glottolog.org/) for the dialect. | |
| The varieties with a secondary language code (Egyptian Arabic, Colloquial Malay) use code-switching, | |
| i.e. the speakers switch between the two languages (a colloquial and a standardized variety) | |
| depending on the context (e.g. the formality level). | |
| For a fuller description of the languages and the codes used to represent them, please refer | |
| to https://huggingface.co/datasets/facebook/bouquet#languages and the [BOUQuET paper](https://arxiv.org/abs/2502.04314). | |
| """ | |
| METRICS_EXPLANATION = """## Metrics | |
| 1. `metricx_both`: [google/metricx-24-hybrid-xl-v2p6](https://huggingface.co/google/metricx-24-hybrid-xl-v2p6) score based on both source and reference. **Attention: lower is better!** | |
| 2. `xcomet_both`: []() score based on both source and reference. | |
| 3. `CHRFpp`: ChrF++ score ([sacrebleu](https://github.com/mjpost/sacrebleu) implementation) based on reference. | |
| 4. `glotlid_ref`: probability, as predicted by the [GlotLID model](https://huggingface.co/cis-lmu/glotlid), that translation and reference are in the same language. | |
| """ | |
| SYSTEMS_EXPLANATION = """## Systems | |
| Descriptions of the implementation of the systems will come out later. | |
| """ | |
| def leaderboard_tab(): | |
| stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE) | |
| stats.columns = [strip_colname(c) for c in stats.columns] | |
| metrics = ["metricx_both", "xcomet_both", "CHRFpp", "glotlid_ref"] | |
| systems = sorted(set(stats["system"])) | |
| levels = ["sentence_level", "paragraph_level"] | |
| ALL = "ALL" | |
| MEAN = "Average" | |
| BEST = "Best" | |
| XX2EN = "Everything-into-English" | |
| EN2XX = "English-into-Everything" | |
| lang_src2tgt = defaultdict(set) | |
| lang_tgt2src = defaultdict(set) | |
| langs_src = set() | |
| langs_tgt = set() | |
| for src_lang, tgt_lang in stats[["src_lang", "tgt_lang"]].drop_duplicates().values: | |
| lang_src2tgt[src_lang].add(tgt_lang) | |
| lang_tgt2src[tgt_lang].add(src_lang) | |
| langs_src.add(src_lang) | |
| langs_tgt.add(tgt_lang) | |
| langs_df = pd.read_csv("data/language_metadata.tsv", sep="\t") | |
| lang2name = {} | |
| for i, row in langs_df.iterrows(): | |
| code = row["ISO 639-3"] + "_" + row["ISO 15924"] | |
| if isinstance(row["Glottocode"], str) and len(row["Glottocode"]) > 0: | |
| code = code + "_" + row["Glottocode"] | |
| lang2name[code] = row["Language"] | |
| if isinstance(row["Secondary ISO 639-3"], str) and len( | |
| row["Secondary ISO 639-3"] | |
| ): | |
| code = row["Secondary ISO 639-3"] + code[3:] | |
| lang2name[code] = row["Language"] | |
| for lang in langs_src.union(langs_tgt): | |
| if lang not in lang2name: | |
| print(f"Name not found for {lang}") | |
| def named_langs(langs_list): | |
| return [ | |
| (f"{lang} — {lang2name[lang]}", lang) if lang in lang2name else lang | |
| for lang in langs_list | |
| ] | |
| with gr.Tab("Leaderboard"): | |
| gr.Markdown("# BOUQuET translation leaderboard") | |
| gr.Markdown(INTRO) | |
| gr.Markdown("## Systems ranking") | |
| # Inputs | |
| gr_level = gr.Dropdown(levels, value="sentence_level", label="Level") | |
| gr_src_lang = gr.Dropdown( | |
| [ALL] + named_langs(sorted(langs_src)), value=ALL, label="Source lang" | |
| ) | |
| gr_tgt_lang = gr.Dropdown( | |
| [ALL] + named_langs(sorted(langs_tgt)), value=ALL, label="Target lang" | |
| ) | |
| # Interactivity | |
| inputs = [gr_level, gr_src_lang, gr_tgt_lang] | |
| def get_lb(level, src_lang, tgt_lang): | |
| filtered = stats[stats["level"].eq(level)] | |
| if src_lang != ALL: | |
| filtered = filtered[filtered["src_lang"].eq(src_lang)] | |
| if tgt_lang != ALL: | |
| filtered = filtered[filtered["tgt_lang"].eq(tgt_lang)] | |
| means = ( | |
| filtered.groupby(["system"])[metrics] | |
| .mean() | |
| .reset_index() | |
| .sort_values("metricx_both") | |
| ) | |
| means.columns = [strip_colname(c) for c in means.columns] | |
| styler = means.style.background_gradient().format(precision=4) | |
| return styler | |
| df_all = get_lb(*[inp.value for inp in inputs]) | |
| gr_df = gr.Dataframe(df_all) | |
| for inp in inputs: | |
| inp.change(fn=get_lb, inputs=inputs, outputs=gr_df) | |
| # Interdependecy of the controls | |
| def src2tgt(src_lang, tgt_lang): | |
| if src_lang == ALL: | |
| choices = [ALL] + named_langs(sorted(langs_tgt)) | |
| else: | |
| choices = [ALL] + named_langs(sorted(lang_src2tgt[src_lang])) | |
| return gr.update(choices=choices, value=tgt_lang) | |
| def tgt2src(src_lang, tgt_lang): | |
| if tgt_lang == ALL: | |
| choices = [ALL] + named_langs(sorted(langs_src)) | |
| else: | |
| choices = [ALL] + named_langs(sorted(lang_tgt2src[tgt_lang])) | |
| return gr.update(choices=choices, value=src_lang) | |
| gr_src_lang.input( | |
| fn=src2tgt, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_tgt_lang | |
| ) | |
| gr_tgt_lang.input( | |
| fn=tgt2src, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_src_lang | |
| ) | |
| gr.Markdown("## Languages difficulty") | |
| gr_system = gr.Dropdown( | |
| [MEAN, BEST] + systems, value=MEAN, label="Translation system" | |
| ) | |
| gr_direction = gr.Dropdown( | |
| [XX2EN, EN2XX], value=XX2EN, label="Translation direction" | |
| ) | |
| gr_metric = gr.Dropdown(metrics, label="Quality metric", value="metricx_both") | |
| gr_level2 = gr.Dropdown(levels, value="sentence_level", label="Level") | |
| bar_controls = [gr_system, gr_direction, gr_metric, gr_level2] | |
| def get_hist(system, direction, metric, level): | |
| # decide on the data to process | |
| if direction == EN2XX: | |
| direction_filter = stats["src_lang"].eq("eng_Latn") | |
| lang_col = "tgt_lang" | |
| else: | |
| direction_filter = stats["tgt_lang"].eq("eng_Latn") | |
| lang_col = "src_lang" | |
| if system in (MEAN, BEST): | |
| system_filter = stats["system"].astype(bool) | |
| else: | |
| system_filter = stats["system"].eq(system) | |
| subset = stats[system_filter & direction_filter & stats["level"].eq(level)] | |
| # Compute the means and update the plot | |
| grouped = subset.groupby(lang_col)[metric] | |
| if metric == "metricx_both": | |
| bests = grouped.min() | |
| best_sys = grouped.idxmin() | |
| else: | |
| bests = grouped.max() | |
| best_sys = grouped.idxmax() | |
| if system == BEST: | |
| means = bests | |
| else: | |
| means = grouped.mean() | |
| report = ( | |
| pd.DataFrame( | |
| { | |
| metric: means, | |
| "best_system": subset.loc[best_sys]["system"].values, | |
| } | |
| ) | |
| .sort_values(metric, ascending=(metric == "metricx_both")) | |
| .reset_index() | |
| ) | |
| report["lang_name"] = [lang2name.get(lang, "") for lang in report[lang_col]] | |
| tooltip_columns = ["lang_name", "best_system"] | |
| return gr.update( | |
| value=report, | |
| x=lang_col, | |
| y=metric, | |
| x_label_angle=-90, | |
| height=500, | |
| sort="y", | |
| tooltip=tooltip_columns, | |
| ) | |
| default_bar = get_hist(*[x.value for x in bar_controls]) | |
| gr_barplot = gr.BarPlot(**default_bar) | |
| for inp in bar_controls: | |
| inp.change(fn=get_hist, inputs=bar_controls, outputs=gr_barplot) | |
| gr.Markdown(METRICS_EXPLANATION) | |
| gr.Markdown(SYSTEMS_EXPLANATION) | |
| gr.Markdown(LANGS_EXPLANATION) | |
| gr.Dataframe(langs_df.drop(columns=["Class"]).style.format(na_rep="")) | |