Spaces:

facebook
/

bouquet

Running

bouquet / leaderboard.py

David Dale

add language names and best systems in the leaderboard; code linting

baeda9f 28 days ago

8.82 kB

	import csv
	from collections import defaultdict

	import gradio as gr
	import pandas as pd


	def strip_colname(x):
	if x.startswith("score_"):
	return x[6:]
	return x


	INTRO = """The current leaderboard displays performance across all filtered directions based on the dev subset of BOUQuET.

	A smarter leaderboard and the code for reproducing the evaluations will be published soon!
	"""

	LANGS_EXPLANATION = """## Languages
	Below, we give a brief description of each language variety participating in the leaderboard.
	Each language variety is identified by
	an [ISO 639-3 code](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) (the first 3 letters) for the language,
	an [ISO 15924](https://en.wikipedia.org/wiki/ISO_15924) code (the next 4 letters) for the writing system,
	and optionally, a [Glottolog code](https://glottolog.org/) for the dialect.

	The varieties with a secondary language code (Egyptian Arabic, Colloquial Malay) use code-switching,
	i.e. the speakers switch between the two languages (a colloquial and a standardized variety)
	depending on the context (e.g. the formality level).

	For a fuller description of the languages and the codes used to represent them, please refer
	to https://huggingface.co/datasets/facebook/bouquet#languages and the [BOUQuET paper](https://arxiv.org/abs/2502.04314).
	"""

	METRICS_EXPLANATION = """## Metrics
	1. `metricx_both`: [google/metricx-24-hybrid-xl-v2p6](https://huggingface.co/google/metricx-24-hybrid-xl-v2p6) score based on both source and reference. Attention: lower is better!
	2. `xcomet_both`: []() score based on both source and reference.
	3. `CHRFpp`: ChrF++ score ([sacrebleu](https://github.com/mjpost/sacrebleu) implementation) based on reference.
	4. `glotlid_ref`: probability, as predicted by the [GlotLID model](https://huggingface.co/cis-lmu/glotlid), that translation and reference are in the same language.
	"""

	SYSTEMS_EXPLANATION = """## Systems
	Descriptions of the implementation of the systems will come out later.
	"""


	def leaderboard_tab():
	stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE)
	stats.columns = [strip_colname(c) for c in stats.columns]

	metrics = ["metricx_both", "xcomet_both", "CHRFpp", "glotlid_ref"]
	systems = sorted(set(stats["system"]))
	levels = ["sentence_level", "paragraph_level"]
	ALL = "ALL"
	MEAN = "Average"
	BEST = "Best"
	XX2EN = "Everything-into-English"
	EN2XX = "English-into-Everything"

	lang_src2tgt = defaultdict(set)
	lang_tgt2src = defaultdict(set)
	langs_src = set()
	langs_tgt = set()
	for src_lang, tgt_lang in stats[["src_lang", "tgt_lang"]].drop_duplicates().values:
	lang_src2tgt[src_lang].add(tgt_lang)
	lang_tgt2src[tgt_lang].add(src_lang)
	langs_src.add(src_lang)
	langs_tgt.add(tgt_lang)

	langs_df = pd.read_csv("data/language_metadata.tsv", sep="\t")
	lang2name = {}
	for i, row in langs_df.iterrows():
	code = row["ISO 639-3"] + "_" + row["ISO 15924"]
	if isinstance(row["Glottocode"], str) and len(row["Glottocode"]) > 0:
	code = code + "_" + row["Glottocode"]
	lang2name[code] = row["Language"]

	if isinstance(row["Secondary ISO 639-3"], str) and len(
	row["Secondary ISO 639-3"]
	):
	code = row["Secondary ISO 639-3"] + code[3:]
	lang2name[code] = row["Language"]
	for lang in langs_src.union(langs_tgt):
	if lang not in lang2name:
	print(f"Name not found for {lang}")

	def named_langs(langs_list):
	return [
	(f"{lang} — {lang2name[lang]}", lang) if lang in lang2name else lang
	for lang in langs_list
	]

	with gr.Tab("Leaderboard"):
	gr.Markdown("# BOUQuET translation leaderboard")
	gr.Markdown(INTRO)

	gr.Markdown("## Systems ranking")
	# Inputs
	gr_level = gr.Dropdown(levels, value="sentence_level", label="Level")
	gr_src_lang = gr.Dropdown(
	[ALL] + named_langs(sorted(langs_src)), value=ALL, label="Source lang"
	)
	gr_tgt_lang = gr.Dropdown(
	[ALL] + named_langs(sorted(langs_tgt)), value=ALL, label="Target lang"
	)

	# Interactivity
	inputs = [gr_level, gr_src_lang, gr_tgt_lang]

	def get_lb(level, src_lang, tgt_lang):
	filtered = stats[stats["level"].eq(level)]
	if src_lang != ALL:
	filtered = filtered[filtered["src_lang"].eq(src_lang)]
	if tgt_lang != ALL:
	filtered = filtered[filtered["tgt_lang"].eq(tgt_lang)]
	means = (
	filtered.groupby(["system"])[metrics]
	.mean()
	.reset_index()
	.sort_values("metricx_both")
	)
	means.columns = [strip_colname(c) for c in means.columns]
	styler = means.style.background_gradient().format(precision=4)
	return styler

	df_all = get_lb(*[inp.value for inp in inputs])
	gr_df = gr.Dataframe(df_all)

	for inp in inputs:
	inp.change(fn=get_lb, inputs=inputs, outputs=gr_df)

	# Interdependecy of the controls
	def src2tgt(src_lang, tgt_lang):
	if src_lang == ALL:
	choices = [ALL] + named_langs(sorted(langs_tgt))
	else:
	choices = [ALL] + named_langs(sorted(lang_src2tgt[src_lang]))

	return gr.update(choices=choices, value=tgt_lang)

	def tgt2src(src_lang, tgt_lang):
	if tgt_lang == ALL:
	choices = [ALL] + named_langs(sorted(langs_src))
	else:
	choices = [ALL] + named_langs(sorted(lang_tgt2src[tgt_lang]))
	return gr.update(choices=choices, value=src_lang)

	gr_src_lang.input(
	fn=src2tgt, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_tgt_lang
	)
	gr_tgt_lang.input(
	fn=tgt2src, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_src_lang
	)

	gr.Markdown("## Languages difficulty")
	gr_system = gr.Dropdown(
	[MEAN, BEST] + systems, value=MEAN, label="Translation system"
	)
	gr_direction = gr.Dropdown(
	[XX2EN, EN2XX], value=XX2EN, label="Translation direction"
	)
	gr_metric = gr.Dropdown(metrics, label="Quality metric", value="metricx_both")
	gr_level2 = gr.Dropdown(levels, value="sentence_level", label="Level")
	bar_controls = [gr_system, gr_direction, gr_metric, gr_level2]

	def get_hist(system, direction, metric, level):
	# decide on the data to process
	if direction == EN2XX:
	direction_filter = stats["src_lang"].eq("eng_Latn")
	lang_col = "tgt_lang"
	else:
	direction_filter = stats["tgt_lang"].eq("eng_Latn")
	lang_col = "src_lang"
	if system in (MEAN, BEST):
	system_filter = stats["system"].astype(bool)
	else:
	system_filter = stats["system"].eq(system)
	subset = stats[system_filter & direction_filter & stats["level"].eq(level)]

	# Compute the means and update the plot
	grouped = subset.groupby(lang_col)[metric]
	if metric == "metricx_both":
	bests = grouped.min()
	best_sys = grouped.idxmin()
	else:
	bests = grouped.max()
	best_sys = grouped.idxmax()
	if system == BEST:
	means = bests
	else:
	means = grouped.mean()
	report = (
	pd.DataFrame(
	{
	metric: means,
	"best_system": subset.loc[best_sys]["system"].values,
	}
	)
	.sort_values(metric, ascending=(metric == "metricx_both"))
	.reset_index()
	)
	report["lang_name"] = [lang2name.get(lang, "") for lang in report[lang_col]]
	tooltip_columns = ["lang_name", "best_system"]

	return gr.update(
	value=report,
	x=lang_col,
	y=metric,
	x_label_angle=-90,
	height=500,
	sort="y",
	tooltip=tooltip_columns,
	)

	default_bar = get_hist(*[x.value for x in bar_controls])
	gr_barplot = gr.BarPlot(**default_bar)

	for inp in bar_controls:
	inp.change(fn=get_hist, inputs=bar_controls, outputs=gr_barplot)

	gr.Markdown(METRICS_EXPLANATION)
	gr.Markdown(SYSTEMS_EXPLANATION)
	gr.Markdown(LANGS_EXPLANATION)
	gr.Dataframe(langs_df.drop(columns=["Class"]).style.format(na_rep=""))