bouquet / data_samples.py
David Dale
Add a data browser
8d96c36
import datasets
import pandas as pd
import gradio as gr
import csv
from collections import defaultdict
import random
INTRO = """
The table below demonstrates a sample paragraph from the dev split of BOUQuET.
"""
ALL = "All"
def data_browse_tab():
# Load the data
ds = datasets.load_dataset("facebook/bouquet", "sentence_level", split="dev")
long_df = ds.to_pandas()
lang2df = {
lang: part.drop(columns=["tgt_text", "tgt_lang"]).reset_index(drop=True)
for lang, part in long_df.groupby('src_lang')
}
eng_df = lang2df["eng_Latn"]
langs = sorted(lang2df.keys())
domains = sorted(set(eng_df["domain"]))
paragraph_ids = sorted(set(eng_df["par_id"]))
domain2par_ids = {domain: sorted(set(group["par_id"])) for domain, group in eng_df.groupby("domain")}
def select_data(src_lang, tgt_lang, par_id):
src_df = lang2df[src_lang]
tgt_df = lang2df[tgt_lang]
df = src_df.copy()
df["tgt_text"] = tgt_df["src_text"]
par = df[df['par_id'].eq(par_id)].copy()
part = par[['domain', 'uniq_id', 'orig_text', 'src_text', 'tgt_text', 'tags', 'register']]
# TODO: add 'par_comment' in a text field below
return gr.update(value=part, wrap=True)
with gr.Tab("Data samples"):
gr.Markdown("# BOUQuET data browser")
# Define the controls
with gr.Row():
gr_src_lang = gr.Dropdown(langs, label="Source lang", value=random.choice(langs))
gr_tgt_lang = gr.Dropdown(langs, label="Target lang", value=random.choice(langs))
gr_domain = gr.Dropdown([ALL] + domains, label="Domain", value=ALL)
gr_par_id = gr.Dropdown(paragraph_ids, label="Paragraph ID", value=random.choice(paragraph_ids))
inputs = [gr_src_lang, gr_tgt_lang, gr_par_id]
gr_sample_btn = gr.Button(value="Sample a paragraph")
gr_sample_btn.click(fn=lambda: random.choice(paragraph_ids), inputs=None, outputs=gr_par_id)
# Define the data
df_all = select_data(*[inp.value for inp in inputs])
gr_df = gr.Dataframe(
df_all,
wrap=True,
show_fullscreen_button=True,
column_widths=["10%", "5%", "20%", "20%", "20%", "15%", "6%"],
elem_classes=["small-font"],
)
# Interactivity
for inp in inputs:
inp.change(fn=select_data, inputs=inputs, outputs=gr_df)
def change_domain(domain, par_id):
if domain == ALL:
par_ids = paragraph_ids
else:
par_ids = domain2par_ids[domain]
if par_id not in par_ids:
par_id = random.choice(par_ids)
print(f"par_id: {par_id} is one of {par_ids}")
return gr.Dropdown(choices=par_ids, value=par_id)
gr_domain.change(fn=change_domain, inputs=[gr_domain, gr_par_id], outputs=[gr_par_id])