Spaces:
Running
Running
| import datasets | |
| import pandas as pd | |
| import gradio as gr | |
| import csv | |
| from collections import defaultdict | |
| import random | |
| INTRO = """ | |
| The table below demonstrates a sample paragraph from the dev split of BOUQuET. | |
| """ | |
| ALL = "All" | |
| def data_browse_tab(): | |
| # Load the data | |
| ds = datasets.load_dataset("facebook/bouquet", "sentence_level", split="dev") | |
| long_df = ds.to_pandas() | |
| lang2df = { | |
| lang: part.drop(columns=["tgt_text", "tgt_lang"]).reset_index(drop=True) | |
| for lang, part in long_df.groupby('src_lang') | |
| } | |
| eng_df = lang2df["eng_Latn"] | |
| langs = sorted(lang2df.keys()) | |
| domains = sorted(set(eng_df["domain"])) | |
| paragraph_ids = sorted(set(eng_df["par_id"])) | |
| domain2par_ids = {domain: sorted(set(group["par_id"])) for domain, group in eng_df.groupby("domain")} | |
| def select_data(src_lang, tgt_lang, par_id): | |
| src_df = lang2df[src_lang] | |
| tgt_df = lang2df[tgt_lang] | |
| df = src_df.copy() | |
| df["tgt_text"] = tgt_df["src_text"] | |
| par = df[df['par_id'].eq(par_id)].copy() | |
| part = par[['domain', 'uniq_id', 'orig_text', 'src_text', 'tgt_text', 'tags', 'register']] | |
| # TODO: add 'par_comment' in a text field below | |
| return gr.update(value=part, wrap=True) | |
| with gr.Tab("Data samples"): | |
| gr.Markdown("# BOUQuET data browser") | |
| # Define the controls | |
| with gr.Row(): | |
| gr_src_lang = gr.Dropdown(langs, label="Source lang", value=random.choice(langs)) | |
| gr_tgt_lang = gr.Dropdown(langs, label="Target lang", value=random.choice(langs)) | |
| gr_domain = gr.Dropdown([ALL] + domains, label="Domain", value=ALL) | |
| gr_par_id = gr.Dropdown(paragraph_ids, label="Paragraph ID", value=random.choice(paragraph_ids)) | |
| inputs = [gr_src_lang, gr_tgt_lang, gr_par_id] | |
| gr_sample_btn = gr.Button(value="Sample a paragraph") | |
| gr_sample_btn.click(fn=lambda: random.choice(paragraph_ids), inputs=None, outputs=gr_par_id) | |
| # Define the data | |
| df_all = select_data(*[inp.value for inp in inputs]) | |
| gr_df = gr.Dataframe( | |
| df_all, | |
| wrap=True, | |
| show_fullscreen_button=True, | |
| column_widths=["10%", "5%", "20%", "20%", "20%", "15%", "6%"], | |
| elem_classes=["small-font"], | |
| ) | |
| # Interactivity | |
| for inp in inputs: | |
| inp.change(fn=select_data, inputs=inputs, outputs=gr_df) | |
| def change_domain(domain, par_id): | |
| if domain == ALL: | |
| par_ids = paragraph_ids | |
| else: | |
| par_ids = domain2par_ids[domain] | |
| if par_id not in par_ids: | |
| par_id = random.choice(par_ids) | |
| print(f"par_id: {par_id} is one of {par_ids}") | |
| return gr.Dropdown(choices=par_ids, value=par_id) | |
| gr_domain.change(fn=change_domain, inputs=[gr_domain, gr_par_id], outputs=[gr_par_id]) | |