He Yingxu
commited on
Commit
·
c6dfdb5
1
Parent(s):
742e9d0
add meralion2
Browse files- app.py +4 -0
- app/content.py +241 -41
- app/draw_diagram.py +2 -32
- app/pages.py +57 -193
- app/summarization.py +2 -2
- model_information.py +24 -1
- results_organized/bleu/st.csv +14 -9
- results_organized/llama3_70b_judge/accent_recognition.csv +11 -6
- results_organized/llama3_70b_judge/audio_captioning.csv +9 -4
- results_organized/llama3_70b_judge/audio_scene_question_answering.csv +9 -4
- results_organized/llama3_70b_judge/emotion_recognition.csv +9 -4
- results_organized/llama3_70b_judge/gender_recognition.csv +17 -12
- results_organized/llama3_70b_judge/music_understanding.csv +9 -4
- results_organized/llama3_70b_judge/sds_singlish.csv +10 -5
- results_organized/llama3_70b_judge/speech_instruction.csv +11 -6
- results_organized/llama3_70b_judge/sqa_english.csv +15 -12
- results_organized/llama3_70b_judge/sqa_singlish.csv +11 -6
- results_organized/llama3_70b_judge/under_development_llama3_70b_judge.csv +2 -2
- results_organized/meteor/audio_captioning.csv +7 -7
- results_organized/wer/asr_english.csv +15 -9
- results_organized/wer/asr_mandarin.csv +18 -12
- results_organized/wer/asr_private.csv +12 -0
- results_organized/wer/asr_sea.csv +12 -0
- results_organized/wer/asr_singlish.csv +15 -9
- results_organized/wer/under_development_wer.csv +14 -14
app.py
CHANGED
|
@@ -19,6 +19,8 @@ pages = {
|
|
| 19 |
'ASR-English' : asr_english,
|
| 20 |
'ASR-Mandarin' : asr_mandarin,
|
| 21 |
'ASR-Singlish' : asr_singlish,
|
|
|
|
|
|
|
| 22 |
'Speech Translation' : speech_translation,
|
| 23 |
'SQA-English' : speech_question_answering_english,
|
| 24 |
'SQA-Singlish' : speech_question_answering_singlish,
|
|
@@ -47,6 +49,8 @@ menu_items = [
|
|
| 47 |
sac.MenuItem(label='ASR-English', icon='mic'),
|
| 48 |
sac.MenuItem(label='ASR-Mandarin', icon='mic'),
|
| 49 |
sac.MenuItem(label='ASR-Singlish', icon='mic'),
|
|
|
|
|
|
|
| 50 |
]
|
| 51 |
),
|
| 52 |
|
|
|
|
| 19 |
'ASR-English' : asr_english,
|
| 20 |
'ASR-Mandarin' : asr_mandarin,
|
| 21 |
'ASR-Singlish' : asr_singlish,
|
| 22 |
+
'ASR-SEA' : asr_sea,
|
| 23 |
+
'ASR-Private' : asr_private,
|
| 24 |
'Speech Translation' : speech_translation,
|
| 25 |
'SQA-English' : speech_question_answering_english,
|
| 26 |
'SQA-Singlish' : speech_question_answering_singlish,
|
|
|
|
| 49 |
sac.MenuItem(label='ASR-English', icon='mic'),
|
| 50 |
sac.MenuItem(label='ASR-Mandarin', icon='mic'),
|
| 51 |
sac.MenuItem(label='ASR-Singlish', icon='mic'),
|
| 52 |
+
sac.MenuItem(label='ASR-SEA', icon='mic'),
|
| 53 |
+
sac.MenuItem(label='ASR-Private', icon='mic'),
|
| 54 |
]
|
| 55 |
),
|
| 56 |
|
app/content.py
CHANGED
|
@@ -1,5 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
'LibriSpeech-Clean' : 'librispeech_test_clean',
|
| 4 |
'LibriSpeech-Other' : 'librispeech_test_other',
|
| 5 |
'CommonVoice-15-EN' : 'common_voice_15_en_test',
|
|
@@ -9,65 +161,102 @@ displayname2datasetname = {
|
|
| 9 |
'Earnings-22' : 'earnings22_test',
|
| 10 |
'TED-LIUM-3' : 'tedlium3_test',
|
| 11 |
'TED-LIUM-3-LongForm' : 'tedlium3_long_form_test',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
'AISHELL-ASR-ZH' : 'aishell_asr_zh_test',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
'CoVoST2-EN-ID' : 'covost2_en_id_test',
|
| 14 |
'CoVoST2-EN-ZH' : 'covost2_en_zh_test',
|
| 15 |
'CoVoST2-EN-TA' : 'covost2_en_ta_test',
|
| 16 |
'CoVoST2-ID-EN' : 'covost2_id_en_test',
|
| 17 |
'CoVoST2-ZH-EN' : 'covost2_zh_en_test',
|
| 18 |
'CoVoST2-TA-EN' : 'covost2_ta_en_test',
|
|
|
|
| 19 |
'CN-College-Listen-MCQ': 'cn_college_listen_mcq_test',
|
| 20 |
'DREAM-TTS-MCQ' : 'dream_tts_mcq_test',
|
| 21 |
'SLUE-P2-SQA5' : 'slue_p2_sqa5_test',
|
| 22 |
'Public-SG-Speech-QA' : 'public_sg_speech_qa_test',
|
| 23 |
'Spoken-SQuAD' : 'spoken_squad_test',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
'OpenHermes-Audio' : 'openhermes_audio_test',
|
| 25 |
'ALPACA-Audio' : 'alpaca_audio_test',
|
|
|
|
| 26 |
'WavCaps' : 'wavcaps_test',
|
| 27 |
'AudioCaps' : 'audiocaps_test',
|
|
|
|
| 28 |
'Clotho-AQA' : 'clotho_aqa_test',
|
| 29 |
'WavCaps-QA' : 'wavcaps_qa_test',
|
| 30 |
'AudioCaps-QA' : 'audiocaps_qa_test',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
'VoxCeleb-Accent' : 'voxceleb_accent_test',
|
| 32 |
'MNSC-AR-Sentence' : 'imda_ar_sentence',
|
| 33 |
'MNSC-AR-Dialogue' : 'imda_ar_dialogue',
|
|
|
|
| 34 |
'VoxCeleb-Gender' : 'voxceleb_gender_test',
|
| 35 |
'IEMOCAP-Gender' : 'iemocap_gender_test',
|
| 36 |
-
|
| 37 |
-
'MELD-Sentiment' : 'meld_sentiment_test',
|
| 38 |
-
'MELD-Emotion' : 'meld_emotion_test',
|
| 39 |
'MuChoMusic' : 'muchomusic_test',
|
| 40 |
-
'MNSC-PART1-ASR' : 'imda_part1_asr_test',
|
| 41 |
-
'MNSC-PART2-ASR' : 'imda_part2_asr_test',
|
| 42 |
-
'MNSC-PART3-ASR' : 'imda_part3_30s_asr_test',
|
| 43 |
-
'MNSC-PART4-ASR' : 'imda_part4_30s_asr_test',
|
| 44 |
-
'MNSC-PART5-ASR' : 'imda_part5_30s_asr_test',
|
| 45 |
-
'MNSC-PART6-ASR' : 'imda_part6_30s_asr_test',
|
| 46 |
-
'MNSC-PART3-SQA' : 'imda_part3_30s_sqa_human_test',
|
| 47 |
-
'MNSC-PART4-SQA' : 'imda_part4_30s_sqa_human_test',
|
| 48 |
-
'MNSC-PART5-SQA' : 'imda_part5_30s_sqa_human_test',
|
| 49 |
-
'MNSC-PART6-SQA' : 'imda_part6_30s_sqa_human_test',
|
| 50 |
-
'MNSC-PART3-SDS' : 'imda_part3_30s_ds_human_test',
|
| 51 |
-
'MNSC-PART4-SDS' : 'imda_part4_30s_ds_human_test',
|
| 52 |
-
'MNSC-PART5-SDS' : 'imda_part5_30s_ds_human_test',
|
| 53 |
-
'MNSC-PART6-SDS' : 'imda_part6_30s_ds_human_test',
|
| 54 |
|
| 55 |
-
'
|
| 56 |
-
'
|
| 57 |
-
'
|
| 58 |
-
'UKUS-News' : 'ukusnews_test',
|
| 59 |
-
'Mediacorp' : 'mediacorp_test',
|
| 60 |
-
'IDPC-Short' : 'idpc_short_test',
|
| 61 |
-
'Parliament-Short': 'parliament_short_test',
|
| 62 |
-
'UKUS-News-Short' : 'ukusnews_short_test',
|
| 63 |
-
'Mediacorp-Short' : 'mediacorp_short_test',
|
| 64 |
-
|
| 65 |
-
'YouTube ASR: English Singapore Content': 'ytb_asr_batch1',
|
| 66 |
-
'YouTube ASR: English with Strong Emotion': 'ytb_asr_batch2',
|
| 67 |
-
'YouTube ASR: Malay with English Prompt': 'ytb_asr_batch3_malay',
|
| 68 |
-
'YouTube ASR: Malay with Malay Prompt': 'ytb_asr_batch3_ms_ms_prompt',
|
| 69 |
-
'YouTube ASR: Chinese with English Prompt': 'ytb_asr_batch3_chinese',
|
| 70 |
-
'YouTube ASR: Chinese with Chinese Prompt': 'ytb_asr_batch3_zh_zh_prompt',
|
| 71 |
|
| 72 |
'YouTube SQA: Malay': 'ytb_sqa_batch3_malay',
|
| 73 |
'YouTube SQA: Chinese': 'ytb_sqa_batch3_chinese',
|
|
@@ -76,15 +265,13 @@ displayname2datasetname = {
|
|
| 76 |
'YouTube SDS: Malay': 'ytb_sds_batch3_malay',
|
| 77 |
'YouTube SDS: Chinese': 'ytb_sds_batch3_chinese',
|
| 78 |
'YouTube SDS: Tamil': 'ytb_sds_batch3_tamil',
|
| 79 |
-
|
| 80 |
-
'SEAME-Dev-Mandarin' : 'seame_dev_man',
|
| 81 |
-
'SEAME-Dev-Singlish' : 'seame_dev_sge',
|
| 82 |
|
| 83 |
-
'YouTube SQA: English with Singapore Content': 'ytb_sqa_batch1',
|
| 84 |
-
'YouTube SDS: English with Singapore Content': 'ytb_sds_batch1',
|
| 85 |
-
'YouTube PQA: English with Singapore Content': 'ytb_pqa_batch1',
|
| 86 |
|
| 87 |
-
}
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
datasetname2diaplayname = {datasetname: displayname for displayname, datasetname in displayname2datasetname.items()}
|
| 90 |
|
|
@@ -152,6 +339,19 @@ dataset_diaplay_information = {
|
|
| 152 |
'Parliament-Short': 'Under Development',
|
| 153 |
'UKUS-News-Short' : 'Under Development',
|
| 154 |
'Mediacorp-Short' : 'Under Development',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
'YouTube ASR: English Singapore Content' : 'YouTube Evaluation Dataset for ASR Task: <br> This dataset contains English and Singlish audio clips, featuring Singapore-related content. <br> It includes approximately 2.5 hours of audio, with individual clips ranging from 2 seconds to 30 seconds in length.',
|
| 157 |
|
|
|
|
| 1 |
+
asr_english_datasets = [
|
| 2 |
+
'LibriSpeech-Clean',
|
| 3 |
+
'LibriSpeech-Other',
|
| 4 |
+
'CommonVoice-15-EN',
|
| 5 |
+
'Peoples-Speech',
|
| 6 |
+
'GigaSpeech-1',
|
| 7 |
+
'Earnings-21',
|
| 8 |
+
'Earnings-22',
|
| 9 |
+
'TED-LIUM-3',
|
| 10 |
+
'TED-LIUM-3-LongForm',
|
| 11 |
+
]
|
| 12 |
|
| 13 |
+
|
| 14 |
+
asr_singlish_datasets = [
|
| 15 |
+
'MNSC-PART1-ASR',
|
| 16 |
+
'MNSC-PART2-ASR',
|
| 17 |
+
'MNSC-PART3-ASR',
|
| 18 |
+
'MNSC-PART4-ASR',
|
| 19 |
+
'MNSC-PART5-ASR',
|
| 20 |
+
'MNSC-PART6-ASR',
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
asr_mandarin_datasets = [
|
| 25 |
+
'AISHELL-ASR-ZH',
|
| 26 |
+
'CommonVoice-ZH'
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
asr_sea_datasets = [
|
| 31 |
+
'CommonVoice-17-Indonesian',
|
| 32 |
+
'CommonVoice-17-Tamil',
|
| 33 |
+
# 'CommonVoice-17-Thai',
|
| 34 |
+
'CommonVoice-17-Vietnamese',
|
| 35 |
+
'GigaSpeech-2-Indonesain',
|
| 36 |
+
'GigaSpeech-2-Thai',
|
| 37 |
+
'GigaSpeech-2-Vietnamese',
|
| 38 |
+
'Fleurs-Tamil',
|
| 39 |
+
'Lotus-Thai'
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
asr_private_datasets = [
|
| 44 |
+
'CNA',
|
| 45 |
+
'IDPC',
|
| 46 |
+
'Parliament',
|
| 47 |
+
'UKUS-News',
|
| 48 |
+
'Mediacorp',
|
| 49 |
+
'IDPC-Short',
|
| 50 |
+
'Parliament-Short',
|
| 51 |
+
'UKUS-News-Short',
|
| 52 |
+
'Mediacorp-Short',
|
| 53 |
+
'YouTube ASR: English Singapore Content',
|
| 54 |
+
'YouTube ASR: English with Strong Emotion',
|
| 55 |
+
'YouTube ASR: Malay with English Prompt',
|
| 56 |
+
'YouTube ASR: Chinese with English Prompt',
|
| 57 |
+
'YouTube ASR: Tamil with English Prompt'
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
speech_translation_datasets = [
|
| 62 |
+
'CoVoST2-EN-ID',
|
| 63 |
+
'CoVoST2-EN-ZH',
|
| 64 |
+
'CoVoST2-EN-TA',
|
| 65 |
+
'CoVoST2-ID-EN',
|
| 66 |
+
'CoVoST2-ZH-EN',
|
| 67 |
+
'CoVoST2-TA-EN'
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
speech_qa_english_datasets = [
|
| 72 |
+
'CN-College-Listen-MCQ',
|
| 73 |
+
'DREAM-TTS-MCQ',
|
| 74 |
+
'SLUE-P2-SQA5',
|
| 75 |
+
'Public-SG-Speech-QA',
|
| 76 |
+
'Spoken-SQuAD',
|
| 77 |
+
'MMAU-mini'
|
| 78 |
+
]
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
speech_qa_singlish_datasets = [
|
| 82 |
+
'MNSC-PART3-SQA',
|
| 83 |
+
'MNSC-PART4-SQA',
|
| 84 |
+
'MNSC-PART5-SQA',
|
| 85 |
+
'MNSC-PART6-SQA',
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
sds_datasets = [
|
| 90 |
+
'MNSC-PART3-SDS',
|
| 91 |
+
'MNSC-PART4-SDS',
|
| 92 |
+
'MNSC-PART5-SDS',
|
| 93 |
+
'MNSC-PART6-SDS',
|
| 94 |
+
]
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
si_datasets = [
|
| 98 |
+
'OpenHermes-Audio',
|
| 99 |
+
'ALPACA-Audio',
|
| 100 |
+
]
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
ac_datasets = [
|
| 104 |
+
'WavCaps',
|
| 105 |
+
'AudioCaps',
|
| 106 |
+
]
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
asqa_datasets = [
|
| 110 |
+
'Clotho-AQA',
|
| 111 |
+
'WavCaps-QA',
|
| 112 |
+
'AudioCaps-QA'
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
er_datasets = [
|
| 117 |
+
'IEMOCAP-Emotion',
|
| 118 |
+
'MELD-Sentiment',
|
| 119 |
+
'MELD-Emotion',
|
| 120 |
+
]
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
ar_datasets = [
|
| 124 |
+
'VoxCeleb-Accent',
|
| 125 |
+
'MNSC-AR-Sentence',
|
| 126 |
+
'MNSC-AR-Dialogue',
|
| 127 |
+
]
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
gr_datasets = [
|
| 131 |
+
'VoxCeleb-Gender',
|
| 132 |
+
'IEMOCAP-Gender'
|
| 133 |
+
]
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
music_datasets = ['MuChoMusic']
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
wer_development_datasets = [
|
| 140 |
+
'YouTube ASR: Malay with Malay Prompt',
|
| 141 |
+
'YouTube ASR: Chinese with Chinese Prompt',
|
| 142 |
+
'SEAME-Dev-Mandarin',
|
| 143 |
+
'SEAME-Dev-Singlish',
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
non_wer_development_datasets = [
|
| 148 |
+
'YouTube SQA: English with Singapore Content',
|
| 149 |
+
'YouTube SDS: English with Singapore Content',
|
| 150 |
+
'YouTube PQA: English with Singapore Content',
|
| 151 |
+
]
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
wer_displayname2datasetname = {
|
| 155 |
'LibriSpeech-Clean' : 'librispeech_test_clean',
|
| 156 |
'LibriSpeech-Other' : 'librispeech_test_other',
|
| 157 |
'CommonVoice-15-EN' : 'common_voice_15_en_test',
|
|
|
|
| 161 |
'Earnings-22' : 'earnings22_test',
|
| 162 |
'TED-LIUM-3' : 'tedlium3_test',
|
| 163 |
'TED-LIUM-3-LongForm' : 'tedlium3_long_form_test',
|
| 164 |
+
|
| 165 |
+
'MNSC-PART1-ASR' : 'imda_part1_asr_test',
|
| 166 |
+
'MNSC-PART2-ASR' : 'imda_part2_asr_test',
|
| 167 |
+
'MNSC-PART3-ASR' : 'imda_part3_30s_asr_test',
|
| 168 |
+
'MNSC-PART4-ASR' : 'imda_part4_30s_asr_test',
|
| 169 |
+
'MNSC-PART5-ASR' : 'imda_part5_30s_asr_test',
|
| 170 |
+
'MNSC-PART6-ASR' : 'imda_part6_30s_asr_test',
|
| 171 |
+
|
| 172 |
'AISHELL-ASR-ZH' : 'aishell_asr_zh_test',
|
| 173 |
+
'CommonVoice-ZH' : 'commonvoice_zh_asr',
|
| 174 |
+
|
| 175 |
+
'CommonVoice-17-Indonesian' : 'commonvoice_17_id_asr',
|
| 176 |
+
'CommonVoice-17-Tamil' : 'commonvoice_17_ta_asr',
|
| 177 |
+
'CommonVoice-17-Thai' : 'commonvoice_17_th_asr',
|
| 178 |
+
'CommonVoice-17-Vietnamese' : 'commonvoice_17_vi_asr',
|
| 179 |
+
'GigaSpeech-2-Indonesain' : 'gigaspeech2_id_test',
|
| 180 |
+
'GigaSpeech-2-Thai' : 'gigaspeech2_th_test',
|
| 181 |
+
'GigaSpeech-2-Vietnamese' : 'gigaspeech2_vi_test',
|
| 182 |
+
'Fleurs-Tamil' : 'fleurs_tamil_ta_30_asr',
|
| 183 |
+
'Lotus-Thai' : 'lotus_thai_th_30_asr',
|
| 184 |
+
|
| 185 |
+
'CNA' : 'cna_test',
|
| 186 |
+
'IDPC' : 'idpc_test',
|
| 187 |
+
'Parliament' : 'parliament_test',
|
| 188 |
+
'UKUS-News' : 'ukusnews_test',
|
| 189 |
+
'Mediacorp' : 'mediacorp_test',
|
| 190 |
+
'IDPC-Short' : 'idpc_short_test',
|
| 191 |
+
'Parliament-Short': 'parliament_short_test',
|
| 192 |
+
'UKUS-News-Short' : 'ukusnews_short_test',
|
| 193 |
+
'Mediacorp-Short' : 'mediacorp_short_test',
|
| 194 |
+
|
| 195 |
+
'YouTube ASR: English Singapore Content': 'ytb_asr_batch1',
|
| 196 |
+
'YouTube ASR: English with Strong Emotion': 'ytb_asr_batch2',
|
| 197 |
+
'YouTube ASR: Malay with English Prompt': 'ytb_asr_batch3_malay',
|
| 198 |
+
'YouTube ASR: Chinese with English Prompt': 'ytb_asr_batch3_chinese',
|
| 199 |
+
'YouTube ASR: Tamil with English Prompt': 'ytb_asr_batch3_tamil',
|
| 200 |
+
|
| 201 |
+
'YouTube ASR: Malay with Malay Prompt': 'ytb_asr_batch3_ms_ms_prompt',
|
| 202 |
+
'YouTube ASR: Chinese with Chinese Prompt': 'ytb_asr_batch3_zh_zh_prompt',
|
| 203 |
+
|
| 204 |
+
'SEAME-Dev-Mandarin' : 'seame_dev_man',
|
| 205 |
+
'SEAME-Dev-Singlish' : 'seame_dev_sge',
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
non_wer_displayname2datasetname = {
|
| 210 |
'CoVoST2-EN-ID' : 'covost2_en_id_test',
|
| 211 |
'CoVoST2-EN-ZH' : 'covost2_en_zh_test',
|
| 212 |
'CoVoST2-EN-TA' : 'covost2_en_ta_test',
|
| 213 |
'CoVoST2-ID-EN' : 'covost2_id_en_test',
|
| 214 |
'CoVoST2-ZH-EN' : 'covost2_zh_en_test',
|
| 215 |
'CoVoST2-TA-EN' : 'covost2_ta_en_test',
|
| 216 |
+
|
| 217 |
'CN-College-Listen-MCQ': 'cn_college_listen_mcq_test',
|
| 218 |
'DREAM-TTS-MCQ' : 'dream_tts_mcq_test',
|
| 219 |
'SLUE-P2-SQA5' : 'slue_p2_sqa5_test',
|
| 220 |
'Public-SG-Speech-QA' : 'public_sg_speech_qa_test',
|
| 221 |
'Spoken-SQuAD' : 'spoken_squad_test',
|
| 222 |
+
'MMAU-mini' : 'mmau_mini',
|
| 223 |
+
|
| 224 |
+
'MNSC-PART3-SQA' : 'imda_part3_30s_sqa_human_test',
|
| 225 |
+
'MNSC-PART4-SQA' : 'imda_part4_30s_sqa_human_test',
|
| 226 |
+
'MNSC-PART5-SQA' : 'imda_part5_30s_sqa_human_test',
|
| 227 |
+
'MNSC-PART6-SQA' : 'imda_part6_30s_sqa_human_test',
|
| 228 |
+
|
| 229 |
+
'MNSC-PART3-SDS' : 'imda_part3_30s_ds_human_test',
|
| 230 |
+
'MNSC-PART4-SDS' : 'imda_part4_30s_ds_human_test',
|
| 231 |
+
'MNSC-PART5-SDS' : 'imda_part5_30s_ds_human_test',
|
| 232 |
+
'MNSC-PART6-SDS' : 'imda_part6_30s_ds_human_test',
|
| 233 |
+
|
| 234 |
'OpenHermes-Audio' : 'openhermes_audio_test',
|
| 235 |
'ALPACA-Audio' : 'alpaca_audio_test',
|
| 236 |
+
|
| 237 |
'WavCaps' : 'wavcaps_test',
|
| 238 |
'AudioCaps' : 'audiocaps_test',
|
| 239 |
+
|
| 240 |
'Clotho-AQA' : 'clotho_aqa_test',
|
| 241 |
'WavCaps-QA' : 'wavcaps_qa_test',
|
| 242 |
'AudioCaps-QA' : 'audiocaps_qa_test',
|
| 243 |
+
|
| 244 |
+
'IEMOCAP-Emotion' : 'iemocap_emotion_test',
|
| 245 |
+
'MELD-Sentiment' : 'meld_sentiment_test',
|
| 246 |
+
'MELD-Emotion' : 'meld_emotion_test',
|
| 247 |
+
|
| 248 |
'VoxCeleb-Accent' : 'voxceleb_accent_test',
|
| 249 |
'MNSC-AR-Sentence' : 'imda_ar_sentence',
|
| 250 |
'MNSC-AR-Dialogue' : 'imda_ar_dialogue',
|
| 251 |
+
|
| 252 |
'VoxCeleb-Gender' : 'voxceleb_gender_test',
|
| 253 |
'IEMOCAP-Gender' : 'iemocap_gender_test',
|
| 254 |
+
|
|
|
|
|
|
|
| 255 |
'MuChoMusic' : 'muchomusic_test',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
+
'YouTube SQA: English with Singapore Content': 'ytb_sqa_batch1',
|
| 258 |
+
'YouTube SDS: English with Singapore Content': 'ytb_sds_batch1',
|
| 259 |
+
'YouTube PQA: English with Singapore Content': 'ytb_pqa_batch1',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
'YouTube SQA: Malay': 'ytb_sqa_batch3_malay',
|
| 262 |
'YouTube SQA: Chinese': 'ytb_sqa_batch3_chinese',
|
|
|
|
| 265 |
'YouTube SDS: Malay': 'ytb_sds_batch3_malay',
|
| 266 |
'YouTube SDS: Chinese': 'ytb_sds_batch3_chinese',
|
| 267 |
'YouTube SDS: Tamil': 'ytb_sds_batch3_tamil',
|
| 268 |
+
}
|
|
|
|
|
|
|
| 269 |
|
|
|
|
|
|
|
|
|
|
| 270 |
|
| 271 |
+
displayname2datasetname = {}
|
| 272 |
+
displayname2datasetname.update(wer_displayname2datasetname)
|
| 273 |
+
displayname2datasetname.update(non_wer_displayname2datasetname)
|
| 274 |
+
|
| 275 |
|
| 276 |
datasetname2diaplayname = {datasetname: displayname for displayname, datasetname in displayname2datasetname.items()}
|
| 277 |
|
|
|
|
| 339 |
'Parliament-Short': 'Under Development',
|
| 340 |
'UKUS-News-Short' : 'Under Development',
|
| 341 |
'Mediacorp-Short' : 'Under Development',
|
| 342 |
+
|
| 343 |
+
'CommonVoice-ZH' : 'Under Development',
|
| 344 |
+
'CommonVoice-17-Indonesian' : 'Under Development',
|
| 345 |
+
'CommonVoice-17-Tamil' : 'Under Development',
|
| 346 |
+
'CommonVoice-17-Thai' : 'Under Development',
|
| 347 |
+
'CommonVoice-17-Vietnamese' : 'Under Development',
|
| 348 |
+
'GigaSpeech-2-Indonesain' : 'Under Development',
|
| 349 |
+
'GigaSpeech-2-Thai' : 'Under Development',
|
| 350 |
+
'GigaSpeech-2-Vietnamese' : 'Under Development',
|
| 351 |
+
'Fleurs-Tamil' : 'Under Development',
|
| 352 |
+
'Lotus-Thai' : 'Under Development',
|
| 353 |
+
'MMAU-mini' : 'Under Development',
|
| 354 |
+
|
| 355 |
|
| 356 |
'YouTube ASR: English Singapore Content' : 'YouTube Evaluation Dataset for ASR Task: <br> This dataset contains English and Singlish audio clips, featuring Singapore-related content. <br> It includes approximately 2.5 hours of audio, with individual clips ranging from 2 seconds to 30 seconds in length.',
|
| 357 |
|
app/draw_diagram.py
CHANGED
|
@@ -7,6 +7,7 @@ from app.content import *
|
|
| 7 |
|
| 8 |
import pandas as pd
|
| 9 |
|
|
|
|
| 10 |
from model_information import get_dataframe
|
| 11 |
info_df = get_dataframe()
|
| 12 |
|
|
@@ -81,38 +82,7 @@ def draw(folder_name, category_name, displayname, metrics, cus_sort=True):
|
|
| 81 |
|
| 82 |
return df_style
|
| 83 |
|
| 84 |
-
if cur_dataset_name in
|
| 85 |
-
'LibriSpeech-Clean',
|
| 86 |
-
'LibriSpeech-Other',
|
| 87 |
-
'CommonVoice-15-EN',
|
| 88 |
-
'Peoples-Speech',
|
| 89 |
-
'GigaSpeech-1',
|
| 90 |
-
'Earnings-21',
|
| 91 |
-
'Earnings-22',
|
| 92 |
-
'TED-LIUM-3',
|
| 93 |
-
'TED-LIUM-3-LongForm',
|
| 94 |
-
'AISHELL-ASR-ZH',
|
| 95 |
-
'MNSC-PART1-ASR',
|
| 96 |
-
'MNSC-PART2-ASR',
|
| 97 |
-
'MNSC-PART3-ASR',
|
| 98 |
-
'MNSC-PART4-ASR',
|
| 99 |
-
'MNSC-PART5-ASR',
|
| 100 |
-
'MNSC-PART6-ASR',
|
| 101 |
-
'CNA',
|
| 102 |
-
'IDPC',
|
| 103 |
-
'Parliament',
|
| 104 |
-
'UKUS-News',
|
| 105 |
-
'Mediacorp',
|
| 106 |
-
'IDPC-Short',
|
| 107 |
-
'Parliament-Short',
|
| 108 |
-
'UKUS-News-Short',
|
| 109 |
-
'Mediacorp-Short',
|
| 110 |
-
'YTB-ASR-Batch1',
|
| 111 |
-
'YTB-ASR-Batch2',
|
| 112 |
-
'SEAME-Dev-Man',
|
| 113 |
-
'SEAME-Dev-Sge',
|
| 114 |
-
]:
|
| 115 |
-
|
| 116 |
chart_data_table = chart_data_table.sort_values(
|
| 117 |
by=chart_data_table.columns[1],
|
| 118 |
ascending=True
|
|
|
|
| 7 |
|
| 8 |
import pandas as pd
|
| 9 |
|
| 10 |
+
from app.content import wer_displayname2datasetname
|
| 11 |
from model_information import get_dataframe
|
| 12 |
info_df = get_dataframe()
|
| 13 |
|
|
|
|
| 82 |
|
| 83 |
return df_style
|
| 84 |
|
| 85 |
+
if cur_dataset_name in wer_displayname2datasetname:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
chart_data_table = chart_data_table.sort_values(
|
| 87 |
by=chart_data_table.columns[1],
|
| 88 |
ascending=True
|
app/pages.py
CHANGED
|
@@ -120,28 +120,12 @@ def dashboard():
|
|
| 120 |
""")
|
| 121 |
|
| 122 |
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
def asr_english():
|
| 129 |
st.title("Task: Automatic Speech Recognition - English")
|
| 130 |
|
| 131 |
sum = ['Overall']
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
'LibriSpeech-Other',
|
| 135 |
-
'CommonVoice-15-EN',
|
| 136 |
-
'Peoples-Speech',
|
| 137 |
-
'GigaSpeech-1',
|
| 138 |
-
'Earnings-21',
|
| 139 |
-
'Earnings-22',
|
| 140 |
-
'TED-LIUM-3',
|
| 141 |
-
'TED-LIUM-3-LongForm',
|
| 142 |
-
]
|
| 143 |
-
|
| 144 |
-
filters_levelone = sum + dataset_lists
|
| 145 |
|
| 146 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 147 |
|
|
@@ -156,23 +140,12 @@ def asr_english():
|
|
| 156 |
draw('su', 'asr_english', filter_1, 'wer', cus_sort=True)
|
| 157 |
|
| 158 |
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
def asr_singlish():
|
| 163 |
st.title("Task: Automatic Speech Recognition - Singlish")
|
| 164 |
|
| 165 |
sum = ['Overall']
|
| 166 |
-
dataset_lists = [
|
| 167 |
-
'MNSC-PART1-ASR',
|
| 168 |
-
'MNSC-PART2-ASR',
|
| 169 |
-
'MNSC-PART3-ASR',
|
| 170 |
-
'MNSC-PART4-ASR',
|
| 171 |
-
'MNSC-PART5-ASR',
|
| 172 |
-
'MNSC-PART6-ASR',
|
| 173 |
-
]
|
| 174 |
|
| 175 |
-
filters_levelone = sum +
|
| 176 |
|
| 177 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 178 |
|
|
@@ -187,17 +160,12 @@ def asr_singlish():
|
|
| 187 |
draw('su', 'asr_singlish', filter_1, 'wer')
|
| 188 |
|
| 189 |
|
| 190 |
-
|
| 191 |
-
|
| 192 |
def asr_mandarin():
|
| 193 |
st.title("Task: Automatic Speech Recognition - Mandarin")
|
| 194 |
|
| 195 |
sum = ['Overall']
|
| 196 |
-
dataset_lists = [
|
| 197 |
-
'AISHELL-ASR-ZH',
|
| 198 |
-
]
|
| 199 |
|
| 200 |
-
filters_levelone = sum +
|
| 201 |
|
| 202 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 203 |
|
|
@@ -211,22 +179,53 @@ def asr_mandarin():
|
|
| 211 |
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
|
| 212 |
draw('su', 'asr_mandarin', filter_1, 'wer')
|
| 213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
|
| 217 |
def speech_translation():
|
| 218 |
st.title("Task: Speech Translation")
|
| 219 |
|
| 220 |
sum = ['Overall']
|
| 221 |
-
dataset_lists = [
|
| 222 |
-
'CoVoST2-EN-ID',
|
| 223 |
-
'CoVoST2-EN-ZH',
|
| 224 |
-
'CoVoST2-EN-TA',
|
| 225 |
-
'CoVoST2-ID-EN',
|
| 226 |
-
'CoVoST2-ZH-EN',
|
| 227 |
-
'CoVoST2-TA-EN']
|
| 228 |
|
| 229 |
-
filters_levelone = sum +
|
| 230 |
|
| 231 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 232 |
|
|
@@ -241,22 +240,12 @@ def speech_translation():
|
|
| 241 |
draw('su', 'ST', filter_1, 'bleu')
|
| 242 |
|
| 243 |
|
| 244 |
-
|
| 245 |
-
|
| 246 |
def speech_question_answering_english():
|
| 247 |
st.title("Task: Spoken Question Answering - English")
|
| 248 |
|
| 249 |
sum = ['Overall']
|
| 250 |
|
| 251 |
-
|
| 252 |
-
'CN-College-Listen-MCQ',
|
| 253 |
-
'DREAM-TTS-MCQ',
|
| 254 |
-
'SLUE-P2-SQA5',
|
| 255 |
-
'Public-SG-Speech-QA',
|
| 256 |
-
'Spoken-SQuAD',
|
| 257 |
-
]
|
| 258 |
-
|
| 259 |
-
filters_levelone = sum + dataset_lists
|
| 260 |
|
| 261 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 262 |
|
|
@@ -276,22 +265,12 @@ def speech_question_answering_english():
|
|
| 276 |
draw('su', 'sqa_english', filter_1, 'llama3_70b_judge')
|
| 277 |
|
| 278 |
|
| 279 |
-
|
| 280 |
-
|
| 281 |
def speech_question_answering_singlish():
|
| 282 |
st.title("Task: Spoken Question Answering - Singlish")
|
| 283 |
|
| 284 |
sum = ['Overall']
|
| 285 |
|
| 286 |
-
|
| 287 |
-
'MNSC-PART3-SQA',
|
| 288 |
-
'MNSC-PART4-SQA',
|
| 289 |
-
'MNSC-PART5-SQA',
|
| 290 |
-
'MNSC-PART6-SQA',
|
| 291 |
-
]
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
filters_levelone = sum + dataset_lists
|
| 295 |
|
| 296 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 297 |
|
|
@@ -312,15 +291,7 @@ def spoken_dialogue_summarization_singlish():
|
|
| 312 |
|
| 313 |
sum = ['Overall']
|
| 314 |
|
| 315 |
-
|
| 316 |
-
'MNSC-PART3-SDS',
|
| 317 |
-
'MNSC-PART4-SDS',
|
| 318 |
-
'MNSC-PART5-SDS',
|
| 319 |
-
'MNSC-PART6-SDS',
|
| 320 |
-
]
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
filters_levelone = sum + dataset_lists
|
| 324 |
|
| 325 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 326 |
|
|
@@ -336,18 +307,12 @@ def spoken_dialogue_summarization_singlish():
|
|
| 336 |
draw('su', 'sds_singlish', filter_1, 'llama3_70b_judge')
|
| 337 |
|
| 338 |
|
| 339 |
-
|
| 340 |
-
|
| 341 |
def speech_instruction():
|
| 342 |
st.title("Task: Speech Instruction")
|
| 343 |
|
| 344 |
sum = ['Overall']
|
| 345 |
-
|
| 346 |
-
dataset_lists = ['OpenHermes-Audio',
|
| 347 |
-
'ALPACA-Audio',
|
| 348 |
-
]
|
| 349 |
|
| 350 |
-
filters_levelone = sum +
|
| 351 |
|
| 352 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 353 |
|
|
@@ -362,14 +327,11 @@ def speech_instruction():
|
|
| 362 |
draw('su', 'speech_instruction', filter_1, 'llama3_70b_judge')
|
| 363 |
|
| 364 |
|
| 365 |
-
|
| 366 |
-
|
| 367 |
def audio_captioning():
|
| 368 |
st.title("Task: Audio Captioning")
|
| 369 |
|
| 370 |
-
filters_levelone =
|
| 371 |
-
|
| 372 |
-
]
|
| 373 |
filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
|
| 374 |
|
| 375 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
|
@@ -384,18 +346,12 @@ def audio_captioning():
|
|
| 384 |
draw('asu', 'audio_captioning', filter_1, metric.lower().replace('-', '_'))
|
| 385 |
|
| 386 |
|
| 387 |
-
|
| 388 |
-
|
| 389 |
def audio_scene_question_answering():
|
| 390 |
st.title("Task: Audio Scene Question Answering")
|
| 391 |
|
| 392 |
sum = ['Overall']
|
| 393 |
-
|
| 394 |
-
dataset_lists = ['Clotho-AQA',
|
| 395 |
-
'WavCaps-QA',
|
| 396 |
-
'AudioCaps-QA']
|
| 397 |
|
| 398 |
-
filters_levelone = sum +
|
| 399 |
|
| 400 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 401 |
|
|
@@ -410,20 +366,12 @@ def audio_scene_question_answering():
|
|
| 410 |
draw('asu', 'audio_scene_question_answering', filter_1, 'llama3_70b_judge')
|
| 411 |
|
| 412 |
|
| 413 |
-
|
| 414 |
-
|
| 415 |
def emotion_recognition():
|
| 416 |
st.title("Task: Emotion Recognition")
|
| 417 |
|
| 418 |
sum = ['Overall']
|
| 419 |
|
| 420 |
-
|
| 421 |
-
'IEMOCAP-Emotion',
|
| 422 |
-
'MELD-Sentiment',
|
| 423 |
-
'MELD-Emotion',
|
| 424 |
-
]
|
| 425 |
-
|
| 426 |
-
filters_levelone = sum + dataset_lists
|
| 427 |
|
| 428 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 429 |
|
|
@@ -438,20 +386,12 @@ def emotion_recognition():
|
|
| 438 |
draw('vu', 'emotion_recognition', filter_1, 'llama3_70b_judge')
|
| 439 |
|
| 440 |
|
| 441 |
-
|
| 442 |
-
|
| 443 |
def accent_recognition():
|
| 444 |
st.title("Task: Accent Recognition")
|
| 445 |
|
| 446 |
sum = ['Overall']
|
| 447 |
-
dataset_lists = [
|
| 448 |
-
'VoxCeleb-Accent',
|
| 449 |
-
'MNSC-AR-Sentence',
|
| 450 |
-
'MNSC-AR-Dialogue',
|
| 451 |
-
]
|
| 452 |
-
|
| 453 |
|
| 454 |
-
filters_levelone = sum +
|
| 455 |
|
| 456 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 457 |
|
|
@@ -467,19 +407,12 @@ def accent_recognition():
|
|
| 467 |
draw('vu', 'accent_recognition', filter_1, 'llama3_70b_judge')
|
| 468 |
|
| 469 |
|
| 470 |
-
|
| 471 |
-
|
| 472 |
def gender_recognition():
|
| 473 |
st.title("Task: Gender Recognition")
|
| 474 |
|
| 475 |
sum = ['Overall']
|
| 476 |
|
| 477 |
-
|
| 478 |
-
'VoxCeleb-Gender',
|
| 479 |
-
'IEMOCAP-Gender'
|
| 480 |
-
]
|
| 481 |
-
|
| 482 |
-
filters_levelone = sum + dataset_lists
|
| 483 |
|
| 484 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 485 |
|
|
@@ -494,17 +427,12 @@ def gender_recognition():
|
|
| 494 |
draw('vu', 'gender_recognition', filter_1, 'llama3_70b_judge')
|
| 495 |
|
| 496 |
|
| 497 |
-
|
| 498 |
-
|
| 499 |
def music_understanding():
|
| 500 |
st.title("Task: Music Understanding - MCQ Questions")
|
| 501 |
|
| 502 |
sum = ['Overall']
|
| 503 |
|
| 504 |
-
|
| 505 |
-
]
|
| 506 |
-
|
| 507 |
-
filters_levelone = sum + dataset_lists
|
| 508 |
|
| 509 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 510 |
|
|
@@ -519,43 +447,10 @@ def music_understanding():
|
|
| 519 |
draw('vu', 'music_understanding', filter_1, 'llama3_70b_judge')
|
| 520 |
|
| 521 |
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
def under_development():
|
| 526 |
st.title("Task: Under Development")
|
| 527 |
-
|
| 528 |
|
| 529 |
-
|
| 530 |
-
'YouTube ASR: English Singapore Content',
|
| 531 |
-
'YouTube ASR: English with Strong Emotion',
|
| 532 |
-
'YouTube ASR: Malay with English Prompt',
|
| 533 |
-
'YouTube ASR: Malay with Malay Prompt',
|
| 534 |
-
'YouTube ASR: Chinese with English Prompt',
|
| 535 |
-
'YouTube ASR: Chinese with Chinese Prompt',
|
| 536 |
-
|
| 537 |
-
'YouTube SQA: English with Singapore Content',
|
| 538 |
-
'YouTube SDS: English with Singapore Content',
|
| 539 |
-
'YouTube PQA: English with Singapore Content',
|
| 540 |
-
|
| 541 |
-
'CNA',
|
| 542 |
-
'IDPC',
|
| 543 |
-
'Parliament',
|
| 544 |
-
'UKUS-News',
|
| 545 |
-
'Mediacorp',
|
| 546 |
-
'IDPC-Short',
|
| 547 |
-
'Parliament-Short',
|
| 548 |
-
'UKUS-News-Short',
|
| 549 |
-
'Mediacorp-Short',
|
| 550 |
-
|
| 551 |
-
'SEAME-Dev-Mandarin',
|
| 552 |
-
'SEAME-Dev-Singlish',
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
]
|
| 557 |
-
|
| 558 |
-
filters_levelone = dataset_lists
|
| 559 |
|
| 560 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 561 |
|
|
@@ -592,39 +487,8 @@ def under_development():
|
|
| 592 |
st.markdown('To be implemented')
|
| 593 |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
| 594 |
|
| 595 |
-
if filter_1 in
|
| 596 |
-
'CNA',
|
| 597 |
-
'IDPC',
|
| 598 |
-
'Parliament',
|
| 599 |
-
'UKUS-News',
|
| 600 |
-
'Mediacorp',
|
| 601 |
-
'IDPC-Short',
|
| 602 |
-
'Parliament-Short',
|
| 603 |
-
'UKUS-News-Short',
|
| 604 |
-
'Mediacorp-Short',
|
| 605 |
-
|
| 606 |
-
'YouTube ASR: English Singapore Content',
|
| 607 |
-
'YouTube ASR: English with Strong Emotion',
|
| 608 |
-
'YouTube ASR: Malay with English Prompt',
|
| 609 |
-
'YouTube ASR: Malay with Malay Prompt',
|
| 610 |
-
|
| 611 |
-
'YouTube ASR: Chinese with English Prompt',
|
| 612 |
-
'YouTube ASR: Chinese with Chinese Prompt',
|
| 613 |
-
|
| 614 |
-
'SEAME-Dev-Mandarin',
|
| 615 |
-
'SEAME-Dev-Singlish',
|
| 616 |
-
]:
|
| 617 |
-
|
| 618 |
draw('vu', 'under_development_wer', filter_1, 'wer')
|
| 619 |
|
| 620 |
-
elif filter_1 in
|
| 621 |
-
'YouTube SQA: English with Singapore Content',
|
| 622 |
-
'YouTube SDS: English with Singapore Content',
|
| 623 |
-
'YouTube PQA: English with Singapore Content',
|
| 624 |
-
]:
|
| 625 |
draw('vu', 'under_development_llama3_70b_judge', filter_1, 'llama3_70b_judge')
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
|
|
|
| 120 |
""")
|
| 121 |
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
def asr_english():
|
| 124 |
st.title("Task: Automatic Speech Recognition - English")
|
| 125 |
|
| 126 |
sum = ['Overall']
|
| 127 |
+
|
| 128 |
+
filters_levelone = sum + asr_english_datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 131 |
|
|
|
|
| 140 |
draw('su', 'asr_english', filter_1, 'wer', cus_sort=True)
|
| 141 |
|
| 142 |
|
|
|
|
|
|
|
|
|
|
| 143 |
def asr_singlish():
|
| 144 |
st.title("Task: Automatic Speech Recognition - Singlish")
|
| 145 |
|
| 146 |
sum = ['Overall']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
+
filters_levelone = sum + asr_singlish_datasets
|
| 149 |
|
| 150 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 151 |
|
|
|
|
| 160 |
draw('su', 'asr_singlish', filter_1, 'wer')
|
| 161 |
|
| 162 |
|
|
|
|
|
|
|
| 163 |
def asr_mandarin():
|
| 164 |
st.title("Task: Automatic Speech Recognition - Mandarin")
|
| 165 |
|
| 166 |
sum = ['Overall']
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
+
filters_levelone = sum + asr_mandarin_datasets
|
| 169 |
|
| 170 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 171 |
|
|
|
|
| 179 |
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
|
| 180 |
draw('su', 'asr_mandarin', filter_1, 'wer')
|
| 181 |
|
| 182 |
+
|
| 183 |
+
def asr_sea():
|
| 184 |
+
st.title("Task: Automatic Speech Recognition - SEA Languages")
|
| 185 |
+
|
| 186 |
+
sum = ['Overall']
|
| 187 |
+
|
| 188 |
+
filters_levelone = sum + asr_sea_datasets
|
| 189 |
+
|
| 190 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 191 |
+
|
| 192 |
+
with left:
|
| 193 |
+
filter_1 = st.selectbox('Dataset', filters_levelone)
|
| 194 |
+
|
| 195 |
+
if filter_1:
|
| 196 |
+
if filter_1 in sum:
|
| 197 |
+
sum_table_mulit_metrix('asr_sea', ['wer'])
|
| 198 |
+
else:
|
| 199 |
+
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
|
| 200 |
+
draw('su', 'asr_sea', filter_1, 'wer')
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def asr_private():
|
| 204 |
+
st.title("Task: Automatic Speech Recognition - Private Datasets")
|
| 205 |
+
|
| 206 |
+
sum = ['Overall']
|
| 207 |
+
|
| 208 |
+
filters_levelone = sum + asr_private_datasets
|
| 209 |
|
| 210 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 211 |
+
|
| 212 |
+
with left:
|
| 213 |
+
filter_1 = st.selectbox('Dataset', filters_levelone)
|
| 214 |
+
|
| 215 |
+
if filter_1:
|
| 216 |
+
if filter_1 in sum:
|
| 217 |
+
sum_table_mulit_metrix('asr_private', ['wer'])
|
| 218 |
+
else:
|
| 219 |
+
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
|
| 220 |
+
draw('su', 'asr_private', filter_1, 'wer')
|
| 221 |
|
| 222 |
|
| 223 |
def speech_translation():
|
| 224 |
st.title("Task: Speech Translation")
|
| 225 |
|
| 226 |
sum = ['Overall']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
+
filters_levelone = sum + speech_translation_datasets
|
| 229 |
|
| 230 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 231 |
|
|
|
|
| 240 |
draw('su', 'ST', filter_1, 'bleu')
|
| 241 |
|
| 242 |
|
|
|
|
|
|
|
| 243 |
def speech_question_answering_english():
|
| 244 |
st.title("Task: Spoken Question Answering - English")
|
| 245 |
|
| 246 |
sum = ['Overall']
|
| 247 |
|
| 248 |
+
filters_levelone = sum + speech_qa_english_datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 251 |
|
|
|
|
| 265 |
draw('su', 'sqa_english', filter_1, 'llama3_70b_judge')
|
| 266 |
|
| 267 |
|
|
|
|
|
|
|
| 268 |
def speech_question_answering_singlish():
|
| 269 |
st.title("Task: Spoken Question Answering - Singlish")
|
| 270 |
|
| 271 |
sum = ['Overall']
|
| 272 |
|
| 273 |
+
filters_levelone = sum + speech_qa_singlish_datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 276 |
|
|
|
|
| 291 |
|
| 292 |
sum = ['Overall']
|
| 293 |
|
| 294 |
+
filters_levelone = sum + sds_datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 297 |
|
|
|
|
| 307 |
draw('su', 'sds_singlish', filter_1, 'llama3_70b_judge')
|
| 308 |
|
| 309 |
|
|
|
|
|
|
|
| 310 |
def speech_instruction():
|
| 311 |
st.title("Task: Speech Instruction")
|
| 312 |
|
| 313 |
sum = ['Overall']
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
+
filters_levelone = sum + si_datasets
|
| 316 |
|
| 317 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 318 |
|
|
|
|
| 327 |
draw('su', 'speech_instruction', filter_1, 'llama3_70b_judge')
|
| 328 |
|
| 329 |
|
|
|
|
|
|
|
| 330 |
def audio_captioning():
|
| 331 |
st.title("Task: Audio Captioning")
|
| 332 |
|
| 333 |
+
filters_levelone = ac_datasets
|
| 334 |
+
|
|
|
|
| 335 |
filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
|
| 336 |
|
| 337 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
|
|
|
| 346 |
draw('asu', 'audio_captioning', filter_1, metric.lower().replace('-', '_'))
|
| 347 |
|
| 348 |
|
|
|
|
|
|
|
| 349 |
def audio_scene_question_answering():
|
| 350 |
st.title("Task: Audio Scene Question Answering")
|
| 351 |
|
| 352 |
sum = ['Overall']
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
|
| 354 |
+
filters_levelone = sum + asqa_datasets
|
| 355 |
|
| 356 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 357 |
|
|
|
|
| 366 |
draw('asu', 'audio_scene_question_answering', filter_1, 'llama3_70b_judge')
|
| 367 |
|
| 368 |
|
|
|
|
|
|
|
| 369 |
def emotion_recognition():
|
| 370 |
st.title("Task: Emotion Recognition")
|
| 371 |
|
| 372 |
sum = ['Overall']
|
| 373 |
|
| 374 |
+
filters_levelone = sum + er_datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
|
| 376 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 377 |
|
|
|
|
| 386 |
draw('vu', 'emotion_recognition', filter_1, 'llama3_70b_judge')
|
| 387 |
|
| 388 |
|
|
|
|
|
|
|
| 389 |
def accent_recognition():
|
| 390 |
st.title("Task: Accent Recognition")
|
| 391 |
|
| 392 |
sum = ['Overall']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
|
| 394 |
+
filters_levelone = sum + ar_datasets
|
| 395 |
|
| 396 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 397 |
|
|
|
|
| 407 |
draw('vu', 'accent_recognition', filter_1, 'llama3_70b_judge')
|
| 408 |
|
| 409 |
|
|
|
|
|
|
|
| 410 |
def gender_recognition():
|
| 411 |
st.title("Task: Gender Recognition")
|
| 412 |
|
| 413 |
sum = ['Overall']
|
| 414 |
|
| 415 |
+
filters_levelone = sum + gr_datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
|
| 417 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 418 |
|
|
|
|
| 427 |
draw('vu', 'gender_recognition', filter_1, 'llama3_70b_judge')
|
| 428 |
|
| 429 |
|
|
|
|
|
|
|
| 430 |
def music_understanding():
|
| 431 |
st.title("Task: Music Understanding - MCQ Questions")
|
| 432 |
|
| 433 |
sum = ['Overall']
|
| 434 |
|
| 435 |
+
filters_levelone = sum + music_datasets
|
|
|
|
|
|
|
|
|
|
| 436 |
|
| 437 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 438 |
|
|
|
|
| 447 |
draw('vu', 'music_understanding', filter_1, 'llama3_70b_judge')
|
| 448 |
|
| 449 |
|
|
|
|
|
|
|
|
|
|
| 450 |
def under_development():
|
| 451 |
st.title("Task: Under Development")
|
|
|
|
| 452 |
|
| 453 |
+
filters_levelone = non_wer_development_datasets + wer_development_datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
|
| 455 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
| 456 |
|
|
|
|
| 487 |
st.markdown('To be implemented')
|
| 488 |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
| 489 |
|
| 490 |
+
if filter_1 in wer_development_datasets:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 491 |
draw('vu', 'under_development_wer', filter_1, 'wer')
|
| 492 |
|
| 493 |
+
elif filter_1 in non_wer_development_datasets:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
draw('vu', 'under_development_llama3_70b_judge', filter_1, 'llama3_70b_judge')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/summarization.py
CHANGED
|
@@ -27,7 +27,6 @@ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
|
|
| 27 |
chart_data = one_chart_data
|
| 28 |
else:
|
| 29 |
chart_data = pd.merge(chart_data, one_chart_data, on='Model', how='outer')
|
| 30 |
-
|
| 31 |
|
| 32 |
selected_columns = [i for i in chart_data.columns if i != 'Model']
|
| 33 |
chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
|
|
@@ -79,7 +78,8 @@ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
|
|
| 79 |
|
| 80 |
|
| 81 |
# Format numeric columns to 2 decimal places
|
| 82 |
-
|
|
|
|
| 83 |
|
| 84 |
if metrics in ['wer']:
|
| 85 |
ascend = True
|
|
|
|
| 27 |
chart_data = one_chart_data
|
| 28 |
else:
|
| 29 |
chart_data = pd.merge(chart_data, one_chart_data, on='Model', how='outer')
|
|
|
|
| 30 |
|
| 31 |
selected_columns = [i for i in chart_data.columns if i != 'Model']
|
| 32 |
chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
|
|
|
|
| 78 |
|
| 79 |
|
| 80 |
# Format numeric columns to 2 decimal places
|
| 81 |
+
target_column = chart_data_table.columns[1]
|
| 82 |
+
chart_data_table.loc[:, target_column] = chart_data_table[target_column].apply(lambda x: round(float(x), 3) if isinstance(float(x), (int, float)) else float(x))
|
| 83 |
|
| 84 |
if metrics in ['wer']:
|
| 85 |
ascend = True
|
model_information.py
CHANGED
|
@@ -36,6 +36,30 @@ data['Original Name'].append('MERaLiON-AudioLLM-Whisper-SEA-LION')
|
|
| 36 |
data['Proper Display Name'].append('Fusion: MERaLiON-AudioLLM-Whisper-SEA-LION')
|
| 37 |
data['Link'].append('https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION')
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
data['Original Name'].append('cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct')
|
| 40 |
data['Proper Display Name'].append('Cascade: Whisper-Large-v2 / SEA-LIONv3')
|
| 41 |
data['Link'].append('https://github.com/aisingapore/sealion')
|
|
@@ -44,7 +68,6 @@ data['Original Name'].append('whisper_large_v3')
|
|
| 44 |
data['Proper Display Name'].append('Whisper-large-v3')
|
| 45 |
data['Link'].append('https://huggingface.co/openai/whisper-large-v3')
|
| 46 |
|
| 47 |
-
|
| 48 |
data['Original Name'].append('gemini-1.5-flash')
|
| 49 |
data['Proper Display Name'].append('Gemini-1.5-Flash')
|
| 50 |
data['Link'].append('https://ai.google.dev/gemini-api/docs/models/gemini')
|
|
|
|
| 36 |
data['Proper Display Name'].append('Fusion: MERaLiON-AudioLLM-Whisper-SEA-LION')
|
| 37 |
data['Link'].append('https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION')
|
| 38 |
|
| 39 |
+
data['Original Name'].append('MERaLiON-AudioLLM-v2-2b')
|
| 40 |
+
data['Proper Display Name'].append('Fusion: MERaLiON-2-3B')
|
| 41 |
+
data['Link'].append('https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION')
|
| 42 |
+
|
| 43 |
+
data['Original Name'].append('MERaLiON-AudioLLM-v2-9b')
|
| 44 |
+
data['Proper Display Name'].append('Fusion: MERaLiON-2-10B')
|
| 45 |
+
data['Link'].append('https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION')
|
| 46 |
+
|
| 47 |
+
data['Original Name'].append('MERaLiON-AudioLLM-v2-9b-asr')
|
| 48 |
+
data['Proper Display Name'].append('Fusion: MERaLiON-2-10B-ASR')
|
| 49 |
+
data['Link'].append('https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION')
|
| 50 |
+
|
| 51 |
+
data['Original Name'].append('phi_4_multimodal_instruct')
|
| 52 |
+
data['Proper Display Name'].append('Fusion: Phi-4-multimodal-instruct')
|
| 53 |
+
data['Link'].append('https://huggingface.co/microsoft/Phi-4-multimodal-instruct')
|
| 54 |
+
|
| 55 |
+
data['Original Name'].append('Qwen2.5-Omni-3B')
|
| 56 |
+
data['Proper Display Name'].append('Fusion: Qwen2.5-Omni-3B')
|
| 57 |
+
data['Link'].append('https://huggingface.co/Qwen/Qwen2.5-Omni-3B')
|
| 58 |
+
|
| 59 |
+
data['Original Name'].append('Qwen2.5-Omni-7B')
|
| 60 |
+
data['Proper Display Name'].append('Fusion: Qwen2.5-Omni-7B')
|
| 61 |
+
data['Link'].append('https://huggingface.co/Qwen/Qwen2.5-Omni-7B')
|
| 62 |
+
|
| 63 |
data['Original Name'].append('cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct')
|
| 64 |
data['Proper Display Name'].append('Cascade: Whisper-Large-v2 / SEA-LIONv3')
|
| 65 |
data['Link'].append('https://github.com/aisingapore/sealion')
|
|
|
|
| 68 |
data['Proper Display Name'].append('Whisper-large-v3')
|
| 69 |
data['Link'].append('https://huggingface.co/openai/whisper-large-v3')
|
| 70 |
|
|
|
|
| 71 |
data['Original Name'].append('gemini-1.5-flash')
|
| 72 |
data['Proper Display Name'].append('Gemini-1.5-Flash')
|
| 73 |
data['Link'].append('https://ai.google.dev/gemini-api/docs/models/gemini')
|
results_organized/bleu/st.csv
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
Model,covost2_en_id_test,covost2_en_zh_test,covost2_en_ta_test,covost2_id_en_test,covost2_zh_en_test,covost2_ta_en_test
|
| 2 |
-
Qwen-Audio-Chat,4.102230932924371,15.330641138043728,0.
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
whisper_large_v3,1.600581653970121,0.16408986541757878,0.02107778621423822,46.01512198258627,14.673689493155793,2.451098639578599
|
| 7 |
old_models,,,,,,
|
| 8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,27.620150160643625,35.274306071307024,8.433062902024755,46.80524126004861,15.209998552437538,2.8327095799289337
|
| 9 |
gemini-1.5-flash,,,,,,
|
| 10 |
-
WavLLM_fairseq,13.841886973016162,31.96381187282953,0.
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
Model,covost2_en_id_test,covost2_en_zh_test,covost2_en_ta_test,covost2_id_en_test,covost2_zh_en_test,covost2_ta_en_test
|
| 2 |
+
Qwen-Audio-Chat,4.102230932924371,15.330641138043728,0.0345148380723629,0.4564861971472884,9.898238298955656,0.0169914430109318
|
| 3 |
+
hy_whisper_local_cs,1.0869208512565696,0.1057326962921535,0.0089505165494316,22.267131378964944,7.31707791416422,2.8610263518826757
|
| 4 |
+
Qwen2-Audio-7B-Instruct,16.325186897428104,25.765420247070075,0.0324597207187291,6.326113431899141,16.466557744958333,0.0442583814605029
|
| 5 |
+
whisper_large_v3,1.600581653970121,0.1640898654175787,0.0210777862142382,46.01512198258627,14.673689493155791,2.451098639578599
|
|
|
|
| 6 |
old_models,,,,,,
|
|
|
|
| 7 |
gemini-1.5-flash,,,,,,
|
| 8 |
+
WavLLM_fairseq,13.841886973016162,31.96381187282953,0.0033159224040994,5.933522277713613,2.368659001743569,0.1695522548322915
|
| 9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,37.058238343330466,43.96331874536172,13.808713343771569,43.37364836260576,19.55610418584389,4.758175879451736
|
| 10 |
+
MERaLiON-AudioLLM-v2-2b,30.658188021678257,40.02820084309168,5.601731502002274,37.77329494766737,16.777825775562142,1.9423083468131173
|
| 11 |
+
MERaLiON-AudioLLM-v2-9b,36.242124109428445,43.747307981166834,10.885517678613343,47.85937752036512,22.133726547487697,3.4786390367027833
|
| 12 |
+
Qwen2.5-Omni-3B,3.2577143149506815,10.28866767786604,0.020665917336912663,15.00712601210481,8.98152195711894,0.04161842995351044
|
| 13 |
+
Qwen2.5-Omni-7B,2.612412992528698,12.429229982446326,0.05482974047730791,12.471476026200369,9.974234734341179,0.02999794683579762
|
| 14 |
+
SALMONN_7B,14.193483776951359,33.255550227097565,0.0005121531999434492,27.88515689237341,5.175547389931541,0.40577007761551664
|
| 15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,27.59161630015759,28.71368811388653,7.474730798912167,46.80524126004861,15.209998552437538,2.8327095799289337
|
| 16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,10.753313930099422,6.089840198985321,1.0029597453865848,46.79744652156276,14.156349261775734,2.4177196689141547
|
| 17 |
+
phi_4_multimodal_instruct,14.553644350540432,45.48015814069248,0.14817117451495013,0.37716244197757426,22.330318273444895,0.07320611681035753
|
results_organized/llama3_70b_judge/accent_recognition.csv
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
Model,voxceleb_accent_test,imda_ar_sentence,imda_ar_dialogue
|
| 2 |
Qwen-Audio-Chat,48.05088223225277,3.933333333333333,0.6666666666666667
|
| 3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,47.01682396389003,7.816666666666666,77.83333333333333
|
| 4 |
hy_whisper_local_cs,,,
|
| 5 |
-
Qwen2-Audio-7B-Instruct,29.187525646286417,2.55,0.
|
| 6 |
whisper_large_v3,,,
|
| 7 |
old_models,,,
|
| 8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,24.640951990151827,26.016666666666666,7.633333333333334
|
| 9 |
gemini-1.5-flash,,,
|
| 10 |
-
WavLLM_fairseq,39.96717275338531,2.6833333333333336,0.
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
Model,voxceleb_accent_test,imda_ar_sentence,imda_ar_dialogue
|
| 2 |
Qwen-Audio-Chat,48.05088223225277,3.933333333333333,0.6666666666666667
|
|
|
|
| 3 |
hy_whisper_local_cs,,,
|
| 4 |
+
Qwen2-Audio-7B-Instruct,29.187525646286417,2.55,0.9666666666666668
|
| 5 |
whisper_large_v3,,,
|
| 6 |
old_models,,,
|
|
|
|
| 7 |
gemini-1.5-flash,,,
|
| 8 |
+
WavLLM_fairseq,39.96717275338531,2.6833333333333336,0.2333333333333333
|
| 9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,47.066064833812064,,
|
| 10 |
+
MERaLiON-AudioLLM-v2-2b,66.59827656955272,,
|
| 11 |
+
MERaLiON-AudioLLM-v2-9b,40.78785391875257,,
|
| 12 |
+
Qwen2.5-Omni-3B,0.9027492819039803,,
|
| 13 |
+
Qwen2.5-Omni-7B,1.661879359868691,,
|
| 14 |
+
SALMONN_7B,31.69881001231022,,
|
| 15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,28.00574476815757,,
|
| 16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,40.29544521953221,,
|
| 17 |
+
phi_4_multimodal_instruct,2.6261797291752154,,
|
results_organized/llama3_70b_judge/audio_captioning.csv
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
Model,audiocaps_test,wavcaps_test
|
| 2 |
Qwen-Audio-Chat,47.04090909090909,32.9364161849711
|
| 3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,38.00454545454545,33.97687861271676
|
| 4 |
hy_whisper_local_cs,,
|
| 5 |
Qwen2-Audio-7B-Instruct,40.77727272727273,33.78034682080925
|
| 6 |
whisper_large_v3,,
|
| 7 |
old_models,,
|
| 8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,3.0954545454545457,6.3468208092485545
|
| 9 |
gemini-1.5-flash,,
|
| 10 |
WavLLM_fairseq,5.5,6.901734104046243
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
Model,audiocaps_test,wavcaps_test
|
| 2 |
Qwen-Audio-Chat,47.04090909090909,32.9364161849711
|
|
|
|
| 3 |
hy_whisper_local_cs,,
|
| 4 |
Qwen2-Audio-7B-Instruct,40.77727272727273,33.78034682080925
|
| 5 |
whisper_large_v3,,
|
| 6 |
old_models,,
|
|
|
|
| 7 |
gemini-1.5-flash,,
|
| 8 |
WavLLM_fairseq,5.5,6.901734104046243
|
| 9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,39.38636363636363,34.566473988439306
|
| 10 |
+
MERaLiON-AudioLLM-v2-2b,35.07727272727273,31.410404624277458
|
| 11 |
+
MERaLiON-AudioLLM-v2-9b,36.04090909090909,35.16763005780347
|
| 12 |
+
Qwen2.5-Omni-3B,43.69545454545454,34.70520231213873
|
| 13 |
+
Qwen2.5-Omni-7B,37.7,26.09248554913295
|
| 14 |
+
SALMONN_7B,35.24090909090909,22.520231213872833
|
| 15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,2.4545454545454546,3.8265895953757223
|
| 16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,2.5136363636363637,3.3179190751445087
|
| 17 |
+
phi_4_multimodal_instruct,33.595454545454544,28.069364161849713
|
results_organized/llama3_70b_judge/audio_scene_question_answering.csv
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
Model,clotho_aqa_test,audiocaps_qa_test,wavcaps_qa_test
|
| 2 |
Qwen-Audio-Chat,61.934856587263,50.22364217252396,42.69736842105263
|
| 3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,63.15021876519203,49.77635782747604,46.31578947368421
|
| 4 |
hy_whisper_local_cs,,,
|
| 5 |
Qwen2-Audio-7B-Instruct,50.919591292758774,45.75079872204473,44.473684210526315
|
| 6 |
whisper_large_v3,,,
|
| 7 |
old_models,,,
|
| 8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,24.647544968400585,18.466453674121407,18.88157894736842
|
| 9 |
gemini-1.5-flash,,,
|
| 10 |
WavLLM_fairseq,43.01199466903598,29.840255591054312,26.25
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
Model,clotho_aqa_test,audiocaps_qa_test,wavcaps_qa_test
|
| 2 |
Qwen-Audio-Chat,61.934856587263,50.22364217252396,42.69736842105263
|
|
|
|
| 3 |
hy_whisper_local_cs,,,
|
| 4 |
Qwen2-Audio-7B-Instruct,50.919591292758774,45.75079872204473,44.473684210526315
|
| 5 |
whisper_large_v3,,,
|
| 6 |
old_models,,,
|
|
|
|
| 7 |
gemini-1.5-flash,,,
|
| 8 |
WavLLM_fairseq,43.01199466903598,29.840255591054312,26.25
|
| 9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,62.67379679144385,48.81789137380192,45.131578947368425
|
| 10 |
+
MERaLiON-AudioLLM-v2-2b,50.53962080700049,44.79233226837061,43.0921052631579
|
| 11 |
+
MERaLiON-AudioLLM-v2-9b,58.20126397666505,50.35143769968051,44.868421052631575
|
| 12 |
+
Qwen2.5-Omni-3B,52.64948954788527,48.56230031948882,43.15789473684211
|
| 13 |
+
Qwen2.5-Omni-7B,46.592124453087024,50.41533546325879,40.0
|
| 14 |
+
SALMONN_7B,58.19154107924162,50.35143769968051,46.90789473684211
|
| 15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,22.673796791443852,17.44408945686901,14.013157894736842
|
| 16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,29.820126397666506,17.06070287539936,18.75
|
| 17 |
+
phi_4_multimodal_instruct,48.37141468157511,40.319488817891376,37.96052631578947
|
results_organized/llama3_70b_judge/emotion_recognition.csv
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
Model,iemocap_emotion_test,meld_sentiment_test,meld_emotion_test
|
| 2 |
Qwen-Audio-Chat,29.382470119521916,44.90421455938697,50.72796934865901
|
| 3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,48.505976095617534,46.206896551724135,36.36015325670498
|
| 4 |
hy_whisper_local_cs,,,
|
| 5 |
Qwen2-Audio-7B-Instruct,53.98406374501992,53.9463601532567,41.60919540229885
|
| 6 |
whisper_large_v3,,,
|
| 7 |
old_models,,,
|
| 8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,44.322709163346616,56.59003831417625,47.356321839080465
|
| 9 |
gemini-1.5-flash,,,
|
| 10 |
WavLLM_fairseq,59.76095617529881,51.072796934865906,41.57088122605364
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
Model,iemocap_emotion_test,meld_sentiment_test,meld_emotion_test
|
| 2 |
Qwen-Audio-Chat,29.382470119521916,44.90421455938697,50.72796934865901
|
|
|
|
| 3 |
hy_whisper_local_cs,,,
|
| 4 |
Qwen2-Audio-7B-Instruct,53.98406374501992,53.9463601532567,41.60919540229885
|
| 5 |
whisper_large_v3,,,
|
| 6 |
old_models,,,
|
|
|
|
| 7 |
gemini-1.5-flash,,,
|
| 8 |
WavLLM_fairseq,59.76095617529881,51.072796934865906,41.57088122605364
|
| 9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,49.103585657370516,52.452107279693486,44.17624521072797
|
| 10 |
+
MERaLiON-AudioLLM-v2-2b,51.39442231075698,58.582375478927204,52.1455938697318
|
| 11 |
+
MERaLiON-AudioLLM-v2-9b,62.54980079681275,68.85057471264368,59.808429118773944
|
| 12 |
+
Qwen2.5-Omni-3B,34.36254980079681,30.421455938697317,34.32950191570881
|
| 13 |
+
Qwen2.5-Omni-7B,36.55378486055777,27.77777777777778,30.07662835249042
|
| 14 |
+
SALMONN_7B,26.195219123505975,42.26053639846744,32.298850574712645
|
| 15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,41.98207171314741,58.39080459770115,44.272030651341
|
| 16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,46.91235059760957,56.47509578544061,49.42528735632184
|
| 17 |
+
phi_4_multimodal_instruct,32.07171314741036,49.11877394636016,40.84291187739464
|
results_organized/llama3_70b_judge/gender_recognition.csv
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
-
Model,voxceleb_gender_test,iemocap_gender_test
|
| 2 |
-
Qwen-Audio-Chat,70.5990972507181,50.0996015936255
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,voxceleb_gender_test,iemocap_gender_test
|
| 2 |
+
Qwen-Audio-Chat,70.5990972507181,50.0996015936255
|
| 3 |
+
hy_whisper_local_cs,,
|
| 4 |
+
Qwen2-Audio-7B-Instruct,99.1177677472302,92.80876494023904
|
| 5 |
+
whisper_large_v3,,
|
| 6 |
+
old_models,,
|
| 7 |
+
gemini-1.5-flash,,
|
| 8 |
+
WavLLM_fairseq,69.61427985227739,51.932270916334666
|
| 9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,99.73327862125564,94.6215139442231
|
| 10 |
+
MERaLiON-AudioLLM-v2-2b,99.69224456298728,87.92828685258964
|
| 11 |
+
MERaLiON-AudioLLM-v2-9b,97.2507180960197,92.96812749003983
|
| 12 |
+
Qwen2.5-Omni-3B,32.78621255642183,62.948207171314735
|
| 13 |
+
Qwen2.5-Omni-7B,54.08288879770209,43.366533864541836
|
| 14 |
+
SALMONN_7B,88.53098071399262,80.199203187251
|
| 15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,26.631103816167418,12.211155378486056
|
| 16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,69.69634796881412,44.38247011952191
|
| 17 |
+
phi_4_multimodal_instruct,94.58350430857611,46.852589641434264
|
results_organized/llama3_70b_judge/music_understanding.csv
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
Model,muchomusic_test
|
| 2 |
Qwen-Audio-Chat,59.0564448188711
|
| 3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,57.7927548441449
|
| 4 |
hy_whisper_local_cs,
|
| 5 |
Qwen2-Audio-7B-Instruct,71.60909856781802
|
| 6 |
whisper_large_v3,
|
| 7 |
old_models,
|
| 8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,51.727042965459134
|
| 9 |
gemini-1.5-flash,
|
| 10 |
WavLLM_fairseq,44.3133951137321
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
Model,muchomusic_test
|
| 2 |
Qwen-Audio-Chat,59.0564448188711
|
|
|
|
| 3 |
hy_whisper_local_cs,
|
| 4 |
Qwen2-Audio-7B-Instruct,71.60909856781802
|
| 5 |
whisper_large_v3,
|
| 6 |
old_models,
|
|
|
|
| 7 |
gemini-1.5-flash,
|
| 8 |
WavLLM_fairseq,44.3133951137321
|
| 9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,51.34793597304128
|
| 10 |
+
MERaLiON-AudioLLM-v2-2b,55.602358887952825
|
| 11 |
+
MERaLiON-AudioLLM-v2-9b,63.94271272114573
|
| 12 |
+
Qwen2.5-Omni-3B,59.30918281381634
|
| 13 |
+
Qwen2.5-Omni-7B,47.598989048020215
|
| 14 |
+
SALMONN_7B,49.70513900589722
|
| 15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,50.463352990732936
|
| 16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,56.697556866048856
|
| 17 |
+
phi_4_multimodal_instruct,55.2653748946925
|
results_organized/llama3_70b_judge/sds_singlish.csv
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
Model,imda_part3_30s_ds_human_test,imda_part4_30s_ds_human_test,imda_part5_30s_ds_human_test,imda_part6_30s_ds_human_test
|
| 2 |
Qwen-Audio-Chat,16.4,16.0,28.2,40.4
|
| 3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,48.4,46.4,57.0,62.599999999999994
|
| 4 |
hy_whisper_local_cs,,,,
|
| 5 |
Qwen2-Audio-7B-Instruct,33.8,24.8,40.4,46.2
|
| 6 |
whisper_large_v3,,,,
|
| 7 |
old_models,,,,
|
| 8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,45.4,44.0,58.0,65.4
|
| 9 |
gemini-1.5-flash,,,,
|
| 10 |
-
WavLLM_fairseq,31.6,31.6,45.
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
Model,imda_part3_30s_ds_human_test,imda_part4_30s_ds_human_test,imda_part5_30s_ds_human_test,imda_part6_30s_ds_human_test
|
| 2 |
Qwen-Audio-Chat,16.4,16.0,28.2,40.4
|
|
|
|
| 3 |
hy_whisper_local_cs,,,,
|
| 4 |
Qwen2-Audio-7B-Instruct,33.8,24.8,40.4,46.2
|
| 5 |
whisper_large_v3,,,,
|
| 6 |
old_models,,,,
|
|
|
|
| 7 |
gemini-1.5-flash,,,,
|
| 8 |
+
WavLLM_fairseq,31.6,31.6,45.2,49.400000000000006
|
| 9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,47.800000000000004,46.4,54.6,65.6
|
| 10 |
+
MERaLiON-AudioLLM-v2-2b,42.2,40.199999999999996,51.8,60.0
|
| 11 |
+
MERaLiON-AudioLLM-v2-9b,49.8,46.6,55.4,60.599999999999994
|
| 12 |
+
Qwen2.5-Omni-3B,42.800000000000004,33.199999999999996,52.199999999999996,58.8
|
| 13 |
+
Qwen2.5-Omni-7B,39.8,31.6,42.800000000000004,58.4
|
| 14 |
+
SALMONN_7B,9.0,7.4,16.0,25.2
|
| 15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,48.4,45.599999999999994,53.4,56.6
|
| 16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,38.0,38.199999999999996,46.2,61.0
|
| 17 |
+
phi_4_multimodal_instruct,43.6,42.800000000000004,55.599999999999994,61.0
|
results_organized/llama3_70b_judge/speech_instruction.csv
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
Model,openhermes_audio_test,alpaca_audio_test
|
| 2 |
-
Qwen-Audio-Chat,10.
|
| 3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,65.6,74.80000000000001
|
| 4 |
hy_whisper_local_cs,,
|
| 5 |
-
Qwen2-Audio-7B-Instruct,44.
|
| 6 |
whisper_large_v3,,
|
| 7 |
old_models,,
|
| 8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,72.2,73.8
|
| 9 |
gemini-1.5-flash,,
|
| 10 |
WavLLM_fairseq,19.2,21.6
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
Model,openhermes_audio_test,alpaca_audio_test
|
| 2 |
+
Qwen-Audio-Chat,10.6,9.8
|
|
|
|
| 3 |
hy_whisper_local_cs,,
|
| 4 |
+
Qwen2-Audio-7B-Instruct,44.8,52.599999999999994
|
| 5 |
whisper_large_v3,,
|
| 6 |
old_models,,
|
|
|
|
| 7 |
gemini-1.5-flash,,
|
| 8 |
WavLLM_fairseq,19.2,21.6
|
| 9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,66.39999999999999,75.19999999999999
|
| 10 |
+
MERaLiON-AudioLLM-v2-2b,12.6,25.6
|
| 11 |
+
MERaLiON-AudioLLM-v2-9b,66.2,74.2
|
| 12 |
+
Qwen2.5-Omni-3B,66.0,64.0
|
| 13 |
+
Qwen2.5-Omni-7B,57.400000000000006,59.2
|
| 14 |
+
SALMONN_7B,15.4,10.4
|
| 15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,78.8,67.0
|
| 16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,62.800000000000004,69.4
|
| 17 |
+
phi_4_multimodal_instruct,39.0,33.4
|
results_organized/llama3_70b_judge/sqa_english.csv
CHANGED
|
@@ -1,12 +1,15 @@
|
|
| 1 |
-
Model,slue_p2_sqa5_test,public_sg_speech_qa_test,spoken_squad_test,cn_college_listen_mcq_test,dream_tts_mcq_test
|
| 2 |
-
Qwen-Audio-Chat,79.36274509803921,63.16860465116279,64.8327415436367,63.
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,slue_p2_sqa5_test,public_sg_speech_qa_test,spoken_squad_test,cn_college_listen_mcq_test,dream_tts_mcq_test,mmau_mini
|
| 2 |
+
Qwen-Audio-Chat,79.36274509803921,63.16860465116279,64.8327415436367,63.23205636283576,59.749085206481965,
|
| 3 |
+
hy_whisper_local_cs,,,,,,
|
| 4 |
+
Qwen2-Audio-7B-Instruct,80.04901960784315,58.31395348837209,64.86264249672958,74.7247908410392,66.49242028227914,
|
| 5 |
+
whisper_large_v3,,,,,,
|
| 6 |
+
old_models,,,,,,
|
| 7 |
+
gemini-1.5-flash,,,,89.25583443416997,,
|
| 8 |
+
WavLLM_fairseq,83.92156862745098,58.54651162790698,77.64903756307233,66.31439894319684,66.5446941975954,
|
| 9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,86.7156862745098,59.59302325581396,74.20669033825453,57.11140466754734,51.54208050182959,53.1
|
| 10 |
+
MERaLiON-AudioLLM-v2-2b,83.18627450980392,69.47674418604652,81.4614090824145,66.00616468516073,61.16048092002091,50.99999999999999
|
| 11 |
+
MERaLiON-AudioLLM-v2-9b,89.55882352941177,75.02906976744187,89.20949355260699,84.58828709819463,83.32462101411396,56.699999999999996
|
| 12 |
+
SALMONN_7B,80.88235294117646,59.38953488372093,65.64754251541768,50.81461911052399,56.56037637219028,50.6
|
| 13 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,51.51960784313726,70.93023255813954,57.16314707531303,89.52003522677234,85.15420805018296,52.6
|
| 14 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,86.96078431372548,69.68023255813954,87.43412446271725,84.98458828709819,86.1996863565081,55.900000000000006
|
| 15 |
+
phi_4_multimodal_instruct,83.72549019607844,74.18604651162791,83.19566436180153,75.6494936151475,77.5222164140094,58.8
|
results_organized/llama3_70b_judge/sqa_singlish.csv
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
Model,imda_part3_30s_sqa_human_test,imda_part4_30s_sqa_human_test,imda_part5_30s_sqa_human_test,imda_part6_30s_sqa_human_test
|
| 2 |
-
Qwen-Audio-Chat,32.2,37.8,47.
|
| 3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,51.4,53.2,64.80000000000001,67.2
|
| 4 |
hy_whisper_local_cs,,,,
|
| 5 |
Qwen2-Audio-7B-Instruct,42.0,39.6,51.6,53.6
|
| 6 |
whisper_large_v3,,,,
|
| 7 |
old_models,,,,
|
| 8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,56.0,66.0,74.0,71.6
|
| 9 |
gemini-1.5-flash,,,,
|
| 10 |
-
WavLLM_fairseq,45.
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
Model,imda_part3_30s_sqa_human_test,imda_part4_30s_sqa_human_test,imda_part5_30s_sqa_human_test,imda_part6_30s_sqa_human_test
|
| 2 |
+
Qwen-Audio-Chat,32.2,37.8,47.8,51.4
|
|
|
|
| 3 |
hy_whisper_local_cs,,,,
|
| 4 |
Qwen2-Audio-7B-Instruct,42.0,39.6,51.6,53.6
|
| 5 |
whisper_large_v3,,,,
|
| 6 |
old_models,,,,
|
|
|
|
| 7 |
gemini-1.5-flash,,,,
|
| 8 |
+
WavLLM_fairseq,45.2,46.6,50.8,62.2
|
| 9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,55.199999999999996,50.0,63.0,67.4
|
| 10 |
+
MERaLiON-AudioLLM-v2-2b,52.599999999999994,54.6,61.4,70.19999999999999
|
| 11 |
+
MERaLiON-AudioLLM-v2-9b,59.400000000000006,63.0,72.0,71.8
|
| 12 |
+
Qwen2.5-Omni-3B,52.400000000000006,54.400000000000006,66.0,69.2
|
| 13 |
+
Qwen2.5-Omni-7B,54.2,52.0,62.800000000000004,64.6
|
| 14 |
+
SALMONN_7B,42.0,35.4,45.8,49.6
|
| 15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,51.4,46.4,54.6,62.599999999999994
|
| 16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,51.6,55.599999999999994,62.0,68.2
|
| 17 |
+
phi_4_multimodal_instruct,55.0,56.4,64.6,71.8
|
results_organized/llama3_70b_judge/under_development_llama3_70b_judge.csv
CHANGED
|
@@ -7,6 +7,6 @@ whisper_large_v3,,,
|
|
| 7 |
old_models,,,
|
| 8 |
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,70.18719211822659,64.12654745529574,55.01831501831502
|
| 9 |
gemini-1.5-flash,78.06896551724138,65.9697386519945,49.908424908424905
|
| 10 |
-
WavLLM_fairseq,60.70935960591133,55.
|
| 11 |
-
SALMONN_7B,55.665024630541865,31.
|
| 12 |
cascade_whisper_large_v3_llama_3_8b_instruct,67.3103448275862,59.44979367262724,52.252747252747255
|
|
|
|
| 7 |
old_models,,,
|
| 8 |
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,70.18719211822659,64.12654745529574,55.01831501831502
|
| 9 |
gemini-1.5-flash,78.06896551724138,65.9697386519945,49.908424908424905
|
| 10 |
+
WavLLM_fairseq,60.70935960591133,55.62585969738653,40.95238095238095
|
| 11 |
+
SALMONN_7B,55.665024630541865,31.27922971114167,32.124542124542124
|
| 12 |
cascade_whisper_large_v3_llama_3_8b_instruct,67.3103448275862,59.44979367262724,52.252747252747255
|
results_organized/meteor/audio_captioning.csv
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
Model,audiocaps_test,wavcaps_test
|
| 2 |
-
Qwen-Audio-Chat,0.
|
| 3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,0.
|
| 4 |
hy_whisper_local_cs,,
|
| 5 |
-
Qwen2-Audio-7B-Instruct,0.
|
| 6 |
whisper_large_v3,,
|
| 7 |
old_models,,
|
| 8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.
|
| 9 |
gemini-1.5-flash,,
|
| 10 |
-
WavLLM_fairseq,0.
|
| 11 |
-
SALMONN_7B,0.
|
| 12 |
-
cascade_whisper_large_v3_llama_3_8b_instruct,0.
|
|
|
|
| 1 |
Model,audiocaps_test,wavcaps_test
|
| 2 |
+
Qwen-Audio-Chat,0.2755301507695097,0.2355106805560457
|
| 3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.2492004703435381,0.3175511907248581
|
| 4 |
hy_whisper_local_cs,,
|
| 5 |
+
Qwen2-Audio-7B-Instruct,0.1989171207631428,0.2134229485619918
|
| 6 |
whisper_large_v3,,
|
| 7 |
old_models,,
|
| 8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.0579681972394305,0.120421856260385
|
| 9 |
gemini-1.5-flash,,
|
| 10 |
+
WavLLM_fairseq,0.0417329650944285,0.0639952252468867
|
| 11 |
+
SALMONN_7B,0.2099405248433995,0.1717511277065815
|
| 12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.0795304845778549,0.1388630786594543
|
results_organized/wer/asr_english.csv
CHANGED
|
@@ -1,12 +1,18 @@
|
|
| 1 |
Model,librispeech_test_clean,librispeech_test_other,common_voice_15_en_test,peoples_speech_test,gigaspeech_test,earnings21_test,earnings22_test,tedlium3_test,tedlium3_long_form_test
|
| 2 |
-
Qwen-Audio-Chat,0.
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
Qwen2-Audio-7B-Instruct,0.035141660693401744,0.060415760304159495,0.11438872500819404,0.2165498391593041,0.11723812890302816,0.18872219319407232,0.23542555661330924,0.06114048472375004,0.08739585179932637
|
| 6 |
-
whisper_large_v3,0.01878749009695552,0.03660128246354058,0.10001863741235596,0.14602420615337386,0.09459022434812692,0.11863959266711877,0.15887899737116104,0.037649480146197796,0.03208650948413402
|
| 7 |
old_models,,,,,,,,,
|
| 8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.032349945297468596,0.05307658841999735,0.10600831614192711,0.20140159998943682,0.09948381629977261,0.11416493424197618,0.1448629161356777,0.04900464852205386,0.04396383619925545
|
| 9 |
gemini-1.5-flash,,,,,,,,,
|
| 10 |
-
WavLLM_fairseq,0.
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
Model,librispeech_test_clean,librispeech_test_other,common_voice_15_en_test,peoples_speech_test,gigaspeech_test,earnings21_test,earnings22_test,tedlium3_test,tedlium3_long_form_test
|
| 2 |
+
Qwen-Audio-Chat,0.0202587995623797,0.043467569561352,0.1127242112839891,0.3141914474672335,0.1301891002258773,0.2655529121410546,0.3664994875132684,0.0405237571413363,0.2911540507002305
|
| 3 |
+
Qwen2-Audio-7B-Instruct,0.0351416606934017,0.0604157603041594,0.114388725008194,0.2165498391593041,0.1172381289030281,0.1887221931940723,0.2354255566133092,0.06114048472375,0.0873958517993263
|
| 4 |
+
whisper_large_v3,0.0187874900969555,0.0366012824635405,0.1000186374123559,0.1460242061533738,0.0945902243481269,0.1186395926671187,0.158878997371161,0.0376494801461977,0.032086509484134
|
|
|
|
|
|
|
| 5 |
old_models,,,,,,,,,
|
|
|
|
| 6 |
gemini-1.5-flash,,,,,,,,,
|
| 7 |
+
WavLLM_fairseq,0.0210321801788206,0.0479883481188643,0.1453332562130063,0.3792176325635977,0.154917784145464,0.6447482518259942,0.6671766188447099,0.0662148255917107,0.4536784258110264
|
| 8 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.023937073225940318,0.0422569845082944,0.07797507728099434,0.21620323529945748,0.14477210452030514,0.13838923413858656,0.16553574886426656,0.08154430289911642,0.10512320510547775
|
| 9 |
+
MERaLiON-AudioLLM-v2-2b,0.027124910401026145,0.050958064577146425,0.09270505973611995,0.20627055897299626,0.09237908290276242,0.21886082422652334,0.23935918375209228,0.03456229374401192,0.13837971990781775
|
| 10 |
+
MERaLiON-AudioLLM-v2-9b,0.02497453502848304,0.046607524542720415,0.08676036786395974,0.20476530792451958,0.09023061553464748,0.1084090226901313,0.15062142184399924,0.03513005216280473,0.043573834426520124
|
| 11 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.020956728411363035,0.04040327614579984,0.0761563229028091,0.1957668115250735,0.08768103407213536,0.09210848128425476,0.1277414998676963,0.0313686526383024,0.03495834071973054
|
| 12 |
+
Qwen2.5-Omni-3B,0.01765571358509073,0.03898462178674788,0.08397118270448134,0.2217852079375585,0.09894231227233641,0.12490689375326566,0.18720009894897133,0.03211383556296796,0.052153873426697396
|
| 13 |
+
Qwen2.5-Omni-7B,0.02252235258610933,0.04165169198176556,0.08635548614726127,0.31617534194121266,0.12679717916513114,0.23232370957521317,0.2807910240306093,0.0633760334977467,0.09094132246055664
|
| 14 |
+
SALMONN_7B,0.09638963292715132,0.11776722719276675,0.315955552984878,0.24158949229136512,0.11024871580815716,0.27733154717568453,0.37956460424973665,0.039352755402576205,0.14139336996986349
|
| 15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.03299128532085864,0.05381428868670437,0.10610471655066483,0.20285898669536326,0.09994259054523941,0.14091838890062366,0.17187922953626794,0.04939498243497392,0.08636766530756958
|
| 16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.018032972422378994,0.035504189759207064,0.09879113887442882,0.14542012514049835,0.09501640807342393,0.10872308256717546,0.1459710229559586,0.038146268762641496,0.04935295160432548
|
| 17 |
+
hy_whisper_local_cs,0.029086656354925113,0.05591389713810127,0.1066766923091754,0.17879147486544342,0.10212866235970408,0.14925070316060968,0.17014458107377883,0.04666264504453355,0.06973940790639957
|
| 18 |
+
phi_4_multimodal_instruct,0.016844607084920964,0.03851173700039722,0.08109202383018103,0.2147161396912585,0.0988294989332872,0.1306461295594268,0.22572024408764688,0.028636315247862035,0.05062932104236838
|
results_organized/wer/asr_mandarin.csv
CHANGED
|
@@ -1,12 +1,18 @@
|
|
| 1 |
-
Model,aishell_asr_zh_test
|
| 2 |
-
Qwen-Audio-Chat,0.
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,aishell_asr_zh_test,commonvoice_zh_asr
|
| 2 |
+
Qwen-Audio-Chat,0.9469917443725128,
|
| 3 |
+
Qwen2-Audio-7B-Instruct,0.0926035912969452,
|
| 4 |
+
whisper_large_v3,0.1235968402922135,
|
| 5 |
+
old_models,,
|
| 6 |
+
gemini-1.5-flash,,
|
| 7 |
+
WavLLM_fairseq,0.7054601967888183,
|
| 8 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.12846706657955692,0.3269799259362027
|
| 9 |
+
MERaLiON-AudioLLM-v2-2b,0.05010789728969927,0.13139387212789344
|
| 10 |
+
MERaLiON-AudioLLM-v2-9b,0.05789827958266516,0.14684695260557293
|
| 11 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.043317297222387204,0.1183419954537208
|
| 12 |
+
Qwen2.5-Omni-3B,0.08080418126744669,0.08551487145555639
|
| 13 |
+
Qwen2.5-Omni-7B,0.08943596444338857,0.0775535468448182
|
| 14 |
+
SALMONN_7B,0.9314703727900854,1.0013340021130595
|
| 15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.20889509215814378,0.31938144990021666
|
| 16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.12450753301261111,0.1962263748225777
|
| 17 |
+
hy_whisper_local_cs,0.15675793391538476,0.287290695068461
|
| 18 |
+
phi_4_multimodal_instruct,0.12232978955079092,0.154221316286565
|
results_organized/wer/asr_private.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,cna_test,idpc_short_test,idpc_test,mediacorp_short_test,mediacorp_test,parliament_test,ukusnews_test,ytb_asr_batch1,ytb_asr_batch2,ytb_asr_batch3_chinese,ytb_asr_batch3_malay,ytb_asr_batch3_tamil
|
| 2 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.14503898323187012,0.16498433693003828,0.20359281437125748,0.12828873397796267,0.12250898399215943,0.058780395496262655,0.1128757799205899,0.10724437274333563,0.13268461455292463,0.418102808691044,0.28989513404414025,0.6929759165018962
|
| 3 |
+
MERaLiON-AudioLLM-v2-2b,0.13494606429563175,0.15106160807518274,0.17741659538066723,0.1208680008994828,0.12250898399215943,0.18544800832623712,0.17383248251087163,0.09933164323576861,0.15990917937074278,0.25613142554319024,0.2798911851169321,0.7504943113675407
|
| 4 |
+
MERaLiON-AudioLLM-v2-9b,0.13334401367083198,0.15663069961712495,0.16030795551753635,0.11693276366089499,0.10454099967330938,0.06024694862333239,0.06972962752883342,0.09848659445340709,0.1110174072872743,0.19133015368309486,0.20907375718485366,0.6644679264853651
|
| 5 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.12709601623411299,0.14009745910198398,0.16612489307100087,0.11783224645828648,0.10372427311336165,0.05284322073989971,0.055965210814898844,0.09230237381885227,0.09936209319926478,0.1494223635400106,0.19463823439076827,0.5467894071504975
|
| 6 |
+
Qwen2.5-Omni-3B,0.15224821104346897,0.3038635572572224,0.19743370402053037,0.13660894985383404,0.1391702058150931,0.09165957044185827,0.0828512006050293,0.12241683951755397,0.24802681370959023,0.2562374138844727,2.2815585099381335,1.2873650773070564
|
| 7 |
+
Qwen2.5-Omni-7B,0.17280786072839902,0.4491820396797772,0.6198460222412319,0.26714639082527547,0.3391048676902973,0.2558898665909736,0.22628096048402344,0.20300376430821235,0.34827548924208024,0.19881293057763647,1.4799262866921152,1.0804025801432693
|
| 8 |
+
SALMONN_7B,0.1492577165438428,0.2398190045248869,0.5414884516680923,0.19901056892286936,0.3636883371447239,0.20430031223389156,0.191869918699187,0.2207497887378044,0.3495513028435506,0.8858293587705353,1.0858672282918695,0.985267900554277
|
| 9 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.15171419416853574,0.19735468151757746,0.17040205303678357,0.1541488644029683,0.15754655341391702,0.09007474690131517,0.12278313480809226,0.12475992932319274,0.12552708400908205,0.3469210386857446,0.3143784827344127,0.9665002755178114
|
| 10 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.13815016554523124,0.15344926428434932,0.16184773310521813,0.11434675061839443,0.15125775890231952,0.06537988456807645,0.08943089430894309,0.10816624414227549,0.08387933830684398,0.2698675145733969,0.3119213724715897,0.8976532365239376
|
| 11 |
+
hy_whisper_local_cs,0.14674783723165652,0.18308388444135051,0.17570573139435414,0.12885091072633237,0.1256125449199608,0.07257072570725707,0.16948383437322745,0.1284858262272413,0.14315061087685155,0.27520932697403283,0.2421569917950068,0.8339924151567211
|
| 12 |
+
phi_4_multimodal_instruct,0.19080422941364947,0.5388096066829099,0.26073567151411464,0.1217674836968743,0.19813786344331918,0.2778645094143249,0.07521270561542824,0.16939386955519706,0.23232781922369986,0.44008479067302597,3.762932736606555,2.7500567242552916
|
results_organized/wer/asr_sea.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,commonvoice_17_id_asr,commonvoice_17_ta_asr,commonvoice_17_vi_asr,fleurs_tamil_ta_30_asr,gigaspeech2_id_test,gigaspeech2_th_test,gigaspeech2_vi_test,lotus_thai_th_30_asr
|
| 2 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.25954549636581103,0.5284951114826634,0.9221892864704637,0.4624736472241743,0.337184855698226,0.9866395307075302,0.9818897503814326,0.8520208370756243
|
| 3 |
+
MERaLiON-AudioLLM-v2-2b,0.08547244456711749,0.13853008043879414,0.14196485284776625,0.1432185523541813,0.17842684134623737,0.19968394588770502,0.16825573283269715,0.014873360876594216
|
| 4 |
+
MERaLiON-AudioLLM-v2-9b,0.11334989419449812,0.15591770571023683,0.15646834639000634,0.16085734364019677,0.1722759890883186,0.20004788698671136,0.11314793912959634,0.018681516076881625
|
| 5 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.07921611923820039,0.12871226564172622,0.1423883125132331,0.1383345045678145,0.16282383194620612,0.18238237758889023,0.09499798648962901,0.010670019759295851
|
| 6 |
+
Qwen2.5-Omni-3B,0.13731714049130556,1.0276387288835422,0.2463476603853483,1.3477160927617708,0.3110002953799107,0.4670274152998923,0.19581530154444754,0.4822705227231902
|
| 7 |
+
Qwen2.5-Omni-7B,0.18235348238108381,1.0684188526512177,0.22041075587550285,1.2090302178496135,0.26146334682814104,0.2936956781994493,0.22408385278119664,0.0984012933357284
|
| 8 |
+
SALMONN_7B,1.1888858220627472,1.4272941368377052,1.496294727927165,1.507519325368939,2.1181172136986777,1.2470441757452413,1.5460526688938172,1.1351535836177475
|
| 9 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.09977918851780293,0.23805397249380653,0.1567859411391065,0.2724525650035137,0.2191718937327333,0.276058900993655,0.17136958408249153,0.06815160768816239
|
| 10 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.07815806421933941,0.24404355317218387,0.11676900275248782,0.28397751229796203,0.1926224523482703,0.20872022028013887,0.15538061017872032,0.031794503323154304
|
| 11 |
+
hy_whisper_local_cs,0.10267733922163952,0.31793713743921215,0.1681134871903451,0.33113141250878425,0.21382030476256667,0.26486292350053875,0.1781020821398794,0.076019400035926
|
| 12 |
+
phi_4_multimodal_instruct,1.327169012788665,1.1784589191228196,1.1070294304467498,1.7016514406184118,5.803850364012302,1.7344522925894887,2.5042567310800923,1.2856834920064666
|
results_organized/wer/asr_singlish.csv
CHANGED
|
@@ -1,12 +1,18 @@
|
|
| 1 |
Model,imda_part1_asr_test,imda_part2_asr_test,imda_part3_30s_asr_test,imda_part4_30s_asr_test,imda_part5_30s_asr_test,imda_part6_30s_asr_test
|
| 2 |
-
Qwen-Audio-Chat,0.
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
Qwen2-Audio-7B-Instruct,0.07197717796796138,0.1905689473257041,0.35076166942732234,0.5613424034000176,0.27856006770658537,0.2245352799625317
|
| 6 |
-
whisper_large_v3,0.06844171360300393,0.3171008846684522,0.27026366524560785,0.4618189591218298,0.2143555471246589,0.1698509342851144
|
| 7 |
old_models,,,,,,
|
| 8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.07041669714480775,0.32988393799204613,0.3035544573275043,0.4779640131272869,0.22881615619208825,0.1789273082575623
|
| 9 |
gemini-1.5-flash,,,,,,
|
| 10 |
-
WavLLM_fairseq,0.
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
Model,imda_part1_asr_test,imda_part2_asr_test,imda_part3_30s_asr_test,imda_part4_30s_asr_test,imda_part5_30s_asr_test,imda_part6_30s_asr_test
|
| 2 |
+
Qwen-Audio-Chat,0.1055031331529027,0.4547926304683061,0.6412550574306894,1.173131813552289,0.3016882870525747,0.3139424086306303
|
| 3 |
+
Qwen2-Audio-7B-Instruct,0.0719771779679613,0.1905689473257041,0.3507616694273223,0.5613424034000176,0.2785600677065853,0.2245352799625317
|
| 4 |
+
whisper_large_v3,0.0684417136030039,0.3171008846684522,0.2702636652456078,0.4618189591218298,0.2143555471246589,0.1698509342851144
|
|
|
|
|
|
|
| 5 |
old_models,,,,,,
|
|
|
|
| 6 |
gemini-1.5-flash,,,,,,
|
| 7 |
+
WavLLM_fairseq,0.1007729256577182,0.4463923382842302,0.7540934640345399,1.143645714142011,0.3979658840524726,0.4254106170965293
|
| 8 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.04303513520103382,0.0473581689797906,0.21299589974746788,0.29660878421707804,0.15406166552363165,0.1087388362215152
|
| 9 |
+
MERaLiON-AudioLLM-v2-2b,0.049057615877892376,0.05819332846359873,0.26414044043772233,0.3595795244502006,0.20202536078562985,0.1493725673864242
|
| 10 |
+
MERaLiON-AudioLLM-v2-9b,0.051959134908443665,0.14532099667234802,0.22654574089662477,0.2948987161915779,0.16760298259181977,0.12655243140231592
|
| 11 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.04362031550971643,0.054094635175716256,0.19622831075026476,0.24570911239925058,0.1403598371539887,0.0989680065892537
|
| 12 |
+
Qwen2.5-Omni-3B,0.04657059956599127,0.11265319373427482,0.49541097564287073,1.0728162054093475,0.273861464154908,0.17795830036014793
|
| 13 |
+
Qwen2.5-Omni-7B,0.04854558310779509,0.12052593133674215,0.6256143590300595,1.1316375158747123,0.34107192365498823,0.36374941455772863
|
| 14 |
+
SALMONN_7B,0.09275107892619414,0.45783621459297136,0.681280039101746,0.7865181254636674,0.37533379054734356,0.25522053004731987
|
| 15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.07053860970911661,0.3298433568703839,0.2810437993863198,0.4594298934979693,0.21829536997854984,0.17514817745764627
|
| 16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.06922195401458074,0.31912994075156237,0.2770250088250468,0.4581096203900464,0.21391778902978215,0.1722411537654032
|
| 17 |
+
hy_whisper_local_cs,0.06692999780557385,0.2735167600032465,0.25580416542210876,0.3612895924757007,0.186411988735025,0.14417222500363377
|
| 18 |
+
phi_4_multimodal_instruct,0.057615877892375586,0.3451018586153721,0.4381839411301491,1.4697028756805695,0.23859275364433613,0.1439784234241509
|
results_organized/wer/under_development_wer.csv
CHANGED
|
@@ -1,14 +1,14 @@
|
|
| 1 |
-
Model,
|
| 2 |
-
Qwen-Audio-Chat,0.
|
| 3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,0.
|
| 4 |
-
hy_whisper_local_cs,0.
|
| 5 |
-
Qwen2-Audio-7B-Instruct,0.
|
| 6 |
-
whisper_large_v3,0.
|
| 7 |
-
whisper_large_v2
|
| 8 |
-
old_models
|
| 9 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.
|
| 10 |
-
gemini-1.5-flash
|
| 11 |
-
WavLLM_fairseq,
|
| 12 |
-
SALMONN_7B,
|
| 13 |
-
cascade_whisper_large_v3_llama_3_8b_instruct,0.
|
| 14 |
-
Phi4-Multimodal-Instruct
|
|
|
|
| 1 |
+
Model,seame_dev_man,seame_dev_sge,ytb_asr_batch3_ms_ms_prompt,ytb_asr_batch3_zh_zh_prompt
|
| 2 |
+
Qwen-Audio-Chat,0.8783373786407767,1.05567969634822,2.899079022421131,
|
| 3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.388282092772384,0.3555052190149683,0.3031898556447721,0.2826921038685744
|
| 4 |
+
hy_whisper_local_cs,0.3134101941747573,0.3319966941136857,,
|
| 5 |
+
Qwen2-Audio-7B-Instruct,0.5522518878101402,0.5486546879304539,0.9981132903339036,
|
| 6 |
+
whisper_large_v3,0.7225930420711975,0.5377268970583734,0.237374402,0.2127821939586645
|
| 7 |
+
whisper_large_v2,,,,0.2802967673555909
|
| 8 |
+
old_models,,,,
|
| 9 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.7824973031283711,0.5840399155162387,,
|
| 10 |
+
gemini-1.5-flash,0.9690871089536138,1.110043160182436,,
|
| 11 |
+
WavLLM_fairseq,1.2913969795037756,1.2204842511249197,,
|
| 12 |
+
SALMONN_7B,1.2721817691477886,1.0189782362484312,,
|
| 13 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.6848705501618123,0.507882090054792,,
|
| 14 |
+
Phi4-Multimodal-Instruct,,,,0.2153471118177
|