Spaces:

inesc-id
/

CAMOES-ASR-DEMO

Sleeping

File size: 1,627 Bytes

220065d
101cb32
220065d
101cb32
220065d
101cb32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c7cd09
1b12a9c
101cb32
 
3c7cd09
101cb32
 
 
 
 
3c7cd09
101cb32
1b12a9c
101cb32
 
 
 
 
 
220065d
 
3c7cd09
 
 
1b12a9c
37f1adc
2e1ef95
37f1adc
21677f0
37f1adc
220065d
 
101cb32

import gradio as gr
import whisperx

# -----------------------------

# Device and compute settings

# -----------------------------

device = "cpu"            # Free-tier Spaces only have CPU
compute_type = "int8"     # float16 only works on GPU

# -----------------------------

# Load WhisperX model

# -----------------------------

model_name = "inesc-id/WhisperLv3-EP-X"  # Portuguese fine-tuned Whisper model
model = whisperx.load_model(
model_name,
device=device,
compute_type=compute_type,
language="pt",
task="transcribe"
)

# -----------------------------

# Transcription function

# -----------------------------

def transcribe(audio_file):
    # Load audio and resample to 16 kHz
    audio = whisperx.load_audio(audio_file, sr=16000)
    
    
    # Transcribe
    outputs = model.transcribe(audio, batch_size=4, language="pt", task="transcribe")
    
    # Concatenate segments
    if outputs['segments']:
        text = " ".join(segment['text'] for segment in outputs['segments'])
    else:
        text = ""
    
    return text


# -----------------------------

# Gradio interface

# -----------------------------

demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
outputs="text",
title="CAMÕES European Portuguese Automatic Speech Recognition Demo",
description="""
This is a demo for **CAMÕES**, a Whisper Model fine-tuned on around 420h of European Portuguese by the HLT lab at INESC-ID.

The model being used here is "WhisperLv3-X". For more details about CAMÕES check out the [paper here](https://arxiv.org/abs/2508.19721).
""")

demo.launch()