Spaces:

inesc-id
/

CAMOES-ASR-DEMO

Sleeping

Miamoto commited on Nov 3

Commit

101cb32

1 Parent(s): 3c7cd09

app.py

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,35 +1,66 @@
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
-import torch
 import gradio as gr
-import librosa
-processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
-model = WhisperForConditionalGeneration.from_pretrained("inesc-id/WhisperLv3-FT")
-def transcribe(audio):
-    # Load and resample audio to 16 kHz
-    speech, _ = librosa.load(audio, sr=16000)
-    # Split audio into 30s chunks
-    chunk_size = 30 * 16000
-    texts = []
-    for start in range(0, len(speech), chunk_size):
-        chunk = speech[start:start + chunk_size]
-        inputs = processor(chunk, return_tensors="pt")
-        predicted_ids = model.generate(**inputs)
-        text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-        texts.append(text)
-    # Combine all chunks
-    return " ".join(texts)
 demo = gr.Interface(
 fn=transcribe,
 inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
 outputs="text",
-title="CAMÕES Whisper Demo",
-description="Upload or record audio and get transcription. Supports files longer than 30 seconds."
 )
 demo.launch()

 import gradio as gr
+import whisperx
+# -----------------------------
+# Device and compute settings
+# -----------------------------
+device = "cpu"            # Free-tier Spaces only have CPU
+compute_type = "int8"     # float16 only works on GPU
+# -----------------------------
+# Load WhisperX model
+# -----------------------------
+model_name = "inesc-id/WhisperLv3-EP-X"  # Portuguese fine-tuned Whisper model
+model = whisperx.load_model(
+model_name,
+device=device,
+compute_type=compute_type,
+language="pt",
+task="transcribe"
+)
+# -----------------------------
+# Transcription function
+# -----------------------------
+def transcribe(audio_file):
+    # Load audio and resample to 16 kHz
+    audio = whisperx.load_audio(audio_file, sr=16000)
+    ```
+    # Transcribe
+    outputs = model.transcribe(audio, batch_size=4, language="pt", task="transcribe")
+    # Concatenate segments
+    if outputs['segments']:
+        text = " ".join(segment['text'] for segment in outputs['segments'])
+    else:
+        text = ""
+    return text
+```
+# -----------------------------
+# Gradio interface
+# -----------------------------
 demo = gr.Interface(
 fn=transcribe,
 inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
 outputs="text",
+title="WhisperX Portuguese ASR Demo",
+description="Upload or record audio and get transcription. Simple concatenated segments, no alignment."
 )
 demo.launch()

requirements.txt CHANGED Viewed

@@ -3,3 +3,5 @@ torch
 gradio
 librosa
 accelerate

 gradio
 librosa
 accelerate
+whisperx
+faster-whisper