Miamoto commited on
Commit
101cb32
·
1 Parent(s): 3c7cd09
Files changed (2) hide show
  1. app.py +52 -21
  2. requirements.txt +2 -0
app.py CHANGED
@@ -1,35 +1,66 @@
1
- from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
- import torch
3
  import gradio as gr
4
- import librosa
5
 
6
- processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
7
- model = WhisperForConditionalGeneration.from_pretrained("inesc-id/WhisperLv3-FT")
8
 
9
- def transcribe(audio):
10
- # Load and resample audio to 16 kHz
11
- speech, _ = librosa.load(audio, sr=16000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # Split audio into 30s chunks
14
- chunk_size = 30 * 16000
15
- texts = []
16
 
17
- for start in range(0, len(speech), chunk_size):
18
- chunk = speech[start:start + chunk_size]
19
- inputs = processor(chunk, return_tensors="pt")
20
- predicted_ids = model.generate(**inputs)
21
- text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
22
- texts.append(text)
23
 
24
- # Combine all chunks
25
- return " ".join(texts)
 
 
 
 
 
 
26
 
27
  demo = gr.Interface(
28
  fn=transcribe,
29
  inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
30
  outputs="text",
31
- title="CAMÕES Whisper Demo",
32
- description="Upload or record audio and get transcription. Supports files longer than 30 seconds."
33
  )
34
 
35
  demo.launch()
 
 
 
 
1
  import gradio as gr
2
+ import whisperx
3
 
4
+ # -----------------------------
 
5
 
6
+ # Device and compute settings
7
+
8
+ # -----------------------------
9
+
10
+ device = "cpu" # Free-tier Spaces only have CPU
11
+ compute_type = "int8" # float16 only works on GPU
12
+
13
+ # -----------------------------
14
+
15
+ # Load WhisperX model
16
+
17
+ # -----------------------------
18
+
19
+ model_name = "inesc-id/WhisperLv3-EP-X" # Portuguese fine-tuned Whisper model
20
+ model = whisperx.load_model(
21
+ model_name,
22
+ device=device,
23
+ compute_type=compute_type,
24
+ language="pt",
25
+ task="transcribe"
26
+ )
27
+
28
+ # -----------------------------
29
+
30
+ # Transcription function
31
+
32
+ # -----------------------------
33
+
34
+ def transcribe(audio_file):
35
+ # Load audio and resample to 16 kHz
36
+ audio = whisperx.load_audio(audio_file, sr=16000)
37
 
38
+ ```
39
+ # Transcribe
40
+ outputs = model.transcribe(audio, batch_size=4, language="pt", task="transcribe")
41
 
42
+ # Concatenate segments
43
+ if outputs['segments']:
44
+ text = " ".join(segment['text'] for segment in outputs['segments'])
45
+ else:
46
+ text = ""
 
47
 
48
+ return text
49
+ ```
50
+
51
+ # -----------------------------
52
+
53
+ # Gradio interface
54
+
55
+ # -----------------------------
56
 
57
  demo = gr.Interface(
58
  fn=transcribe,
59
  inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
60
  outputs="text",
61
+ title="WhisperX Portuguese ASR Demo",
62
+ description="Upload or record audio and get transcription. Simple concatenated segments, no alignment."
63
  )
64
 
65
  demo.launch()
66
+
requirements.txt CHANGED
@@ -3,3 +3,5 @@ torch
3
  gradio
4
  librosa
5
  accelerate
 
 
 
3
  gradio
4
  librosa
5
  accelerate
6
+ whisperx
7
+ faster-whisper