Spaces:
Running
on
Zero
Running
on
Zero
Switch the automatic speech recognition (ASR) implementation to use the inference client instead
Browse files- app.py +1 -1
- automatic_speech_recognition.py +17 -13
- utils.py +20 -4
app.py
CHANGED
|
@@ -80,7 +80,7 @@ class App:
|
|
| 80 |
audio_transcription_generate_button = gr.Button("Transcribe")
|
| 81 |
audio_transcription_output = gr.Textbox(label="Text")
|
| 82 |
audio_transcription_generate_button.click(
|
| 83 |
-
fn=automatic_speech_recognition,
|
| 84 |
inputs=audio_transcription_audio_input,
|
| 85 |
outputs=audio_transcription_output
|
| 86 |
)
|
|
|
|
| 80 |
audio_transcription_generate_button = gr.Button("Transcribe")
|
| 81 |
audio_transcription_output = gr.Textbox(label="Text")
|
| 82 |
audio_transcription_generate_button.click(
|
| 83 |
+
fn=partial(automatic_speech_recognition, self.client),
|
| 84 |
inputs=audio_transcription_audio_input,
|
| 85 |
outputs=audio_transcription_output
|
| 86 |
)
|
automatic_speech_recognition.py
CHANGED
|
@@ -1,14 +1,18 @@
|
|
| 1 |
-
import
|
| 2 |
-
from os import getenv
|
| 3 |
-
from
|
| 4 |
-
from utils import spaces_gpu, resample_audio
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import InferenceClient
|
| 2 |
+
from os import getenv, path, unlink
|
| 3 |
+
from utils import save_audio_to_temp_file, get_model_sample_rate
|
|
|
|
| 4 |
|
| 5 |
+
def automatic_speech_recognition(client: InferenceClient, audio: tuple[int, bytes]) -> str:
|
| 6 |
+
temp_file_path = None
|
| 7 |
+
try:
|
| 8 |
+
model_id = getenv("AUDIO_TRANSCRIPTION_MODEL")
|
| 9 |
+
sample_rate = get_model_sample_rate(model_id)
|
| 10 |
+
temp_file_path = save_audio_to_temp_file(sample_rate, audio)
|
| 11 |
+
result = client.automatic_speech_recognition(temp_file_path, model=model_id)
|
| 12 |
+
return result["text"]
|
| 13 |
+
finally:
|
| 14 |
+
if temp_file_path and path.exists(temp_file_path): # Clean up temporary file.
|
| 15 |
+
try:
|
| 16 |
+
unlink(temp_file_path)
|
| 17 |
+
except Exception:
|
| 18 |
+
pass # Ignore clean-up errors.
|
utils.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
from io import BytesIO
|
| 3 |
-
|
|
|
|
| 4 |
from os import getenv
|
|
|
|
|
|
|
| 5 |
import requests
|
| 6 |
from tempfile import NamedTemporaryFile
|
| 7 |
import torch
|
| 8 |
-
|
| 9 |
-
import soundfile as sf
|
| 10 |
-
import librosa
|
| 11 |
|
| 12 |
|
| 13 |
# Try to import spaces decorator (for Hugging Face Spaces), otherwise use no-op decorator.
|
|
@@ -45,6 +46,13 @@ def save_image_to_temp_file(image: Image) -> str:
|
|
| 45 |
image.save(temp_path, format=image_format)
|
| 46 |
return temp_path
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
def resample_audio(target_sample_rate: int, audio: tuple[int, bytes | np.ndarray]) -> np.ndarray:
|
| 49 |
sample_rate, audio_data = audio
|
| 50 |
|
|
@@ -61,3 +69,11 @@ def resample_audio(target_sample_rate: int, audio: tuple[int, bytes | np.ndarray
|
|
| 61 |
audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=target_sample_rate)
|
| 62 |
|
| 63 |
return audio_array
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from io import BytesIO
|
| 3 |
+
import librosa
|
| 4 |
+
import numpy as np
|
| 5 |
from os import getenv
|
| 6 |
+
from PIL.Image import Image, open as open_image
|
| 7 |
+
import soundfile as sf
|
| 8 |
import requests
|
| 9 |
from tempfile import NamedTemporaryFile
|
| 10 |
import torch
|
| 11 |
+
from transformers import AutoProcessor
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
# Try to import spaces decorator (for Hugging Face Spaces), otherwise use no-op decorator.
|
|
|
|
| 46 |
image.save(temp_path, format=image_format)
|
| 47 |
return temp_path
|
| 48 |
|
| 49 |
+
def get_model_sample_rate(model_id: str) -> int:
|
| 50 |
+
try:
|
| 51 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
| 52 |
+
return processor.feature_extractor.sampling_rate
|
| 53 |
+
except Exception:
|
| 54 |
+
return 16000 # Fallback value as most ASR models use 16kHz
|
| 55 |
+
|
| 56 |
def resample_audio(target_sample_rate: int, audio: tuple[int, bytes | np.ndarray]) -> np.ndarray:
|
| 57 |
sample_rate, audio_data = audio
|
| 58 |
|
|
|
|
| 69 |
audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=target_sample_rate)
|
| 70 |
|
| 71 |
return audio_array
|
| 72 |
+
|
| 73 |
+
def save_audio_to_temp_file(target_sample_rate: int, audio: tuple[int, bytes | np.ndarray]) -> str:
|
| 74 |
+
audio_array = resample_audio(target_sample_rate, audio)
|
| 75 |
+
temp_file = NamedTemporaryFile(delete=False, suffix='.wav')
|
| 76 |
+
temp_path = temp_file.name
|
| 77 |
+
temp_file.close()
|
| 78 |
+
sf.write(temp_path, audio_array, target_sample_rate, format='WAV')
|
| 79 |
+
return temp_path
|