LiKenun commited on
Commit
0fea237
·
1 Parent(s): 02c9b64

Switch the automatic speech recognition (ASR) implementation to use the inference client instead

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. automatic_speech_recognition.py +17 -13
  3. utils.py +20 -4
app.py CHANGED
@@ -80,7 +80,7 @@ class App:
80
  audio_transcription_generate_button = gr.Button("Transcribe")
81
  audio_transcription_output = gr.Textbox(label="Text")
82
  audio_transcription_generate_button.click(
83
- fn=automatic_speech_recognition,
84
  inputs=audio_transcription_audio_input,
85
  outputs=audio_transcription_output
86
  )
 
80
  audio_transcription_generate_button = gr.Button("Transcribe")
81
  audio_transcription_output = gr.Textbox(label="Text")
82
  audio_transcription_generate_button.click(
83
+ fn=partial(automatic_speech_recognition, self.client),
84
  inputs=audio_transcription_audio_input,
85
  outputs=audio_transcription_output
86
  )
automatic_speech_recognition.py CHANGED
@@ -1,14 +1,18 @@
1
- import gc
2
- from os import getenv
3
- from transformers import pipeline
4
- from utils import spaces_gpu, resample_audio
5
 
6
-
7
- @spaces_gpu
8
- def automatic_speech_recognition(audio: tuple[int, bytes]) -> str:
9
- asr = pipeline(task="automatic-speech-recognition", model=getenv("AUDIO_TRANSCRIPTION_MODEL"))
10
- audio_array = resample_audio(asr.feature_extractor.sampling_rate, audio)
11
- result = asr(audio_array)
12
- del asr
13
- gc.collect()
14
- return result["text"]
 
 
 
 
 
 
1
+ from huggingface_hub import InferenceClient
2
+ from os import getenv, path, unlink
3
+ from utils import save_audio_to_temp_file, get_model_sample_rate
 
4
 
5
+ def automatic_speech_recognition(client: InferenceClient, audio: tuple[int, bytes]) -> str:
6
+ temp_file_path = None
7
+ try:
8
+ model_id = getenv("AUDIO_TRANSCRIPTION_MODEL")
9
+ sample_rate = get_model_sample_rate(model_id)
10
+ temp_file_path = save_audio_to_temp_file(sample_rate, audio)
11
+ result = client.automatic_speech_recognition(temp_file_path, model=model_id)
12
+ return result["text"]
13
+ finally:
14
+ if temp_file_path and path.exists(temp_file_path): # Clean up temporary file.
15
+ try:
16
+ unlink(temp_file_path)
17
+ except Exception:
18
+ pass # Ignore clean-up errors.
utils.py CHANGED
@@ -1,13 +1,14 @@
1
  import gradio as gr
2
  from io import BytesIO
3
- from PIL.Image import Image, open as open_image
 
4
  from os import getenv
 
 
5
  import requests
6
  from tempfile import NamedTemporaryFile
7
  import torch
8
- import numpy as np
9
- import soundfile as sf
10
- import librosa
11
 
12
 
13
  # Try to import spaces decorator (for Hugging Face Spaces), otherwise use no-op decorator.
@@ -45,6 +46,13 @@ def save_image_to_temp_file(image: Image) -> str:
45
  image.save(temp_path, format=image_format)
46
  return temp_path
47
 
 
 
 
 
 
 
 
48
  def resample_audio(target_sample_rate: int, audio: tuple[int, bytes | np.ndarray]) -> np.ndarray:
49
  sample_rate, audio_data = audio
50
 
@@ -61,3 +69,11 @@ def resample_audio(target_sample_rate: int, audio: tuple[int, bytes | np.ndarray
61
  audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=target_sample_rate)
62
 
63
  return audio_array
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from io import BytesIO
3
+ import librosa
4
+ import numpy as np
5
  from os import getenv
6
+ from PIL.Image import Image, open as open_image
7
+ import soundfile as sf
8
  import requests
9
  from tempfile import NamedTemporaryFile
10
  import torch
11
+ from transformers import AutoProcessor
 
 
12
 
13
 
14
  # Try to import spaces decorator (for Hugging Face Spaces), otherwise use no-op decorator.
 
46
  image.save(temp_path, format=image_format)
47
  return temp_path
48
 
49
+ def get_model_sample_rate(model_id: str) -> int:
50
+ try:
51
+ processor = AutoProcessor.from_pretrained(model_id)
52
+ return processor.feature_extractor.sampling_rate
53
+ except Exception:
54
+ return 16000 # Fallback value as most ASR models use 16kHz
55
+
56
  def resample_audio(target_sample_rate: int, audio: tuple[int, bytes | np.ndarray]) -> np.ndarray:
57
  sample_rate, audio_data = audio
58
 
 
69
  audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=target_sample_rate)
70
 
71
  return audio_array
72
+
73
+ def save_audio_to_temp_file(target_sample_rate: int, audio: tuple[int, bytes | np.ndarray]) -> str:
74
+ audio_array = resample_audio(target_sample_rate, audio)
75
+ temp_file = NamedTemporaryFile(delete=False, suffix='.wav')
76
+ temp_path = temp_file.name
77
+ temp_file.close()
78
+ sf.write(temp_path, audio_array, target_sample_rate, format='WAV')
79
+ return temp_path