File size: 3,433 Bytes
39d9406
b71a3ad
39d9406
5c395b2
b71a3ad
 
02c9b64
b71a3ad
 
5bebd85
 
b71a3ad
 
5bebd85
 
b71a3ad
1c1b97a
5bebd85
 
5c395b2
5bebd85
 
 
 
 
 
b71a3ad
 
 
 
 
5bebd85
b71a3ad
 
 
 
 
0fea237
b71a3ad
 
 
 
 
 
39d9406
 
b71a3ad
5bebd85
 
 
 
 
 
 
 
 
b71a3ad
55d79e2
5bebd85
39d9406
 
 
 
 
 
 
 
 
 
 
 
b71a3ad
39d9406
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from functools import partial
from os import path, unlink
import gradio as gr
import numpy as np
from huggingface_hub import InferenceClient
from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio

def automatic_speech_recognition(client: InferenceClient, model: str, audio: tuple[int, bytes | np.ndarray]) -> str:
    """Transcribe audio to text using Hugging Face Inference API.
    
    This function converts speech audio into text transcription. The audio is
    resampled to match the model's expected sample rate, saved to a temporary
    file, and then sent to the Inference API for transcription.
    
    Args:
        client: Hugging Face InferenceClient instance for API calls.
        model: Hugging Face model ID to use for automatic speech recognition.
        audio: Tuple containing:
            - int: Sample rate of the input audio (e.g., 44100 Hz)
            - bytes | np.ndarray: Raw audio data as bytes or numpy array
    
    Returns:
        String containing the transcribed text from the audio.
    
    Note:
        - Audio is automatically resampled to match the model's expected sample rate.
        - Audio is saved as a WAV file for InferenceClient compatibility.
        - Automatically cleans up temporary files after transcription.
        - Uses Inference API to offload model loading and inference to Hugging Face's
          infrastructure, which is more suitable for environments with limited GPU memory
          or time constraints (like Hugging Face Spaces with Zero GPU).
    """
    temp_file_path = None
    try:
        target_sample_rate = get_model_sample_rate(model)
        temp_file_path = save_audio_to_temp_file(target_sample_rate, audio)
        result = client.automatic_speech_recognition(temp_file_path, model=model)
        return result["text"]
    finally:
        if temp_file_path and path.exists(temp_file_path):
            try:
                unlink(temp_file_path)
            except Exception:
                pass  # Ignore clean-up errors.


def create_asr_tab(client: InferenceClient, model: str):
    """Create the automatic speech recognition tab in the Gradio interface.
    
    This function sets up all UI components for automatic speech recognition, including:
    - URL input textbox for fetching audio files from the web
    - Button to retrieve audio from URL
    - Audio input component for uploading or recording audio
    - Transcribe button and output textbox
    
    Args:
        client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function.
        model: Hugging Face model ID to use for automatic speech recognition.
    """
    gr.Markdown("Transcribe audio to text.")
    audio_transcription_url_input = gr.Textbox(label="Audio URL")
    audio_transcription_audio_request_button = gr.Button("Get Audio")
    audio_transcription_audio_input = gr.Audio(label="Audio")
    audio_transcription_audio_request_button.click(
        fn=request_audio,
        inputs=audio_transcription_url_input,
        outputs=audio_transcription_audio_input
    )
    audio_transcription_generate_button = gr.Button("Transcribe")
    audio_transcription_output = gr.Textbox(label="Text")
    audio_transcription_generate_button.click(
        fn=partial(automatic_speech_recognition, client, model),
        inputs=audio_transcription_audio_input,
        outputs=audio_transcription_output
    )