Spaces:

nvidia
/

parakeet-tdt_ctc-1.1b

Running on Zero

App Files Files Community

nithinraok commited on about 1 month ago

Commit

d378233

verified ·

1 Parent(s): 06aadb5

Update to support zero GPU (#5)

Browse files

- update model space (a7d8cee5fdad58415a5ac619a8666e61ac73d5ee)
- remove yt support (e6218d687975e14299f0c7a0560f668283b5cabc)

Files changed (3) hide show

app.py +14 -78
nemo_align.py +7 -8
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import subprocess
 import torch
 import gradio as gr
-import yt_dlp
 import pandas as pd
 from nemo.collections.asr.models import ASRModel
 from nemo_align import align_tdt_to_ctc_timestamps
@@ -12,6 +12,7 @@ import os
 device = "cuda" if torch.cuda.is_available() else "cpu"
 def process_audio(input_file, output_file):
     command = [
         'sox', input_file,
         output_file,
@@ -20,6 +21,7 @@ def process_audio(input_file, output_file):
     ]
     try:
         subprocess.run(command, check=True)
         return output_file
     except:
         raise gr.Error("Failed to convert audio to single channel and sampling rate to 16000")
@@ -38,55 +40,8 @@ def get_dataframe_segments(segments):
     return df
-def get_video_info(url):
-    ydl_opts = {
-        'quiet': True,
-        'skip-download': True,
-    }
-    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-        try:
-            info = ydl.extract_info(url, download=False)
-        except:
-            raise gr.Error("Failed to extract video info from Youtube")
-        return info
-def download_audio(url):
-    ydl_opts = {
-        'format': 'bestaudio/best,channels:1',
-        'quiet': True,
-        'outtmpl': 'audio_file',
-        'postprocessors': [{
-            'key': 'FFmpegExtractAudio',
-            'preferredcodec': 'flac',
-            'preferredquality': '192',
-        }],
-    }
-    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-        try:
-            ydl.download([url])
-        except yt_dlp.utils.DownloadError as err:
-            raise gr.Error(str(err))
-    return process_audio('audio_file.flac', 'processed_file.flac')
-def get_audio_from_youtube(url):
-    info = get_video_info(url)
-    duration = info.get('duration', 0)  # Duration in seconds
-    video_id = info.get('id',None)
-    html = f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
-    if duration > 2*60*60: # 2 hrs change later based on GPU
-        return gr.Error(str("For GPU {}, single pass maximum audio can be 2hrs"))
-    else:
-        return download_audio(url), html
 def get_transcripts(audio_path, model):
-    with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16):
         with torch.inference_mode():
             text = model.transcribe(audio=[audio_path], )
     return text
@@ -101,31 +56,20 @@ def pick_asr_model():
 asr_model = pick_asr_model()
-def run_nemo_models(url, microphone, audio_path):
-    html = None
-    if url is None or len(url)<2:
-        path1 = microphone if microphone else audio_path
-    else:
-        gr.Info("Downloading and processing audio from Youtube")
-        path1, html = get_audio_from_youtube(url)
     gr.Info("Running NeMo Model")
-    text = get_transcripts(path1, asr_model)
-    segments = align_tdt_to_ctc_timestamps(text, asr_model, path1)
     df = get_dataframe_segments(segments)
-    return df, html
-def clear_youtube_link():
-    # Remove .flac files in current directory
-    file_list = os.listdir()
-    for file in file_list:
-        if file.endswith(".flac"):
-            os.remove(file)
-    return None
 # def run_speaker_diarization()
@@ -143,17 +87,12 @@ with gr.Blocks(
 ) as demo:
     gr.HTML("<h1 style='text-align: center'>Transcription with timestamps using Parakeet TDT-CTC</h1>")
     gr.Markdown('''
-    Choose between different sources of audio (Microphone, Audio File, Youtube Video) to transcribe along with timestamps.
     Parakeet models with limited attention are quite fast due to their limited attention mechanism. The current model with 1.1B parameters can transcribe very long audios upto 11 hrs on A6000 GPU in a single pass.
     Model used: [nvidia/parakeet-tdt_ctc-1.1b](https://huggingface.co/nvidia/parakeet-tdt_ctc-1.1b).
     ''')
-    # This block is for reading audio from MIC
-    with gr.Tab('Audio from Youtube'):
-        with gr.Row():
-            yt_link = gr.Textbox(value=None, label='Enter Youtube Link', type='text')
-            yt_render = gr.HTML()
     with gr.Tab('Audio From File'):
         file_input = gr.Audio(sources='upload', label='Upload Audio', type='filepath')
@@ -173,10 +112,7 @@ with gr.Blocks(
     time_stamp = gr.DataFrame(wrap=True, label='Speech Recognition with TimeStamps',
         row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'text'])
-    # b1.click(run_nemo_models, inputs=[file_input, mic_input, yt_link], outputs=[text_output, yt_render])
-    b2.click(run_nemo_models, inputs=[yt_link, file_input, mic_input], outputs=[time_stamp, yt_render]).then(
-        clear_youtube_link, None, yt_link, queue=False) #here clean up passing None to audio.
     demo.queue(True)
     demo.launch(share=True, debug=True)

 import subprocess
 import torch
 import gradio as gr
+import spaces
 import pandas as pd
 from nemo.collections.asr.models import ASRModel
 from nemo_align import align_tdt_to_ctc_timestamps
 device = "cuda" if torch.cuda.is_available() else "cpu"
 def process_audio(input_file, output_file):
+    gr.Info("Processing audio to single channel and sampling rate to 16000")
     command = [
         'sox', input_file,
         output_file,
     ]
     try:
         subprocess.run(command, check=True)
+        gr.Info("Audio processed successfully")
         return output_file
     except:
         raise gr.Error("Failed to convert audio to single channel and sampling rate to 16000")
     return df
 def get_transcripts(audio_path, model):
+    with torch.amp.autocast(device, dtype=torch.bfloat16, enabled=True):
         with torch.inference_mode():
             text = model.transcribe(audio=[audio_path], )
     return text
 asr_model = pick_asr_model()
+@spaces.GPU
+def run_nemo_models(microphone, audio_path):
+    path1 = microphone if microphone else audio_path
+    new_path = process_audio(path1, "processed_audio.flac")
     gr.Info("Running NeMo Model")
+    text = get_transcripts(new_path, asr_model)
+    segments = align_tdt_to_ctc_timestamps(text, asr_model, new_path)
     df = get_dataframe_segments(segments)
+    return df
 # def run_speaker_diarization()
 ) as demo:
     gr.HTML("<h1 style='text-align: center'>Transcription with timestamps using Parakeet TDT-CTC</h1>")
     gr.Markdown('''
+    Choose between different sources of audio (Microphone, Audio File) to transcribe along with timestamps.
     Parakeet models with limited attention are quite fast due to their limited attention mechanism. The current model with 1.1B parameters can transcribe very long audios upto 11 hrs on A6000 GPU in a single pass.
     Model used: [nvidia/parakeet-tdt_ctc-1.1b](https://huggingface.co/nvidia/parakeet-tdt_ctc-1.1b).
     ''')
     with gr.Tab('Audio From File'):
         file_input = gr.Audio(sources='upload', label='Upload Audio', type='filepath')
     time_stamp = gr.DataFrame(wrap=True, label='Speech Recognition with TimeStamps',
         row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'text'])
+    b2.click(run_nemo_models, inputs=[file_input, mic_input], outputs=[time_stamp])
     demo.queue(True)
     demo.launch(share=True, debug=True)

nemo_align.py CHANGED Viewed

@@ -6,7 +6,7 @@ from nemo.utils import logging
 from pathlib import Path
 from viterbi_decoding import viterbi_decoding
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
 BLANK_TOKEN = "<b>"
 SPACE_TOKEN = "<space>"
@@ -435,9 +435,9 @@ def get_start_end_for_segments(word_timestamps):
     return segment_timestamps
 def align_tdt_to_ctc_timestamps(tdt_txt, model, audio_filepath):
-    tdt_txt = tdt_txt[0][0] if tdt_txt is not None else tdt_txt
     if isinstance(model, EncDecHybridRNNTCTCModel):
         ctc_cfg = CTCDecodingConfig()
         ctc_cfg.decoding = "greedy_batch"
@@ -445,12 +445,11 @@ def align_tdt_to_ctc_timestamps(tdt_txt, model, audio_filepath):
     else:
         raise ValueError("Currently supporting hybrid models")
-    if torch.cuda.is_available():
-        viterbi_device = torch.device('cuda')
-    else:
-        viterbi_device = torch.device('cpu')
-    with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16):
         with torch.inference_mode():
             hypotheses = model.transcribe([audio_filepath], return_hypotheses=True, batch_size=1)

 from pathlib import Path
 from viterbi_decoding import viterbi_decoding
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
+import spaces
 BLANK_TOKEN = "<b>"
 SPACE_TOKEN = "<space>"
     return segment_timestamps
+@spaces.GPU
 def align_tdt_to_ctc_timestamps(tdt_txt, model, audio_filepath):
+    tdt_txt = tdt_txt[0].text if tdt_txt is not None else tdt_txt
     if isinstance(model, EncDecHybridRNNTCTCModel):
         ctc_cfg = CTCDecodingConfig()
         ctc_cfg.decoding = "greedy_batch"
     else:
         raise ValueError("Currently supporting hybrid models")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    viterbi_device = torch.device(device)
+    with torch.amp.autocast(device_type=device, dtype=torch.bfloat16, enabled=True):
         with torch.inference_mode():
             hypotheses = model.transcribe([audio_filepath], return_hypotheses=True, batch_size=1)

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 Cython
 packaging
-git+https://github.com/NVIDIA/NeMo.git@r2.0.0#egg=nemo_toolkit[asr]
 yt_dlp

 Cython
 packaging
+git+https://github.com/NVIDIA/NeMo.git@r2.5.3#egg=nemo_toolkit[asr]
 yt_dlp