Spaces:
Running
on
Zero
Running
on
Zero
| import subprocess | |
| import torch | |
| import gradio as gr | |
| import spaces | |
| import pandas as pd | |
| from nemo.collections.asr.models import ASRModel | |
| from nemo_align import align_tdt_to_ctc_timestamps | |
| import os | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| def process_audio(input_file, output_file): | |
| gr.Info("Processing audio to single channel and sampling rate to 16000") | |
| command = [ | |
| 'sox', input_file, | |
| output_file, | |
| 'channels', '1', | |
| 'rate', '16000' | |
| ] | |
| try: | |
| subprocess.run(command, check=True) | |
| gr.Info("Audio processed successfully") | |
| return output_file | |
| except: | |
| raise gr.Error("Failed to convert audio to single channel and sampling rate to 16000") | |
| def get_dataframe_segments(segments): | |
| df = pd.DataFrame(columns=['start_time', 'end_time', 'text']) | |
| if len(segments) == 0: | |
| df.loc[0] = 0, 0, '' | |
| return df | |
| for segment in segments: | |
| text, start_time, end_time = segment | |
| if len(text)>0: | |
| df.loc[len(df)] = round(start_time, 2), round(end_time, 2), text | |
| return df | |
| def get_transcripts(audio_path, model): | |
| with torch.amp.autocast(device, dtype=torch.bfloat16, enabled=True): | |
| with torch.inference_mode(): | |
| text = model.transcribe(audio=[audio_path], ) | |
| return text | |
| def pick_asr_model(): | |
| model = 'nvidia/parakeet-tdt_ctc-1.1b' | |
| asr_model = ASRModel.from_pretrained(model).to(device) | |
| asr_model.cfg.decoding.strategy = "greedy_batch" | |
| asr_model.change_decoding_strategy(asr_model.cfg.decoding) | |
| asr_model.eval() | |
| return asr_model | |
| asr_model = pick_asr_model() | |
| def run_nemo_models(microphone, audio_path): | |
| path1 = microphone if microphone else audio_path | |
| new_path = process_audio(path1, "processed_audio.flac") | |
| gr.Info("Running NeMo Model") | |
| text = get_transcripts(new_path, asr_model) | |
| segments = align_tdt_to_ctc_timestamps(text, asr_model, new_path) | |
| df = get_dataframe_segments(segments) | |
| return df | |
| # def run_speaker_diarization() | |
| with gr.Blocks( | |
| title="NeMo Parakeet Model", | |
| css=""" | |
| textarea { font-size: 18px;} | |
| #model_output_text_box span { | |
| font-size: 18px; | |
| font-weight: bold; | |
| } | |
| """, | |
| theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md ) | |
| ) as demo: | |
| gr.HTML("<h1 style='text-align: center'>Transcription with timestamps using Parakeet TDT-CTC</h1>") | |
| gr.Markdown(''' | |
| Choose between different sources of audio (Microphone, Audio File) to transcribe along with timestamps. | |
| Parakeet models with limited attention are quite fast due to their limited attention mechanism. The current model with 1.1B parameters can transcribe very long audios upto 11 hrs on A6000 GPU in a single pass. | |
| Model used: [nvidia/parakeet-tdt_ctc-1.1b](https://huggingface.co/nvidia/parakeet-tdt_ctc-1.1b). | |
| ''') | |
| with gr.Tab('Audio From File'): | |
| file_input = gr.Audio(sources='upload', label='Upload Audio', type='filepath') | |
| with gr.Tab('Audio From Microphone'): | |
| mic_input = gr.Audio(sources='microphone', label='Record Audio', type='filepath') | |
| # b1 = gr.Button("Get Transcription with Punctuation and Capitalization") | |
| gr.Markdown('''Speech Recognition''') | |
| # text_output = gr.Textbox(label='Transcription', type='text') | |
| b2 = gr.Button("Get timestamps with text") | |
| time_stamp = gr.DataFrame(wrap=True, label='Speech Recognition with TimeStamps', | |
| row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'text']) | |
| b2.click(run_nemo_models, inputs=[file_input, mic_input], outputs=[time_stamp]) | |
| demo.queue(True) | |
| demo.launch(share=True, debug=True) | |