Spaces:

cuhgrel
/

nemo-tts-api

Running

App Files Files Community

cuhgrel commited on Oct 1, 2025

Commit

8491e9b

verified ·

1 Parent(s): 6f7c9d8

updated the app.py to also load the facebook/mms/tgl model (#2)

Browse files

- updated the app.py to also load the facebook/mms/tgl model (71bd426e5372e763be1b5a6ee4b203bba0cb226f)

Files changed (1) hide show

app.py +68 -74

app.py CHANGED Viewed

@@ -4,9 +4,14 @@ import io
 import logging
 from fastapi import FastAPI, HTTPException, status
 from pydantic import BaseModel
 from nemo.collections.tts.models import FastPitchModel, HifiGanModel
-# Omegaconf is no longer needed here since we aren't creating overrides
-# from omegaconf import OmegaConf, open_dict
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -14,8 +19,8 @@ logger = logging.getLogger(__name__)
 # --- 1. Initialize FastAPI App ---
 app = FastAPI(
-    title="NVIDIA NeMo TTS API",
-    description="A backend service to convert text to speech in English and Bikol.",
 )
 # --- 2. Load Models on Startup ---
@@ -23,44 +28,50 @@ models = {}
 @app.on_event("startup")
 def load_models():
-    """Load all NeMo models into memory when the application starts."""
     logger.info("Loading models...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
-        # Load the shared HiFi-GAN Vocoder
         logger.info("Loading HiFi-GAN vocoder...")
         models['hifigan'] = HifiGanModel.restore_from("models/hifigan_en.nemo").to(device)
         models['hifigan'].eval()
-        logger.info("HiFi-GAN loaded successfully")
-        # Load the English Spectrogram Generator
         logger.info("Loading English FastPitch model...")
         models['en'] = FastPitchModel.restore_from("models/fastpitch_en.nemo").to(device)
         models['en'].eval()
-        logger.info("English model loaded successfully")
-        # Load the CORRECTED Bikol Spectrogram Generator
         logger.info("Loading Bikol FastPitch model...")
-        # This is the only line needed now. Replace the filename with your new .nemo file.
         models['bikol'] = FastPitchModel.restore_from("models/fastpitch_bikol_corrected.nemo").to(device)
         models['bikol'].eval()
-        logger.info("Bikol model loaded successfully")
     except Exception as e:
         logger.error(f"FATAL: Could not load models. Error: {e}")
         import traceback
         traceback.print_exc()
-        # You might want the app to fail completely if models don't load
-        # raise e
-    logger.info("Model loading complete. Available models: " + ", ".join(models.keys()))
 # --- 3. Define API Request and Response Models ---
 class TTSRequest(BaseModel):
     text: str
-    language: str  # Should be 'en' or 'bikol'
 # --- 4. Define the TTS API Endpoint ---
 @app.post("/synthesize/")
@@ -68,69 +79,50 @@ def synthesize_speech(request: TTSRequest):
     """
     Generates speech from text using the selected language model.
     """
-    if not models or 'hifigan' not in models:
-        raise HTTPException(
-            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
-            detail="Models are not loaded yet. Please try again in a moment."
-        )
-    # ... (the validation code remains the same) ...
-    if request.language not in ['en', 'bikol']:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid language specified. Use 'en' or 'bikol'.")
-    if request.language not in models:
-        available = [k for k in models.keys() if k != 'hifigan']
-        raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=f"The '{request.language}' model is not available. Available languages: {', '.join(available)}")
     try:
-        spectrogram_generator = models[request.language]
-        vocoder = models['hifigan']
-        logger.info(f"--- STARTING SYNTHESIS FOR '{request.text}' ---")
-        audio = None # Define audio here to ensure it exists
-        with torch.no_grad():
-            # --- DEBUG STEP 1: Check the parsed tokens ---
-            parsed = spectrogram_generator.parse(request.text)
-            logger.info(f"1. Parsed tokens shape: {parsed.shape}")
-            logger.info(f"   Parsed tokens content: {parsed}")
-            # --- DEBUG STEP 2: Check the generated spectrogram ---
-            spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
-            if spectrogram is not None:
-                logger.info(f"2. Spectrogram generated with shape: {spectrogram.shape}")
-                logger.info(f"   Spectrogram stats: min={spectrogram.min()}, max={spectrogram.max()}, mean={spectrogram.mean()}")
-            else:
-                logger.error("2. FAILED: Spectrogram is None!")
-            # --- DEBUG STEP 3: Check the generated audio waveform ---
-            if spectrogram is not None:
                 audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
-                if audio is not None:
-                    logger.info(f"3. Audio generated with shape: {audio.shape}")
-                    logger.info(f"   Audio stats: min={audio.min()}, max={audio.max()}, mean={audio.mean()}")
-                else:
-                    logger.error("3. FAILED: Audio is None!")
-        # If audio generation failed, we can't proceed
-        if audio is None:
-            logger.error("Synthesis failed, audio tensor is None.")
-            raise HTTPException(status_code=500, detail="Audio generation failed internally, resulting in None.")
         # --- Prepare and return audio file ---
-        audio_numpy = audio.to('cpu').detach().numpy()
-        logger.info(f"4. Successfully converted to NumPy array.")
-        if len(audio_numpy.shape) > 1:
-            audio_numpy = audio_numpy.squeeze()
         buffer = io.BytesIO()
-        sf.write(buffer, audio_numpy, samplerate=22050, format='WAV')
         buffer.seek(0)
         logger.info(f"--- SYNTHESIS COMPLETE ---")
-        from fastapi.responses import StreamingResponse
         return StreamingResponse(buffer, media_type="audio/wav")
     except Exception as e:
@@ -142,11 +134,12 @@ def synthesize_speech(request: TTSRequest):
 # --- 5. Add a Root Endpoint for Health Check ---
 @app.get("/")
 def read_root():
-    available_models = [k for k in models.keys() if k != 'hifigan']
     return {
-        "status": "NeMo TTS Backend is running",
-        "available_languages": available_models,
-        "device": "cuda" if torch.cuda.is_available() else "cpu"
     }
 # --- 6. Add Model Status Endpoint ---
@@ -155,7 +148,8 @@ def get_status():
     """Get the status of all loaded models."""
     return {
         "models_loaded": list(models.keys()),
-        "device": "cuda" if torch.cuda.is_available() else "cpu",
         "english_available": 'en' in models,
-        "bikol_available": 'bikol' in models
     }

 import logging
 from fastapi import FastAPI, HTTPException, status
 from pydantic import BaseModel
+from fastapi.responses import StreamingResponse
+# --- Library Imports ---
+# For NeMo models
 from nemo.collections.tts.models import FastPitchModel, HifiGanModel
+from nemo.collections.tts.torch.tts_tokenizers import BaseCharsTokenizer
+# For Transformers MMS-TTS model
+from transformers import AutoTokenizer, AutoModelForTextToWaveform
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 # --- 1. Initialize FastAPI App ---
 app = FastAPI(
+    title="Multilingual TTS API",
+    description="A backend service to convert text to speech in English, Bikol, and Tagalog.",
 )
 # --- 2. Load Models on Startup ---
 @app.on_event("startup")
 def load_models():
+    """Load all models into memory when the application starts."""
     logger.info("Loading models...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
+        # --- NeMo Models ---
         logger.info("Loading HiFi-GAN vocoder...")
         models['hifigan'] = HifiGanModel.restore_from("models/hifigan_en.nemo").to(device)
         models['hifigan'].eval()
         logger.info("Loading English FastPitch model...")
         models['en'] = FastPitchModel.restore_from("models/fastpitch_en.nemo").to(device)
         models['en'].eval()
         logger.info("Loading Bikol FastPitch model...")
         models['bikol'] = FastPitchModel.restore_from("models/fastpitch_bikol_corrected.nemo").to(device)
+        logger.info("Overriding Bikol model tokenizer...")
+        BIKOL_CHARS = [
+            ' ', '!', ',', '-', '.', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
+            'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
+            'y', 'z', 'à', 'á', 'â', 'é', 'ì', 'í', 'î', 'ñ', 'ò', 'ó', 'ô', 'ú', '’'
+        ]
+        models['bikol'].tokenizer = BaseCharsTokenizer(chars=BIKOL_CHARS)
         models['bikol'].eval()
+        # --- Transformers MMS-TTS Model ---
+        logger.info("Loading Tagalog (tgl) MMS-TTS model from Hub...")
+        models['tgl_tokenizer'] = AutoTokenizer.from_pretrained("facebook/mms-tts-tgl")
+        models['tgl_model'] = AutoModelForTextToWaveform.from_pretrained("facebook/mms-tts-tgl").to(device)
     except Exception as e:
         logger.error(f"FATAL: Could not load models. Error: {e}")
         import traceback
         traceback.print_exc()
+        raise e
+    logger.info("Model loading complete.")
 # --- 3. Define API Request and Response Models ---
 class TTSRequest(BaseModel):
     text: str
+    language: str  # Should be 'en', 'bikol', or 'tgl'
 # --- 4. Define the TTS API Endpoint ---
 @app.post("/synthesize/")
     """
     Generates speech from text using the selected language model.
     """
+    if not models:
+        raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Models are not loaded.")
+    lang = request.language.lower()
+    # Validate the requested language
+    valid_langs = ['en', 'bikol', 'tgl']
+    if lang not in valid_langs:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Invalid language. Use one of {valid_langs}")
     try:
+        logger.info(f"--- STARTING SYNTHESIS for '{lang}' ---")
+        # --- Logic for NeMo Models (English, Bikol) ---
+        if lang in ['en', 'bikol']:
+            sample_rate = 22050
+            spectrogram_generator = models[lang]
+            vocoder = models['hifigan']
+            with torch.no_grad():
+                parsed = spectrogram_generator.parse(request.text)
+                spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
                 audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
+            audio_numpy = audio.to('cpu').detach().numpy().squeeze()
+        # --- Logic for Transformers Model (Tagalog) ---
+        elif lang == 'tgl':
+            sample_rate = 16000  # MMS-TTS default sample rate is 16kHz
+            tokenizer = models['tgl_tokenizer']
+            model = models['tgl_model']
+            with torch.no_grad():
+                inputs = tokenizer(request.text, return_tensors="pt").to(device)
+                output = model.generate(**inputs)
+            audio_numpy = output.cpu().numpy().squeeze()
         # --- Prepare and return audio file ---
         buffer = io.BytesIO()
+        sf.write(buffer, audio_numpy, samplerate=sample_rate, format='WAV')
         buffer.seek(0)
         logger.info(f"--- SYNTHESIS COMPLETE ---")
         return StreamingResponse(buffer, media_type="audio/wav")
     except Exception as e:
 # --- 5. Add a Root Endpoint for Health Check ---
 @app.get("/")
 def read_root():
+    # Filter out tokenizer and non-spectrogram models for a cleaner list
+    available_languages = [k for k in models.keys() if '_model' not in k and k != 'hifigan']
     return {
+        "status": "Multilingual TTS Backend is running",
+        "available_languages": available_languages,
+        "device": device
     }
 # --- 6. Add Model Status Endpoint ---
     """Get the status of all loaded models."""
     return {
         "models_loaded": list(models.keys()),
+        "device": device,
         "english_available": 'en' in models,
+        "bikol_available": 'bikol' in models,
+        "tagalog_available": 'tgl_model' in models,
     }