Spaces:
Sleeping
Sleeping
| """ | |
| Text-to-Speech Service with Multiple Fallback Options | |
| Provides speech synthesis with voice cloning capabilities and fallback voices. | |
| """ | |
| import os | |
| import time | |
| import tempfile | |
| from pathlib import Path | |
| from typing import Dict, Any, Optional, Union | |
| import logging | |
| import numpy as np | |
| import soundfile as sf | |
| class TextToSpeechService: | |
| """TTS service with multiple backend options""" | |
| def __init__(self): | |
| self.logger = logging.getLogger(__name__) | |
| self.temp_dir = Path(tempfile.gettempdir()) / "speech_translation_tts" | |
| self.temp_dir.mkdir(exist_ok=True) | |
| # Available TTS engines in order of preference | |
| self.engines = [] | |
| self._initialize_engines() | |
| def _initialize_engines(self): | |
| """Initialize available TTS engines""" | |
| # Try to initialize TTS engines in order of preference | |
| # 1. Try gTTS (Google Text-to-Speech) - requires internet | |
| try: | |
| import gtts | |
| self.engines.append('gtts') | |
| self.logger.info("✅ gTTS (Google TTS) available") | |
| except ImportError: | |
| self.logger.warning("⚠️ gTTS not available") | |
| # 2. Try pyttsx3 (offline TTS) | |
| try: | |
| import pyttsx3 | |
| self.engines.append('pyttsx3') | |
| self.logger.info("✅ pyttsx3 (offline TTS) available") | |
| except ImportError: | |
| self.logger.warning("⚠️ pyttsx3 not available") | |
| # 3. Always have mock TTS as final fallback | |
| self.engines.append('mock') | |
| self.logger.info("✅ Mock TTS available as fallback") | |
| self.logger.info(f"Available TTS engines: {self.engines}") | |
| def synthesize_speech( | |
| self, | |
| text: str, | |
| language: str = "en", | |
| voice_sample: Optional[str] = None, | |
| output_path: Optional[str] = None | |
| ) -> Dict[str, Any]: | |
| """ | |
| Convert text to speech | |
| Args: | |
| text: Text to synthesize | |
| language: Target language code | |
| voice_sample: Path to voice sample for cloning (if supported) | |
| output_path: Output file path (if None, generates temp file) | |
| Returns: | |
| Result dictionary with audio file path and metadata | |
| """ | |
| if not output_path: | |
| output_path = self.temp_dir / f"tts_output_{int(time.time())}.wav" | |
| # Try each TTS engine until one works | |
| for engine in self.engines: | |
| try: | |
| if engine == 'gtts': | |
| return self._synthesize_with_gtts(text, language, output_path) | |
| elif engine == 'pyttsx3': | |
| return self._synthesize_with_pyttsx3(text, language, output_path) | |
| elif engine == 'mock': | |
| return self._synthesize_with_mock(text, language, output_path) | |
| except Exception as e: | |
| self.logger.warning(f"TTS engine {engine} failed: {str(e)}") | |
| continue | |
| # If all engines fail | |
| return { | |
| 'success': False, | |
| 'error': 'All TTS engines failed', | |
| 'audio_path': None, | |
| 'engine': 'none' | |
| } | |
| def _synthesize_with_gtts(self, text: str, language: str, output_path: str) -> Dict[str, Any]: | |
| """Use Google Text-to-Speech""" | |
| try: | |
| from gtts import gTTS | |
| import pygame | |
| import time | |
| # Map common language codes for gTTS | |
| gtts_lang_map = { | |
| 'hi': 'hi', | |
| 'en': 'en', | |
| 'es': 'es', | |
| 'fr': 'fr', | |
| 'de': 'de', | |
| 'it': 'it', | |
| 'pt': 'pt', | |
| 'ru': 'ru', | |
| 'ja': 'ja', | |
| 'ko': 'ko', | |
| 'zh': 'zh', | |
| 'ar': 'ar' | |
| } | |
| gtts_lang = gtts_lang_map.get(language, 'en') | |
| # Create TTS object | |
| tts = gTTS(text=text, lang=gtts_lang, slow=False) | |
| # Save to temporary MP3 file first | |
| temp_mp3 = str(output_path).replace('.wav', '.mp3') | |
| tts.save(temp_mp3) | |
| # Convert MP3 to WAV using pydub | |
| from pydub import AudioSegment | |
| audio = AudioSegment.from_mp3(temp_mp3) | |
| audio.export(output_path, format="wav") | |
| # Clean up temp MP3 | |
| os.remove(temp_mp3) | |
| return { | |
| 'success': True, | |
| 'audio_path': str(output_path), | |
| 'engine': 'gTTS (Google)', | |
| 'language': language, | |
| 'duration': len(audio) / 1000.0, # Duration in seconds | |
| 'sample_rate': audio.frame_rate | |
| } | |
| except Exception as e: | |
| raise Exception(f"gTTS synthesis failed: {str(e)}") | |
| def _synthesize_with_pyttsx3(self, text: str, language: str, output_path: str) -> Dict[str, Any]: | |
| """Use pyttsx3 offline TTS""" | |
| try: | |
| import pyttsx3 | |
| # Initialize TTS engine | |
| engine = pyttsx3.init() | |
| # Configure voice properties | |
| voices = engine.getProperty('voices') | |
| # Try to find appropriate voice for language | |
| selected_voice = None | |
| for voice in voices: | |
| voice_lang = getattr(voice, 'languages', []) | |
| if language in str(voice_lang).lower() or language == 'en': | |
| selected_voice = voice.id | |
| break | |
| if selected_voice: | |
| engine.setProperty('voice', selected_voice) | |
| # Set speech rate and volume | |
| engine.setProperty('rate', 150) # Speed of speech | |
| engine.setProperty('volume', 0.8) # Volume level (0.0 to 1.0) | |
| # Save to file | |
| engine.save_to_file(text, str(output_path)) | |
| engine.runAndWait() | |
| # Get audio duration (approximate) | |
| duration = len(text.split()) * 0.6 # Rough estimate: 0.6 seconds per word | |
| return { | |
| 'success': True, | |
| 'audio_path': str(output_path), | |
| 'engine': 'pyttsx3 (offline)', | |
| 'language': language, | |
| 'duration': duration, | |
| 'sample_rate': 22050 # Default for pyttsx3 | |
| } | |
| except Exception as e: | |
| raise Exception(f"pyttsx3 synthesis failed: {str(e)}") | |
| def _synthesize_with_mock(self, text: str, language: str, output_path: str) -> Dict[str, Any]: | |
| """Generate mock audio for demonstration""" | |
| try: | |
| import time | |
| # Generate a simple tone sequence based on text | |
| sample_rate = 22050 | |
| duration = max(2.0, len(text) * 0.1) # Minimum 2 seconds | |
| t = np.linspace(0, duration, int(duration * sample_rate), False) | |
| # Create a pleasant tone sequence | |
| # Base frequency varies by language | |
| base_freq = { | |
| 'hi': 220, # A3 | |
| 'en': 261, # C4 | |
| 'es': 293, # D4 | |
| 'fr': 329, # E4 | |
| 'de': 349, # F4 | |
| }.get(language, 261) | |
| # Generate harmonics for richer sound | |
| audio = ( | |
| 0.3 * np.sin(2 * np.pi * base_freq * t) + | |
| 0.2 * np.sin(2 * np.pi * base_freq * 1.5 * t) + | |
| 0.1 * np.sin(2 * np.pi * base_freq * 2 * t) | |
| ) | |
| # Add simple envelope (fade in/out) | |
| fade_samples = int(0.1 * sample_rate) # 100ms fade | |
| audio[:fade_samples] *= np.linspace(0, 1, fade_samples) | |
| audio[-fade_samples:] *= np.linspace(1, 0, fade_samples) | |
| # Add some variation based on text length | |
| if len(text) > 50: | |
| # Longer text gets some frequency modulation | |
| mod_freq = 2.0 # 2 Hz modulation | |
| modulation = 1 + 0.1 * np.sin(2 * np.pi * mod_freq * t) | |
| audio *= modulation | |
| # Normalize | |
| audio = audio / np.max(np.abs(audio)) * 0.7 | |
| # Save as WAV | |
| sf.write(str(output_path), audio.astype(np.float32), sample_rate) | |
| return { | |
| 'success': True, | |
| 'audio_path': str(output_path), | |
| 'engine': 'Mock TTS (Demo)', | |
| 'language': language, | |
| 'duration': duration, | |
| 'sample_rate': sample_rate, | |
| 'note': 'This is a demo tone. Install gTTS or pyttsx3 for real speech.' | |
| } | |
| except Exception as e: | |
| raise Exception(f"Mock TTS failed: {str(e)}") | |
| def clone_voice( | |
| self, | |
| text: str, | |
| voice_sample_path: str, | |
| output_path: Optional[str] = None | |
| ) -> Dict[str, Any]: | |
| """ | |
| Attempt voice cloning (placeholder for future implementation) | |
| Currently falls back to regular TTS with a note about voice cloning. | |
| """ | |
| # For now, use regular TTS but indicate it's attempted cloning | |
| result = self.synthesize_speech(text, "en", None, output_path) | |
| if result['success']: | |
| result['note'] = f"Voice cloning attempted using {voice_sample_path}. Currently using fallback TTS." | |
| result['voice_cloning'] = 'attempted (fallback to TTS)' | |
| return result | |
| def get_available_voices(self) -> Dict[str, Any]: | |
| """Get information about available voices""" | |
| voices_info = { | |
| 'engines': self.engines, | |
| 'languages_supported': ['en', 'hi', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh', 'ar'], | |
| 'voice_cloning': 'planned (currently uses fallback)', | |
| 'recommendations': { | |
| 'best_quality': 'gTTS (requires internet)', | |
| 'offline': 'pyttsx3', | |
| 'demo': 'mock (always available)' | |
| } | |
| } | |
| # Try to get system voices if pyttsx3 is available | |
| if 'pyttsx3' in self.engines: | |
| try: | |
| import pyttsx3 | |
| engine = pyttsx3.init() | |
| system_voices = engine.getProperty('voices') | |
| voices_info['system_voices'] = [ | |
| { | |
| 'id': voice.id, | |
| 'name': voice.name, | |
| 'languages': getattr(voice, 'languages', []) | |
| } | |
| for voice in system_voices[:5] # Limit to first 5 | |
| ] | |
| engine.stop() | |
| except: | |
| pass | |
| return voices_info | |
| def create_tts_service() -> TextToSpeechService: | |
| """Factory function to create TTS service""" | |
| return TextToSpeechService() | |
| def test_tts_service(): | |
| """Test the TTS service""" | |
| import time | |
| print("🎵 Testing Text-to-Speech Service") | |
| print("=" * 50) | |
| tts = create_tts_service() | |
| # Test cases | |
| test_cases = [ | |
| ("Hello, this is a test.", "en"), | |
| ("नमस्ते, यह एक परीक्षण है।", "hi"), | |
| ("Hola, esta es una prueba.", "es"), | |
| ] | |
| for text, lang in test_cases: | |
| print(f"\n🌍 Testing {lang}: {text}") | |
| result = tts.synthesize_speech(text, lang) | |
| if result['success']: | |
| print(f"✅ Success!") | |
| print(f"🔧 Engine: {result['engine']}") | |
| print(f"📁 Audio: {result['audio_path']}") | |
| print(f"⏱️ Duration: {result.get('duration', 'Unknown')} seconds") | |
| else: | |
| print(f"❌ Failed: {result['error']}") | |
| # Show available voices | |
| print(f"\n📋 Available Voice Information:") | |
| voices = tts.get_available_voices() | |
| for key, value in voices.items(): | |
| if key != 'system_voices': | |
| print(f" {key}: {value}") | |
| if __name__ == "__main__": | |
| test_tts_service() |