Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| AI Speech Translation System - Deployment Version | |
| Optimized for Hugging Face Spaces deployment | |
| Features: | |
| - Real-time speech recognition with Whisper | |
| - Auto language detection for 12+ languages | |
| - Enhanced Hindi-English translation | |
| - Text-to-speech output | |
| - Beautiful Apple-style dark mode UI | |
| """ | |
| import gradio as gr | |
| import sys | |
| import os | |
| import time | |
| import tempfile | |
| import threading | |
| from pathlib import Path | |
| from typing import Optional, Tuple, Dict, Any | |
| import numpy as np | |
| import soundfile as sf | |
| # Add src to Python path for local imports | |
| current_dir = Path(__file__).parent | |
| src_path = current_dir / "src" | |
| if src_path.exists(): | |
| sys.path.insert(0, str(src_path)) | |
| # Import with error handling for deployment | |
| try: | |
| import whisper | |
| import librosa | |
| WHISPER_AVAILABLE = True | |
| except ImportError as e: | |
| print(f"โ ๏ธ Whisper not available: {e}") | |
| WHISPER_AVAILABLE = False | |
| try: | |
| from translation.improved_translator import create_improved_translator | |
| from tts.tts_service import create_tts_service | |
| SERVICES_AVAILABLE = True | |
| except ImportError as e: | |
| print(f"โ ๏ธ Services not available: {e}") | |
| SERVICES_AVAILABLE = False | |
| class DeploymentSpeechApp: | |
| """Production-ready speech translation app""" | |
| def __init__(self): | |
| self.whisper_model = None | |
| self.translator = None | |
| self.tts_service = None | |
| self.initialization_status = "๐ Initializing system..." | |
| self.system_ready = False | |
| # Language options | |
| self.languages = { | |
| "auto": "๐ Auto-detect", | |
| "hi": "๐ฎ๐ณ Hindi", | |
| "en": "๐บ๐ธ English", | |
| "es": "๐ช๐ธ Spanish", | |
| "fr": "๐ซ๐ท French", | |
| "de": "๐ฉ๐ช German", | |
| "it": "๐ฎ๐น Italian", | |
| "pt": "๐ต๐น Portuguese", | |
| "ru": "๐ท๐บ Russian", | |
| "ja": "๐ฏ๐ต Japanese", | |
| "ko": "๐ฐ๐ท Korean", | |
| "zh": "๐จ๐ณ Chinese", | |
| "ar": "๐ธ๐ฆ Arabic" | |
| } | |
| self.temp_dir = Path(tempfile.gettempdir()) / "speech_translation_deploy" | |
| self.temp_dir.mkdir(exist_ok=True) | |
| # Start initialization | |
| self._start_initialization() | |
| def _start_initialization(self): | |
| """Initialize system components""" | |
| def init_worker(): | |
| try: | |
| if not WHISPER_AVAILABLE or not SERVICES_AVAILABLE: | |
| self.initialization_status = "โ Missing dependencies for full functionality" | |
| return | |
| self.initialization_status = "๐๏ธ Loading speech recognition..." | |
| self.whisper_model = whisper.load_model("small") | |
| self.initialization_status = "๐ Setting up translation..." | |
| self.translator = create_improved_translator() | |
| self.initialization_status = "๐ต Preparing text-to-speech..." | |
| self.tts_service = create_tts_service() | |
| self.initialization_status = "โ System ready!" | |
| self.system_ready = True | |
| except Exception as e: | |
| self.initialization_status = f"โ Initialization failed: {str(e)}" | |
| self.system_ready = False | |
| threading.Thread(target=init_worker, daemon=True).start() | |
| def get_system_status(self) -> str: | |
| return self.initialization_status | |
| def process_audio( | |
| self, | |
| audio_file: str, | |
| target_lang: str = "en" | |
| ) -> Tuple[str, str, str, Optional[str], str]: | |
| """Process audio file and return results""" | |
| if not self.system_ready: | |
| status = f"โณ System not ready. Status: {self.initialization_status}" | |
| return "", "", "", None, status | |
| if audio_file is None: | |
| return "", "", "", None, "โ Please upload an audio file" | |
| try: | |
| start_time = time.time() | |
| # Step 1: Transcribe | |
| result = self.whisper_model.transcribe( | |
| audio_file, | |
| task="transcribe", | |
| verbose=False | |
| ) | |
| transcription = result['text'].strip() | |
| detected_lang = result.get('language', 'unknown') | |
| if not transcription: | |
| return "", "", detected_lang, None, "โ No speech detected" | |
| # Step 2: Translate | |
| if target_lang == "auto": | |
| target_lang = "en" if detected_lang != "en" else "hi" | |
| translation_result = self.translator.translate_text( | |
| text=transcription, | |
| source_lang=detected_lang, | |
| target_lang=target_lang | |
| ) | |
| if not translation_result['success']: | |
| return transcription, "", detected_lang, None, f"โ Translation failed" | |
| translation = translation_result['translated_text'] | |
| # Step 3: Generate speech | |
| timestamp = int(time.time()) | |
| audio_filename = f"output_{timestamp}.wav" | |
| audio_output_path = self.temp_dir / audio_filename | |
| tts_result = self.tts_service.synthesize_speech( | |
| text=translation, | |
| language=target_lang, | |
| output_path=str(audio_output_path) | |
| ) | |
| if not tts_result['success']: | |
| return transcription, translation, detected_lang, None, f"โ TTS failed" | |
| audio_output = tts_result['audio_path'] | |
| # Final status | |
| total_time = time.time() - start_time | |
| status = f""" | |
| โ **Translation Complete!** | |
| **๐ Summary:** | |
| - โฑ๏ธ **Time:** {total_time:.1f}s | |
| - ๐ **From:** {detected_lang.upper()} โ {target_lang.upper()} | |
| - ๐ต **Engine:** {tts_result['engine']} | |
| - ๐ **Service:** {translation_result.get('service', 'Unknown')} | |
| """ | |
| return transcription, translation, detected_lang, audio_output, status | |
| except Exception as e: | |
| return "", "", "", None, f"โ Error: {str(e)}" | |
| def create_interface(self): | |
| """Create the Gradio interface""" | |
| # Enhanced CSS for production | |
| css = """ | |
| /* Production-ready Apple Dark Mode */ | |
| .gradio-container { | |
| font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important; | |
| background: #000000; | |
| color: #ffffff; | |
| } | |
| body { | |
| background: #000000 !important; | |
| color: #ffffff !important; | |
| } | |
| .header-gradient { | |
| background: linear-gradient(135deg, #1d1d1f 0%, #2c2c2e 100%); | |
| color: #ffffff; | |
| padding: 32px; | |
| border-radius: 16px; | |
| margin-bottom: 24px; | |
| text-align: center; | |
| border: 1px solid #48484a; | |
| } | |
| .status-box { | |
| background: linear-gradient(135deg, #007aff 0%, #5856d6 100%); | |
| color: #ffffff; | |
| padding: 16px; | |
| border-radius: 12px; | |
| text-align: center; | |
| margin: 16px 0; | |
| font-weight: 500; | |
| } | |
| /* Force dark mode for all components */ | |
| .gradio-container * { | |
| background-color: #1c1c1e !important; | |
| color: #ffffff !important; | |
| } | |
| .gradio-container .gr-button { | |
| background: #007aff !important; | |
| color: #ffffff !important; | |
| border: none !important; | |
| border-radius: 8px !important; | |
| font-weight: 500 !important; | |
| } | |
| .gradio-container .gr-button:hover { | |
| background: #0a84ff !important; | |
| } | |
| .gradio-container .gr-textbox, | |
| .gradio-container .gr-textbox input, | |
| .gradio-container .gr-textbox textarea { | |
| background: #2c2c2e !important; | |
| border: 1px solid #48484a !important; | |
| color: #ffffff !important; | |
| border-radius: 8px !important; | |
| } | |
| .gradio-container .gr-dropdown, | |
| .gradio-container .gr-dropdown select { | |
| background: #2c2c2e !important; | |
| border: 1px solid #48484a !important; | |
| color: #ffffff !important; | |
| border-radius: 8px !important; | |
| } | |
| """ | |
| with gr.Blocks(css=css, title="AI Speech Translation System") as interface: | |
| # Header | |
| gr.HTML(""" | |
| <div class="header-gradient"> | |
| <h1 style="font-size: 2.5em; margin: 0; font-weight: 700;">๐๏ธ AI Speech Translator</h1> | |
| <p style="font-size: 1.2em; margin: 16px 0 0 0; opacity: 0.8;"> | |
| Real-time Speech Translation โข Auto Language Detection โข 12+ Languages | |
| </p> | |
| <p style="font-size: 1em; margin: 8px 0 0 0; opacity: 0.6;"> | |
| Upload audio โ Automatic transcription โ Smart translation โ Natural speech output | |
| </p> | |
| </div> | |
| """) | |
| # Status display | |
| with gr.Row(): | |
| status_display = gr.Markdown( | |
| value=f"**{self.get_system_status()}**", | |
| elem_classes=["status-box"] | |
| ) | |
| # Main interface | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### ๐ค Upload & Configure") | |
| audio_input = gr.Audio( | |
| label="๐ค Upload Audio or Record", | |
| type="filepath", | |
| sources=["upload", "microphone"] | |
| ) | |
| target_lang = gr.Dropdown( | |
| choices=list(self.languages.keys()), | |
| value="en", | |
| label="๐ฏ Target Language" | |
| ) | |
| process_btn = gr.Button("๐ Translate Audio", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### ๐ Results") | |
| detected_lang_display = gr.Textbox( | |
| label="๐ Detected Language", | |
| interactive=False | |
| ) | |
| transcription_output = gr.Textbox( | |
| label="๐ Original Text", | |
| lines=3 | |
| ) | |
| translation_output = gr.Textbox( | |
| label="๐ Translated Text", | |
| lines=3 | |
| ) | |
| audio_output = gr.Audio(label="๐ต Translated Speech") | |
| # Detailed status | |
| detailed_status = gr.Markdown( | |
| value="Upload an audio file and click 'Translate Audio' to start..." | |
| ) | |
| # Event handlers | |
| process_btn.click( | |
| self.process_audio, | |
| inputs=[audio_input, target_lang], | |
| outputs=[ | |
| transcription_output, | |
| translation_output, | |
| detected_lang_display, | |
| audio_output, | |
| detailed_status | |
| ] | |
| ) | |
| # Tips section | |
| with gr.Accordion("๐ก How to Use", open=False): | |
| gr.Markdown(""" | |
| ### ๐ฏ Quick Start | |
| 1. **Upload** an audio file (WAV, MP3, M4A) or record directly | |
| 2. **Select** your target language (or keep "Auto-detect") | |
| 3. **Click** "Translate Audio" | |
| 4. **Listen** to the results! | |
| ### โจ Features | |
| - ๐ **Auto Language Detection** - Automatically detects 12+ languages | |
| - ๐ฏ **Enhanced Hindi Support** - Optimized for Hindi-English translation | |
| - ๐ต **Natural Speech Output** - High-quality text-to-speech synthesis | |
| - ๐ **Beautiful UI** - Apple-inspired dark mode design | |
| ### ๐ Supported Languages | |
| Hindi, English, Spanish, French, German, Italian, Portuguese, Russian, Japanese, Korean, Chinese, Arabic | |
| ### ๐๏ธ Tech Stack | |
| - **Speech Recognition**: OpenAI Whisper | |
| - **Translation**: Enhanced algorithms + API fallbacks | |
| - **Speech Synthesis**: Google TTS + offline engines | |
| - **Interface**: Gradio with custom styling | |
| """) | |
| # Footer | |
| gr.HTML(""" | |
| <div style="text-align: center; margin-top: 32px; padding: 24px; background: #1c1c1e; border-radius: 12px;"> | |
| <p style="color: #98989d; margin: 0; font-size: 14px;"> | |
| ๐ AI Speech Translation System โข Built with Whisper, Gradio & Modern ML | |
| </p> | |
| </div> | |
| """) | |
| return interface | |
| def main(): | |
| """Launch the application""" | |
| print("๐ Starting AI Speech Translation System...") | |
| print("๐ Deployment-ready version for cloud hosting") | |
| app = DeploymentSpeechApp() | |
| interface = app.create_interface() | |
| # Launch configuration for deployment | |
| interface.launch( | |
| server_name="0.0.0.0", # Listen on all interfaces for cloud deployment | |
| server_port=7860, # Standard port for Hugging Face Spaces | |
| share=False, | |
| debug=False, | |
| show_api=False, | |
| inbrowser=False # Don't auto-open browser in cloud | |
| ) | |
| if __name__ == "__main__": | |
| main() |