SidML's picture
Initial Upload
5a65ad6 verified
#!/usr/bin/env python3
"""
AI Speech Translation System - Deployment Version
Optimized for Hugging Face Spaces deployment
Features:
- Real-time speech recognition with Whisper
- Auto language detection for 12+ languages
- Enhanced Hindi-English translation
- Text-to-speech output
- Beautiful Apple-style dark mode UI
"""
import gradio as gr
import sys
import os
import time
import tempfile
import threading
from pathlib import Path
from typing import Optional, Tuple, Dict, Any
import numpy as np
import soundfile as sf
# Add src to Python path for local imports
current_dir = Path(__file__).parent
src_path = current_dir / "src"
if src_path.exists():
sys.path.insert(0, str(src_path))
# Import with error handling for deployment
try:
import whisper
import librosa
WHISPER_AVAILABLE = True
except ImportError as e:
print(f"โš ๏ธ Whisper not available: {e}")
WHISPER_AVAILABLE = False
try:
from translation.improved_translator import create_improved_translator
from tts.tts_service import create_tts_service
SERVICES_AVAILABLE = True
except ImportError as e:
print(f"โš ๏ธ Services not available: {e}")
SERVICES_AVAILABLE = False
class DeploymentSpeechApp:
"""Production-ready speech translation app"""
def __init__(self):
self.whisper_model = None
self.translator = None
self.tts_service = None
self.initialization_status = "๐Ÿ”„ Initializing system..."
self.system_ready = False
# Language options
self.languages = {
"auto": "๐Ÿ” Auto-detect",
"hi": "๐Ÿ‡ฎ๐Ÿ‡ณ Hindi",
"en": "๐Ÿ‡บ๐Ÿ‡ธ English",
"es": "๐Ÿ‡ช๐Ÿ‡ธ Spanish",
"fr": "๐Ÿ‡ซ๐Ÿ‡ท French",
"de": "๐Ÿ‡ฉ๐Ÿ‡ช German",
"it": "๐Ÿ‡ฎ๐Ÿ‡น Italian",
"pt": "๐Ÿ‡ต๐Ÿ‡น Portuguese",
"ru": "๐Ÿ‡ท๐Ÿ‡บ Russian",
"ja": "๐Ÿ‡ฏ๐Ÿ‡ต Japanese",
"ko": "๐Ÿ‡ฐ๐Ÿ‡ท Korean",
"zh": "๐Ÿ‡จ๐Ÿ‡ณ Chinese",
"ar": "๐Ÿ‡ธ๐Ÿ‡ฆ Arabic"
}
self.temp_dir = Path(tempfile.gettempdir()) / "speech_translation_deploy"
self.temp_dir.mkdir(exist_ok=True)
# Start initialization
self._start_initialization()
def _start_initialization(self):
"""Initialize system components"""
def init_worker():
try:
if not WHISPER_AVAILABLE or not SERVICES_AVAILABLE:
self.initialization_status = "โŒ Missing dependencies for full functionality"
return
self.initialization_status = "๐ŸŽ™๏ธ Loading speech recognition..."
self.whisper_model = whisper.load_model("small")
self.initialization_status = "๐ŸŒ Setting up translation..."
self.translator = create_improved_translator()
self.initialization_status = "๐ŸŽต Preparing text-to-speech..."
self.tts_service = create_tts_service()
self.initialization_status = "โœ… System ready!"
self.system_ready = True
except Exception as e:
self.initialization_status = f"โŒ Initialization failed: {str(e)}"
self.system_ready = False
threading.Thread(target=init_worker, daemon=True).start()
def get_system_status(self) -> str:
return self.initialization_status
def process_audio(
self,
audio_file: str,
target_lang: str = "en"
) -> Tuple[str, str, str, Optional[str], str]:
"""Process audio file and return results"""
if not self.system_ready:
status = f"โณ System not ready. Status: {self.initialization_status}"
return "", "", "", None, status
if audio_file is None:
return "", "", "", None, "โŒ Please upload an audio file"
try:
start_time = time.time()
# Step 1: Transcribe
result = self.whisper_model.transcribe(
audio_file,
task="transcribe",
verbose=False
)
transcription = result['text'].strip()
detected_lang = result.get('language', 'unknown')
if not transcription:
return "", "", detected_lang, None, "โŒ No speech detected"
# Step 2: Translate
if target_lang == "auto":
target_lang = "en" if detected_lang != "en" else "hi"
translation_result = self.translator.translate_text(
text=transcription,
source_lang=detected_lang,
target_lang=target_lang
)
if not translation_result['success']:
return transcription, "", detected_lang, None, f"โŒ Translation failed"
translation = translation_result['translated_text']
# Step 3: Generate speech
timestamp = int(time.time())
audio_filename = f"output_{timestamp}.wav"
audio_output_path = self.temp_dir / audio_filename
tts_result = self.tts_service.synthesize_speech(
text=translation,
language=target_lang,
output_path=str(audio_output_path)
)
if not tts_result['success']:
return transcription, translation, detected_lang, None, f"โŒ TTS failed"
audio_output = tts_result['audio_path']
# Final status
total_time = time.time() - start_time
status = f"""
โœ… **Translation Complete!**
**๐Ÿ“Š Summary:**
- โฑ๏ธ **Time:** {total_time:.1f}s
- ๐ŸŒ **From:** {detected_lang.upper()} โ†’ {target_lang.upper()}
- ๐ŸŽต **Engine:** {tts_result['engine']}
- ๐Ÿ“ˆ **Service:** {translation_result.get('service', 'Unknown')}
"""
return transcription, translation, detected_lang, audio_output, status
except Exception as e:
return "", "", "", None, f"โŒ Error: {str(e)}"
def create_interface(self):
"""Create the Gradio interface"""
# Enhanced CSS for production
css = """
/* Production-ready Apple Dark Mode */
.gradio-container {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important;
background: #000000;
color: #ffffff;
}
body {
background: #000000 !important;
color: #ffffff !important;
}
.header-gradient {
background: linear-gradient(135deg, #1d1d1f 0%, #2c2c2e 100%);
color: #ffffff;
padding: 32px;
border-radius: 16px;
margin-bottom: 24px;
text-align: center;
border: 1px solid #48484a;
}
.status-box {
background: linear-gradient(135deg, #007aff 0%, #5856d6 100%);
color: #ffffff;
padding: 16px;
border-radius: 12px;
text-align: center;
margin: 16px 0;
font-weight: 500;
}
/* Force dark mode for all components */
.gradio-container * {
background-color: #1c1c1e !important;
color: #ffffff !important;
}
.gradio-container .gr-button {
background: #007aff !important;
color: #ffffff !important;
border: none !important;
border-radius: 8px !important;
font-weight: 500 !important;
}
.gradio-container .gr-button:hover {
background: #0a84ff !important;
}
.gradio-container .gr-textbox,
.gradio-container .gr-textbox input,
.gradio-container .gr-textbox textarea {
background: #2c2c2e !important;
border: 1px solid #48484a !important;
color: #ffffff !important;
border-radius: 8px !important;
}
.gradio-container .gr-dropdown,
.gradio-container .gr-dropdown select {
background: #2c2c2e !important;
border: 1px solid #48484a !important;
color: #ffffff !important;
border-radius: 8px !important;
}
"""
with gr.Blocks(css=css, title="AI Speech Translation System") as interface:
# Header
gr.HTML("""
<div class="header-gradient">
<h1 style="font-size: 2.5em; margin: 0; font-weight: 700;">๐ŸŽ™๏ธ AI Speech Translator</h1>
<p style="font-size: 1.2em; margin: 16px 0 0 0; opacity: 0.8;">
Real-time Speech Translation โ€ข Auto Language Detection โ€ข 12+ Languages
</p>
<p style="font-size: 1em; margin: 8px 0 0 0; opacity: 0.6;">
Upload audio โ†’ Automatic transcription โ†’ Smart translation โ†’ Natural speech output
</p>
</div>
""")
# Status display
with gr.Row():
status_display = gr.Markdown(
value=f"**{self.get_system_status()}**",
elem_classes=["status-box"]
)
# Main interface
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### ๐Ÿ“ค Upload & Configure")
audio_input = gr.Audio(
label="๐ŸŽค Upload Audio or Record",
type="filepath",
sources=["upload", "microphone"]
)
target_lang = gr.Dropdown(
choices=list(self.languages.keys()),
value="en",
label="๐ŸŽฏ Target Language"
)
process_btn = gr.Button("๐Ÿš€ Translate Audio", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### ๐Ÿ“‹ Results")
detected_lang_display = gr.Textbox(
label="๐Ÿ” Detected Language",
interactive=False
)
transcription_output = gr.Textbox(
label="๐Ÿ“ Original Text",
lines=3
)
translation_output = gr.Textbox(
label="๐ŸŒ Translated Text",
lines=3
)
audio_output = gr.Audio(label="๐ŸŽต Translated Speech")
# Detailed status
detailed_status = gr.Markdown(
value="Upload an audio file and click 'Translate Audio' to start..."
)
# Event handlers
process_btn.click(
self.process_audio,
inputs=[audio_input, target_lang],
outputs=[
transcription_output,
translation_output,
detected_lang_display,
audio_output,
detailed_status
]
)
# Tips section
with gr.Accordion("๐Ÿ’ก How to Use", open=False):
gr.Markdown("""
### ๐ŸŽฏ Quick Start
1. **Upload** an audio file (WAV, MP3, M4A) or record directly
2. **Select** your target language (or keep "Auto-detect")
3. **Click** "Translate Audio"
4. **Listen** to the results!
### โœจ Features
- ๐Ÿ” **Auto Language Detection** - Automatically detects 12+ languages
- ๐ŸŽฏ **Enhanced Hindi Support** - Optimized for Hindi-English translation
- ๐ŸŽต **Natural Speech Output** - High-quality text-to-speech synthesis
- ๐ŸŒ™ **Beautiful UI** - Apple-inspired dark mode design
### ๐ŸŒ Supported Languages
Hindi, English, Spanish, French, German, Italian, Portuguese, Russian, Japanese, Korean, Chinese, Arabic
### ๐Ÿ—๏ธ Tech Stack
- **Speech Recognition**: OpenAI Whisper
- **Translation**: Enhanced algorithms + API fallbacks
- **Speech Synthesis**: Google TTS + offline engines
- **Interface**: Gradio with custom styling
""")
# Footer
gr.HTML("""
<div style="text-align: center; margin-top: 32px; padding: 24px; background: #1c1c1e; border-radius: 12px;">
<p style="color: #98989d; margin: 0; font-size: 14px;">
๐ŸŽ‰ AI Speech Translation System โ€ข Built with Whisper, Gradio & Modern ML
</p>
</div>
""")
return interface
def main():
"""Launch the application"""
print("๐Ÿš€ Starting AI Speech Translation System...")
print("๐ŸŒŸ Deployment-ready version for cloud hosting")
app = DeploymentSpeechApp()
interface = app.create_interface()
# Launch configuration for deployment
interface.launch(
server_name="0.0.0.0", # Listen on all interfaces for cloud deployment
server_port=7860, # Standard port for Hugging Face Spaces
share=False,
debug=False,
show_api=False,
inbrowser=False # Don't auto-open browser in cloud
)
if __name__ == "__main__":
main()