CVNSS commited on
Commit
491db3e
·
verified ·
1 Parent(s): 688b492

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +495 -433
app.py CHANGED
@@ -1,433 +1,495 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- """
5
- Vietnamese TTS - Hugging Face Spaces Demo
6
- Giao diện web chuyển văn bản tiếng Việt thành giọng nói.
7
- """
8
-
9
- import os
10
- import sys
11
- import json
12
- import tempfile
13
- import glob
14
- import re
15
- from pathlib import Path
16
-
17
- import torch
18
- import numpy as np
19
- import soundfile as sf
20
- import gradio as gr
21
-
22
- # Add src to path for imports
23
- sys.path.insert(0, str(Path(__file__).parent))
24
-
25
- from src.vietnamese.text_processor import process_vietnamese_text
26
- from src.vietnamese.phonemizer import text_to_phonemes, VIPHONEME_AVAILABLE
27
- from src.models.synthesizer import SynthesizerTrn
28
- from src.text.symbols import symbols
29
- from src.utils import helpers as utils
30
-
31
-
32
- # ============== Viphoneme Check ==============
33
-
34
- def check_viphoneme():
35
- """Check if viphoneme is working properly."""
36
- from src.vietnamese.phonemizer import VIPHONEME_AVAILABLE, text_to_phonemes
37
-
38
- print(f"Viphoneme available: {VIPHONEME_AVAILABLE}")
39
-
40
- if VIPHONEME_AVAILABLE:
41
- try:
42
- # Test with a simple Vietnamese text
43
- test_text = "Xin chào"
44
- phones, tones, word2ph = text_to_phonemes(test_text, use_viphoneme=True)
45
- print(f"✅ Viphoneme test passed: '{test_text}' -> {phones[:5]}...")
46
- return True
47
- except Exception as e:
48
- print(f"❌ Viphoneme test failed: {e}")
49
- return False
50
- else:
51
- print("⚠️ Viphoneme not available, using fallback phonemizer")
52
- return False
53
-
54
-
55
- # ============== Model Loading ==============
56
-
57
- def find_latest_checkpoint(model_dir, prefix="G"):
58
- """Find the latest checkpoint in model directory."""
59
- pattern = os.path.join(model_dir, f"{prefix}*.pth")
60
- checkpoints = glob.glob(pattern)
61
- if not checkpoints:
62
- return None
63
-
64
- def get_step(path):
65
- match = re.search(rf'{prefix}(\d+)\.pth', path)
66
- return int(match.group(1)) if match else 0
67
-
68
- checkpoints.sort(key=get_step, reverse=True)
69
- return checkpoints[0]
70
-
71
-
72
- def download_model():
73
- """Download model from Hugging Face Hub."""
74
- from huggingface_hub import snapshot_download
75
-
76
- hf_repo = "valtecAI-team/valtec-tts-pretrained"
77
-
78
- # Get cache directory
79
- if os.name == 'nt': # Windows
80
- cache_base = Path(os.environ.get('LOCALAPPDATA', Path.home() / 'AppData' / 'Local'))
81
- else: # Linux/Mac
82
- cache_base = Path(os.environ.get('XDG_CACHE_HOME', Path.home() / '.cache'))
83
-
84
- model_dir = cache_base / 'valtec_tts' / 'models' / 'vits-vietnamese'
85
-
86
- # Check if already downloaded
87
- config_path = model_dir / "config.json"
88
- if config_path.exists():
89
- checkpoints = list(model_dir.glob("G_*.pth"))
90
- if checkpoints:
91
- print(f"Using cached model from: {model_dir}")
92
- return str(model_dir)
93
-
94
- print(f"Downloading model from {hf_repo}...")
95
- model_dir.mkdir(parents=True, exist_ok=True)
96
- snapshot_download(repo_id=hf_repo, local_dir=str(model_dir))
97
- print("Download complete!")
98
-
99
- return str(model_dir)
100
-
101
-
102
- class VietnameseTTS:
103
- """Vietnamese TTS synthesizer using trained VITS-based model."""
104
-
105
- def __init__(self, checkpoint_path, config_path, device="cpu"):
106
- self.device = device
107
-
108
- # Load config
109
- with open(config_path, 'r', encoding='utf-8') as f:
110
- self.config = json.load(f)
111
-
112
- self.sampling_rate = self.config['data']['sampling_rate']
113
- self.spk2id = self.config['data']['spk2id']
114
- self.speakers = list(self.spk2id.keys())
115
- self.add_blank = self.config['data'].get('add_blank', True)
116
-
117
- print(f"Available speakers: {self.speakers}")
118
-
119
- # Load model
120
- self._load_model(checkpoint_path)
121
-
122
- def _load_model(self, checkpoint_path):
123
- """Load the trained model."""
124
- self.model = SynthesizerTrn(
125
- len(symbols),
126
- self.config['data']['filter_length'] // 2 + 1,
127
- self.config['train']['segment_size'] // self.config['data']['hop_length'],
128
- n_speakers=self.config['data']['n_speakers'],
129
- **self.config['model'],
130
- ).to(self.device)
131
-
132
- # Load checkpoint
133
- checkpoint = torch.load(checkpoint_path, map_location=self.device)
134
-
135
- # Handle DDP checkpoint
136
- state_dict = checkpoint['model']
137
- new_state_dict = {}
138
- for k, v in state_dict.items():
139
- if k.startswith('module.'):
140
- new_state_dict[k[7:]] = v
141
- else:
142
- new_state_dict[k] = v
143
-
144
- self.model.load_state_dict(new_state_dict, strict=False)
145
- self.model.eval()
146
-
147
- print(f"Model loaded from {checkpoint_path}")
148
-
149
- def text_to_sequence(self, text, speaker):
150
- """Convert text to model input tensors."""
151
- from src.text import cleaned_text_to_sequence
152
- from src.nn import commons
153
-
154
- # Normalize text
155
- normalized_text = process_vietnamese_text(text)
156
-
157
- # Convert to phonemes
158
- phones, tones, word2ph = text_to_phonemes(normalized_text, use_viphoneme=VIPHONEME_AVAILABLE)
159
-
160
- # Convert to sequence
161
- phone_ids, tone_ids, lang_ids = cleaned_text_to_sequence(phones, tones, "VI")
162
-
163
- # Add blanks if needed
164
- if self.add_blank:
165
- phone_ids = commons.intersperse(phone_ids, 0)
166
- tone_ids = commons.intersperse(tone_ids, 0)
167
- lang_ids = commons.intersperse(lang_ids, 0)
168
-
169
- # Get speaker ID
170
- if speaker not in self.spk2id:
171
- print(f"Warning: Speaker '{speaker}' not found, using first speaker: {self.speakers[0]}")
172
- speaker = self.speakers[0]
173
- speaker_id = self.spk2id[speaker]
174
-
175
- # Create tensors
176
- x = torch.LongTensor(phone_ids).unsqueeze(0).to(self.device)
177
- x_lengths = torch.LongTensor([len(phone_ids)]).to(self.device)
178
- tone = torch.LongTensor(tone_ids).unsqueeze(0).to(self.device)
179
- language = torch.LongTensor(lang_ids).unsqueeze(0).to(self.device)
180
- sid = torch.LongTensor([speaker_id]).to(self.device)
181
-
182
- # Create dummy BERT features
183
- bert = torch.zeros(1024, len(phone_ids)).unsqueeze(0).to(self.device)
184
- ja_bert = torch.zeros(768, len(phone_ids)).unsqueeze(0).to(self.device)
185
-
186
- return x, x_lengths, tone, language, sid, bert, ja_bert
187
-
188
- @torch.no_grad()
189
- def synthesize(self, text, speaker, sdp_ratio=0.0, noise_scale=0.667,
190
- noise_scale_w=0.8, length_scale=1.0):
191
- """Synthesize speech from text."""
192
- x, x_lengths, tone, language, sid, bert, ja_bert = self.text_to_sequence(text, speaker)
193
-
194
- audio, attn, *_ = self.model.infer(
195
- x, x_lengths, sid, tone, language, bert, ja_bert,
196
- sdp_ratio=sdp_ratio,
197
- noise_scale=noise_scale,
198
- noise_scale_w=noise_scale_w,
199
- length_scale=length_scale,
200
- )
201
-
202
- audio = audio[0, 0].cpu().numpy()
203
- return audio, self.sampling_rate
204
-
205
-
206
- # ============== Gradio Interface ==============
207
-
208
- class TTSInterface:
209
- """Wrapper for TTS model with Gradio interface."""
210
-
211
- def __init__(self):
212
- print("Initializing TTS...")
213
-
214
- # Detect device
215
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
216
- print(f"Using device: {self.device}")
217
-
218
- # Download/load model
219
- model_dir = download_model()
220
- checkpoint_path = find_latest_checkpoint(model_dir, "G")
221
- config_path = os.path.join(model_dir, "config.json")
222
-
223
- self.tts = VietnameseTTS(checkpoint_path, config_path, self.device)
224
- self.temp_dir = Path(tempfile.gettempdir()) / "valtec_tts_demo"
225
- self.temp_dir.mkdir(exist_ok=True)
226
-
227
- print("TTS initialized successfully!")
228
-
229
- def synthesize(self, text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio):
230
- """Synthesize speech from text."""
231
- try:
232
- if not text or not text.strip():
233
- return None, "⚠️ Vui lòng nhập văn bản"
234
-
235
- audio, sr = self.tts.synthesize(
236
- text=text.strip(),
237
- speaker=speaker,
238
- length_scale=speed,
239
- noise_scale=noise_scale,
240
- noise_scale_w=noise_scale_w,
241
- sdp_ratio=sdp_ratio,
242
- )
243
-
244
- # Save to temp file
245
- output_path = self.temp_dir / f"output_{hash(text)}.wav"
246
- sf.write(str(output_path), audio, sr)
247
-
248
- return str(output_path), f"✅ Tạo giọng nói thành công! ({len(audio)/sr:.2f}s)"
249
-
250
- except Exception as e:
251
- return None, f"❌ Lỗi: {str(e)}"
252
-
253
-
254
- def create_demo(tts_interface):
255
- """Create Gradio interface."""
256
-
257
- examples = [
258
- ["Xin chào, chúc bạn một ngày tốt lành", "male", 1.0, 0.667, 0.8, 0.0],
259
- ["Buổi sáng hôm nay trời trong xanh và gió thổi rất nhẹ", "male", 1.0, 0.667, 0.8, 0.0],
260
- ["Tôi pha một tách cà phê nóng và ngồi nhìn ánh nắng chiếu qua cửa sổ", "female", 1.0, 0.667, 0.8, 0.0],
261
- ["Việt Nam là một đất nước xinh đẹp với văn hóa phong phú", "male", 0.9, 0.667, 0.8, 0.0],
262
- ["Con đường làng quê rợp bóng tre xanh mát rượi", "female", 1.1, 0.667, 0.8, 0.0],
263
- ]
264
-
265
- with gr.Blocks(
266
- theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan"),
267
- title="Chuyển Văn Bản Thành Giọng Nói Tiếng Việt",
268
- css="""
269
- /* Base styles - Mobile first */
270
- .gradio-container {
271
- max-width: 100% !important;
272
- margin: 0 auto !important;
273
- padding: 10px !important;
274
- }
275
- .main {
276
- margin: 0 auto !important;
277
- padding: 0 10px !important;
278
- }
279
- #title {
280
- text-align: center;
281
- background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
282
- -webkit-background-clip: text;
283
- -webkit-text-fill-color: transparent;
284
- font-weight: bold;
285
- font-size: 1.5rem;
286
- }
287
- .contain {
288
- max-width: 100% !important;
289
- margin: 0 auto !important;
290
- }
291
-
292
- /* Tablet (768px+) */
293
- @media (min-width: 768px) {
294
- .gradio-container {
295
- max-width: 750px !important;
296
- padding: 20px !important;
297
- }
298
- #title {
299
- font-size: 2rem;
300
- }
301
- }
302
-
303
- /* Desktop (1024px+) */
304
- @media (min-width: 1024px) {
305
- .gradio-container {
306
- max-width: 900px !important;
307
- }
308
- #title {
309
- font-size: 2.5rem;
310
- }
311
- }
312
-
313
- /* Large screens (1200px+) */
314
- @media (min-width: 1200px) {
315
- .gradio-container {
316
- max-width: 1000px !important;
317
- }
318
- }
319
- """
320
- ) as demo:
321
-
322
- gr.Markdown("""
323
- # <span id="title">🎙️ Chuyển Văn Bản Thành Giọng Nói</span>
324
-
325
- ### Hệ thống tổng hợp giọng nói tiếng Việt
326
-
327
- Nhập văn bản tiếng Việt và chọn giọng đọc để tạo audio.
328
- """)
329
-
330
- with gr.Row():
331
- with gr.Column(scale=2):
332
- text_input = gr.Textbox(
333
- label="📝 Văn bản đầu vào",
334
- placeholder="Nhập văn bản tiếng Việt ở đây...",
335
- lines=5,
336
- max_lines=10,
337
- )
338
-
339
- speaker_dropdown = gr.Dropdown(
340
- choices=tts_interface.tts.speakers,
341
- value="female",
342
- label="🎤 Chọn giọng đọc",
343
- info="Chọn người đọc từ danh sách"
344
- )
345
-
346
- synthesize_btn = gr.Button(
347
- "🔊 Tạo giọng nói",
348
- variant="primary",
349
- size="lg"
350
- )
351
-
352
- with gr.Column(scale=1):
353
- with gr.Accordion("⚙️ Cài đặt nâng cao", open=False):
354
- speed_slider = gr.Slider(
355
- minimum=0.5, maximum=2.0, value=1.0, step=0.1,
356
- label="Tốc độ",
357
- info="< 1.0: Nhanh hơn | > 1.0: Chậm hơn"
358
- )
359
-
360
- noise_scale_slider = gr.Slider(
361
- minimum=0.1, maximum=1.5, value=0.667, step=0.01,
362
- label="Noise Scale",
363
- info="Điều khiển độ biến thiên giọng nói"
364
- )
365
-
366
- noise_scale_w_slider = gr.Slider(
367
- minimum=0.1, maximum=1.5, value=0.8, step=0.01,
368
- label="Duration Noise",
369
- info="Điều khiển độ biến thiên thời lượng"
370
- )
371
-
372
- sdp_ratio_slider = gr.Slider(
373
- minimum=0.0, maximum=1.0, value=0.0, step=0.1,
374
- label="SDP Ratio",
375
- info="0: Xác định | 1: Ngẫu nhiên"
376
- )
377
-
378
- with gr.Row():
379
- with gr.Column():
380
- audio_output = gr.Audio(
381
- label="🔊 Audio đầu ra",
382
- type="filepath",
383
- interactive=False
384
- )
385
- status_output = gr.Textbox(
386
- label="📊 Trạng thái",
387
- interactive=False,
388
- show_label=False
389
- )
390
-
391
- gr.Markdown("### 📚 Ví dụ")
392
- gr.Examples(
393
- examples=examples,
394
- inputs=[text_input, speaker_dropdown, speed_slider,
395
- noise_scale_slider, noise_scale_w_slider, sdp_ratio_slider],
396
- outputs=[audio_output, status_output],
397
- fn=tts_interface.synthesize,
398
- cache_examples=False,
399
- )
400
-
401
- synthesize_btn.click(
402
- fn=tts_interface.synthesize,
403
- inputs=[text_input, speaker_dropdown, speed_slider,
404
- noise_scale_slider, noise_scale_w_slider, sdp_ratio_slider],
405
- outputs=[audio_output, status_output],
406
- )
407
-
408
- gr.Markdown("""
409
- ---
410
- <div style="text-align: center; color: #666; font-size: 0.9em;">
411
- Hệ thống tổng hợp giọng nói tiếng Việt | Powered by <b>Valtec AI Team</b>
412
- </div>
413
- """)
414
-
415
- return demo
416
-
417
-
418
- # ============== Main ==============
419
-
420
- if __name__ == "__main__":
421
- print("Đang khởi động hệ thống tổng hợp giọng nói tiếng Việt...")
422
-
423
- # Check viphoneme
424
- check_viphoneme()
425
-
426
- tts_interface = TTSInterface()
427
- demo = create_demo(tts_interface)
428
-
429
- demo.launch(
430
- server_name="0.0.0.0",
431
- server_port=7860,
432
- show_error=True,
433
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ CVNSS4.0 Vietnamese TTS Studio
6
+ - Architecture: Modular CSS & Component Separation
7
+ - UX: High Contrast Input Fields
8
+ - Core: Optimized Logic Flow
9
+ """
10
+
11
+ import os
12
+ import sys
13
+ import json
14
+ import time
15
+ import glob
16
+ import re
17
+ import hashlib
18
+ import tempfile
19
+ from pathlib import Path
20
+
21
+ import torch
22
+ import numpy as np
23
+ import soundfile as sf
24
+ import gradio as gr
25
+
26
+ # Add src to path for imports
27
+ sys.path.insert(0, str(Path(__file__).parent))
28
+
29
+ # Import core modules
30
+ try:
31
+ from src.vietnamese.text_processor import process_vietnamese_text
32
+ from src.vietnamese.phonemizer import text_to_phonemes, VIPHONEME_AVAILABLE
33
+ from src.models.synthesizer import SynthesizerTrn
34
+ from src.text.symbols import symbols
35
+ except ImportError:
36
+ # Fallback for environment setup if src is missing during init
37
+ print("⚠️ Core modules not found. Ensure 'src' directory exists.")
38
+ VIPHONEME_AVAILABLE = False
39
+ symbols = []
40
+
41
+
42
+ # =========================================================
43
+ # 1) SYSTEM CONFIGURATION & CSS (The Expert Layer)
44
+ # =========================================================
45
+
46
+ # Expert CSS: Definitive Z-Index Management & Neon Theme
47
+ NEON_CSS = r"""
48
+ :root {
49
+ --bg-dark: #0f172a;
50
+ --bg-panel: rgba(30, 41, 59, 0.7);
51
+ --line: rgba(148, 163, 184, 0.1);
52
+ --text-primary: #e2e8f0;
53
+ --neon-cyan: #06b6d4;
54
+ --neon-accent: #38bdf8;
55
+ --radius-lg: 16px;
56
+ --radius-sm: 8px;
57
+
58
+ /* UX Color Palette for Inputs */
59
+ --input-bg: #f1f5f9; /* Light Blue-Grey for readability */
60
+ --input-text: #0f4c81; /* Classic Blue (Dark Blue) for high contrast */
61
+ --input-placeholder: #64748b;
62
+ }
63
+
64
+ body, .gradio-container, .app {
65
+ background: radial-gradient(circle at 50% 0%, #1e293b 0%, #0f172a 100%) !important;
66
+ color: var(--text-primary) !important;
67
+ font-family: 'Inter', 'Segoe UI', sans-serif;
68
+ }
69
+
70
+ /* --- ISOLATION FULL: CVNSS4.0 Vietnamese TTS Studio --- */
71
+ .panelNeon {
72
+ border: 1px solid rgba(255,255,255,0.08);
73
+ border-radius: var(--radius-lg);
74
+ background: var(--bg-panel);
75
+ backdrop-filter: blur(12px);
76
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
77
+ padding: 20px;
78
+ position: relative;
79
+ isolation: isolate;
80
+ z-index: 1;
81
+ margin-bottom: 20px;
82
+ }
83
+
84
+ /* UX IMPROVEMENT: High Contrast Input Styling */
85
+ .panelNeon textarea, .panelNeon input[type="text"] {
86
+ background: var(--input-bg) !important;
87
+ color: var(--input-text) !important; /* DARK BLUE TEXT requested */
88
+ border: 2px solid transparent !important;
89
+ border-radius: var(--radius-sm) !important;
90
+ font-weight: 500 !important;
91
+ font-size: 1rem !important;
92
+ line-height: 1.5 !important;
93
+ padding: 12px !important;
94
+ transition: all 0.2s ease;
95
+ z-index: 10 !important;
96
+ position: relative !important;
97
+ }
98
+
99
+ .panelNeon textarea::placeholder {
100
+ color: var(--input-placeholder) !important;
101
+ }
102
+
103
+ .panelNeon textarea:focus, .panelNeon input:focus {
104
+ background: #ffffff !important;
105
+ border-color: var(--neon-cyan) !important;
106
+ box-shadow: 0 0 0 4px rgba(6, 182, 212, 0.15) !important;
107
+ color: #000000 !important; /* Even darker on focus */
108
+ }
109
+
110
+ /* Label Styling */
111
+ .panelNeon label span {
112
+ color: var(--neon-accent) !important;
113
+ font-weight: 600;
114
+ font-size: 0.85rem;
115
+ text-transform: uppercase;
116
+ letter-spacing: 0.05em;
117
+ margin-bottom: 8px;
118
+ display: block;
119
+ }
120
+
121
+ /* Dropdown & Slider fixes */
122
+ .panelNeon .wrap, .panelNeon .range-compact {
123
+ z-index: 10 !important;
124
+ }
125
+
126
+ /* Button Upgrades */
127
+ button.primary, .gr-button-primary {
128
+ background: linear-gradient(135deg, #06b6d4 0%, #3b82f6 100%) !important;
129
+ border: none !important;
130
+ color: white !important;
131
+ font-weight: 700 !important;
132
+ transition: transform 0.1s ease, box-shadow 0.2s ease;
133
+ }
134
+ button.primary:hover, .gr-button-primary:hover {
135
+ box-shadow: 0 10px 15px -3px rgba(6, 182, 212, 0.3) !important;
136
+ transform: translateY(-1px);
137
+ }
138
+ button.primary:active {
139
+ transform: translateY(0px);
140
+ }
141
+
142
+ /* Status Panel */
143
+ .statusCard {
144
+ background: rgba(15, 23, 42, 0.6);
145
+ border-radius: var(--radius-sm);
146
+ padding: 16px;
147
+ border: 1px solid rgba(255,255,255,0.05);
148
+ }
149
+ .pill {
150
+ display: inline-flex;
151
+ align-items: center;
152
+ padding: 4px 12px;
153
+ border-radius: 99px;
154
+ background: rgba(56, 189, 248, 0.1);
155
+ color: #38bdf8;
156
+ border: 1px solid rgba(56, 189, 248, 0.2);
157
+ font-size: 0.8rem;
158
+ font-weight: 600;
159
+ margin-right: 6px;
160
+ margin-bottom: 6px;
161
+ }
162
+ .alert { padding: 12px; border-radius: 8px; margin-top: 12px; font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 8px;}
163
+ .alertOk { background: rgba(34, 197, 94, 0.1); color: #4ade80; border: 1px solid rgba(34, 197, 94, 0.2); }
164
+ .alertWarn { background: rgba(234, 179, 8, 0.1); color: #facc15; border: 1px solid rgba(234, 179, 8, 0.2); }
165
+ """
166
+
167
+ # =========================================================
168
+ # 2) UTILITIES & HELPERS
169
+ # =========================================================
170
+ def check_viphoneme():
171
+ if not VIPHONEME_AVAILABLE:
172
+ print("⚠️ Viphoneme not available.")
173
+ return False
174
+ try:
175
+ phones, _, _ = text_to_phonemes("Test", use_viphoneme=True)
176
+ print("✅ Viphoneme active.")
177
+ return True
178
+ except Exception as e:
179
+ print(f"❌ Viphoneme error: {e}")
180
+ return False
181
+
182
+ def md5_key(*parts: str) -> str:
183
+ return hashlib.md5("|".join(parts).encode("utf-8")).hexdigest()
184
+
185
+ def split_sentences_vi(text: str, max_chars: int):
186
+ # Improved splitting logic
187
+ if not text: return []
188
+ text = re.sub(r'\s+', ' ', text).strip()
189
+ # Split by delimiters keeping delimiters
190
+ parts = re.split(r'([.?!;:])', text)
191
+
192
+ chunks = []
193
+ current_chunk = ""
194
+
195
+ for i in range(0, len(parts) - 1, 2):
196
+ sentence = parts[i] + parts[i+1]
197
+ if len(current_chunk) + len(sentence) <= max_chars:
198
+ current_chunk += sentence
199
+ else:
200
+ if current_chunk: chunks.append(current_chunk.strip())
201
+ current_chunk = sentence
202
+
203
+ if len(parts) % 2 != 0 and parts[-1]:
204
+ sentence = parts[-1]
205
+ if len(current_chunk) + len(sentence) <= max_chars:
206
+ current_chunk += sentence
207
+ else:
208
+ if current_chunk: chunks.append(current_chunk.strip())
209
+ current_chunk = sentence
210
+
211
+ if current_chunk: chunks.append(current_chunk.strip())
212
+ return chunks
213
+
214
+ # =========================================================
215
+ # 3) CORE ENGINE WRAPPER
216
+ # =========================================================
217
+ class TTSManager:
218
+ """Singleton-like manager for TTS operations."""
219
+ def __init__(self):
220
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
221
+ print(f"🔧 Initializing TTS on {self.device}...")
222
+
223
+ self.model_dir = self._get_model_dir()
224
+ self.ckpt_path = find_latest_checkpoint(self.model_dir, "G")
225
+ self.cfg_path = os.path.join(self.model_dir, "config.json")
226
+
227
+ if not self.ckpt_path:
228
+ raise FileNotFoundError(f"No checkpoint found in {self.model_dir}")
229
+
230
+ self.tts = VietnameseTTS(self.ckpt_path, self.cfg_path, self.device)
231
+ self.temp_dir = Path(tempfile.gettempdir()) / "neon_tts_cache"
232
+ self.temp_dir.mkdir(parents=True, exist_ok=True)
233
+
234
+ def _get_model_dir(self):
235
+ return download_model()
236
+
237
+ def synthesize(self, text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio):
238
+ try:
239
+ if not text or not text.strip():
240
+ return None, "⚠️ Empty input"
241
+
242
+ key = md5_key(speaker, f"{speed:.2f}", text[:20], str(len(text)))
243
+ out_path = self.temp_dir / f"{key}.wav"
244
+
245
+ if out_path.exists():
246
+ return str(out_path), "✅ Cached (From history)"
247
+
248
+ audio, sr = self.tts.synthesize(
249
+ text=text, speaker=speaker, length_scale=speed,
250
+ noise_scale=noise_scale, noise_scale_w=noise_scale_w, sdp_ratio=sdp_ratio
251
+ )
252
+ sf.write(str(out_path), audio, sr)
253
+ return str(out_path), "✅ Generated successfully"
254
+ except Exception as e:
255
+ # Capture full traceback if needed, but return clean msg
256
+ return None, f"❌ Error: {str(e)}"
257
+
258
+ # =========================================================
259
+ # 4) MODEL LOGIC (PRESERVED & FIXED)
260
+ # =========================================================
261
+ def find_latest_checkpoint(model_dir, prefix="G"):
262
+ pattern = os.path.join(model_dir, f"{prefix}*.pth")
263
+ checkpoints = glob.glob(pattern)
264
+ if not checkpoints: return None
265
+ checkpoints.sort(key=lambda x: int(re.search(rf"{prefix}(\d+)\.pth", x).group(1)) if re.search(rf"{prefix}(\d+)\.pth", x) else 0, reverse=True)
266
+ return checkpoints[0]
267
+
268
+ def download_model():
269
+ from huggingface_hub import snapshot_download
270
+ hf_repo = "valtecAI-team/valtec-tts-pretrained"
271
+ cache_base = Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache"))
272
+ if os.name == "nt": cache_base = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local"))
273
+
274
+ model_dir = cache_base / "valtec_tts" / "models" / "vits-vietnamese"
275
+ if (model_dir / "config.json").exists() and list(model_dir.glob("G_*.pth")):
276
+ return str(model_dir)
277
+
278
+ print(f"⬇️ Downloading {hf_repo}...")
279
+ snapshot_download(repo_id=hf_repo, local_dir=str(model_dir))
280
+ return str(model_dir)
281
+
282
+ class VietnameseTTS:
283
+ def __init__(self, ckpt, cfg, device="cpu"):
284
+ self.device = device
285
+ with open(cfg, "r", encoding="utf-8") as f: self.config = json.load(f)
286
+ self.spk2id = self.config["data"]["spk2id"]
287
+ self.speakers = list(self.spk2id.keys())
288
+ self._load(ckpt)
289
+
290
+ def _load(self, ckpt):
291
+ self.model = SynthesizerTrn(
292
+ len(symbols),
293
+ self.config["data"]["filter_length"] // 2 + 1,
294
+ self.config["train"]["segment_size"] // self.config["data"]["hop_length"],
295
+ n_speakers=self.config["data"]["n_speakers"],
296
+ **self.config["model"]
297
+ ).to(self.device)
298
+ state = torch.load(ckpt, map_location=self.device)["model"]
299
+ self.model.load_state_dict({k.replace("module.", ""): v for k,v in state.items()}, strict=False)
300
+ self.model.eval()
301
+
302
+ def synthesize(self, text, speaker, **kwargs):
303
+ from src.text import cleaned_text_to_sequence
304
+ from src.nn import commons
305
+
306
+ # 1. Text Processing
307
+ norm_text = process_vietnamese_text(text)
308
+ phones, tones, _ = text_to_phonemes(norm_text, use_viphoneme=VIPHONEME_AVAILABLE)
309
+ phone_ids, tone_ids, lang_ids = cleaned_text_to_sequence(phones, tones, "VI")
310
+
311
+ phone_ids = commons.intersperse(phone_ids, 0)
312
+ tone_ids = commons.intersperse(tone_ids, 0)
313
+ lang_ids = commons.intersperse(lang_ids, 0)
314
+
315
+ # 2. Prepare Tensors
316
+ x = torch.LongTensor(phone_ids).unsqueeze(0).to(self.device)
317
+ x_len = torch.LongTensor([len(phone_ids)]).to(self.device)
318
+ tone = torch.LongTensor(tone_ids).unsqueeze(0).to(self.device)
319
+ lang = torch.LongTensor(lang_ids).unsqueeze(0).to(self.device)
320
+ sid = torch.LongTensor([self.spk2id.get(speaker, 0)]).to(self.device)
321
+
322
+ # 3. Inference with Gradient Safety (FIX IS HERE)
323
+ with torch.no_grad():
324
+ bert = torch.zeros(1024, len(phone_ids)).unsqueeze(0).to(self.device)
325
+ ja_bert = torch.zeros(768, len(phone_ids)).unsqueeze(0).to(self.device)
326
+
327
+ # Run inference
328
+ # The error "Can't call numpy() on Tensor that requires grad" means output has grad_fn.
329
+ # We use .detach() before .cpu() to ensure the graph is cut.
330
+ outputs = self.model.infer(
331
+ x, x_len, sid, tone, lang,
332
+ bert, ja_bert,
333
+ **kwargs
334
+ )
335
+
336
+ audio = outputs[0][0,0].detach().cpu().numpy()
337
+
338
+ return audio, self.config["data"]["sampling_rate"]
339
+
340
+ # =========================================================
341
+ # 5) UI CONSTRUCTION (REFACTORED)
342
+ # =========================================================
343
+ def create_ui(manager: TTSManager):
344
+
345
+ def ui_header():
346
+ return gr.HTML("""
347
+ <div style="border-bottom: 1px solid rgba(255,255,255,0.08); padding-bottom: 20px; margin-bottom: 25px;">
348
+ <h1 style="color: #38bdf8; margin:0; font-weight:800; font-size: 2rem; letter-spacing: -0.02em;">
349
+ 🎛️ CVNSS4.0 Vietnamese TTS Studio
350
+ </h1>
351
+ <div style="color: #94a3b8; font-size: 1rem; margin-top: 5px; font-weight: 400;">
352
+ Công nghệ tích hợp giọng nói AI tiên tiến • Phiên bản 1.0.0 Demo • Dự án mã nguồn mở
353
+ </div>
354
+ </div>
355
+ """)
356
+
357
+ def ui_status_render(text, speaker, speed, chunks, dur, msg):
358
+ return f"""
359
+ <div class="statusCard">
360
+ <div style="margin-bottom:12px; font-weight:700; color:#38bdf8; font-size: 0.9rem; text-transform: uppercase;">
361
+ 📟 Trạng thái hoạt động
362
+ </div>
363
+ <div style="display:flex; flex-wrap:wrap; gap:8px;">
364
+ <span class="pill">🎤 {speaker}</span>
365
+ <span class="pill">⚡ {speed}x</span>
366
+ <span class="pill">📄 {len(text)} ký tự</span>
367
+ <span class="pill">🧩 {chunks} đoạn</span>
368
+ </div>
369
+ <div class="alert {'alertOk' if '✅' in msg else 'alertWarn'}">
370
+ {msg}
371
+ </div>
372
+ </div>
373
+ """
374
+
375
+ with gr.Blocks(theme=gr.themes.Base(), css=NEON_CSS, title="Neon TTS Expert") as app:
376
+ ui_header()
377
+
378
+ with gr.Tabs():
379
+ # --- TAB BASIC ---
380
+ with gr.Tab("⚡ Chế độ Nhanh"):
381
+ with gr.Row():
382
+ # INPUT COLUMN
383
+ with gr.Column(scale=2):
384
+ # REFACTOR: Using a specific ID for the container to target with CSS isolation
385
+ with gr.Group(elem_classes=["panelNeon"], elem_id="input-panel-basic"):
386
+ gr.HTML('<div class="panelTitle">📝 Văn bản đầu vào</div>')
387
+
388
+ # THE FIX: Pure Textbox with updated styling (Dark Blue text)
389
+ txt_basic = gr.Textbox(
390
+ label="",
391
+ show_label=False,
392
+ placeholder="Nhập nội dung tiếng Việt vào... (Ví dụ: Xin chào, bạn đã học qua CVNSS4.0 chưa?)",
393
+ lines=6,
394
+ elem_id="main-input-basic"
395
+ )
396
+
397
+ with gr.Row():
398
+ spk_basic = gr.Dropdown(choices=manager.tts.speakers, value=manager.tts.speakers[0], label="Giọng đọc")
399
+ spd_basic = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Tốc độ đọc")
400
+
401
+ btn_basic = gr.Button("🔊 Đọc ngay", variant="primary")
402
+
403
+ # OUTPUT COLUMN
404
+ with gr.Column(scale=1):
405
+ with gr.Group(elem_classes=["panelNeon"]):
406
+ gr.HTML('<div class="panelTitle">🔊 Kết quả</div>')
407
+ out_audio_basic = gr.Audio(label="Audio Output", type="filepath", interactive=False)
408
+ out_status_basic = gr.HTML()
409
+
410
+ # --- TAB ADVANCED ---
411
+ with gr.Tab("🧠 Chế độ Chuyên sâu"):
412
+ with gr.Row():
413
+ with gr.Column(scale=2):
414
+ with gr.Group(elem_classes=["panelNeon"], elem_id="input-panel-adv"):
415
+ gr.HTML('<div class="panelTitle">📝 Xử lý văn bản dài</div>')
416
+ txt_adv = gr.Textbox(
417
+ label="",
418
+ show_label=False,
419
+ lines=8,
420
+ placeholder="Nhập văn bản dài cần ngắt câu tự động...",
421
+ elem_id="main-input-adv"
422
+ )
423
+
424
+ gr.HTML('<div style="height:15px"></div>')
425
+ gr.HTML('<div class="panelTitle">🎚️ Tham số âm thanh</div>')
426
+
427
+ with gr.Row():
428
+ ns = gr.Slider(0.1, 1.5, 0.667, step=0.01, label="Noise Scale (Độ biến thiên)")
429
+ nsw = gr.Slider(0.1, 1.5, 0.8, step=0.01, label="Duration Noise (Độ động)")
430
+
431
+ with gr.Row():
432
+ sdp = gr.Slider(0.0, 1.0, 0.2, step=0.1, label="SDP Ratio (Ngẫu nhiên)")
433
+ max_chars = gr.Slider(50, 500, 300, step=10, label="Ngắt đoạn (ký tự)")
434
+ pause = gr.Slider(0, 1000, 250, step=50, label="Nghỉ câu (ms)")
435
+
436
+ btn_adv = gr.Button("🧠 Xử lý & Ghép nối", variant="primary")
437
+
438
+ with gr.Column(scale=1):
439
+ with gr.Group(elem_classes=["panelNeon"]):
440
+ out_audio_adv = gr.Audio(label="Merged Audio", type="filepath", interactive=False)
441
+ out_status_adv = gr.HTML()
442
+
443
+ # --- LOGIC BINDING ---
444
+ def run_basic(text, spk, spd):
445
+ path, msg = manager.synthesize(text, spk, spd, 0.667, 0.8, 0.2)
446
+ html = ui_status_render(text, spk, spd, 1, 0, msg)
447
+ return path, html
448
+
449
+ def run_adv(text, spk, spd, ns, nsw, sdp, mc, p, progress=gr.Progress()):
450
+ chunks = split_sentences_vi(text, int(mc))
451
+ audios = []
452
+ sr = 44100
453
+
454
+ for i, chunk in enumerate(chunks):
455
+ progress((i)/len(chunks), desc=f"Đang xử lý đoạn {i+1}/{len(chunks)}")
456
+ path, _ = manager.synthesize(chunk, spk, spd, ns, nsw, sdp)
457
+ if path:
458
+ data, rate = sf.read(path)
459
+ audios.append(data)
460
+ sr = rate
461
+ if p > 0:
462
+ audios.append(np.zeros(int(rate * p/1000)))
463
+
464
+ if not audios: return None, "❌ Không tạo được âm thanh"
465
+
466
+ full_audio = np.concatenate(audios)
467
+ out_path = manager.temp_dir / f"merged_{int(time.time())}.wav"
468
+ sf.write(str(out_path), full_audio, sr)
469
+
470
+ html = ui_status_render(text, spk, spd, len(chunks), len(full_audio)/sr, "✅ Đã ghép nối thành công")
471
+ return str(out_path), html
472
+
473
+ btn_basic.click(run_basic, [txt_basic, spk_basic, spd_basic], [out_audio_basic, out_status_basic])
474
+ btn_adv.click(run_adv, [txt_adv, spk_basic, spd_basic, ns, nsw, sdp, max_chars, pause], [out_audio_adv, out_status_adv])
475
+
476
+ return app
477
+
478
+ # =========================================================
479
+ # 6) ENTRY POINT
480
+ # =========================================================
481
+ if __name__ == "__main__":
482
+ print("🚀 Starting Expert Neon TTS...")
483
+
484
+ # Check dependencies
485
+ check_viphoneme()
486
+
487
+ # Init Manager & UI
488
+ try:
489
+ tts_manager = TTSManager()
490
+ app = create_ui(tts_manager)
491
+
492
+ port = int(os.environ.get("PORT", "7860"))
493
+ app.queue(max_size=10).launch(server_name="0.0.0.0", server_port=port)
494
+ except Exception as e:
495
+ print(f"❌ Fatal Error: {e}")