Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -176,38 +176,38 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
| 176 |
|
| 177 |
audio, sr = ref_audio
|
| 178 |
|
| 179 |
-
# Convert
|
|
|
|
|
|
|
|
|
|
| 180 |
audio_segment = AudioSegment(
|
| 181 |
-
(
|
| 182 |
frame_rate=sr,
|
| 183 |
sample_width=2, # 16-bit audio
|
| 184 |
-
channels=1 if len(
|
| 185 |
)
|
| 186 |
|
| 187 |
-
# Remove silence using
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
# Convert back to NumPy array for further processing
|
| 191 |
-
audio_trimmed = np.array(audio_segment.get_array_of_samples(), dtype=np.float32) / (2**15)
|
| 192 |
-
|
| 193 |
-
# Continue processing with trimmed audio
|
| 194 |
-
audio = torch.from_numpy(audio_trimmed).unsqueeze(0) # Add batch/channel dimension
|
| 195 |
-
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
-
|
| 199 |
if audio.shape[0] > 1:
|
| 200 |
-
audio =
|
| 201 |
|
| 202 |
-
rms = torch.sqrt(
|
| 203 |
if rms < target_rms:
|
| 204 |
-
audio
|
|
|
|
| 205 |
if sr != target_sample_rate:
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
|
| 210 |
-
audio = audio.to(device)
|
| 211 |
|
| 212 |
generated_waves = []
|
| 213 |
spectrograms = []
|
|
|
|
| 176 |
|
| 177 |
audio, sr = ref_audio
|
| 178 |
|
| 179 |
+
# Convert PyTorch tensor to NumPy array before scaling and processing
|
| 180 |
+
audio_np = audio.cpu().numpy() # Convert to NumPy (if it's a tensor)
|
| 181 |
+
|
| 182 |
+
# Convert NumPy audio array to PyDub AudioSegment
|
| 183 |
audio_segment = AudioSegment(
|
| 184 |
+
(audio_np * 32768).astype(np.int16).tobytes(), # Scale to 16-bit PCM
|
| 185 |
frame_rate=sr,
|
| 186 |
sample_width=2, # 16-bit audio
|
| 187 |
+
channels=1 if len(audio_np.shape) == 1 else audio_np.shape[0] # Mono or multi-channel
|
| 188 |
)
|
| 189 |
|
| 190 |
+
# Remove silence using the custom function
|
| 191 |
+
audio_trimmed = remove_silence_edges(audio_segment, silence_threshold=-42)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
+
# Convert trimmed audio back to a PyTorch tensor
|
| 194 |
+
audio = torch.tensor(
|
| 195 |
+
np.array(audio_trimmed.get_array_of_samples(), dtype=np.float32) / 32768
|
| 196 |
+
).unsqueeze(0) # Add batch/channel dimension
|
| 197 |
|
| 198 |
+
# Normalize and resample
|
| 199 |
if audio.shape[0] > 1:
|
| 200 |
+
audio = audio.mean(dim=0, keepdim=True) # Convert to mono
|
| 201 |
|
| 202 |
+
rms = torch.sqrt((audio**2).mean()) # Compute RMS
|
| 203 |
if rms < target_rms:
|
| 204 |
+
audio *= target_rms / rms # Adjust RMS
|
| 205 |
+
|
| 206 |
if sr != target_sample_rate:
|
| 207 |
+
audio = torchaudio.transforms.Resample(sr, target_sample_rate)(audio)
|
| 208 |
+
|
| 209 |
+
audio = audio.to(device) # Move to target device
|
| 210 |
|
|
|
|
| 211 |
|
| 212 |
generated_waves = []
|
| 213 |
spectrograms = []
|