Spaces:
Running
on
Zero
Running
on
Zero
| import json | |
| import os | |
| import soundfile as sf | |
| from pathlib import Path | |
| DATA_CONFIG = [ | |
| {"name": "gigaspeech", "subset": "xl", "split": "train"}, | |
| {"name": "librispeech", "subset": "train-clean-360"}, | |
| {"name": "commonvoice", "subset": "validated"} | |
| ] | |
| ROOTS = { | |
| "gigaspeech": "/home/work/AIDAS/data/audio/GigaSpeech", | |
| "librispeech": "/home/work/AIDAS/data/audio/LibriSpeech", | |
| "commonvoice": "/home/work/AIDAS/data/audio/commonvoice/cv-corpus-22.0-2025-06-20/en" | |
| } | |
| def iter_gigaspeech(cfg): | |
| import datasets | |
| ds = datasets.load_dataset("speechcolab/gigaspeech", cfg["subset"], split=cfg["split"]) | |
| for row in ds: | |
| yield row["audio"]["path"] | |
| def iter_librispeech(cfg): | |
| subset_root = Path(ROOTS["librispeech"]) / cfg["subset"] | |
| for txt in subset_root.glob("*/**/*.txt"): | |
| with txt.open() as f: | |
| for line in f: | |
| parts = line.strip().split() | |
| if not parts: | |
| continue | |
| audio_id = parts[0] | |
| speaker, chapter, _ = audio_id.split("-") | |
| audio_path = subset_root / speaker / chapter / f"{audio_id}.flac" | |
| yield audio_path | |
| def iter_commonvoice(cfg): | |
| import pandas as pd | |
| tsv = Path(ROOTS["commonvoice"]) / f"{cfg['subset']}.tsv" | |
| df = pd.read_csv(tsv, sep="\t", usecols=["path"]) | |
| clips_root = Path(ROOTS["commonvoice"]) / "clips" | |
| for rel in df["path"]: | |
| yield clips_root / rel | |
| DISPATCH = { | |
| "gigaspeech": iter_gigaspeech, | |
| "librispeech": iter_librispeech, | |
| "commonvoice": iter_commonvoice, | |
| } | |
| def main(): | |
| total_sec = 0.0 | |
| total_files = 0 | |
| per_dataset = [] | |
| for cfg in DATA_CONFIG: | |
| name = cfg["name"] | |
| iterator = DISPATCH[name](cfg) | |
| subset_total = 0.0 | |
| subset_files = 0 | |
| for audio_path in iterator: | |
| if not os.path.isfile(audio_path): | |
| continue | |
| info = sf.info(str(audio_path)) | |
| duration = info.frames / info.samplerate | |
| subset_total += duration | |
| subset_files += 1 | |
| total_sec += duration | |
| total_files += 1 | |
| per_dataset.append({ | |
| "name": name, | |
| "subset": cfg.get("subset"), | |
| "split": cfg.get("split"), | |
| "num_files": subset_files, | |
| "avg_seconds": subset_total / subset_files if subset_files else 0.0, | |
| }) | |
| summary = { | |
| "total_files": total_files, | |
| "overall_avg_seconds": total_sec / total_files if total_files else 0.0, | |
| "per_dataset": per_dataset, | |
| } | |
| print(json.dumps(summary, indent=2)) | |
| if __name__ == "__main__": | |
| main() | |