import json import os import soundfile as sf from pathlib import Path DATA_CONFIG = [ {"name": "gigaspeech", "subset": "xl", "split": "train"}, {"name": "librispeech", "subset": "train-clean-360"}, {"name": "commonvoice", "subset": "validated"} ] ROOTS = { "gigaspeech": "/home/work/AIDAS/data/audio/GigaSpeech", "librispeech": "/home/work/AIDAS/data/audio/LibriSpeech", "commonvoice": "/home/work/AIDAS/data/audio/commonvoice/cv-corpus-22.0-2025-06-20/en" } def iter_gigaspeech(cfg): import datasets ds = datasets.load_dataset("speechcolab/gigaspeech", cfg["subset"], split=cfg["split"]) for row in ds: yield row["audio"]["path"] def iter_librispeech(cfg): subset_root = Path(ROOTS["librispeech"]) / cfg["subset"] for txt in subset_root.glob("*/**/*.txt"): with txt.open() as f: for line in f: parts = line.strip().split() if not parts: continue audio_id = parts[0] speaker, chapter, _ = audio_id.split("-") audio_path = subset_root / speaker / chapter / f"{audio_id}.flac" yield audio_path def iter_commonvoice(cfg): import pandas as pd tsv = Path(ROOTS["commonvoice"]) / f"{cfg['subset']}.tsv" df = pd.read_csv(tsv, sep="\t", usecols=["path"]) clips_root = Path(ROOTS["commonvoice"]) / "clips" for rel in df["path"]: yield clips_root / rel DISPATCH = { "gigaspeech": iter_gigaspeech, "librispeech": iter_librispeech, "commonvoice": iter_commonvoice, } def main(): total_sec = 0.0 total_files = 0 per_dataset = [] for cfg in DATA_CONFIG: name = cfg["name"] iterator = DISPATCH[name](cfg) subset_total = 0.0 subset_files = 0 for audio_path in iterator: if not os.path.isfile(audio_path): continue info = sf.info(str(audio_path)) duration = info.frames / info.samplerate subset_total += duration subset_files += 1 total_sec += duration total_files += 1 per_dataset.append({ "name": name, "subset": cfg.get("subset"), "split": cfg.get("split"), "num_files": subset_files, "avg_seconds": subset_total / subset_files if subset_files else 0.0, }) summary = { "total_files": total_files, "overall_avg_seconds": total_sec / total_files if total_files else 0.0, "per_dataset": per_dataset, } print(json.dumps(summary, indent=2)) if __name__ == "__main__": main()