jaeikkim
Reinit Space without binary assets
7bfbdc3
import json
import os
import soundfile as sf
from pathlib import Path
DATA_CONFIG = [
{"name": "gigaspeech", "subset": "xl", "split": "train"},
{"name": "librispeech", "subset": "train-clean-360"},
{"name": "commonvoice", "subset": "validated"}
]
ROOTS = {
"gigaspeech": "/home/work/AIDAS/data/audio/GigaSpeech",
"librispeech": "/home/work/AIDAS/data/audio/LibriSpeech",
"commonvoice": "/home/work/AIDAS/data/audio/commonvoice/cv-corpus-22.0-2025-06-20/en"
}
def iter_gigaspeech(cfg):
import datasets
ds = datasets.load_dataset("speechcolab/gigaspeech", cfg["subset"], split=cfg["split"])
for row in ds:
yield row["audio"]["path"]
def iter_librispeech(cfg):
subset_root = Path(ROOTS["librispeech"]) / cfg["subset"]
for txt in subset_root.glob("*/**/*.txt"):
with txt.open() as f:
for line in f:
parts = line.strip().split()
if not parts:
continue
audio_id = parts[0]
speaker, chapter, _ = audio_id.split("-")
audio_path = subset_root / speaker / chapter / f"{audio_id}.flac"
yield audio_path
def iter_commonvoice(cfg):
import pandas as pd
tsv = Path(ROOTS["commonvoice"]) / f"{cfg['subset']}.tsv"
df = pd.read_csv(tsv, sep="\t", usecols=["path"])
clips_root = Path(ROOTS["commonvoice"]) / "clips"
for rel in df["path"]:
yield clips_root / rel
DISPATCH = {
"gigaspeech": iter_gigaspeech,
"librispeech": iter_librispeech,
"commonvoice": iter_commonvoice,
}
def main():
total_sec = 0.0
total_files = 0
per_dataset = []
for cfg in DATA_CONFIG:
name = cfg["name"]
iterator = DISPATCH[name](cfg)
subset_total = 0.0
subset_files = 0
for audio_path in iterator:
if not os.path.isfile(audio_path):
continue
info = sf.info(str(audio_path))
duration = info.frames / info.samplerate
subset_total += duration
subset_files += 1
total_sec += duration
total_files += 1
per_dataset.append({
"name": name,
"subset": cfg.get("subset"),
"split": cfg.get("split"),
"num_files": subset_files,
"avg_seconds": subset_total / subset_files if subset_files else 0.0,
})
summary = {
"total_files": total_files,
"overall_avg_seconds": total_sec / total_files if total_files else 0.0,
"per_dataset": per_dataset,
}
print(json.dumps(summary, indent=2))
if __name__ == "__main__":
main()