|
|
import sys |
|
|
import os |
|
|
import os.path as osp |
|
|
from glob import glob |
|
|
from tqdm import tqdm |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import json |
|
|
import yaml |
|
|
import argparse |
|
|
|
|
|
|
|
|
def preprocess_audiocaps(data_root): |
|
|
|
|
|
data_dir = f'{data_root}/AudioCaps' |
|
|
src_file = f'{data_dir}/dataset/metadata/audiocaps/datafiles/audiocaps_train_label.json' |
|
|
with open(src_file, 'r') as f: |
|
|
data = json.load(f)['data'] |
|
|
audios, captions = [], [] |
|
|
for item in tqdm(data, desc='AudioCaps'): |
|
|
audio_path = f'{data_dir}/dataset/audioset/' + item['wav'] |
|
|
if not osp.exists(audio_path): |
|
|
break |
|
|
audios.append(audio_path) |
|
|
captions.append(item['caption']) |
|
|
|
|
|
return {path: cap for path, cap in zip(audios, captions)} |
|
|
|
|
|
|
|
|
def preprocess_audiocaps_test(data_root): |
|
|
|
|
|
data_dir = f'{data_root}/AudioCaps' |
|
|
src_file = f'{data_dir}/dataset/metadata/audiocaps/datafiles/audiocaps_test_nonrepeat_subset_2.json' |
|
|
with open(src_file, 'r') as f: |
|
|
data = json.load(f)['data'] |
|
|
audios, captions = [], [] |
|
|
for item in tqdm(data, desc='AudioCaps_test'): |
|
|
name = osp.sep.join(item['wav'].split('/')[-2:]) |
|
|
audio_path = f'{data_dir}/dataset/audioset/zip_audios/unbalanced_train_segments/' + name |
|
|
assert osp.exists(audio_path), audio_path |
|
|
audios.append(audio_path) |
|
|
captions.append(item['caption']) |
|
|
|
|
|
return {path: cap for path, cap in zip(audios, captions)} |
|
|
|
|
|
|
|
|
def preprocess_vggsound(data_root): |
|
|
|
|
|
data_dir = f'{data_root}/VGGSound' |
|
|
src_file = f'{data_dir}/vggsound.csv' |
|
|
data = pd.read_csv(src_file) |
|
|
audios, captions = [], [] |
|
|
for i in tqdm(range(len(data)), desc='VGGSound'): |
|
|
_id, start, label, split = data.iloc[i] |
|
|
if split != 'train': |
|
|
continue |
|
|
audio_path = f'{data_dir}/audio/{_id}_{int(start):06d}.wav' |
|
|
if not osp.exists(audio_path): |
|
|
continue |
|
|
audios.append(audio_path) |
|
|
captions.append(label) |
|
|
|
|
|
return {path: cap for path, cap in zip(audios, captions)} |
|
|
|
|
|
|
|
|
def preprocess_audioset(data_root): |
|
|
|
|
|
|
|
|
data_dir = f'{data_root}/AudioSet' |
|
|
if not osp.exists(f'{data_dir}/class_labels_indices.csv'): |
|
|
raise ValueError('Download the `class_labels_indices.csv` file from' |
|
|
'http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv ' |
|
|
'and put it into `\{data_root\}/AudioSet/` first.') |
|
|
map_data = pd.read_csv(f'{data_dir}/class_labels_indices.csv') |
|
|
label_map = {} |
|
|
for i in range(len(map_data)): |
|
|
_, label, caption = map_data.iloc[i] |
|
|
label_map[label] = caption.lower() |
|
|
src_file = f'{data_dir}/balanced_train_segments.csv' |
|
|
data = pd.read_csv(src_file, sep=', ') |
|
|
audios, captions = [], [] |
|
|
for i in tqdm(range(len(data)), desc='AudioSet'): |
|
|
_id, start, end, label = data.iloc[i] |
|
|
audio_path = f'{data_dir}/audio/bal_train/{_id}.flac' |
|
|
if not osp.exists(audio_path): |
|
|
continue |
|
|
audios.append(audio_path) |
|
|
labels = label.strip()[1:-1].split(',') |
|
|
caption = ', '.join(label_map[label] for label in labels) |
|
|
captions.append(caption) |
|
|
|
|
|
return {path: cap for path, cap in zip(audios, captions)} |
|
|
|
|
|
|
|
|
def preprocess_wavcaps(data_root): |
|
|
|
|
|
data_dir = f'{data_root}/WavCaps' |
|
|
subset_dict = { |
|
|
'as': 'AudioSet_SL', 'bbc': 'BBC_Sound_Effects', 'sb': 'SoundBible', |
|
|
'fsd': 'FreeSound' |
|
|
} |
|
|
audios, captions = [], [] |
|
|
for jname, aname in subset_dict.items(): |
|
|
src_file = f'{data_dir}/json_files/{jname}_final.json' |
|
|
with open(src_file, 'r') as f: |
|
|
data = json.load(f)['data'] |
|
|
audio_dir = f'{data_dir}/audio_files/{aname}_flac' |
|
|
for item in data: |
|
|
audio_id = osp.splitext(item["id"])[0] |
|
|
audio_path = f'{audio_dir}/{audio_id}.flac' |
|
|
if not osp.exists(audio_path): |
|
|
continue |
|
|
audios.append(audio_path) |
|
|
captions.append(item['caption']) |
|
|
|
|
|
return {path: cap for path, cap in zip(audios, captions)} |
|
|
|
|
|
|
|
|
def preprocess_clotho(data_root): |
|
|
|
|
|
data_dir = f'{data_root}/Clotho' |
|
|
audios, captions = [], [] |
|
|
subset_list = ['development', ] |
|
|
for subset in subset_list: |
|
|
src_file = f'{data_dir}/clotho_captions_{subset}.csv' |
|
|
data = pd.read_csv(src_file) |
|
|
for i, row in data.iterrows(): |
|
|
filename, caption = row['file_name'], row['caption_1'] |
|
|
audio_path = f'{data_dir}/{subset}/{filename}' |
|
|
if not osp.exists(audio_path): |
|
|
continue |
|
|
audios.append(audio_path) |
|
|
captions.append(caption) |
|
|
|
|
|
return {path: cap for path, cap in zip(audios, captions)} |
|
|
|
|
|
|
|
|
def preprocess_esc50(data_root): |
|
|
|
|
|
data_dir = f'{data_root}/ESC50' |
|
|
audios, captions = [], [] |
|
|
src_file = f'{data_dir}/meta/esc50.csv' |
|
|
data = pd.read_csv(src_file) |
|
|
for i, row in data.iterrows(): |
|
|
filename, caption = row['filename'], row['category'].replace('_', ' ') |
|
|
audio_path = f'{data_dir}/audio/{filename}' |
|
|
if not osp.exists(audio_path): |
|
|
continue |
|
|
audios.append(audio_path) |
|
|
captions.append(caption) |
|
|
|
|
|
return {path: cap for path, cap in zip(audios, captions)} |
|
|
|
|
|
|
|
|
def preprocess_macs(data_root): |
|
|
|
|
|
|
|
|
data_dir = f'{data_root}/MACS' |
|
|
audios, captions = [], [] |
|
|
src_file = f'{data_dir}/MACS.yaml' |
|
|
if not osp.exists(src_file): |
|
|
raise ValueError('Download the `MACS.yaml` file from' |
|
|
'https://zenodo.org/records/5114771 ' |
|
|
'and put it into `\{data_root\}/MACS/` first.') |
|
|
with open(src_file, 'r') as f: |
|
|
data = yaml.safe_load(f)['files'] |
|
|
ann = pd.read_csv(f'{data_dir}/MACS_competence.csv', delimiter=' ') |
|
|
ann2score = {row['annotator_id']: row['competence'] for _, row in ann.iterrows()} |
|
|
for item in data: |
|
|
filename = item['filename'] |
|
|
audio_path = f'{data_dir}/audio/{filename}' |
|
|
if not osp.exists(audio_path): |
|
|
continue |
|
|
ann_ids = [ann['annotator_id'] for ann in item['annotations']] |
|
|
ann_caps = [ann['sentence'] for ann in item['annotations']] |
|
|
ann_scores = [ann2score[annid] for annid in ann_ids] |
|
|
caption = ann_caps[np.argmax(ann_scores)] |
|
|
audios.append(audio_path) |
|
|
captions.append(caption) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return {path: cap for path, cap in zip(audios, captions)} |
|
|
|
|
|
|
|
|
def preprocess_urbansound(data_root): |
|
|
|
|
|
|
|
|
data_dir = f'{data_root}/UrbanSound8K' |
|
|
audios, captions = [], [] |
|
|
src_file = f'{data_dir}/metadata/UrbanSound8K.csv' |
|
|
data = pd.read_csv(src_file) |
|
|
name2cap = {row['slice_file_name']: row['class'].replace('_', ' ') \ |
|
|
for _, row in data.iterrows()} |
|
|
for audio_path in sorted(glob(f'{data_dir}/audio/fold*/*.*')): |
|
|
filename = osp.basename(audio_path) |
|
|
if filename not in name2cap: |
|
|
continue |
|
|
audios.append(audio_path) |
|
|
captions.append(name2cap[filename]) |
|
|
|
|
|
return {path: cap for path, cap in zip(audios, captions)} |
|
|
|
|
|
|
|
|
def preprocess_musicinstrument(data_root): |
|
|
|
|
|
data_dir = f'{data_root}/MusicInstrument/data' |
|
|
audios, captions = [], [] |
|
|
src_file = f'{data_dir}/Metadata_Train.csv' |
|
|
data = pd.read_csv(src_file) |
|
|
name2cap = {} |
|
|
for _, row in data.iterrows(): |
|
|
if row['FileName'] in name2cap: |
|
|
continue |
|
|
name2cap[row['FileName']] = row['Class'].replace('_', ' ') |
|
|
for audio_path in sorted(glob(f'{data_dir}/Train_submission/Train_submission/*.*')): |
|
|
filename = osp.basename(audio_path) |
|
|
if filename not in name2cap: |
|
|
|
|
|
continue |
|
|
audios.append(audio_path) |
|
|
captions.append(name2cap[filename]) |
|
|
|
|
|
return {path: cap for path, cap in zip(audios, captions)} |
|
|
|
|
|
|
|
|
def preprocess_gtzan(data_root): |
|
|
|
|
|
data_dir = f'{data_root}/GTZAN/data/genres_original' |
|
|
audios, captions = [], [] |
|
|
for audio_path in sorted(glob(f'{data_dir}/*/*.*')): |
|
|
caption = f'music {osp.basename(osp.dirname(audio_path))}' |
|
|
audios.append(audio_path) |
|
|
captions.append(caption) |
|
|
|
|
|
return {path: cap for path, cap in zip(audios, captions)} |
|
|
|
|
|
|
|
|
def get_all_training_meta(data_root, meta_file, save_file): |
|
|
print('reading ...') |
|
|
path2cap = {} |
|
|
for ds in [ |
|
|
'AudioCaps', 'VGGSound', 'AudioSet', 'WavCaps', |
|
|
'Clotho', 'ESC50', 'MACS', 'UrbanSound8K', 'MusicInstrument', 'GTZAN', |
|
|
]: |
|
|
path2cap.update(eval(f'preprocess_{ds}')(data_root)) |
|
|
|
|
|
print('processing ...') |
|
|
df = pd.read_csv(meta_file) |
|
|
audio_caps = [] |
|
|
for path in df['audio_path'].tolist(): |
|
|
if path not in path2cap: |
|
|
print(f'Warning: cannot find the corresponding caption for {path}') |
|
|
audio_caps.append('') |
|
|
else: |
|
|
audio_caps.append(path2cap[path]) |
|
|
|
|
|
print('saving ...') |
|
|
df['audio_text'] = audio_caps |
|
|
total_num = len(df) |
|
|
df = df[df['audio_text'].str.len() > 0] |
|
|
valid_num = len(df) |
|
|
if save_file is None: |
|
|
save_file = meta_file.replace('.csv', '_training.csv') |
|
|
df.to_csv(save_file, index=False) |
|
|
print(f'{valid_num}/{total_num} samples saved to {save_file}') |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
parser = argparse.ArgumentParser('Dataset processor') |
|
|
parser.add_argument('dataset', type=str, default='all', help='Dataset to process') |
|
|
parser.add_argument('--data_root', type=str, default='/path/to/audios', help='', required=True) |
|
|
parser.add_argument('--meta_file', type=str, default='/path/to/meta.csv', help='', required=True) |
|
|
parser.add_argument('--save_file', type=str, default=None, help='') |
|
|
args = parser.parse_args() |
|
|
|
|
|
eval(f'get_{args.dataset}_training_meta')(args.data_root, args.meta_file, args.save_file) |