| import argparse | |
| import os | |
| from pathlib import Path | |
| from tqdm import tqdm | |
| from glob import glob | |
| import pandas as pd | |
| import numpy as np | |
| def delete_invalid_video(meta_file: str, data_root: str = None, ref_file: str = None): | |
| def _get_path_from_meta(data: pd.DataFrame): | |
| return set(data['path']) | set(data['audio_path'] if 'audio_path' in data else []) | |
| data = pd.read_csv(meta_file) | |
| valid = _get_path_from_meta(data) | |
| if ref_file: | |
| dafa_ref = pd.read_csv(ref_file) | |
| total = _get_path_from_meta(dafa_ref) | |
| else: | |
| total = set(glob(f'{data_root}/**/*.*', recursive=True)) | |
| invalid = total - valid | |
| for file in tqdm(invalid): | |
| if os.path.exists(file): | |
| os.remove(file) | |
| print(f"{len(invalid)}/{len(total)} files have been deleted.") | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser(description="") | |
| parser.add_argument('--meta_file', type=str, default='/path/to/meta.csv', help='') | |
| parser.add_argument('--data_root', type=str, default='/path/to/videos', help='') | |
| parser.add_argument('--ref_file', type=str, default=None, help='') | |
| args = parser.parse_args() | |
| delete_invalid_video(args.meta_file, args.data_root, args.ref_file) | |