JAV-Gen / tools /datasets /del_invalid.py
kaiw7's picture
Upload folder using huggingface_hub
e490e7e verified
import argparse
import os
from pathlib import Path
from tqdm import tqdm
from glob import glob
import pandas as pd
import numpy as np
def delete_invalid_video(meta_file: str, data_root: str = None, ref_file: str = None):
def _get_path_from_meta(data: pd.DataFrame):
return set(data['path']) | set(data['audio_path'] if 'audio_path' in data else [])
data = pd.read_csv(meta_file)
valid = _get_path_from_meta(data)
if ref_file:
dafa_ref = pd.read_csv(ref_file)
total = _get_path_from_meta(dafa_ref)
else:
total = set(glob(f'{data_root}/**/*.*', recursive=True))
invalid = total - valid
for file in tqdm(invalid):
if os.path.exists(file):
os.remove(file)
print(f"{len(invalid)}/{len(total)} files have been deleted.")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="")
parser.add_argument('--meta_file', type=str, default='/path/to/meta.csv', help='')
parser.add_argument('--data_root', type=str, default='/path/to/videos', help='')
parser.add_argument('--ref_file', type=str, default=None, help='')
args = parser.parse_args()
delete_invalid_video(args.meta_file, args.data_root, args.ref_file)