|
|
import argparse |
|
|
import html |
|
|
import json |
|
|
import os |
|
|
import random |
|
|
import re |
|
|
from functools import partial |
|
|
from glob import glob |
|
|
import subprocess |
|
|
import soundfile as sf |
|
|
import librosa |
|
|
from pydub.utils import mediainfo |
|
|
|
|
|
import cv2 |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from PIL import Image |
|
|
from tqdm import tqdm |
|
|
import torchvision.transforms as T |
|
|
from torchvision.io import write_video |
|
|
|
|
|
from javisdit.datasets.read_video import read_video |
|
|
from javisdit.datasets.read_audio import read_audio |
|
|
|
|
|
from .utils import IMG_EXTENSIONS |
|
|
|
|
|
tqdm.pandas() |
|
|
|
|
|
try: |
|
|
from pandarallel import pandarallel |
|
|
|
|
|
PANDA_USE_PARALLEL = True |
|
|
except ImportError: |
|
|
PANDA_USE_PARALLEL = False |
|
|
|
|
|
|
|
|
def apply(df, func, **kwargs): |
|
|
if PANDA_USE_PARALLEL: |
|
|
return df.parallel_apply(func, **kwargs) |
|
|
return df.progress_apply(func, **kwargs) |
|
|
|
|
|
|
|
|
TRAIN_COLUMNS = ["path", "text", "num_frames", "fps", "height", "width", "aspect_ratio", "resolution", "text_len"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_video_length(cap, method="header"): |
|
|
assert method in ["header", "set"] |
|
|
if method == "header": |
|
|
length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) |
|
|
else: |
|
|
cap.set(cv2.CAP_PROP_POS_AVI_RATIO, 1) |
|
|
length = int(cap.get(cv2.CAP_PROP_POS_FRAMES)) |
|
|
return length |
|
|
|
|
|
|
|
|
def get_info_old(path): |
|
|
try: |
|
|
ext = os.path.splitext(path)[1].lower() |
|
|
if ext in IMG_EXTENSIONS: |
|
|
im = cv2.imread(path) |
|
|
if im is None: |
|
|
return 0, 0, 0, np.nan, np.nan, np.nan |
|
|
height, width = im.shape[:2] |
|
|
num_frames, fps = 1, np.nan |
|
|
else: |
|
|
cap = cv2.VideoCapture(path) |
|
|
num_frames, height, width, fps = ( |
|
|
get_video_length(cap, method="header"), |
|
|
int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), |
|
|
int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), |
|
|
float(cap.get(cv2.CAP_PROP_FPS)), |
|
|
) |
|
|
hw = height * width |
|
|
aspect_ratio = height / width if width > 0 else np.nan |
|
|
return num_frames, height, width, aspect_ratio, fps, hw |
|
|
except: |
|
|
return 0, 0, 0, np.nan, np.nan, np.nan |
|
|
|
|
|
|
|
|
def get_info(path): |
|
|
try: |
|
|
ext = os.path.splitext(path)[1].lower() |
|
|
if ext in IMG_EXTENSIONS: |
|
|
return get_image_info(path) |
|
|
else: |
|
|
return get_video_info(path) |
|
|
except: |
|
|
return 0, 0, 0, np.nan, np.nan, np.nan |
|
|
|
|
|
|
|
|
def get_image_info(path, backend="pillow"): |
|
|
if backend == "pillow": |
|
|
try: |
|
|
with open(path, "rb") as f: |
|
|
img = Image.open(f) |
|
|
img = img.convert("RGB") |
|
|
width, height = img.size |
|
|
num_frames, fps = 1, np.nan |
|
|
hw = height * width |
|
|
aspect_ratio = height / width if width > 0 else np.nan |
|
|
return num_frames, height, width, aspect_ratio, fps, hw |
|
|
except: |
|
|
return 0, 0, 0, np.nan, np.nan, np.nan |
|
|
elif backend == "cv2": |
|
|
try: |
|
|
im = cv2.imread(path) |
|
|
if im is None: |
|
|
return 0, 0, 0, np.nan, np.nan, np.nan |
|
|
height, width = im.shape[:2] |
|
|
num_frames, fps = 1, np.nan |
|
|
hw = height * width |
|
|
aspect_ratio = height / width if width > 0 else np.nan |
|
|
return num_frames, height, width, aspect_ratio, fps, hw |
|
|
except: |
|
|
return 0, 0, 0, np.nan, np.nan, np.nan |
|
|
else: |
|
|
raise ValueError |
|
|
|
|
|
|
|
|
def get_video_info(path, backend="cv2"): |
|
|
if backend == "torchvision": |
|
|
try: |
|
|
vframes, infos = read_video(path) |
|
|
num_frames, height, width = vframes.shape[0], vframes.shape[2], vframes.shape[3] |
|
|
if "video_fps" in infos: |
|
|
fps = infos["video_fps"] |
|
|
else: |
|
|
fps = np.nan |
|
|
hw = height * width |
|
|
aspect_ratio = height / width if width > 0 else np.nan |
|
|
return num_frames, height, width, aspect_ratio, fps, hw |
|
|
except: |
|
|
return 0, 0, 0, np.nan, np.nan, np.nan |
|
|
elif backend == "cv2": |
|
|
try: |
|
|
cap = cv2.VideoCapture(path) |
|
|
num_frames, height, width, fps = ( |
|
|
get_video_length(cap, method="set"), |
|
|
int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), |
|
|
int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), |
|
|
float(round(cap.get(cv2.CAP_PROP_FPS))), |
|
|
) |
|
|
hw = height * width |
|
|
aspect_ratio = height / width if width > 0 else np.nan |
|
|
return num_frames, height, width, aspect_ratio, fps, hw |
|
|
except: |
|
|
return 0, 0, 0, np.nan, np.nan, np.nan |
|
|
else: |
|
|
raise ValueError |
|
|
|
|
|
|
|
|
def get_audio_info(audio_path, backend="pydub"): |
|
|
if backend == "pydub": |
|
|
try: |
|
|
info = mediainfo(audio_path) |
|
|
duration = float(info['duration']) |
|
|
sample_rate = int(info['sample_rate']) |
|
|
channels = int(info['channels']) |
|
|
return duration, sample_rate, channels |
|
|
except: |
|
|
return 0, 0, 0 |
|
|
else: |
|
|
raise ValueError |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LLAVA_PREFIX = [ |
|
|
"The video shows", |
|
|
"The video captures", |
|
|
"The video features", |
|
|
"The video depicts", |
|
|
"The video presents", |
|
|
"The video features", |
|
|
"The video is ", |
|
|
"In the video,", |
|
|
"The image shows", |
|
|
"The image captures", |
|
|
"The image features", |
|
|
"The image depicts", |
|
|
"The image presents", |
|
|
"The image features", |
|
|
"The image is ", |
|
|
"The image portrays", |
|
|
"In the image,", |
|
|
] |
|
|
|
|
|
|
|
|
def remove_caption_prefix(caption): |
|
|
for prefix in LLAVA_PREFIX: |
|
|
if caption.startswith(prefix) or caption.startswith(prefix.lower()): |
|
|
caption = caption[len(prefix) :].strip() |
|
|
if caption[0].islower(): |
|
|
caption = caption[0].upper() + caption[1:] |
|
|
return caption |
|
|
return caption |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CMOTION_TEXT = { |
|
|
"static": "static", |
|
|
"pan_right": "pan right", |
|
|
"pan_left": "pan left", |
|
|
"zoom_in": "zoom in", |
|
|
"zoom_out": "zoom out", |
|
|
"tilt_up": "tilt up", |
|
|
"tilt_down": "tilt down", |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
CMOTION_PROBS = { |
|
|
|
|
|
"static": 1.0, |
|
|
"zoom_in": 1.0, |
|
|
"zoom_out": 1.0, |
|
|
"pan_left": 1.0, |
|
|
"pan_right": 1.0, |
|
|
"tilt_up": 1.0, |
|
|
"tilt_down": 1.0, |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
def merge_cmotion(caption, cmotion): |
|
|
text = CMOTION_TEXT[cmotion] |
|
|
prob = CMOTION_PROBS[cmotion] |
|
|
if text is not None and random.random() < prob: |
|
|
caption = f"{caption} Camera motion: {text}." |
|
|
return caption |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_lang_detector(lang_to_detect): |
|
|
from lingua import Language, LanguageDetectorBuilder |
|
|
|
|
|
lang_dict = dict(en=Language.ENGLISH) |
|
|
assert lang_to_detect in lang_dict |
|
|
valid_lang = lang_dict[lang_to_detect] |
|
|
detector = LanguageDetectorBuilder.from_all_spoken_languages().with_low_accuracy_mode().build() |
|
|
|
|
|
def detect_lang(caption): |
|
|
confidence_values = detector.compute_language_confidence_values(caption) |
|
|
confidence = [x.language for x in confidence_values[:5]] |
|
|
if valid_lang not in confidence: |
|
|
return False |
|
|
return True |
|
|
|
|
|
return detect_lang |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def basic_clean(text): |
|
|
import ftfy |
|
|
|
|
|
text = ftfy.fix_text(text) |
|
|
text = html.unescape(html.unescape(text)) |
|
|
return text.strip() |
|
|
|
|
|
|
|
|
BAD_PUNCT_REGEX = re.compile( |
|
|
r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" |
|
|
) |
|
|
|
|
|
|
|
|
def clean_caption(caption): |
|
|
import urllib.parse as ul |
|
|
|
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
caption = str(caption) |
|
|
caption = ul.unquote_plus(caption) |
|
|
caption = caption.strip().lower() |
|
|
caption = re.sub("<person>", "person", caption) |
|
|
|
|
|
caption = re.sub( |
|
|
r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", |
|
|
"", |
|
|
caption, |
|
|
) |
|
|
caption = re.sub( |
|
|
r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", |
|
|
"", |
|
|
caption, |
|
|
) |
|
|
|
|
|
caption = BeautifulSoup(caption, features="html.parser").text |
|
|
|
|
|
|
|
|
caption = re.sub(r"@[\w\d]+\b", "", caption) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) |
|
|
caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) |
|
|
caption = re.sub(r"[\u3200-\u32ff]+", "", caption) |
|
|
caption = re.sub(r"[\u3300-\u33ff]+", "", caption) |
|
|
caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) |
|
|
caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) |
|
|
caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) |
|
|
|
|
|
|
|
|
|
|
|
caption = re.sub( |
|
|
r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", |
|
|
"-", |
|
|
caption, |
|
|
) |
|
|
|
|
|
|
|
|
caption = re.sub(r"[`´«»“”¨]", '"', caption) |
|
|
caption = re.sub(r"[‘’]", "'", caption) |
|
|
|
|
|
|
|
|
caption = re.sub(r""?", "", caption) |
|
|
|
|
|
caption = re.sub(r"&", "", caption) |
|
|
|
|
|
|
|
|
caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) |
|
|
|
|
|
|
|
|
caption = re.sub(r"\d:\d\d\s+$", "", caption) |
|
|
|
|
|
|
|
|
caption = re.sub(r"\\n", " ", caption) |
|
|
|
|
|
|
|
|
caption = re.sub(r"#\d{1,3}\b", "", caption) |
|
|
|
|
|
caption = re.sub(r"#\d{5,}\b", "", caption) |
|
|
|
|
|
caption = re.sub(r"\b\d{6,}\b", "", caption) |
|
|
|
|
|
caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) |
|
|
|
|
|
|
|
|
caption = re.sub(r"[\"\']{2,}", r'"', caption) |
|
|
caption = re.sub(r"[\.]{2,}", r" ", caption) |
|
|
|
|
|
caption = re.sub(BAD_PUNCT_REGEX, r" ", caption) |
|
|
caption = re.sub(r"\s+\.\s+", r" ", caption) |
|
|
|
|
|
|
|
|
regex2 = re.compile(r"(?:\-|\_)") |
|
|
if len(re.findall(regex2, caption)) > 3: |
|
|
caption = re.sub(regex2, " ", caption) |
|
|
|
|
|
caption = basic_clean(caption) |
|
|
|
|
|
caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) |
|
|
caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) |
|
|
caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) |
|
|
|
|
|
caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) |
|
|
caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) |
|
|
caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) |
|
|
caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) |
|
|
caption = re.sub(r"\bpage\s+\d+\b", "", caption) |
|
|
|
|
|
caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) |
|
|
|
|
|
caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) |
|
|
|
|
|
caption = re.sub(r"\b\s+\:\s+", r": ", caption) |
|
|
caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) |
|
|
caption = re.sub(r"\s+", " ", caption) |
|
|
|
|
|
caption.strip() |
|
|
|
|
|
caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) |
|
|
caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) |
|
|
caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) |
|
|
caption = re.sub(r"^\.\S+$", "", caption) |
|
|
|
|
|
return caption.strip() |
|
|
|
|
|
|
|
|
def text_preprocessing(text, use_text_preprocessing: bool = True): |
|
|
if use_text_preprocessing: |
|
|
|
|
|
text = clean_caption(text) |
|
|
text = clean_caption(text) |
|
|
return text |
|
|
else: |
|
|
return text.lower().strip() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_caption(path, ext): |
|
|
try: |
|
|
assert ext in ["json"] |
|
|
json_path = '.'.join(path.split(".")[:-1]) + ".json" |
|
|
with open(json_path, "r") as f: |
|
|
data = json.load(f) |
|
|
caption = data["caption"] |
|
|
return caption |
|
|
except: |
|
|
return "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DROP_SCORE_PROB = 0.2 |
|
|
|
|
|
|
|
|
def score_to_text(data): |
|
|
text = data["text"] |
|
|
scores = [] |
|
|
|
|
|
if "aes" in data: |
|
|
aes = data["aes"] |
|
|
if random.random() > DROP_SCORE_PROB: |
|
|
score_text = f"aesthetic score: {aes:.1f}" |
|
|
scores.append(score_text) |
|
|
if "flow" in data: |
|
|
flow = data["flow"] |
|
|
if random.random() > DROP_SCORE_PROB: |
|
|
score_text = f"motion score: {flow:.1f}" |
|
|
scores.append(score_text) |
|
|
if len(scores) > 0: |
|
|
text = f"{text} [{', '.join(scores)}]" |
|
|
return text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_file(input_path): |
|
|
if input_path.endswith(".csv"): |
|
|
return pd.read_csv(input_path) |
|
|
elif input_path.endswith(".parquet"): |
|
|
return pd.read_parquet(input_path) |
|
|
else: |
|
|
raise NotImplementedError(f"Unsupported file format: {input_path}") |
|
|
|
|
|
|
|
|
def save_file(data, output_path): |
|
|
output_dir = os.path.dirname(output_path) |
|
|
if not os.path.exists(output_dir) and output_dir != "": |
|
|
os.makedirs(output_dir) |
|
|
if output_path.endswith(".csv"): |
|
|
return data.to_csv(output_path, index=False) |
|
|
elif output_path.endswith(".parquet"): |
|
|
return data.to_parquet(output_path, index=False) |
|
|
else: |
|
|
raise NotImplementedError(f"Unsupported file format: {output_path}") |
|
|
|
|
|
|
|
|
def read_data(input_paths): |
|
|
data = [] |
|
|
input_name = "" |
|
|
input_list = [] |
|
|
for input_path in input_paths: |
|
|
input_list.extend(glob(input_path)) |
|
|
print("Input files:", input_list) |
|
|
for i, input_path in enumerate(input_list): |
|
|
if not os.path.exists(input_path): |
|
|
continue |
|
|
data.append(read_file(input_path)) |
|
|
input_name += os.path.splitext(os.path.basename(input_path))[0] |
|
|
if i != len(input_list) - 1: |
|
|
input_name += "+" |
|
|
print(f"Loaded {len(data[-1])} samples from '{input_path}'.") |
|
|
if len(data) == 0: |
|
|
print(f"No samples to process. Exit.") |
|
|
exit() |
|
|
data = pd.concat(data, ignore_index=True, sort=False) |
|
|
print(f"Total number of samples: {len(data)}") |
|
|
return data, input_name |
|
|
|
|
|
|
|
|
def unify_fps(input_path, dst_fps, overwrite=True): |
|
|
src_fps = get_video_info(input_path)[-2] |
|
|
if src_fps != dst_fps: |
|
|
ext = os.path.splitext(input_path)[1].lower() |
|
|
output_path = input_path.replace(ext, f'_fps{dst_fps}' + ext) |
|
|
ffmpeg_command = [ |
|
|
"ffmpeg", |
|
|
"-y", |
|
|
"-i", input_path, |
|
|
"-r", f"{dst_fps}", |
|
|
output_path |
|
|
] |
|
|
result = subprocess.run(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) |
|
|
if result.returncode == 0: |
|
|
if overwrite: |
|
|
os.rename(output_path, input_path) |
|
|
else: |
|
|
input_path = output_path |
|
|
|
|
|
src_fps = dst_fps |
|
|
else: |
|
|
print(result.stderr) |
|
|
|
|
|
num_frames = get_video_info(input_path, backend='torchvision')[0] |
|
|
return input_path, src_fps, num_frames |
|
|
|
|
|
|
|
|
def extract_audio(input_path, dst_sr=16000, backend='ffmpeg'): |
|
|
ext = os.path.splitext(input_path)[1] |
|
|
save_path = input_path.replace(ext, '.wav') |
|
|
audio_id = os.path.splitext(os.path.basename(save_path))[0] |
|
|
|
|
|
if os.path.exists(save_path): |
|
|
try: |
|
|
audio, ainfo = read_audio(save_path, backend='sf') |
|
|
assert len(audio) > 0 |
|
|
sr = int(ainfo['audio_fps']) |
|
|
if dst_sr is not None: |
|
|
assert sr == dst_sr |
|
|
return save_path, audio_id, sr |
|
|
except: |
|
|
pass |
|
|
|
|
|
if backend == 'torch': |
|
|
audio, ainfo = read_audio(input_path, backend='torch') |
|
|
audio = audio.numpy() |
|
|
sr = int(ainfo['audio_fps']) |
|
|
if sr != dst_sr: |
|
|
audio = librosa.resample(audio, orig_sr=sr, target_sr=dst_sr) |
|
|
sr = dst_sr |
|
|
sf.write(save_path, audio, sr) |
|
|
else: |
|
|
ffmpeg_command = [ |
|
|
"ffmpeg", |
|
|
"-i", input_path, |
|
|
"-f", "wav" |
|
|
] |
|
|
if dst_sr is not None: |
|
|
ffmpeg_command.extend(["-ar", f"{dst_sr}"]) |
|
|
ffmpeg_command.append(save_path) |
|
|
result = subprocess.run(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) |
|
|
sr = dst_sr |
|
|
|
|
|
return save_path, audio_id, sr |
|
|
|
|
|
|
|
|
def trim_audio(input_path, max_sec): |
|
|
result = subprocess.run( |
|
|
["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", input_path], |
|
|
stdout=subprocess.PIPE, |
|
|
stderr=subprocess.PIPE, |
|
|
text=True |
|
|
) |
|
|
try: |
|
|
duration = float(result.stdout.strip()) |
|
|
except: |
|
|
return 0 |
|
|
|
|
|
if duration > max_sec: |
|
|
ext = os.path.splitext(input_path)[1] |
|
|
temp_path = input_path.replace(ext, '_tmp' + ext) |
|
|
subprocess.run([ |
|
|
"ffmpeg", "-i", input_path, "-t", str(max_sec), temp_path, "-y" |
|
|
], check=True) |
|
|
subprocess.run(["mv", temp_path, input_path], check=True) |
|
|
duration = max_sec |
|
|
return duration |
|
|
|
|
|
|
|
|
def trim_video(input_path, max_sec): |
|
|
vinfo = get_video_info(input_path, backend='torchvision') |
|
|
num_frames, fps = vinfo[0], vinfo[-2] |
|
|
|
|
|
duration = num_frames / fps |
|
|
|
|
|
if duration > max_sec: |
|
|
ext = os.path.splitext(input_path)[1] |
|
|
temp_path = input_path.replace(ext, '_tmp' + ext) |
|
|
subprocess.run([ |
|
|
"ffmpeg", "-i", input_path, "-t", str(max_sec), temp_path, "-y" |
|
|
], check=True) |
|
|
subprocess.run(["mv", temp_path, input_path], check=True) |
|
|
duration = max_sec |
|
|
num_frames = get_video_info(input_path, backend='torchvision')[0] |
|
|
|
|
|
return num_frames |
|
|
|
|
|
|
|
|
def resample_audio(input_path, dst_sr=16000): |
|
|
try: |
|
|
audio, ainfo = read_audio(input_path, backend='sf') |
|
|
src_sr = int(ainfo['audio_fps']) |
|
|
if src_sr == dst_sr: |
|
|
return src_sr |
|
|
audio = audio.cpu().numpy() |
|
|
audio = librosa.resample(audio, orig_sr=src_sr, target_sr=dst_sr) |
|
|
sf.write(input_path, audio, dst_sr) |
|
|
return dst_sr |
|
|
except: |
|
|
return 0 |
|
|
|
|
|
|
|
|
def set_dummy_video(input_path): |
|
|
result = subprocess.run( |
|
|
["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", input_path], |
|
|
stdout=subprocess.PIPE, |
|
|
stderr=subprocess.PIPE, |
|
|
text=True |
|
|
) |
|
|
try: |
|
|
duration = float(result.stdout.strip()) |
|
|
except: |
|
|
duration = 0 |
|
|
|
|
|
path, relpath = ['placeholder.mp4'] * 2 |
|
|
_id = 'placeholder' |
|
|
fps = 24 |
|
|
num_frames = int(duration * fps) |
|
|
height, width, aspect_ratio, resolution = 720, 1280, 0.5625, 921600 |
|
|
text = 'placeholder' |
|
|
speech = -1 |
|
|
return path, _id, relpath, num_frames, \ |
|
|
height, width, aspect_ratio, fps, resolution,\ |
|
|
speech, text |
|
|
|
|
|
|
|
|
def video_crop_resize(input_path, target_wh): |
|
|
""" |
|
|
Resize and crop a video to the target width and height while maintaining as much original information as possible. |
|
|
|
|
|
Args: |
|
|
input_path (str): Path to the input video. |
|
|
output_path (str): Path to save the processed video. |
|
|
target_wh (tuple): Target width and height as (width, height). |
|
|
""" |
|
|
|
|
|
dst_w, dst_h = target_wh |
|
|
dst_as = dst_w / dst_h |
|
|
resizer = T.Resize((dst_h, dst_w)) |
|
|
dst_res = dst_w * dst_h |
|
|
|
|
|
|
|
|
video, vinfo = read_video(input_path, backend='av') |
|
|
fps = int(round(vinfo['video_fps'])) |
|
|
_, _, H, W = video.shape |
|
|
|
|
|
|
|
|
if W / H > dst_as: |
|
|
new_width = int(round(H * dst_as)) |
|
|
new_height = H |
|
|
x_crop = (W - new_width) // 2 |
|
|
y_crop = 0 |
|
|
else: |
|
|
new_width = W |
|
|
new_height = int(round(W / dst_as)) |
|
|
x_crop = 0 |
|
|
y_crop = (H - new_height) // 2 |
|
|
|
|
|
|
|
|
cropped_video = video[:, :, y_crop:y_crop+new_height, x_crop:x_crop+new_width] |
|
|
resized_video = resizer(cropped_video).permute(0, 2, 3, 1) |
|
|
|
|
|
|
|
|
ext = os.path.splitext(input_path)[-1] |
|
|
audio_path = input_path.replace(ext, '.wav') |
|
|
audio, ainfo = read_audio(audio_path, backend='sf') |
|
|
audio_fps = int(ainfo['audio_fps']) |
|
|
if audio is not None and len(audio.shape) == 1: |
|
|
audio = audio[None].repeat(2, 1) |
|
|
temp_path = input_path.replace(ext, '_tmp'+ext) |
|
|
write_video(temp_path, resized_video, fps=fps, video_codec="h264", |
|
|
audio_array=audio, audio_fps=audio_fps, audio_codec='aac') |
|
|
subprocess.run(["mv", temp_path, input_path], check=True) |
|
|
|
|
|
return dst_w, dst_h, dst_as, dst_res |
|
|
|
|
|
|
|
|
def fix_video(input_path): |
|
|
cap = cv2.VideoCapture(input_path) |
|
|
if get_video_length(cap, 'header') != get_video_length(cap, 'set'): |
|
|
ext = os.path.splitext(input_path)[-1] |
|
|
cache_path = input_path.replace(ext, f'_fix{ext}') |
|
|
ffmpeg_command = [ |
|
|
"ffmpeg", |
|
|
"-i", input_path, |
|
|
"-c:v", "libx264", |
|
|
"-c:a", "aac", |
|
|
"-crf", "18" , |
|
|
"-preset", "veryfast", |
|
|
"-y", cache_path |
|
|
] |
|
|
result = subprocess.run(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) |
|
|
os.rename(cache_path, input_path) |
|
|
print('fixed:', input_path) |
|
|
cap.release() |
|
|
|
|
|
num_frames, _, _, _, fps, _ = get_video_info(input_path) |
|
|
|
|
|
return num_frames, fps |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(args): |
|
|
|
|
|
data, input_name = read_data(args.input) |
|
|
|
|
|
|
|
|
if args.difference is not None: |
|
|
data_diff = pd.read_csv(args.difference) |
|
|
print(f"Difference csv contains {len(data_diff)} samples.") |
|
|
data = data[~data["path"].isin(data_diff["path"])] |
|
|
input_name += f"-{os.path.basename(args.difference).split('.')[0]}" |
|
|
print(f"Filtered number of samples: {len(data)}.") |
|
|
|
|
|
|
|
|
if args.intersection is not None: |
|
|
data_new = pd.read_csv(args.intersection) |
|
|
print(f"Intersection csv contains {len(data_new)} samples.") |
|
|
cols_to_use = data_new.columns.difference(data.columns) |
|
|
|
|
|
col_on = "path" |
|
|
|
|
|
|
|
|
cols_to_use = cols_to_use.insert(0, col_on) |
|
|
data = pd.merge(data, data_new[cols_to_use], on=col_on, how="inner") |
|
|
print(f"Intersection number of samples: {len(data)}.") |
|
|
|
|
|
|
|
|
output_path = get_output_path(args, input_name) |
|
|
|
|
|
|
|
|
if args.lang is not None: |
|
|
detect_lang = build_lang_detector(args.lang) |
|
|
if args.count_num_token == "t5": |
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("DeepFloyd/t5-v1_1-xxl") |
|
|
|
|
|
|
|
|
if args.load_caption is not None: |
|
|
assert "path" in data.columns |
|
|
data["text"] = apply(data["path"], load_caption, ext=args.load_caption) |
|
|
if args.info: |
|
|
info = apply(data["path"], get_info) |
|
|
( |
|
|
data["num_frames"], |
|
|
data["height"], |
|
|
data["width"], |
|
|
data["aspect_ratio"], |
|
|
data["fps"], |
|
|
data["resolution"], |
|
|
) = zip(*info) |
|
|
if args.video_info: |
|
|
info = apply(data["path"], get_video_info) |
|
|
( |
|
|
data["num_frames"], |
|
|
data["height"], |
|
|
data["width"], |
|
|
data["aspect_ratio"], |
|
|
data["fps"], |
|
|
data["resolution"], |
|
|
) = zip(*info) |
|
|
if args.audio_info: |
|
|
info = apply(data["audio_path"], get_audio_info) |
|
|
( |
|
|
data["audio_length_s"], |
|
|
data["audio_fps"], |
|
|
data["audio_channels"], |
|
|
) = zip(*info) |
|
|
if args.ext: |
|
|
assert "path" in data.columns |
|
|
data = data[apply(data["path"], os.path.exists)] |
|
|
if args.uni_fps is not None: |
|
|
info = apply(data["path"], unify_fps, dst_fps=args.uni_fps, overwrite=args.overwrite) |
|
|
data["path"], data["fps"], data["num_frames"] = zip(*info) |
|
|
if args.extract_audio: |
|
|
info = apply(data["path"], extract_audio, dst_sr=args.audio_sr) |
|
|
data["audio_path"], data['audio_id'], data["audio_fps"] = zip(*info) |
|
|
if args.trim_audio: |
|
|
duration = apply(data["audio_path"], trim_audio, max_sec=args.trim_audio) |
|
|
data = data[np.array(duration) > 0] |
|
|
if args.trim: |
|
|
data['num_frames'] = apply(data["path"], trim_video, max_sec=args.trim) |
|
|
audio_duration = apply(data["audio_path"], trim_audio, max_sec=args.trim) |
|
|
data = data[np.array(audio_duration) > 0] |
|
|
if args.crop_resize: |
|
|
assert len(args.crop_resize) == 2 |
|
|
info = apply(data["path"], video_crop_resize, target_wh=args.crop_resize) |
|
|
data['width'], data['height'], data['aspect_ratio'], data['resolution'] = zip(*info) |
|
|
if args.resample_audio: |
|
|
data["audio_fps"] = apply(data["audio_path"], resample_audio, dst_sr=args.audio_sr) |
|
|
data = data[data["audio_fps"] == args.audio_sr] |
|
|
if args.dummy_video: |
|
|
info = apply(data["audio_path"], set_dummy_video) |
|
|
path, _id, relpath, num_frames, height, width, aspect_ratio, \ |
|
|
fps, resolution, speech, text = zip(*info) |
|
|
data = {'path': path, 'id': _id, 'relpath': relpath, 'num_frames': num_frames, |
|
|
'height': height, 'width': width, 'aspect_ratio': aspect_ratio, |
|
|
'fps': fps, 'resolution': resolution, |
|
|
'audio_path': data['audio_path'], 'audio_fps': data['audio_fps'], |
|
|
'speech': speech, 'text': text, 'audio_text': data['audio_text']} |
|
|
data = pd.DataFrame(data) |
|
|
if args.fix_video: |
|
|
info = apply(data["path"], fix_video) |
|
|
data["num_frames"], data['fps'] = zip(*info) |
|
|
|
|
|
|
|
|
if args.remove_url: |
|
|
assert "text" in data.columns |
|
|
data = data[~data["text"].str.contains(r"(?P<url>https?://[^\s]+)", regex=True)] |
|
|
if args.lang is not None: |
|
|
assert "text" in data.columns |
|
|
data = data[data["text"].progress_apply(detect_lang)] |
|
|
if args.remove_empty_path: |
|
|
assert "path" in data.columns |
|
|
data = data[data["path"].str.len() > 0] |
|
|
data = data[~data["path"].isna()] |
|
|
if args.remove_empty_caption: |
|
|
assert "text" in data.columns |
|
|
data = data[data["text"].str.len() > 0] |
|
|
data = data[~data["text"].isna()] |
|
|
if args.remove_path_duplication: |
|
|
assert "path" in data.columns |
|
|
data = data.drop_duplicates(subset=["path"]) |
|
|
if args.path_subset: |
|
|
data = data[data["path"].str.contains(args.path_subset)] |
|
|
|
|
|
|
|
|
if args.relpath is not None: |
|
|
data["path"] = apply(data["path"], lambda x: os.path.relpath(x, args.relpath)) |
|
|
if args.abspath is not None: |
|
|
data["path"] = apply(data["path"], lambda x: os.path.join(args.abspath, x)) |
|
|
if args.path_to_id: |
|
|
data["id"] = apply(data["path"], lambda x: os.path.splitext(os.path.basename(x))[0]) |
|
|
if args.merge_cmotion: |
|
|
data["text"] = apply(data, lambda x: merge_cmotion(x["text"], x["cmotion"]), axis=1) |
|
|
if args.refine_llm_caption: |
|
|
assert "text" in data.columns |
|
|
data["text"] = apply(data["text"], remove_caption_prefix) |
|
|
if args.append_text is not None: |
|
|
assert "text" in data.columns |
|
|
data["text"] = data["text"] + args.append_text |
|
|
if args.score_to_text: |
|
|
data["text"] = apply(data, score_to_text, axis=1) |
|
|
if args.clean_caption: |
|
|
assert "text" in data.columns |
|
|
data["text"] = apply( |
|
|
data["text"], |
|
|
partial(text_preprocessing, use_text_preprocessing=True), |
|
|
) |
|
|
if args.count_num_token is not None: |
|
|
assert "text" in data.columns |
|
|
data["text_len"] = apply(data["text"], lambda x: len(tokenizer(x)["input_ids"])) |
|
|
if args.update_text is not None: |
|
|
data_new = pd.read_csv(args.update_text) |
|
|
num_updated = data.path.isin(data_new.path).sum() |
|
|
print(f"Number of updated samples: {num_updated}.") |
|
|
data = data.set_index("path") |
|
|
data_new = data_new[["path", "text"]].set_index("path") |
|
|
data.update(data_new) |
|
|
data = data.reset_index() |
|
|
|
|
|
|
|
|
if args.sort is not None: |
|
|
data = data.sort_values(by=args.sort, ascending=False) |
|
|
if args.sort_ascending is not None: |
|
|
data = data.sort_values(by=args.sort_ascending, ascending=True) |
|
|
|
|
|
|
|
|
if args.filesize: |
|
|
assert "path" in data.columns |
|
|
data["filesize"] = apply(data["path"], lambda x: os.stat(x).st_size / 1024 / 1024) |
|
|
if args.fsmax is not None: |
|
|
assert "filesize" in data.columns |
|
|
data = data[data["filesize"] <= args.fsmax] |
|
|
if args.remove_empty_caption: |
|
|
assert "text" in data.columns |
|
|
data = data[data["text"].str.len() > 0] |
|
|
data = data[~data["text"].isna()] |
|
|
if args.fmin is not None: |
|
|
assert "num_frames" in data.columns |
|
|
data = data[data["num_frames"] >= args.fmin] |
|
|
if args.fmax is not None: |
|
|
assert "num_frames" in data.columns |
|
|
data = data[data["num_frames"] <= args.fmax] |
|
|
if args.fpsmax is not None: |
|
|
assert "fps" in data.columns |
|
|
data = data[(data["fps"] <= args.fpsmax) | np.isnan(data["fps"])] |
|
|
if args.hwmax is not None: |
|
|
if "resolution" not in data.columns: |
|
|
height = data["height"] |
|
|
width = data["width"] |
|
|
data["resolution"] = height * width |
|
|
data = data[data["resolution"] <= args.hwmax] |
|
|
if args.aesmin is not None: |
|
|
assert "aes" in data.columns |
|
|
data = data[data["aes"] >= args.aesmin] |
|
|
if args.matchmin is not None: |
|
|
assert "match" in data.columns |
|
|
data = data[data["match"] >= args.matchmin] |
|
|
if args.flowmin is not None: |
|
|
assert "flow" in data.columns |
|
|
data = data[data["flow"] >= args.flowmin] |
|
|
if args.ocrmax is not None: |
|
|
assert "ocr" in data.columns |
|
|
data = data[data["ocr"] <= args.ocrmax] |
|
|
if args.remove_text_duplication: |
|
|
data = data.drop_duplicates(subset=["text"], keep="first") |
|
|
if args.img_only: |
|
|
data = data[data["path"].str.lower().str.endswith(IMG_EXTENSIONS)] |
|
|
if args.vid_only: |
|
|
data = data[~data["path"].str.lower().str.endswith(IMG_EXTENSIONS)] |
|
|
if args.nospeech: |
|
|
assert "speech" in data.columns |
|
|
data = data[data["speech"] == 0] |
|
|
|
|
|
|
|
|
if args.shuffle: |
|
|
data = data.sample(frac=1).reset_index(drop=True) |
|
|
if args.head is not None: |
|
|
data = data.head(args.head) |
|
|
|
|
|
|
|
|
if args.train_column: |
|
|
all_columns = data.columns |
|
|
columns_to_drop = all_columns.difference(TRAIN_COLUMNS) |
|
|
data = data.drop(columns=columns_to_drop) |
|
|
|
|
|
print(f"Filtered number of samples: {len(data)}.") |
|
|
|
|
|
|
|
|
if args.shard is not None: |
|
|
sharded_data = np.array_split(data, args.shard) |
|
|
for i in range(args.shard): |
|
|
output_path_part = output_path.split(".") |
|
|
output_path_s = ".".join(output_path_part[:-1]) + f"_{i}." + output_path_part[-1] |
|
|
save_file(sharded_data[i], output_path_s) |
|
|
print(f"Saved {len(sharded_data[i])} samples to {output_path_s}.") |
|
|
else: |
|
|
save_file(data, output_path) |
|
|
print(f"Saved {len(data)} samples to {output_path}.") |
|
|
|
|
|
|
|
|
def parse_args(): |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument("input", type=str, nargs="+", help="path to the input dataset") |
|
|
parser.add_argument("--output", type=str, default=None, help="output path") |
|
|
parser.add_argument("--format", type=str, default="csv", help="output format", choices=["csv", "parquet"]) |
|
|
parser.add_argument("--disable-parallel", action="store_true", help="disable parallel processing") |
|
|
parser.add_argument("--num-workers", type=int, default=None, help="number of workers") |
|
|
parser.add_argument("--seed", type=int, default=42, help="random seed") |
|
|
|
|
|
|
|
|
parser.add_argument("--shard", type=int, default=None, help="shard the dataset") |
|
|
parser.add_argument("--sort", type=str, default=None, help="sort by column") |
|
|
parser.add_argument("--sort-ascending", type=str, default=None, help="sort by column (ascending order)") |
|
|
parser.add_argument("--difference", type=str, default=None, help="get difference from the dataset") |
|
|
parser.add_argument( |
|
|
"--intersection", type=str, default=None, help="keep the paths in csv from the dataset and merge columns" |
|
|
) |
|
|
parser.add_argument("--train-column", action="store_true", help="only keep the train column") |
|
|
|
|
|
|
|
|
parser.add_argument("--info", action="store_true", help="get the basic information of each video and image") |
|
|
parser.add_argument("--video-info", action="store_true", help="get the basic information of each video") |
|
|
parser.add_argument("--audio-info", action="store_true", help="get the basic information of each audio") |
|
|
parser.add_argument("--ext", action="store_true", help="check if the file exists") |
|
|
parser.add_argument( |
|
|
"--load-caption", type=str, default=None, choices=["json", "txt"], help="load the caption from json or txt" |
|
|
) |
|
|
parser.add_argument("--uni-fps", type=int, default=None, help="unify video fps") |
|
|
parser.add_argument("--extract-audio", action="store_true", help="extract and save audios from videos") |
|
|
parser.add_argument("--trim-audio", type=float, default=None, help="trim audios for the first x seconds") |
|
|
parser.add_argument("--trim", type=float, default=None, help="trim videos and audios for the first x seconds") |
|
|
parser.add_argument("--crop-resize", type=int, nargs='+', default=None, help="crop and resize videos to fixed shapes (width, height)") |
|
|
parser.add_argument("--resample-audio", action="store_true", help="resample audios to a specific Hz") |
|
|
parser.add_argument("--audio-sr", type=int, default=None, help="pre-defined audio sample rate") |
|
|
parser.add_argument("--dummy-video", action="store_true", help="set dummy videos for given audios") |
|
|
parser.add_argument("--overwrite", action="store_true", help="overwrite data file") |
|
|
parser.add_argument("--fix-video", action="store_true", help="fix video reading problem") |
|
|
|
|
|
|
|
|
parser.add_argument("--relpath", type=str, default=None, help="modify the path to relative path by root given") |
|
|
parser.add_argument("--abspath", type=str, default=None, help="modify the path to absolute path by root given") |
|
|
parser.add_argument("--path-to-id", action="store_true", help="add id based on path") |
|
|
parser.add_argument( |
|
|
"--path-subset", type=str, default=None, help="extract a subset data containing the given `path-subset` value" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--remove-empty-path", |
|
|
action="store_true", |
|
|
help="remove rows with empty path", |
|
|
) |
|
|
|
|
|
|
|
|
parser.add_argument( |
|
|
"--remove-empty-caption", |
|
|
action="store_true", |
|
|
help="remove rows with empty caption", |
|
|
) |
|
|
parser.add_argument("--remove-url", action="store_true", help="remove rows with url in caption") |
|
|
parser.add_argument("--lang", type=str, default=None, help="remove rows with other language") |
|
|
parser.add_argument("--remove-path-duplication", action="store_true", help="remove rows with duplicated path") |
|
|
parser.add_argument("--remove-text-duplication", action="store_true", help="remove rows with duplicated caption") |
|
|
|
|
|
|
|
|
parser.add_argument("--refine-llm-caption", action="store_true", help="modify the caption generated by LLM") |
|
|
parser.add_argument( |
|
|
"--clean-caption", action="store_true", help="modify the caption according to T5 pipeline to suit training" |
|
|
) |
|
|
parser.add_argument("--merge-cmotion", action="store_true", help="merge the camera motion to the caption") |
|
|
parser.add_argument( |
|
|
"--count-num-token", type=str, choices=["t5"], default=None, help="Count the number of tokens in the caption" |
|
|
) |
|
|
parser.add_argument("--append-text", type=str, default=None, help="append text to the caption") |
|
|
parser.add_argument("--score-to-text", action="store_true", help="convert score to text") |
|
|
parser.add_argument("--update-text", type=str, default=None, help="update the text with the given text") |
|
|
|
|
|
|
|
|
parser.add_argument("--filesize", action="store_true", help="get the filesize of each video and image in MB") |
|
|
parser.add_argument("--fsmax", type=int, default=None, help="filter the dataset by maximum filesize") |
|
|
parser.add_argument("--fmin", type=int, default=None, help="filter the dataset by minimum number of frames") |
|
|
parser.add_argument("--fmax", type=int, default=None, help="filter the dataset by maximum number of frames") |
|
|
parser.add_argument("--hwmax", type=int, default=None, help="filter the dataset by maximum resolution") |
|
|
parser.add_argument("--aesmin", type=float, default=None, help="filter the dataset by minimum aes score") |
|
|
parser.add_argument("--matchmin", type=float, default=None, help="filter the dataset by minimum match score") |
|
|
parser.add_argument("--flowmin", type=float, default=None, help="filter the dataset by minimum flow score") |
|
|
parser.add_argument("--fpsmax", type=float, default=None, help="filter the dataset by maximum fps") |
|
|
parser.add_argument("--img-only", action="store_true", help="only keep the image data") |
|
|
parser.add_argument("--vid-only", action="store_true", help="only keep the video data") |
|
|
parser.add_argument("--ocrmax", type=int, default=None, help="filter the dataset by maximum orc score") |
|
|
parser.add_argument("--nospeech", action="store_true", help="filter the dataset by speech detection results") |
|
|
|
|
|
|
|
|
parser.add_argument("--shuffle", default=False, action="store_true", help="shuffle the dataset") |
|
|
parser.add_argument("--head", type=int, default=None, help="return the first n rows of data") |
|
|
|
|
|
return parser.parse_args() |
|
|
|
|
|
|
|
|
def get_output_path(args, input_name): |
|
|
if args.output is not None: |
|
|
return args.output |
|
|
name = input_name |
|
|
dir_path = os.path.dirname(args.input[0]) |
|
|
|
|
|
|
|
|
if args.sort is not None: |
|
|
assert args.sort_ascending is None |
|
|
name += "_sort" |
|
|
if args.sort_ascending is not None: |
|
|
assert args.sort is None |
|
|
name += "_sort" |
|
|
|
|
|
|
|
|
|
|
|
if args.info: |
|
|
name += "_info" |
|
|
if args.video_info: |
|
|
name += "_vinfo" |
|
|
if args.audio_info: |
|
|
name += "_ainfo" |
|
|
if args.ext: |
|
|
name += "_ext" |
|
|
if args.load_caption: |
|
|
name += f"_load{args.load_caption}" |
|
|
if args.uni_fps is not None: |
|
|
name += f"_fps{args.uni_fps}" |
|
|
if args.extract_audio: |
|
|
name += "_au" |
|
|
if args.audio_sr is not None: |
|
|
name += f"_sr{args.audio_sr}" |
|
|
if args.trim_audio is not None: |
|
|
name += f"_trim{args.trim_audio}s" |
|
|
if args.trim is not None: |
|
|
name += f"_trim{args.trim}s" |
|
|
if args.crop_resize is not None: |
|
|
name += f"_crop_resize{args.crop_resize[0]}x{args.crop_resize[1]}" |
|
|
if args.resample_audio: |
|
|
assert args.audio_sr is not None |
|
|
name += f"_sr{args.audio_sr}" |
|
|
if args.dummy_video: |
|
|
name += f"_dummy_videos" |
|
|
if args.fix_video: |
|
|
name += f"_videofix" |
|
|
|
|
|
|
|
|
if args.relpath is not None: |
|
|
name += "_relpath" |
|
|
if args.abspath is not None: |
|
|
name += "_abspath" |
|
|
if args.remove_empty_path: |
|
|
name += "_noemptypath" |
|
|
|
|
|
|
|
|
if args.remove_empty_caption: |
|
|
name += "_noempty" |
|
|
if args.remove_url: |
|
|
name += "_nourl" |
|
|
if args.lang is not None: |
|
|
name += f"_{args.lang}" |
|
|
if args.remove_path_duplication: |
|
|
name += "_noduppath" |
|
|
if args.remove_text_duplication: |
|
|
name += "_noduptext" |
|
|
if args.path_subset: |
|
|
name += "_subset" |
|
|
|
|
|
|
|
|
if args.refine_llm_caption: |
|
|
name += "_llm" |
|
|
if args.clean_caption: |
|
|
name += "_clean" |
|
|
if args.merge_cmotion: |
|
|
name += "_cmcaption" |
|
|
if args.count_num_token: |
|
|
name += "_ntoken" |
|
|
if args.append_text is not None: |
|
|
name += "_appendtext" |
|
|
if args.score_to_text: |
|
|
name += "_score2text" |
|
|
if args.update_text is not None: |
|
|
name += "_update" |
|
|
|
|
|
|
|
|
if args.filesize: |
|
|
name += "_filesize" |
|
|
if args.fsmax is not None: |
|
|
name += f"_fsmax{args.fsmax}" |
|
|
if args.fmin is not None: |
|
|
name += f"_fmin{args.fmin}" |
|
|
if args.fmax is not None: |
|
|
name += f"_fmax{args.fmax}" |
|
|
if args.fpsmax is not None: |
|
|
name += f"_fpsmax{args.fpsmax}" |
|
|
if args.hwmax is not None: |
|
|
name += f"_hwmax{args.hwmax}" |
|
|
if args.aesmin is not None: |
|
|
name += f"_aesmin{args.aesmin}" |
|
|
if args.matchmin is not None: |
|
|
name += f"_matchmin{args.matchmin}" |
|
|
if args.flowmin is not None: |
|
|
name += f"_flowmin{args.flowmin}" |
|
|
if args.ocrmax is not None: |
|
|
name += f"_ocrmax{args.ocrmax}" |
|
|
if args.img_only: |
|
|
name += "_img" |
|
|
if args.vid_only: |
|
|
name += "_vid" |
|
|
if args.nospeech: |
|
|
name += "_nospeech" |
|
|
|
|
|
|
|
|
if args.shuffle: |
|
|
name += f"_shuffled_seed{args.seed}" |
|
|
if args.head is not None: |
|
|
name += f"_first_{args.head}_data" |
|
|
|
|
|
output_path = os.path.join(dir_path, f"{name}.{args.format}") |
|
|
return output_path |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
args = parse_args() |
|
|
if args.disable_parallel: |
|
|
PANDA_USE_PARALLEL = False |
|
|
if PANDA_USE_PARALLEL: |
|
|
if args.num_workers is not None: |
|
|
pandarallel.initialize(nb_workers=args.num_workers, progress_bar=True) |
|
|
else: |
|
|
pandarallel.initialize(progress_bar=True) |
|
|
if args.seed is not None: |
|
|
random.seed(args.seed) |
|
|
np.random.seed(args.seed) |
|
|
main(args) |
|
|
|