JAV-Gen / scripts /bash /misc /data_prepare.sh
kaiw7's picture
Upload folder using huggingface_hub
e490e7e verified
ROOT_VIDEO="/ssd2/kailiu/TAVGBench/TAVGBench"
ROOT_META="debug/meta/tavgbench_v03"
aesmin=4.0 # 审美分阈值,可以再试试
flowmin=0.1 # flow分阈值,可以再试试
ocrmax=5 # ocr分阈值,检测到超过orcmax数量的字符就就filter掉
fmin=1
ngpus=2
export CUDA_VISIBLE_DEVICES="1,7"
# # 1.1 Create a meta file from a video folder. This should output ${ROOT_META}/meta.csv
# python -m tools.datasets.convert video ${ROOT_VIDEO} --output ${ROOT_META}/meta.csv #\
# # --difference data/meta/st_prior/meta_info_fmin10_fmax1000_au_sr16000_mmtrail136k_tavgbench240k_h100_unpaired_audios_aes.csv
# # --difference data/meta/mmtrail_v01/meta_info_fmin1_fmax1000_au_sr16000_nospeech_fps24_training.csv
# # # 1.2 Get video information and remove broken videos. This should output ${ROOT_META}/meta_info_fmin1.csv
# python -m tools.datasets.datautil ${ROOT_META}/meta.csv --info --fmin ${fmin} #0 --fmax 1000
# # 1.3 Delete invalid videos
# python data_pipeline/sync_va/src/utils.py --meta_file ${ROOT_META}/meta_info_fmin${fmin}.csv --ref_file ${ROOT_META}/meta.csv
# # 2.1 Predict aesthetic scores. This should output ${ROOT_META}/meta_info_fmin${fmin}_aes.csv
# torchrun --nproc_per_node ${ngpus} -m tools.scoring.aesthetic.inference \
# ${ROOT_META}/meta_info_fmin${fmin}.csv \
# --bs 256 --num_workers 16
# # # 2.1.1 Filter by aesthetic scores. This should output ${ROOT_META}/meta_info_fmin${fmin}_aes_aesmin${aesmin}.csv
# python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_aes.csv --aesmin ${aesmin}
# # 2.2 Compute OCR score
# # 2.2.1 download
# mkdir -p ./pretrained_models
# wget https://download.openmmlab.com/mmocr/textdet/dbnetpp/dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015/dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015_20221101_124139-4ecb39ac.pth -P ./pretrained_models/
# 2.2.2 inference
# torchrun --standalone --nproc_per_node ${ngpus} -m tools.scoring.ocr.inference \
# ${ROOT_META}/meta_info_fmin${fmin}_aes.csv \
# --bs 8 --num_workers 8
# # 2.3.1 Extract audios from videos, and unify sample rate to 16k Hz for all audios
# python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr.csv --extract-audio --audio-sr 16000
# # # 2.3.2 Detect speech audios
# python -m tools.scoring.speech.speech_det ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000.csv --ngpus 4
# # 2.3.3 Filter by aesthetic and ocr scores, and speech detections.
# This should output ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech.csv
# python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech.csv --aesmin ${aesmin} --ocrmax ${ocrmax} --nospeech
# python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_nospeech.csv --aesmin ${aesmin} --ocrmax ${ocrmax}
# # 2.4 Compute optical flow to measure motion quality. This should output ${ROOT_META}/meta_info_fmin1_aes_flow.csv
# # 2.4.1 download
# mkdir -p ./pretrained_models/unimatch/
# wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P ./pretrained_models/unimatch/
# # 2.4.2 inference
# torchrun --standalone --nproc_per_node ${ngpus} tools/scoring/optical_flow/inference.py \
# ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech.csv \
# --bs 4 --num_workers 8
# # 2.4.3 Filter by flow scores. This should output ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}.csv
# python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin4.0_ocrmax5_nospeech_flow.csv --flowmin ${flowmin}
# # 2.5 Delete invalid videos
# python -m tools.datasets.del_invalid \
# --meta_file ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}.csv \
# --ref_file ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow.csv
# python -m tools.datasets.datautil /root/workspace/datasets/JavisDiT/train/video/meta_new.csv --uni-fps 16 --overwrite --num-workers 8
# 3.1 Generate caption. This should output ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}_caption.csv
# pip install -U vllm
# export VLLM_WORKER_MULTIPROC_METHOD="spawn"
# export DECORD_DUPLICATE_WARNING_THRESHOLD=1.0
# export FORCE_QWENVL_VIDEO_READER="decord" # "decord" or "torchvision"
# export VLLM_USE_V1=0
# declare -a GPUS=(1 4) # 6
# ngpus=2 #${#GPUS[@]}
# for i in "${!GPUS[@]}"; do
# gpu=${GPUS[$i]}
# # i=1
# CUDA_VISIBLE_DEVICES=$gpu nohup python -m tools.caption.caption_qwen25omni_vllm \
# --model_name_or_path ./pretrained_models/Qwen2.5-Omni-7B \
# --batch-size 8 --num-workers 8 \
# --part_num ${ngpus} --part_idx ${i} \
# --input-file ${ROOT_META}/parts/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}.csv \
# --output-file ${ROOT_META}/parts/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}_caption_part${i}.csv \
# >nohup.infer.part${i}.log 2>&1 &
# done
# python -m tools.datasets.datautil ${ROOT_META}/parts/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}_*csv \
# --output ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}_caption.csv \
# --remove-path-duplication
# TODO: 3.2 Clean caption.
# 4.1 fix broken videos
# python -m tools.datasets.datautil data/meta/tavgbench_filter_360k.csv --fix-video