ROOT_VIDEO="/ssd2/kailiu/TAVGBench/TAVGBench" ROOT_META="debug/meta/tavgbench_v03" aesmin=4.0 # 审美分阈值,可以再试试 flowmin=0.1 # flow分阈值,可以再试试 ocrmax=5 # ocr分阈值,检测到超过orcmax数量的字符就就filter掉 fmin=1 ngpus=2 export CUDA_VISIBLE_DEVICES="1,7" # # 1.1 Create a meta file from a video folder. This should output ${ROOT_META}/meta.csv # python -m tools.datasets.convert video ${ROOT_VIDEO} --output ${ROOT_META}/meta.csv #\ # # --difference data/meta/st_prior/meta_info_fmin10_fmax1000_au_sr16000_mmtrail136k_tavgbench240k_h100_unpaired_audios_aes.csv # # --difference data/meta/mmtrail_v01/meta_info_fmin1_fmax1000_au_sr16000_nospeech_fps24_training.csv # # # 1.2 Get video information and remove broken videos. This should output ${ROOT_META}/meta_info_fmin1.csv # python -m tools.datasets.datautil ${ROOT_META}/meta.csv --info --fmin ${fmin} #0 --fmax 1000 # # 1.3 Delete invalid videos # python data_pipeline/sync_va/src/utils.py --meta_file ${ROOT_META}/meta_info_fmin${fmin}.csv --ref_file ${ROOT_META}/meta.csv # # 2.1 Predict aesthetic scores. This should output ${ROOT_META}/meta_info_fmin${fmin}_aes.csv # torchrun --nproc_per_node ${ngpus} -m tools.scoring.aesthetic.inference \ # ${ROOT_META}/meta_info_fmin${fmin}.csv \ # --bs 256 --num_workers 16 # # # 2.1.1 Filter by aesthetic scores. This should output ${ROOT_META}/meta_info_fmin${fmin}_aes_aesmin${aesmin}.csv # python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_aes.csv --aesmin ${aesmin} # # 2.2 Compute OCR score # # 2.2.1 download # mkdir -p ./pretrained_models # wget https://download.openmmlab.com/mmocr/textdet/dbnetpp/dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015/dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015_20221101_124139-4ecb39ac.pth -P ./pretrained_models/ # 2.2.2 inference # torchrun --standalone --nproc_per_node ${ngpus} -m tools.scoring.ocr.inference \ # ${ROOT_META}/meta_info_fmin${fmin}_aes.csv \ # --bs 8 --num_workers 8 # # 2.3.1 Extract audios from videos, and unify sample rate to 16k Hz for all audios # python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr.csv --extract-audio --audio-sr 16000 # # # 2.3.2 Detect speech audios # python -m tools.scoring.speech.speech_det ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000.csv --ngpus 4 # # 2.3.3 Filter by aesthetic and ocr scores, and speech detections. # This should output ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech.csv # python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech.csv --aesmin ${aesmin} --ocrmax ${ocrmax} --nospeech # python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_nospeech.csv --aesmin ${aesmin} --ocrmax ${ocrmax} # # 2.4 Compute optical flow to measure motion quality. This should output ${ROOT_META}/meta_info_fmin1_aes_flow.csv # # 2.4.1 download # mkdir -p ./pretrained_models/unimatch/ # wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P ./pretrained_models/unimatch/ # # 2.4.2 inference # torchrun --standalone --nproc_per_node ${ngpus} tools/scoring/optical_flow/inference.py \ # ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech.csv \ # --bs 4 --num_workers 8 # # 2.4.3 Filter by flow scores. This should output ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}.csv # python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin4.0_ocrmax5_nospeech_flow.csv --flowmin ${flowmin} # # 2.5 Delete invalid videos # python -m tools.datasets.del_invalid \ # --meta_file ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}.csv \ # --ref_file ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow.csv # python -m tools.datasets.datautil /root/workspace/datasets/JavisDiT/train/video/meta_new.csv --uni-fps 16 --overwrite --num-workers 8 # 3.1 Generate caption. This should output ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}_caption.csv # pip install -U vllm # export VLLM_WORKER_MULTIPROC_METHOD="spawn" # export DECORD_DUPLICATE_WARNING_THRESHOLD=1.0 # export FORCE_QWENVL_VIDEO_READER="decord" # "decord" or "torchvision" # export VLLM_USE_V1=0 # declare -a GPUS=(1 4) # 6 # ngpus=2 #${#GPUS[@]} # for i in "${!GPUS[@]}"; do # gpu=${GPUS[$i]} # # i=1 # CUDA_VISIBLE_DEVICES=$gpu nohup python -m tools.caption.caption_qwen25omni_vllm \ # --model_name_or_path ./pretrained_models/Qwen2.5-Omni-7B \ # --batch-size 8 --num-workers 8 \ # --part_num ${ngpus} --part_idx ${i} \ # --input-file ${ROOT_META}/parts/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}.csv \ # --output-file ${ROOT_META}/parts/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}_caption_part${i}.csv \ # >nohup.infer.part${i}.log 2>&1 & # done # python -m tools.datasets.datautil ${ROOT_META}/parts/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}_*csv \ # --output ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}_caption.csv \ # --remove-path-duplication # TODO: 3.2 Clean caption. # 4.1 fix broken videos # python -m tools.datasets.datautil data/meta/tavgbench_filter_360k.csv --fix-video