| ROOT_VIDEO="/ssd2/kailiu/TAVGBench/TAVGBench" | |
| ROOT_META="debug/meta/tavgbench_v03" | |
| aesmin=4.0 # 审美分阈值,可以再试试 | |
| flowmin=0.1 # flow分阈值,可以再试试 | |
| ocrmax=5 # ocr分阈值,检测到超过orcmax数量的字符就就filter掉 | |
| fmin=1 | |
| ngpus=2 | |
| export CUDA_VISIBLE_DEVICES="1,7" | |
| # # 1.1 Create a meta file from a video folder. This should output ${ROOT_META}/meta.csv | |
| # python -m tools.datasets.convert video ${ROOT_VIDEO} --output ${ROOT_META}/meta.csv #\ | |
| # # --difference data/meta/st_prior/meta_info_fmin10_fmax1000_au_sr16000_mmtrail136k_tavgbench240k_h100_unpaired_audios_aes.csv | |
| # # --difference data/meta/mmtrail_v01/meta_info_fmin1_fmax1000_au_sr16000_nospeech_fps24_training.csv | |
| # # # 1.2 Get video information and remove broken videos. This should output ${ROOT_META}/meta_info_fmin1.csv | |
| # python -m tools.datasets.datautil ${ROOT_META}/meta.csv --info --fmin ${fmin} #0 --fmax 1000 | |
| # # 1.3 Delete invalid videos | |
| # python data_pipeline/sync_va/src/utils.py --meta_file ${ROOT_META}/meta_info_fmin${fmin}.csv --ref_file ${ROOT_META}/meta.csv | |
| # # 2.1 Predict aesthetic scores. This should output ${ROOT_META}/meta_info_fmin${fmin}_aes.csv | |
| # torchrun --nproc_per_node ${ngpus} -m tools.scoring.aesthetic.inference \ | |
| # ${ROOT_META}/meta_info_fmin${fmin}.csv \ | |
| # --bs 256 --num_workers 16 | |
| # # # 2.1.1 Filter by aesthetic scores. This should output ${ROOT_META}/meta_info_fmin${fmin}_aes_aesmin${aesmin}.csv | |
| # python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_aes.csv --aesmin ${aesmin} | |
| # # 2.2 Compute OCR score | |
| # # 2.2.1 download | |
| # mkdir -p ./pretrained_models | |
| # wget https://download.openmmlab.com/mmocr/textdet/dbnetpp/dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015/dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015_20221101_124139-4ecb39ac.pth -P ./pretrained_models/ | |
| # 2.2.2 inference | |
| # torchrun --standalone --nproc_per_node ${ngpus} -m tools.scoring.ocr.inference \ | |
| # ${ROOT_META}/meta_info_fmin${fmin}_aes.csv \ | |
| # --bs 8 --num_workers 8 | |
| # # 2.3.1 Extract audios from videos, and unify sample rate to 16k Hz for all audios | |
| # python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr.csv --extract-audio --audio-sr 16000 | |
| # # # 2.3.2 Detect speech audios | |
| # python -m tools.scoring.speech.speech_det ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000.csv --ngpus 4 | |
| # # 2.3.3 Filter by aesthetic and ocr scores, and speech detections. | |
| # This should output ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech.csv | |
| # python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech.csv --aesmin ${aesmin} --ocrmax ${ocrmax} --nospeech | |
| # python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_nospeech.csv --aesmin ${aesmin} --ocrmax ${ocrmax} | |
| # # 2.4 Compute optical flow to measure motion quality. This should output ${ROOT_META}/meta_info_fmin1_aes_flow.csv | |
| # # 2.4.1 download | |
| # mkdir -p ./pretrained_models/unimatch/ | |
| # wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P ./pretrained_models/unimatch/ | |
| # # 2.4.2 inference | |
| # torchrun --standalone --nproc_per_node ${ngpus} tools/scoring/optical_flow/inference.py \ | |
| # ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech.csv \ | |
| # --bs 4 --num_workers 8 | |
| # # 2.4.3 Filter by flow scores. This should output ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}.csv | |
| # python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin4.0_ocrmax5_nospeech_flow.csv --flowmin ${flowmin} | |
| # # 2.5 Delete invalid videos | |
| # python -m tools.datasets.del_invalid \ | |
| # --meta_file ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}.csv \ | |
| # --ref_file ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow.csv | |
| # python -m tools.datasets.datautil /root/workspace/datasets/JavisDiT/train/video/meta_new.csv --uni-fps 16 --overwrite --num-workers 8 | |
| # 3.1 Generate caption. This should output ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}_caption.csv | |
| # pip install -U vllm | |
| # export VLLM_WORKER_MULTIPROC_METHOD="spawn" | |
| # export DECORD_DUPLICATE_WARNING_THRESHOLD=1.0 | |
| # export FORCE_QWENVL_VIDEO_READER="decord" # "decord" or "torchvision" | |
| # export VLLM_USE_V1=0 | |
| # declare -a GPUS=(1 4) # 6 | |
| # ngpus=2 #${#GPUS[@]} | |
| # for i in "${!GPUS[@]}"; do | |
| # gpu=${GPUS[$i]} | |
| # # i=1 | |
| # CUDA_VISIBLE_DEVICES=$gpu nohup python -m tools.caption.caption_qwen25omni_vllm \ | |
| # --model_name_or_path ./pretrained_models/Qwen2.5-Omni-7B \ | |
| # --batch-size 8 --num-workers 8 \ | |
| # --part_num ${ngpus} --part_idx ${i} \ | |
| # --input-file ${ROOT_META}/parts/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}.csv \ | |
| # --output-file ${ROOT_META}/parts/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}_caption_part${i}.csv \ | |
| # >nohup.infer.part${i}.log 2>&1 & | |
| # done | |
| # python -m tools.datasets.datautil ${ROOT_META}/parts/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}_*csv \ | |
| # --output ${ROOT_META}/meta_info_fmin${fmin}_aes_ocr_au_sr16000_detspeech_aesmin${aesmin}_ocrmax${ocrmax}_nospeech_flow_flowmin${flowmin}_caption.csv \ | |
| # --remove-path-duplication | |
| # TODO: 3.2 Clean caption. | |
| # 4.1 fix broken videos | |
| # python -m tools.datasets.datautil data/meta/tavgbench_filter_360k.csv --fix-video | |