Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
e528a23
1
Parent(s):
c9e5c19
add gradio app and dependencies
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +102 -0
- config/infer.yaml +19 -0
- depth_anything_utils.py +249 -0
- depth_anything_v2_metric/depth_anything_v2/__pycache__/dinov2.cpython-310.pyc +0 -0
- depth_anything_v2_metric/depth_anything_v2/__pycache__/dinov3_adpther.cpython-310.pyc +0 -0
- depth_anything_v2_metric/depth_anything_v2/__pycache__/dpt.cpython-310.pyc +0 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2.py +415 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__init__.py +11 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-310.pyc +0 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-310.pyc +0 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-310.pyc +0 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-310.pyc +0 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc +0 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-310.pyc +0 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc +0 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc +0 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/attention.py +83 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/block.py +252 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/drop_path.py +35 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/layer_scale.py +28 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/mlp.py +41 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/patch_embed.py +89 -0
- depth_anything_v2_metric/depth_anything_v2/dinov2_layers/swiglu_ffn.py +63 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/.docstr.yaml +6 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/.github/workflows/lint.yaml +47 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/.gitignore +18 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/CODE_OF_CONDUCT.md +80 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/CONTRIBUTING.md +31 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/LICENSE.md +66 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/MODEL_CARD.md +432 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/README.md +734 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/conda.yaml +23 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/__init__.py +6 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/checkpointer/__init__.py +18 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/checkpointer/checkpointer.py +352 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/__init__.py +16 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/config.py +222 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/ssl_default_config.yaml +205 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vit7b16_gram_anchor.yaml +203 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vit7b16_high_res_adapt.yaml +224 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vit7b16_pretrain.yaml +172 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vitl16_lvd1689m_distilled.yaml +251 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/multi_distillation_test.yaml +27 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/multidist_tests/vitb_p16.yaml +7 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/multidist_tests/vits_p16.yaml +6 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/vitl_im1k_lin834.yaml +143 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/__init__.py +12 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/adapters.py +68 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/augmentations.py +227 -0
- depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/collate.py +125 -0
app.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import absolute_import, division, print_function
|
| 2 |
+
|
| 3 |
+
import os, sys
|
| 4 |
+
import cv2
|
| 5 |
+
import yaml
|
| 6 |
+
import torch
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch.nn as nn
|
| 9 |
+
import gradio as gr
|
| 10 |
+
from huggingface_hub import hf_hub_download
|
| 11 |
+
|
| 12 |
+
# ========== 让 Space 能 import 你的工程 ==========
|
| 13 |
+
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__)) # app.py 在仓库根目录
|
| 14 |
+
sys.path.append(PROJECT_ROOT)
|
| 15 |
+
|
| 16 |
+
from networks.models import make
|
| 17 |
+
|
| 18 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 19 |
+
|
| 20 |
+
# ====== HF 权重仓库配置(你已经上传成功)======
|
| 21 |
+
WEIGHTS_REPO = "Insta360-Research/DAP-weights"
|
| 22 |
+
WEIGHTS_FILE = "model.pth"
|
| 23 |
+
|
| 24 |
+
# ========== 可视化 ==========
|
| 25 |
+
def colorize_depth(depth, colormap=cv2.COLORMAP_JET):
|
| 26 |
+
depth = depth.astype(np.float32)
|
| 27 |
+
depth_norm = (depth - depth.min()) / (depth.max() - depth.min() + 1e-6)
|
| 28 |
+
depth_u8 = (depth_norm * 255).astype(np.uint8)
|
| 29 |
+
return cv2.applyColorMap(depth_u8, colormap) # BGR
|
| 30 |
+
|
| 31 |
+
# ========== 加载模型(只加载一次) ==========
|
| 32 |
+
def load_model(config_path: str):
|
| 33 |
+
with open(config_path, "r") as f:
|
| 34 |
+
config = yaml.load(f, Loader=yaml.FullLoader)
|
| 35 |
+
|
| 36 |
+
print(f"Downloading weights from HF: {WEIGHTS_REPO}/{WEIGHTS_FILE}")
|
| 37 |
+
model_path = hf_hub_download(repo_id=WEIGHTS_REPO, filename=WEIGHTS_FILE)
|
| 38 |
+
print(f"✅ Weights downloaded to: {model_path}")
|
| 39 |
+
|
| 40 |
+
state = torch.load(model_path, map_location=device)
|
| 41 |
+
|
| 42 |
+
model = make(config["model"])
|
| 43 |
+
if any(k.startswith("module") for k in state.keys()):
|
| 44 |
+
model = nn.DataParallel(model)
|
| 45 |
+
|
| 46 |
+
model = model.to(device)
|
| 47 |
+
|
| 48 |
+
model_state = model.state_dict()
|
| 49 |
+
model.load_state_dict({k: v for k, v in state.items() if k in model_state}, strict=False)
|
| 50 |
+
model.eval()
|
| 51 |
+
print("✅ Model loaded.")
|
| 52 |
+
return model
|
| 53 |
+
|
| 54 |
+
# 这里改成你仓库里的 config 路径
|
| 55 |
+
CONFIG_PATH = "config/infer.yaml"
|
| 56 |
+
model = load_model(CONFIG_PATH)
|
| 57 |
+
|
| 58 |
+
# ========== 单张图推理 ==========
|
| 59 |
+
@torch.no_grad()
|
| 60 |
+
def predict(img_rgb: np.ndarray):
|
| 61 |
+
"""
|
| 62 |
+
img_rgb: H x W x 3 (RGB), uint8
|
| 63 |
+
return: depth_color_rgb, depth_gray
|
| 64 |
+
"""
|
| 65 |
+
if img_rgb is None:
|
| 66 |
+
return None, None
|
| 67 |
+
|
| 68 |
+
img = img_rgb.astype(np.float32) / 255.0
|
| 69 |
+
tensor = torch.from_numpy(img.transpose(2, 0, 1)).unsqueeze(0).to(device)
|
| 70 |
+
|
| 71 |
+
outputs = model(tensor)
|
| 72 |
+
|
| 73 |
+
if isinstance(outputs, dict) and "pred_depth" in outputs:
|
| 74 |
+
# 你原来的 mask 逻辑
|
| 75 |
+
if "pred_mask" in outputs:
|
| 76 |
+
outputs["pred_mask"] = 1 - outputs["pred_mask"]
|
| 77 |
+
outputs["pred_mask"] = (outputs["pred_mask"] > 0.5)
|
| 78 |
+
outputs["pred_depth"][~outputs["pred_mask"]] = 1
|
| 79 |
+
pred = outputs["pred_depth"][0].detach().cpu().squeeze().numpy()
|
| 80 |
+
else:
|
| 81 |
+
pred = outputs[0].detach().cpu().squeeze().numpy()
|
| 82 |
+
|
| 83 |
+
pred_clip = np.clip(pred, 0.001, 1.0)
|
| 84 |
+
depth_gray = (pred_clip * 255).astype(np.uint8)
|
| 85 |
+
|
| 86 |
+
depth_color_bgr = colorize_depth(pred_clip, cv2.COLORMAP_JET)
|
| 87 |
+
depth_color_rgb = cv2.cvtColor(depth_color_bgr, cv2.COLOR_BGR2RGB)
|
| 88 |
+
|
| 89 |
+
return depth_color_rgb, depth_gray
|
| 90 |
+
|
| 91 |
+
demo = gr.Interface(
|
| 92 |
+
fn=predict,
|
| 93 |
+
inputs=gr.Image(type="numpy", label="Input Image"),
|
| 94 |
+
outputs=[
|
| 95 |
+
gr.Image(type="numpy", label="Depth (Color)"),
|
| 96 |
+
gr.Image(type="numpy", label="Depth (Gray)"),
|
| 97 |
+
],
|
| 98 |
+
title="DAP Depth Prediction Demo",
|
| 99 |
+
description="Upload an image and get depth prediction."
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
demo.launch()
|
config/infer.yaml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
name: dap
|
| 3 |
+
args:
|
| 4 |
+
midas_model_type: vitl
|
| 5 |
+
fine_tune_type: hypersim
|
| 6 |
+
min_depth: 0.01
|
| 7 |
+
max_depth: 1.0
|
| 8 |
+
train_decoder: True
|
| 9 |
+
|
| 10 |
+
median_align: False
|
| 11 |
+
load_weights_dir: checkpoints
|
| 12 |
+
input:
|
| 13 |
+
height: 512
|
| 14 |
+
width: 1024
|
| 15 |
+
inference:
|
| 16 |
+
batch_size: 1
|
| 17 |
+
num_workers: 1
|
| 18 |
+
save_colormap: True
|
| 19 |
+
colormap_type: jet
|
depth_anything_utils.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import random
|
| 3 |
+
from PIL import Image, ImageOps, ImageFilter
|
| 4 |
+
import torch
|
| 5 |
+
from torchvision import transforms
|
| 6 |
+
import torch.nn.functional as F
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
import cv2
|
| 10 |
+
import math
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
|
| 14 |
+
"""Rezise the sample to ensure the given size. Keeps aspect ratio.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
sample (dict): sample
|
| 18 |
+
size (tuple): image size
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
tuple: new size
|
| 22 |
+
"""
|
| 23 |
+
shape = list(sample["disparity"].shape)
|
| 24 |
+
|
| 25 |
+
if shape[0] >= size[0] and shape[1] >= size[1]:
|
| 26 |
+
return sample
|
| 27 |
+
|
| 28 |
+
scale = [0, 0]
|
| 29 |
+
scale[0] = size[0] / shape[0]
|
| 30 |
+
scale[1] = size[1] / shape[1]
|
| 31 |
+
|
| 32 |
+
scale = max(scale)
|
| 33 |
+
|
| 34 |
+
shape[0] = math.ceil(scale * shape[0])
|
| 35 |
+
shape[1] = math.ceil(scale * shape[1])
|
| 36 |
+
|
| 37 |
+
# resize
|
| 38 |
+
sample["image"] = cv2.resize(
|
| 39 |
+
sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
sample["disparity"] = cv2.resize(
|
| 43 |
+
sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
|
| 44 |
+
)
|
| 45 |
+
sample["mask"] = cv2.resize(
|
| 46 |
+
sample["mask"].astype(np.float32),
|
| 47 |
+
tuple(shape[::-1]),
|
| 48 |
+
interpolation=cv2.INTER_NEAREST,
|
| 49 |
+
)
|
| 50 |
+
sample["mask"] = sample["mask"].astype(bool)
|
| 51 |
+
|
| 52 |
+
return tuple(shape)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class Resize(object):
|
| 56 |
+
"""Resize sample to given size (width, height).
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
def __init__(
|
| 60 |
+
self,
|
| 61 |
+
width,
|
| 62 |
+
height,
|
| 63 |
+
resize_target=True,
|
| 64 |
+
keep_aspect_ratio=False,
|
| 65 |
+
ensure_multiple_of=1,
|
| 66 |
+
resize_method="lower_bound",
|
| 67 |
+
image_interpolation_method=cv2.INTER_AREA,
|
| 68 |
+
):
|
| 69 |
+
"""Init.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
width (int): desired output width
|
| 73 |
+
height (int): desired output height
|
| 74 |
+
resize_target (bool, optional):
|
| 75 |
+
True: Resize the full sample (image, mask, target).
|
| 76 |
+
False: Resize image only.
|
| 77 |
+
Defaults to True.
|
| 78 |
+
keep_aspect_ratio (bool, optional):
|
| 79 |
+
True: Keep the aspect ratio of the input sample.
|
| 80 |
+
Output sample might not have the given width and height, and
|
| 81 |
+
resize behaviour depends on the parameter 'resize_method'.
|
| 82 |
+
Defaults to False.
|
| 83 |
+
ensure_multiple_of (int, optional):
|
| 84 |
+
Output width and height is constrained to be multiple of this parameter.
|
| 85 |
+
Defaults to 1.
|
| 86 |
+
resize_method (str, optional):
|
| 87 |
+
"lower_bound": Output will be at least as large as the given size.
|
| 88 |
+
"upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
|
| 89 |
+
"minimal": Scale as least as possible. (Output size might be smaller than given size.)
|
| 90 |
+
Defaults to "lower_bound".
|
| 91 |
+
"""
|
| 92 |
+
self.__width = width
|
| 93 |
+
self.__height = height
|
| 94 |
+
|
| 95 |
+
self.__resize_target = resize_target
|
| 96 |
+
self.__keep_aspect_ratio = keep_aspect_ratio
|
| 97 |
+
self.__multiple_of = ensure_multiple_of
|
| 98 |
+
self.__resize_method = resize_method
|
| 99 |
+
self.__image_interpolation_method = image_interpolation_method
|
| 100 |
+
|
| 101 |
+
def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
|
| 102 |
+
y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
| 103 |
+
|
| 104 |
+
if max_val is not None and y > max_val:
|
| 105 |
+
y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
| 106 |
+
|
| 107 |
+
if y < min_val:
|
| 108 |
+
y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
| 109 |
+
|
| 110 |
+
return y
|
| 111 |
+
|
| 112 |
+
def get_size(self, width, height):
|
| 113 |
+
# determine new height and width
|
| 114 |
+
scale_height = self.__height / height
|
| 115 |
+
scale_width = self.__width / width
|
| 116 |
+
|
| 117 |
+
if self.__keep_aspect_ratio:
|
| 118 |
+
if self.__resize_method == "lower_bound":
|
| 119 |
+
# scale such that output size is lower bound
|
| 120 |
+
if scale_width > scale_height:
|
| 121 |
+
# fit width
|
| 122 |
+
scale_height = scale_width
|
| 123 |
+
else:
|
| 124 |
+
# fit height
|
| 125 |
+
scale_width = scale_height
|
| 126 |
+
elif self.__resize_method == "upper_bound":
|
| 127 |
+
# scale such that output size is upper bound
|
| 128 |
+
if scale_width < scale_height:
|
| 129 |
+
# fit width
|
| 130 |
+
scale_height = scale_width
|
| 131 |
+
else:
|
| 132 |
+
# fit height
|
| 133 |
+
scale_width = scale_height
|
| 134 |
+
elif self.__resize_method == "minimal":
|
| 135 |
+
# scale as least as possbile
|
| 136 |
+
if abs(1 - scale_width) < abs(1 - scale_height):
|
| 137 |
+
# fit width
|
| 138 |
+
scale_height = scale_width
|
| 139 |
+
else:
|
| 140 |
+
# fit height
|
| 141 |
+
scale_width = scale_height
|
| 142 |
+
else:
|
| 143 |
+
raise ValueError(
|
| 144 |
+
f"resize_method {self.__resize_method} not implemented"
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
if self.__resize_method == "lower_bound":
|
| 148 |
+
new_height = self.constrain_to_multiple_of(
|
| 149 |
+
scale_height * height, min_val=self.__height
|
| 150 |
+
)
|
| 151 |
+
new_width = self.constrain_to_multiple_of(
|
| 152 |
+
scale_width * width, min_val=self.__width
|
| 153 |
+
)
|
| 154 |
+
elif self.__resize_method == "upper_bound":
|
| 155 |
+
new_height = self.constrain_to_multiple_of(
|
| 156 |
+
scale_height * height, max_val=self.__height
|
| 157 |
+
)
|
| 158 |
+
new_width = self.constrain_to_multiple_of(
|
| 159 |
+
scale_width * width, max_val=self.__width
|
| 160 |
+
)
|
| 161 |
+
elif self.__resize_method == "minimal":
|
| 162 |
+
new_height = self.constrain_to_multiple_of(scale_height * height)
|
| 163 |
+
new_width = self.constrain_to_multiple_of(scale_width * width)
|
| 164 |
+
else:
|
| 165 |
+
raise ValueError(f"resize_method {self.__resize_method} not implemented")
|
| 166 |
+
|
| 167 |
+
return (new_width, new_height)
|
| 168 |
+
|
| 169 |
+
def __call__(self, sample):
|
| 170 |
+
width, height = self.get_size(
|
| 171 |
+
sample["image"].shape[1], sample["image"].shape[0]
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
# resize sample
|
| 175 |
+
sample["image"] = cv2.resize(
|
| 176 |
+
sample["image"],
|
| 177 |
+
(width, height),
|
| 178 |
+
interpolation=self.__image_interpolation_method,
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
if self.__resize_target:
|
| 182 |
+
if "disparity" in sample:
|
| 183 |
+
sample["disparity"] = cv2.resize(
|
| 184 |
+
sample["disparity"],
|
| 185 |
+
(width, height),
|
| 186 |
+
interpolation=cv2.INTER_NEAREST,
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
if "depth" in sample:
|
| 190 |
+
sample["depth"] = cv2.resize(
|
| 191 |
+
sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
if "semseg_mask" in sample:
|
| 195 |
+
# sample["semseg_mask"] = cv2.resize(
|
| 196 |
+
# sample["semseg_mask"], (width, height), interpolation=cv2.INTER_NEAREST
|
| 197 |
+
# )
|
| 198 |
+
sample["semseg_mask"] = F.interpolate(torch.from_numpy(sample["semseg_mask"]).float()[None, None, ...], (height, width), mode='nearest').numpy()[0, 0]
|
| 199 |
+
|
| 200 |
+
if "mask" in sample:
|
| 201 |
+
sample["mask"] = cv2.resize(
|
| 202 |
+
sample["mask"].astype(np.float32),
|
| 203 |
+
(width, height),
|
| 204 |
+
interpolation=cv2.INTER_NEAREST,
|
| 205 |
+
)
|
| 206 |
+
# sample["mask"] = sample["mask"].astype(bool)
|
| 207 |
+
|
| 208 |
+
# print(sample['image'].shape, sample['depth'].shape)
|
| 209 |
+
return sample
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
class NormalizeImage(object):
|
| 213 |
+
"""Normlize image by given mean and std.
|
| 214 |
+
"""
|
| 215 |
+
|
| 216 |
+
def __init__(self, mean, std):
|
| 217 |
+
self.__mean = mean
|
| 218 |
+
self.__std = std
|
| 219 |
+
|
| 220 |
+
def __call__(self, sample):
|
| 221 |
+
sample["image"] = (sample["image"] - self.__mean) / self.__std
|
| 222 |
+
|
| 223 |
+
return sample
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
class PrepareForNet(object):
|
| 227 |
+
"""Prepare sample for usage as network input.
|
| 228 |
+
"""
|
| 229 |
+
|
| 230 |
+
def __init__(self):
|
| 231 |
+
pass
|
| 232 |
+
|
| 233 |
+
def __call__(self, sample):
|
| 234 |
+
image = np.transpose(sample["image"], (2, 0, 1))
|
| 235 |
+
sample["image"] = np.ascontiguousarray(image).astype(np.float32)
|
| 236 |
+
|
| 237 |
+
if "mask" in sample:
|
| 238 |
+
sample["mask"] = sample["mask"].astype(np.float32)
|
| 239 |
+
sample["mask"] = np.ascontiguousarray(sample["mask"])
|
| 240 |
+
|
| 241 |
+
if "depth" in sample:
|
| 242 |
+
depth = sample["depth"].astype(np.float32)
|
| 243 |
+
sample["depth"] = np.ascontiguousarray(depth)
|
| 244 |
+
|
| 245 |
+
if "semseg_mask" in sample:
|
| 246 |
+
sample["semseg_mask"] = sample["semseg_mask"].astype(np.float32)
|
| 247 |
+
sample["semseg_mask"] = np.ascontiguousarray(sample["semseg_mask"])
|
| 248 |
+
|
| 249 |
+
return sample
|
depth_anything_v2_metric/depth_anything_v2/__pycache__/dinov2.cpython-310.pyc
ADDED
|
Binary file (12.2 kB). View file
|
|
|
depth_anything_v2_metric/depth_anything_v2/__pycache__/dinov3_adpther.cpython-310.pyc
ADDED
|
Binary file (2.54 kB). View file
|
|
|
depth_anything_v2_metric/depth_anything_v2/__pycache__/dpt.cpython-310.pyc
ADDED
|
Binary file (6.62 kB). View file
|
|
|
depth_anything_v2_metric/depth_anything_v2/dinov2.py
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the Apache License, Version 2.0
|
| 4 |
+
# found in the LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
# References:
|
| 7 |
+
# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
|
| 8 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
|
| 9 |
+
|
| 10 |
+
from functools import partial
|
| 11 |
+
import math
|
| 12 |
+
import logging
|
| 13 |
+
from typing import Sequence, Tuple, Union, Callable
|
| 14 |
+
|
| 15 |
+
import torch
|
| 16 |
+
import torch.nn as nn
|
| 17 |
+
import torch.utils.checkpoint
|
| 18 |
+
from torch.nn.init import trunc_normal_
|
| 19 |
+
|
| 20 |
+
from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger("dinov2")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
|
| 27 |
+
if not depth_first and include_root:
|
| 28 |
+
fn(module=module, name=name)
|
| 29 |
+
for child_name, child_module in module.named_children():
|
| 30 |
+
child_name = ".".join((name, child_name)) if name else child_name
|
| 31 |
+
named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
|
| 32 |
+
if depth_first and include_root:
|
| 33 |
+
fn(module=module, name=name)
|
| 34 |
+
return module
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class BlockChunk(nn.ModuleList):
|
| 38 |
+
def forward(self, x):
|
| 39 |
+
for b in self:
|
| 40 |
+
x = b(x)
|
| 41 |
+
return x
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class DinoVisionTransformer(nn.Module):
|
| 45 |
+
def __init__(
|
| 46 |
+
self,
|
| 47 |
+
img_size=224,
|
| 48 |
+
patch_size=16,
|
| 49 |
+
in_chans=3,
|
| 50 |
+
embed_dim=768,
|
| 51 |
+
depth=12,
|
| 52 |
+
num_heads=12,
|
| 53 |
+
mlp_ratio=4.0,
|
| 54 |
+
qkv_bias=True,
|
| 55 |
+
ffn_bias=True,
|
| 56 |
+
proj_bias=True,
|
| 57 |
+
drop_path_rate=0.0,
|
| 58 |
+
drop_path_uniform=False,
|
| 59 |
+
init_values=None, # for layerscale: None or 0 => no layerscale
|
| 60 |
+
embed_layer=PatchEmbed,
|
| 61 |
+
act_layer=nn.GELU,
|
| 62 |
+
block_fn=Block,
|
| 63 |
+
ffn_layer="mlp",
|
| 64 |
+
block_chunks=1,
|
| 65 |
+
num_register_tokens=0,
|
| 66 |
+
interpolate_antialias=False,
|
| 67 |
+
interpolate_offset=0.1,
|
| 68 |
+
):
|
| 69 |
+
"""
|
| 70 |
+
Args:
|
| 71 |
+
img_size (int, tuple): input image size
|
| 72 |
+
patch_size (int, tuple): patch size
|
| 73 |
+
in_chans (int): number of input channels
|
| 74 |
+
embed_dim (int): embedding dimension
|
| 75 |
+
depth (int): depth of transformer
|
| 76 |
+
num_heads (int): number of attention heads
|
| 77 |
+
mlp_ratio (int): ratio of mlp hidden dim to embedding dim
|
| 78 |
+
qkv_bias (bool): enable bias for qkv if True
|
| 79 |
+
proj_bias (bool): enable bias for proj in attn if True
|
| 80 |
+
ffn_bias (bool): enable bias for ffn if True
|
| 81 |
+
drop_path_rate (float): stochastic depth rate
|
| 82 |
+
drop_path_uniform (bool): apply uniform drop rate across blocks
|
| 83 |
+
weight_init (str): weight init scheme
|
| 84 |
+
init_values (float): layer-scale init values
|
| 85 |
+
embed_layer (nn.Module): patch embedding layer
|
| 86 |
+
act_layer (nn.Module): MLP activation layer
|
| 87 |
+
block_fn (nn.Module): transformer block class
|
| 88 |
+
ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
|
| 89 |
+
block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
|
| 90 |
+
num_register_tokens: (int) number of extra cls tokens (so-called "registers")
|
| 91 |
+
interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
|
| 92 |
+
interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
|
| 93 |
+
"""
|
| 94 |
+
super().__init__()
|
| 95 |
+
norm_layer = partial(nn.LayerNorm, eps=1e-6)
|
| 96 |
+
|
| 97 |
+
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
|
| 98 |
+
self.num_tokens = 1
|
| 99 |
+
self.n_blocks = depth
|
| 100 |
+
self.num_heads = num_heads
|
| 101 |
+
self.patch_size = patch_size
|
| 102 |
+
self.num_register_tokens = num_register_tokens
|
| 103 |
+
self.interpolate_antialias = interpolate_antialias
|
| 104 |
+
self.interpolate_offset = interpolate_offset
|
| 105 |
+
|
| 106 |
+
self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
|
| 107 |
+
num_patches = self.patch_embed.num_patches
|
| 108 |
+
|
| 109 |
+
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
|
| 110 |
+
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
|
| 111 |
+
assert num_register_tokens >= 0
|
| 112 |
+
self.register_tokens = (
|
| 113 |
+
nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
if drop_path_uniform is True:
|
| 117 |
+
dpr = [drop_path_rate] * depth
|
| 118 |
+
else:
|
| 119 |
+
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
|
| 120 |
+
|
| 121 |
+
if ffn_layer == "mlp":
|
| 122 |
+
logger.info("using MLP layer as FFN")
|
| 123 |
+
ffn_layer = Mlp
|
| 124 |
+
elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
|
| 125 |
+
logger.info("using SwiGLU layer as FFN")
|
| 126 |
+
ffn_layer = SwiGLUFFNFused
|
| 127 |
+
elif ffn_layer == "identity":
|
| 128 |
+
logger.info("using Identity layer as FFN")
|
| 129 |
+
|
| 130 |
+
def f(*args, **kwargs):
|
| 131 |
+
return nn.Identity()
|
| 132 |
+
|
| 133 |
+
ffn_layer = f
|
| 134 |
+
else:
|
| 135 |
+
raise NotImplementedError
|
| 136 |
+
|
| 137 |
+
blocks_list = [
|
| 138 |
+
block_fn(
|
| 139 |
+
dim=embed_dim,
|
| 140 |
+
num_heads=num_heads,
|
| 141 |
+
mlp_ratio=mlp_ratio,
|
| 142 |
+
qkv_bias=qkv_bias,
|
| 143 |
+
proj_bias=proj_bias,
|
| 144 |
+
ffn_bias=ffn_bias,
|
| 145 |
+
drop_path=dpr[i],
|
| 146 |
+
norm_layer=norm_layer,
|
| 147 |
+
act_layer=act_layer,
|
| 148 |
+
ffn_layer=ffn_layer,
|
| 149 |
+
init_values=init_values,
|
| 150 |
+
)
|
| 151 |
+
for i in range(depth)
|
| 152 |
+
]
|
| 153 |
+
if block_chunks > 0:
|
| 154 |
+
self.chunked_blocks = True
|
| 155 |
+
chunked_blocks = []
|
| 156 |
+
chunksize = depth // block_chunks
|
| 157 |
+
for i in range(0, depth, chunksize):
|
| 158 |
+
# this is to keep the block index consistent if we chunk the block list
|
| 159 |
+
chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
|
| 160 |
+
self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
|
| 161 |
+
else:
|
| 162 |
+
self.chunked_blocks = False
|
| 163 |
+
self.blocks = nn.ModuleList(blocks_list)
|
| 164 |
+
|
| 165 |
+
self.norm = norm_layer(embed_dim)
|
| 166 |
+
self.head = nn.Identity()
|
| 167 |
+
|
| 168 |
+
self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
|
| 169 |
+
|
| 170 |
+
self.init_weights()
|
| 171 |
+
|
| 172 |
+
def init_weights(self):
|
| 173 |
+
trunc_normal_(self.pos_embed, std=0.02)
|
| 174 |
+
nn.init.normal_(self.cls_token, std=1e-6)
|
| 175 |
+
if self.register_tokens is not None:
|
| 176 |
+
nn.init.normal_(self.register_tokens, std=1e-6)
|
| 177 |
+
named_apply(init_weights_vit_timm, self)
|
| 178 |
+
|
| 179 |
+
def interpolate_pos_encoding(self, x, w, h):
|
| 180 |
+
previous_dtype = x.dtype
|
| 181 |
+
npatch = x.shape[1] - 1
|
| 182 |
+
N = self.pos_embed.shape[1] - 1
|
| 183 |
+
if npatch == N and w == h:
|
| 184 |
+
return self.pos_embed
|
| 185 |
+
pos_embed = self.pos_embed.float()
|
| 186 |
+
class_pos_embed = pos_embed[:, 0]
|
| 187 |
+
patch_pos_embed = pos_embed[:, 1:]
|
| 188 |
+
dim = x.shape[-1]
|
| 189 |
+
w0 = w // self.patch_size
|
| 190 |
+
h0 = h // self.patch_size
|
| 191 |
+
# we add a small number to avoid floating point error in the interpolation
|
| 192 |
+
# see discussion at https://github.com/facebookresearch/dino/issues/8
|
| 193 |
+
# DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
|
| 194 |
+
w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
|
| 195 |
+
# w0, h0 = w0 + 0.1, h0 + 0.1
|
| 196 |
+
|
| 197 |
+
sqrt_N = math.sqrt(N)
|
| 198 |
+
sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
|
| 199 |
+
patch_pos_embed = nn.functional.interpolate(
|
| 200 |
+
patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
|
| 201 |
+
scale_factor=(sx, sy),
|
| 202 |
+
# (int(w0), int(h0)), # to solve the upsampling shape issue
|
| 203 |
+
mode="bicubic",
|
| 204 |
+
antialias=self.interpolate_antialias
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
assert int(w0) == patch_pos_embed.shape[-2]
|
| 208 |
+
assert int(h0) == patch_pos_embed.shape[-1]
|
| 209 |
+
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
|
| 210 |
+
return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
|
| 211 |
+
|
| 212 |
+
def prepare_tokens_with_masks(self, x, masks=None):
|
| 213 |
+
B, nc, w, h = x.shape
|
| 214 |
+
x = self.patch_embed(x)
|
| 215 |
+
if masks is not None:
|
| 216 |
+
x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
|
| 217 |
+
|
| 218 |
+
x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
|
| 219 |
+
x = x + self.interpolate_pos_encoding(x, w, h)
|
| 220 |
+
|
| 221 |
+
if self.register_tokens is not None:
|
| 222 |
+
x = torch.cat(
|
| 223 |
+
(
|
| 224 |
+
x[:, :1],
|
| 225 |
+
self.register_tokens.expand(x.shape[0], -1, -1),
|
| 226 |
+
x[:, 1:],
|
| 227 |
+
),
|
| 228 |
+
dim=1,
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
return x
|
| 232 |
+
|
| 233 |
+
def forward_features_list(self, x_list, masks_list):
|
| 234 |
+
x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
|
| 235 |
+
for blk in self.blocks:
|
| 236 |
+
x = blk(x)
|
| 237 |
+
|
| 238 |
+
all_x = x
|
| 239 |
+
output = []
|
| 240 |
+
for x, masks in zip(all_x, masks_list):
|
| 241 |
+
x_norm = self.norm(x)
|
| 242 |
+
output.append(
|
| 243 |
+
{
|
| 244 |
+
"x_norm_clstoken": x_norm[:, 0],
|
| 245 |
+
"x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
|
| 246 |
+
"x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
|
| 247 |
+
"x_prenorm": x,
|
| 248 |
+
"masks": masks,
|
| 249 |
+
}
|
| 250 |
+
)
|
| 251 |
+
return output
|
| 252 |
+
|
| 253 |
+
def forward_features(self, x, masks=None):
|
| 254 |
+
if isinstance(x, list):
|
| 255 |
+
return self.forward_features_list(x, masks)
|
| 256 |
+
|
| 257 |
+
x = self.prepare_tokens_with_masks(x, masks)
|
| 258 |
+
|
| 259 |
+
for blk in self.blocks:
|
| 260 |
+
x = blk(x)
|
| 261 |
+
|
| 262 |
+
x_norm = self.norm(x)
|
| 263 |
+
return {
|
| 264 |
+
"x_norm_clstoken": x_norm[:, 0],
|
| 265 |
+
"x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
|
| 266 |
+
"x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
|
| 267 |
+
"x_prenorm": x,
|
| 268 |
+
"masks": masks,
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
def _get_intermediate_layers_not_chunked(self, x, n=1):
|
| 272 |
+
x = self.prepare_tokens_with_masks(x)
|
| 273 |
+
# If n is an int, take the n last blocks. If it's a list, take them
|
| 274 |
+
output, total_block_len = [], len(self.blocks)
|
| 275 |
+
blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
|
| 276 |
+
for i, blk in enumerate(self.blocks):
|
| 277 |
+
x = blk(x)
|
| 278 |
+
if i in blocks_to_take:
|
| 279 |
+
output.append(x)
|
| 280 |
+
assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
|
| 281 |
+
return output
|
| 282 |
+
|
| 283 |
+
def _get_intermediate_layers_chunked(self, x, n=1):
|
| 284 |
+
x = self.prepare_tokens_with_masks(x)
|
| 285 |
+
output, i, total_block_len = [], 0, len(self.blocks[-1])
|
| 286 |
+
# If n is an int, take the n last blocks. If it's a list, take them
|
| 287 |
+
blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
|
| 288 |
+
for block_chunk in self.blocks:
|
| 289 |
+
for blk in block_chunk[i:]: # Passing the nn.Identity()
|
| 290 |
+
x = blk(x)
|
| 291 |
+
if i in blocks_to_take:
|
| 292 |
+
output.append(x)
|
| 293 |
+
i += 1
|
| 294 |
+
assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
|
| 295 |
+
return output
|
| 296 |
+
|
| 297 |
+
def get_intermediate_layers(
|
| 298 |
+
self,
|
| 299 |
+
x: torch.Tensor,
|
| 300 |
+
n: Union[int, Sequence] = 1, # Layers or n last layers to take
|
| 301 |
+
reshape: bool = False,
|
| 302 |
+
return_class_token: bool = False,
|
| 303 |
+
norm=True
|
| 304 |
+
) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
|
| 305 |
+
if self.chunked_blocks:
|
| 306 |
+
outputs = self._get_intermediate_layers_chunked(x, n)
|
| 307 |
+
else:
|
| 308 |
+
outputs = self._get_intermediate_layers_not_chunked(x, n)
|
| 309 |
+
if norm:
|
| 310 |
+
outputs = [self.norm(out) for out in outputs]
|
| 311 |
+
class_tokens = [out[:, 0] for out in outputs]
|
| 312 |
+
outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
|
| 313 |
+
if reshape:
|
| 314 |
+
B, _, w, h = x.shape
|
| 315 |
+
outputs = [
|
| 316 |
+
out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
|
| 317 |
+
for out in outputs
|
| 318 |
+
]
|
| 319 |
+
if return_class_token:
|
| 320 |
+
return tuple(zip(outputs, class_tokens))
|
| 321 |
+
return tuple(outputs)
|
| 322 |
+
|
| 323 |
+
def forward(self, *args, is_training=False, **kwargs):
|
| 324 |
+
ret = self.forward_features(*args, **kwargs)
|
| 325 |
+
if is_training:
|
| 326 |
+
return ret
|
| 327 |
+
else:
|
| 328 |
+
return self.head(ret["x_norm_clstoken"])
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def init_weights_vit_timm(module: nn.Module, name: str = ""):
|
| 332 |
+
"""ViT weight initialization, original timm impl (for reproducibility)"""
|
| 333 |
+
if isinstance(module, nn.Linear):
|
| 334 |
+
trunc_normal_(module.weight, std=0.02)
|
| 335 |
+
if module.bias is not None:
|
| 336 |
+
nn.init.zeros_(module.bias)
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
|
| 340 |
+
model = DinoVisionTransformer(
|
| 341 |
+
patch_size=patch_size,
|
| 342 |
+
embed_dim=384,
|
| 343 |
+
depth=12,
|
| 344 |
+
num_heads=6,
|
| 345 |
+
mlp_ratio=4,
|
| 346 |
+
block_fn=partial(Block, attn_class=MemEffAttention),
|
| 347 |
+
num_register_tokens=num_register_tokens,
|
| 348 |
+
**kwargs,
|
| 349 |
+
)
|
| 350 |
+
return model
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
|
| 354 |
+
model = DinoVisionTransformer(
|
| 355 |
+
patch_size=patch_size,
|
| 356 |
+
embed_dim=768,
|
| 357 |
+
depth=12,
|
| 358 |
+
num_heads=12,
|
| 359 |
+
mlp_ratio=4,
|
| 360 |
+
block_fn=partial(Block, attn_class=MemEffAttention),
|
| 361 |
+
num_register_tokens=num_register_tokens,
|
| 362 |
+
**kwargs,
|
| 363 |
+
)
|
| 364 |
+
return model
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
|
| 368 |
+
model = DinoVisionTransformer(
|
| 369 |
+
patch_size=patch_size,
|
| 370 |
+
embed_dim=1024,
|
| 371 |
+
depth=24,
|
| 372 |
+
num_heads=16,
|
| 373 |
+
mlp_ratio=4,
|
| 374 |
+
block_fn=partial(Block, attn_class=MemEffAttention),
|
| 375 |
+
num_register_tokens=num_register_tokens,
|
| 376 |
+
**kwargs,
|
| 377 |
+
)
|
| 378 |
+
return model
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
|
| 382 |
+
"""
|
| 383 |
+
Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
|
| 384 |
+
"""
|
| 385 |
+
model = DinoVisionTransformer(
|
| 386 |
+
patch_size=patch_size,
|
| 387 |
+
embed_dim=1536,
|
| 388 |
+
depth=40,
|
| 389 |
+
num_heads=24,
|
| 390 |
+
mlp_ratio=4,
|
| 391 |
+
block_fn=partial(Block, attn_class=MemEffAttention),
|
| 392 |
+
num_register_tokens=num_register_tokens,
|
| 393 |
+
**kwargs,
|
| 394 |
+
)
|
| 395 |
+
return model
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
def DINOv2(model_name):
|
| 399 |
+
model_zoo = {
|
| 400 |
+
"vits": vit_small,
|
| 401 |
+
"vitb": vit_base,
|
| 402 |
+
"vitl": vit_large,
|
| 403 |
+
"vitg": vit_giant2
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
return model_zoo[model_name](
|
| 407 |
+
img_size=518,
|
| 408 |
+
patch_size=14,
|
| 409 |
+
init_values=1.0,
|
| 410 |
+
ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
|
| 411 |
+
block_chunks=0,
|
| 412 |
+
num_register_tokens=0,
|
| 413 |
+
interpolate_antialias=False,
|
| 414 |
+
interpolate_offset=0.1
|
| 415 |
+
)
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
from .mlp import Mlp
|
| 8 |
+
from .patch_embed import PatchEmbed
|
| 9 |
+
from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
|
| 10 |
+
from .block import NestedTensorBlock
|
| 11 |
+
from .attention import MemEffAttention
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (456 Bytes). View file
|
|
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-310.pyc
ADDED
|
Binary file (2.42 kB). View file
|
|
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-310.pyc
ADDED
|
Binary file (8.03 kB). View file
|
|
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-310.pyc
ADDED
|
Binary file (1.26 kB). View file
|
|
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc
ADDED
|
Binary file (1.06 kB). View file
|
|
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-310.pyc
ADDED
|
Binary file (1.25 kB). View file
|
|
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc
ADDED
|
Binary file (2.7 kB). View file
|
|
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc
ADDED
|
Binary file (2.05 kB). View file
|
|
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/attention.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# References:
|
| 8 |
+
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
| 9 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
|
| 10 |
+
|
| 11 |
+
import logging
|
| 12 |
+
|
| 13 |
+
from torch import Tensor
|
| 14 |
+
from torch import nn
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger("dinov2")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
from xformers.ops import memory_efficient_attention, unbind, fmha
|
| 22 |
+
|
| 23 |
+
XFORMERS_AVAILABLE = True
|
| 24 |
+
except ImportError:
|
| 25 |
+
logger.warning("xFormers not available")
|
| 26 |
+
XFORMERS_AVAILABLE = False
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class Attention(nn.Module):
|
| 30 |
+
def __init__(
|
| 31 |
+
self,
|
| 32 |
+
dim: int,
|
| 33 |
+
num_heads: int = 8,
|
| 34 |
+
qkv_bias: bool = False,
|
| 35 |
+
proj_bias: bool = True,
|
| 36 |
+
attn_drop: float = 0.0,
|
| 37 |
+
proj_drop: float = 0.0,
|
| 38 |
+
) -> None:
|
| 39 |
+
super().__init__()
|
| 40 |
+
self.num_heads = num_heads
|
| 41 |
+
head_dim = dim // num_heads
|
| 42 |
+
self.scale = head_dim**-0.5
|
| 43 |
+
|
| 44 |
+
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
| 45 |
+
self.attn_drop = nn.Dropout(attn_drop)
|
| 46 |
+
self.proj = nn.Linear(dim, dim, bias=proj_bias)
|
| 47 |
+
self.proj_drop = nn.Dropout(proj_drop)
|
| 48 |
+
|
| 49 |
+
def forward(self, x: Tensor) -> Tensor:
|
| 50 |
+
B, N, C = x.shape
|
| 51 |
+
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
| 52 |
+
|
| 53 |
+
q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
|
| 54 |
+
attn = q @ k.transpose(-2, -1)
|
| 55 |
+
|
| 56 |
+
attn = attn.softmax(dim=-1)
|
| 57 |
+
attn = self.attn_drop(attn)
|
| 58 |
+
|
| 59 |
+
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
|
| 60 |
+
x = self.proj(x)
|
| 61 |
+
x = self.proj_drop(x)
|
| 62 |
+
return x
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class MemEffAttention(Attention):
|
| 66 |
+
def forward(self, x: Tensor, attn_bias=None) -> Tensor:
|
| 67 |
+
if not XFORMERS_AVAILABLE:
|
| 68 |
+
assert attn_bias is None, "xFormers is required for nested tensors usage"
|
| 69 |
+
return super().forward(x)
|
| 70 |
+
|
| 71 |
+
B, N, C = x.shape
|
| 72 |
+
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
|
| 73 |
+
|
| 74 |
+
q, k, v = unbind(qkv, 2)
|
| 75 |
+
|
| 76 |
+
x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
|
| 77 |
+
x = x.reshape([B, N, C])
|
| 78 |
+
|
| 79 |
+
x = self.proj(x)
|
| 80 |
+
x = self.proj_drop(x)
|
| 81 |
+
return x
|
| 82 |
+
|
| 83 |
+
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/block.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# References:
|
| 8 |
+
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
| 9 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
|
| 10 |
+
|
| 11 |
+
import logging
|
| 12 |
+
from typing import Callable, List, Any, Tuple, Dict
|
| 13 |
+
|
| 14 |
+
import torch
|
| 15 |
+
from torch import nn, Tensor
|
| 16 |
+
|
| 17 |
+
from .attention import Attention, MemEffAttention
|
| 18 |
+
from .drop_path import DropPath
|
| 19 |
+
from .layer_scale import LayerScale
|
| 20 |
+
from .mlp import Mlp
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger("dinov2")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
from xformers.ops import fmha
|
| 28 |
+
from xformers.ops import scaled_index_add, index_select_cat
|
| 29 |
+
|
| 30 |
+
XFORMERS_AVAILABLE = True
|
| 31 |
+
except ImportError:
|
| 32 |
+
logger.warning("xFormers not available")
|
| 33 |
+
XFORMERS_AVAILABLE = False
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class Block(nn.Module):
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
dim: int,
|
| 40 |
+
num_heads: int,
|
| 41 |
+
mlp_ratio: float = 4.0,
|
| 42 |
+
qkv_bias: bool = False,
|
| 43 |
+
proj_bias: bool = True,
|
| 44 |
+
ffn_bias: bool = True,
|
| 45 |
+
drop: float = 0.0,
|
| 46 |
+
attn_drop: float = 0.0,
|
| 47 |
+
init_values=None,
|
| 48 |
+
drop_path: float = 0.0,
|
| 49 |
+
act_layer: Callable[..., nn.Module] = nn.GELU,
|
| 50 |
+
norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
|
| 51 |
+
attn_class: Callable[..., nn.Module] = Attention,
|
| 52 |
+
ffn_layer: Callable[..., nn.Module] = Mlp,
|
| 53 |
+
) -> None:
|
| 54 |
+
super().__init__()
|
| 55 |
+
# print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
|
| 56 |
+
self.norm1 = norm_layer(dim)
|
| 57 |
+
self.attn = attn_class(
|
| 58 |
+
dim,
|
| 59 |
+
num_heads=num_heads,
|
| 60 |
+
qkv_bias=qkv_bias,
|
| 61 |
+
proj_bias=proj_bias,
|
| 62 |
+
attn_drop=attn_drop,
|
| 63 |
+
proj_drop=drop,
|
| 64 |
+
)
|
| 65 |
+
self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
|
| 66 |
+
self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
| 67 |
+
|
| 68 |
+
self.norm2 = norm_layer(dim)
|
| 69 |
+
mlp_hidden_dim = int(dim * mlp_ratio)
|
| 70 |
+
self.mlp = ffn_layer(
|
| 71 |
+
in_features=dim,
|
| 72 |
+
hidden_features=mlp_hidden_dim,
|
| 73 |
+
act_layer=act_layer,
|
| 74 |
+
drop=drop,
|
| 75 |
+
bias=ffn_bias,
|
| 76 |
+
)
|
| 77 |
+
self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
|
| 78 |
+
self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
| 79 |
+
|
| 80 |
+
self.sample_drop_ratio = drop_path
|
| 81 |
+
|
| 82 |
+
def forward(self, x: Tensor) -> Tensor:
|
| 83 |
+
def attn_residual_func(x: Tensor) -> Tensor:
|
| 84 |
+
return self.ls1(self.attn(self.norm1(x)))
|
| 85 |
+
|
| 86 |
+
def ffn_residual_func(x: Tensor) -> Tensor:
|
| 87 |
+
return self.ls2(self.mlp(self.norm2(x)))
|
| 88 |
+
|
| 89 |
+
if self.training and self.sample_drop_ratio > 0.1:
|
| 90 |
+
# the overhead is compensated only for a drop path rate larger than 0.1
|
| 91 |
+
x = drop_add_residual_stochastic_depth(
|
| 92 |
+
x,
|
| 93 |
+
residual_func=attn_residual_func,
|
| 94 |
+
sample_drop_ratio=self.sample_drop_ratio,
|
| 95 |
+
)
|
| 96 |
+
x = drop_add_residual_stochastic_depth(
|
| 97 |
+
x,
|
| 98 |
+
residual_func=ffn_residual_func,
|
| 99 |
+
sample_drop_ratio=self.sample_drop_ratio,
|
| 100 |
+
)
|
| 101 |
+
elif self.training and self.sample_drop_ratio > 0.0:
|
| 102 |
+
x = x + self.drop_path1(attn_residual_func(x))
|
| 103 |
+
x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2
|
| 104 |
+
else:
|
| 105 |
+
x = x + attn_residual_func(x)
|
| 106 |
+
x = x + ffn_residual_func(x)
|
| 107 |
+
return x
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def drop_add_residual_stochastic_depth(
|
| 111 |
+
x: Tensor,
|
| 112 |
+
residual_func: Callable[[Tensor], Tensor],
|
| 113 |
+
sample_drop_ratio: float = 0.0,
|
| 114 |
+
) -> Tensor:
|
| 115 |
+
# 1) extract subset using permutation
|
| 116 |
+
b, n, d = x.shape
|
| 117 |
+
sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
|
| 118 |
+
brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
|
| 119 |
+
x_subset = x[brange]
|
| 120 |
+
|
| 121 |
+
# 2) apply residual_func to get residual
|
| 122 |
+
residual = residual_func(x_subset)
|
| 123 |
+
|
| 124 |
+
x_flat = x.flatten(1)
|
| 125 |
+
residual = residual.flatten(1)
|
| 126 |
+
|
| 127 |
+
residual_scale_factor = b / sample_subset_size
|
| 128 |
+
|
| 129 |
+
# 3) add the residual
|
| 130 |
+
x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
|
| 131 |
+
return x_plus_residual.view_as(x)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def get_branges_scales(x, sample_drop_ratio=0.0):
|
| 135 |
+
b, n, d = x.shape
|
| 136 |
+
sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
|
| 137 |
+
brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
|
| 138 |
+
residual_scale_factor = b / sample_subset_size
|
| 139 |
+
return brange, residual_scale_factor
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
|
| 143 |
+
if scaling_vector is None:
|
| 144 |
+
x_flat = x.flatten(1)
|
| 145 |
+
residual = residual.flatten(1)
|
| 146 |
+
x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
|
| 147 |
+
else:
|
| 148 |
+
x_plus_residual = scaled_index_add(
|
| 149 |
+
x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
|
| 150 |
+
)
|
| 151 |
+
return x_plus_residual
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
attn_bias_cache: Dict[Tuple, Any] = {}
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def get_attn_bias_and_cat(x_list, branges=None):
|
| 158 |
+
"""
|
| 159 |
+
this will perform the index select, cat the tensors, and provide the attn_bias from cache
|
| 160 |
+
"""
|
| 161 |
+
batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
|
| 162 |
+
all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
|
| 163 |
+
if all_shapes not in attn_bias_cache.keys():
|
| 164 |
+
seqlens = []
|
| 165 |
+
for b, x in zip(batch_sizes, x_list):
|
| 166 |
+
for _ in range(b):
|
| 167 |
+
seqlens.append(x.shape[1])
|
| 168 |
+
attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
|
| 169 |
+
attn_bias._batch_sizes = batch_sizes
|
| 170 |
+
attn_bias_cache[all_shapes] = attn_bias
|
| 171 |
+
|
| 172 |
+
if branges is not None:
|
| 173 |
+
cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
|
| 174 |
+
else:
|
| 175 |
+
tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
|
| 176 |
+
cat_tensors = torch.cat(tensors_bs1, dim=1)
|
| 177 |
+
|
| 178 |
+
return attn_bias_cache[all_shapes], cat_tensors
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def drop_add_residual_stochastic_depth_list(
|
| 182 |
+
x_list: List[Tensor],
|
| 183 |
+
residual_func: Callable[[Tensor, Any], Tensor],
|
| 184 |
+
sample_drop_ratio: float = 0.0,
|
| 185 |
+
scaling_vector=None,
|
| 186 |
+
) -> Tensor:
|
| 187 |
+
# 1) generate random set of indices for dropping samples in the batch
|
| 188 |
+
branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
|
| 189 |
+
branges = [s[0] for s in branges_scales]
|
| 190 |
+
residual_scale_factors = [s[1] for s in branges_scales]
|
| 191 |
+
|
| 192 |
+
# 2) get attention bias and index+concat the tensors
|
| 193 |
+
attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
|
| 194 |
+
|
| 195 |
+
# 3) apply residual_func to get residual, and split the result
|
| 196 |
+
residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore
|
| 197 |
+
|
| 198 |
+
outputs = []
|
| 199 |
+
for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
|
| 200 |
+
outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
|
| 201 |
+
return outputs
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
class NestedTensorBlock(Block):
|
| 205 |
+
def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
|
| 206 |
+
"""
|
| 207 |
+
x_list contains a list of tensors to nest together and run
|
| 208 |
+
"""
|
| 209 |
+
assert isinstance(self.attn, MemEffAttention)
|
| 210 |
+
|
| 211 |
+
if self.training and self.sample_drop_ratio > 0.0:
|
| 212 |
+
|
| 213 |
+
def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
|
| 214 |
+
return self.attn(self.norm1(x), attn_bias=attn_bias)
|
| 215 |
+
|
| 216 |
+
def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
|
| 217 |
+
return self.mlp(self.norm2(x))
|
| 218 |
+
|
| 219 |
+
x_list = drop_add_residual_stochastic_depth_list(
|
| 220 |
+
x_list,
|
| 221 |
+
residual_func=attn_residual_func,
|
| 222 |
+
sample_drop_ratio=self.sample_drop_ratio,
|
| 223 |
+
scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
|
| 224 |
+
)
|
| 225 |
+
x_list = drop_add_residual_stochastic_depth_list(
|
| 226 |
+
x_list,
|
| 227 |
+
residual_func=ffn_residual_func,
|
| 228 |
+
sample_drop_ratio=self.sample_drop_ratio,
|
| 229 |
+
scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
|
| 230 |
+
)
|
| 231 |
+
return x_list
|
| 232 |
+
else:
|
| 233 |
+
|
| 234 |
+
def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
|
| 235 |
+
return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
|
| 236 |
+
|
| 237 |
+
def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
|
| 238 |
+
return self.ls2(self.mlp(self.norm2(x)))
|
| 239 |
+
|
| 240 |
+
attn_bias, x = get_attn_bias_and_cat(x_list)
|
| 241 |
+
x = x + attn_residual_func(x, attn_bias=attn_bias)
|
| 242 |
+
x = x + ffn_residual_func(x)
|
| 243 |
+
return attn_bias.split(x)
|
| 244 |
+
|
| 245 |
+
def forward(self, x_or_x_list):
|
| 246 |
+
if isinstance(x_or_x_list, Tensor):
|
| 247 |
+
return super().forward(x_or_x_list)
|
| 248 |
+
elif isinstance(x_or_x_list, list):
|
| 249 |
+
assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
|
| 250 |
+
return self.forward_nested(x_or_x_list)
|
| 251 |
+
else:
|
| 252 |
+
raise AssertionError
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/drop_path.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# References:
|
| 8 |
+
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
| 9 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
from torch import nn
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def drop_path(x, drop_prob: float = 0.0, training: bool = False):
|
| 16 |
+
if drop_prob == 0.0 or not training:
|
| 17 |
+
return x
|
| 18 |
+
keep_prob = 1 - drop_prob
|
| 19 |
+
shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
|
| 20 |
+
random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
|
| 21 |
+
if keep_prob > 0.0:
|
| 22 |
+
random_tensor.div_(keep_prob)
|
| 23 |
+
output = x * random_tensor
|
| 24 |
+
return output
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class DropPath(nn.Module):
|
| 28 |
+
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
|
| 29 |
+
|
| 30 |
+
def __init__(self, drop_prob=None):
|
| 31 |
+
super(DropPath, self).__init__()
|
| 32 |
+
self.drop_prob = drop_prob
|
| 33 |
+
|
| 34 |
+
def forward(self, x):
|
| 35 |
+
return drop_path(x, self.drop_prob, self.training)
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/layer_scale.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
|
| 8 |
+
|
| 9 |
+
from typing import Union
|
| 10 |
+
|
| 11 |
+
import torch
|
| 12 |
+
from torch import Tensor
|
| 13 |
+
from torch import nn
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class LayerScale(nn.Module):
|
| 17 |
+
def __init__(
|
| 18 |
+
self,
|
| 19 |
+
dim: int,
|
| 20 |
+
init_values: Union[float, Tensor] = 1e-5,
|
| 21 |
+
inplace: bool = False,
|
| 22 |
+
) -> None:
|
| 23 |
+
super().__init__()
|
| 24 |
+
self.inplace = inplace
|
| 25 |
+
self.gamma = nn.Parameter(init_values * torch.ones(dim))
|
| 26 |
+
|
| 27 |
+
def forward(self, x: Tensor) -> Tensor:
|
| 28 |
+
return x.mul_(self.gamma) if self.inplace else x * self.gamma
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/mlp.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# References:
|
| 8 |
+
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
| 9 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
from typing import Callable, Optional
|
| 13 |
+
|
| 14 |
+
from torch import Tensor, nn
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class Mlp(nn.Module):
|
| 18 |
+
def __init__(
|
| 19 |
+
self,
|
| 20 |
+
in_features: int,
|
| 21 |
+
hidden_features: Optional[int] = None,
|
| 22 |
+
out_features: Optional[int] = None,
|
| 23 |
+
act_layer: Callable[..., nn.Module] = nn.GELU,
|
| 24 |
+
drop: float = 0.0,
|
| 25 |
+
bias: bool = True,
|
| 26 |
+
) -> None:
|
| 27 |
+
super().__init__()
|
| 28 |
+
out_features = out_features or in_features
|
| 29 |
+
hidden_features = hidden_features or in_features
|
| 30 |
+
self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
|
| 31 |
+
self.act = act_layer()
|
| 32 |
+
self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
|
| 33 |
+
self.drop = nn.Dropout(drop)
|
| 34 |
+
|
| 35 |
+
def forward(self, x: Tensor) -> Tensor:
|
| 36 |
+
x = self.fc1(x)
|
| 37 |
+
x = self.act(x)
|
| 38 |
+
x = self.drop(x)
|
| 39 |
+
x = self.fc2(x)
|
| 40 |
+
x = self.drop(x)
|
| 41 |
+
return x
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/patch_embed.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# References:
|
| 8 |
+
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
| 9 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
|
| 10 |
+
|
| 11 |
+
from typing import Callable, Optional, Tuple, Union
|
| 12 |
+
|
| 13 |
+
from torch import Tensor
|
| 14 |
+
import torch.nn as nn
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def make_2tuple(x):
|
| 18 |
+
if isinstance(x, tuple):
|
| 19 |
+
assert len(x) == 2
|
| 20 |
+
return x
|
| 21 |
+
|
| 22 |
+
assert isinstance(x, int)
|
| 23 |
+
return (x, x)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class PatchEmbed(nn.Module):
|
| 27 |
+
"""
|
| 28 |
+
2D image to patch embedding: (B,C,H,W) -> (B,N,D)
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
img_size: Image size.
|
| 32 |
+
patch_size: Patch token size.
|
| 33 |
+
in_chans: Number of input image channels.
|
| 34 |
+
embed_dim: Number of linear projection output channels.
|
| 35 |
+
norm_layer: Normalization layer.
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def __init__(
|
| 39 |
+
self,
|
| 40 |
+
img_size: Union[int, Tuple[int, int]] = 224,
|
| 41 |
+
patch_size: Union[int, Tuple[int, int]] = 16,
|
| 42 |
+
in_chans: int = 3,
|
| 43 |
+
embed_dim: int = 768,
|
| 44 |
+
norm_layer: Optional[Callable] = None,
|
| 45 |
+
flatten_embedding: bool = True,
|
| 46 |
+
) -> None:
|
| 47 |
+
super().__init__()
|
| 48 |
+
|
| 49 |
+
image_HW = make_2tuple(img_size)
|
| 50 |
+
patch_HW = make_2tuple(patch_size)
|
| 51 |
+
patch_grid_size = (
|
| 52 |
+
image_HW[0] // patch_HW[0],
|
| 53 |
+
image_HW[1] // patch_HW[1],
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
self.img_size = image_HW
|
| 57 |
+
self.patch_size = patch_HW
|
| 58 |
+
self.patches_resolution = patch_grid_size
|
| 59 |
+
self.num_patches = patch_grid_size[0] * patch_grid_size[1]
|
| 60 |
+
|
| 61 |
+
self.in_chans = in_chans
|
| 62 |
+
self.embed_dim = embed_dim
|
| 63 |
+
|
| 64 |
+
self.flatten_embedding = flatten_embedding
|
| 65 |
+
|
| 66 |
+
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
|
| 67 |
+
self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
|
| 68 |
+
|
| 69 |
+
def forward(self, x: Tensor) -> Tensor:
|
| 70 |
+
_, _, H, W = x.shape
|
| 71 |
+
patch_H, patch_W = self.patch_size
|
| 72 |
+
|
| 73 |
+
assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
|
| 74 |
+
assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
|
| 75 |
+
|
| 76 |
+
x = self.proj(x) # B C H W
|
| 77 |
+
H, W = x.size(2), x.size(3)
|
| 78 |
+
x = x.flatten(2).transpose(1, 2) # B HW C
|
| 79 |
+
x = self.norm(x)
|
| 80 |
+
if not self.flatten_embedding:
|
| 81 |
+
x = x.reshape(-1, H, W, self.embed_dim) # B H W C
|
| 82 |
+
return x
|
| 83 |
+
|
| 84 |
+
def flops(self) -> float:
|
| 85 |
+
Ho, Wo = self.patches_resolution
|
| 86 |
+
flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
|
| 87 |
+
if self.norm is not None:
|
| 88 |
+
flops += Ho * Wo * self.embed_dim
|
| 89 |
+
return flops
|
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/swiglu_ffn.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
from typing import Callable, Optional
|
| 8 |
+
|
| 9 |
+
from torch import Tensor, nn
|
| 10 |
+
import torch.nn.functional as F
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class SwiGLUFFN(nn.Module):
|
| 14 |
+
def __init__(
|
| 15 |
+
self,
|
| 16 |
+
in_features: int,
|
| 17 |
+
hidden_features: Optional[int] = None,
|
| 18 |
+
out_features: Optional[int] = None,
|
| 19 |
+
act_layer: Callable[..., nn.Module] = None,
|
| 20 |
+
drop: float = 0.0,
|
| 21 |
+
bias: bool = True,
|
| 22 |
+
) -> None:
|
| 23 |
+
super().__init__()
|
| 24 |
+
out_features = out_features or in_features
|
| 25 |
+
hidden_features = hidden_features or in_features
|
| 26 |
+
self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
|
| 27 |
+
self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
|
| 28 |
+
|
| 29 |
+
def forward(self, x: Tensor) -> Tensor:
|
| 30 |
+
x12 = self.w12(x)
|
| 31 |
+
x1, x2 = x12.chunk(2, dim=-1)
|
| 32 |
+
hidden = F.silu(x1) * x2
|
| 33 |
+
return self.w3(hidden)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
from xformers.ops import SwiGLU
|
| 38 |
+
|
| 39 |
+
XFORMERS_AVAILABLE = True
|
| 40 |
+
except ImportError:
|
| 41 |
+
SwiGLU = SwiGLUFFN
|
| 42 |
+
XFORMERS_AVAILABLE = False
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class SwiGLUFFNFused(SwiGLU):
|
| 46 |
+
def __init__(
|
| 47 |
+
self,
|
| 48 |
+
in_features: int,
|
| 49 |
+
hidden_features: Optional[int] = None,
|
| 50 |
+
out_features: Optional[int] = None,
|
| 51 |
+
act_layer: Callable[..., nn.Module] = None,
|
| 52 |
+
drop: float = 0.0,
|
| 53 |
+
bias: bool = True,
|
| 54 |
+
) -> None:
|
| 55 |
+
out_features = out_features or in_features
|
| 56 |
+
hidden_features = hidden_features or in_features
|
| 57 |
+
hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
|
| 58 |
+
super().__init__(
|
| 59 |
+
in_features=in_features,
|
| 60 |
+
hidden_features=hidden_features,
|
| 61 |
+
out_features=out_features,
|
| 62 |
+
bias=bias,
|
| 63 |
+
)
|
depth_anything_v2_metric/depth_anything_v2/dinov3/.docstr.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
paths:
|
| 2 |
+
- dinov3
|
| 3 |
+
exclude: dinov3/tests
|
| 4 |
+
skip_init: True
|
| 5 |
+
skip_private: True
|
| 6 |
+
fail_under: 0
|
depth_anything_v2_metric/depth_anything_v2/dinov3/.github/workflows/lint.yaml
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Lint
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches:
|
| 6 |
+
- main
|
| 7 |
+
pull_request:
|
| 8 |
+
branches:
|
| 9 |
+
- main
|
| 10 |
+
|
| 11 |
+
jobs:
|
| 12 |
+
run-linters:
|
| 13 |
+
name: Run linters
|
| 14 |
+
runs-on: ubuntu-latest
|
| 15 |
+
|
| 16 |
+
steps:
|
| 17 |
+
- name: Checkout repository
|
| 18 |
+
uses: actions/checkout@v4
|
| 19 |
+
- name: Set up Python
|
| 20 |
+
uses: actions/setup-python@v5
|
| 21 |
+
with:
|
| 22 |
+
python-version: 3.11
|
| 23 |
+
cache: 'pip'
|
| 24 |
+
cache-dependency-path: '**/requirements*.txt'
|
| 25 |
+
- name: Install Python (development) dependencies
|
| 26 |
+
run: |
|
| 27 |
+
pip install -r requirements-dev.txt
|
| 28 |
+
- name: Run ruff (linter)
|
| 29 |
+
run: |
|
| 30 |
+
ruff check dinov3
|
| 31 |
+
- name: Run ruff (formatter)
|
| 32 |
+
if: always()
|
| 33 |
+
run: |
|
| 34 |
+
ruff format --diff dinov3
|
| 35 |
+
- name: Report docstring coverage
|
| 36 |
+
if: always()
|
| 37 |
+
run: |
|
| 38 |
+
docstr-coverage dinov3
|
| 39 |
+
- name: Run mypy
|
| 40 |
+
if: always()
|
| 41 |
+
run: |
|
| 42 |
+
mypy --txt-report .
|
| 43 |
+
[ -f index.txt ] && cat index.txt
|
| 44 |
+
- name: Run pylint
|
| 45 |
+
if: always()
|
| 46 |
+
run: |
|
| 47 |
+
pylint --exit-zero dinov3
|
depth_anything_v2_metric/depth_anything_v2/dinov3/.gitignore
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
build/
|
| 2 |
+
dist/
|
| 3 |
+
*.egg-info/
|
| 4 |
+
**/__pycache__/
|
| 5 |
+
|
| 6 |
+
**/.ipynb_checkpoints
|
| 7 |
+
**/.ipynb_checkpoints/**
|
| 8 |
+
|
| 9 |
+
**/notebooks
|
| 10 |
+
|
| 11 |
+
# Ignore shell scripts
|
| 12 |
+
*.sh
|
| 13 |
+
|
| 14 |
+
# Ignore swap files
|
| 15 |
+
*.swp
|
| 16 |
+
|
| 17 |
+
# Ignore vscode directory
|
| 18 |
+
.vscode/
|
depth_anything_v2_metric/depth_anything_v2/dinov3/CODE_OF_CONDUCT.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Code of Conduct
|
| 2 |
+
|
| 3 |
+
## Our Pledge
|
| 4 |
+
|
| 5 |
+
In the interest of fostering an open and welcoming environment, we as
|
| 6 |
+
contributors and maintainers pledge to make participation in our project and
|
| 7 |
+
our community a harassment-free experience for everyone, regardless of age, body
|
| 8 |
+
size, disability, ethnicity, sex characteristics, gender identity and expression,
|
| 9 |
+
level of experience, education, socio-economic status, nationality, personal
|
| 10 |
+
appearance, race, religion, or sexual identity and orientation.
|
| 11 |
+
|
| 12 |
+
## Our Standards
|
| 13 |
+
|
| 14 |
+
Examples of behavior that contributes to creating a positive environment
|
| 15 |
+
include:
|
| 16 |
+
|
| 17 |
+
* Using welcoming and inclusive language
|
| 18 |
+
* Being respectful of differing viewpoints and experiences
|
| 19 |
+
* Gracefully accepting constructive criticism
|
| 20 |
+
* Focusing on what is best for the community
|
| 21 |
+
* Showing empathy towards other community members
|
| 22 |
+
|
| 23 |
+
Examples of unacceptable behavior by participants include:
|
| 24 |
+
|
| 25 |
+
* The use of sexualized language or imagery and unwelcome sexual attention or
|
| 26 |
+
advances
|
| 27 |
+
* Trolling, insulting/derogatory comments, and personal or political attacks
|
| 28 |
+
* Public or private harassment
|
| 29 |
+
* Publishing others' private information, such as a physical or electronic
|
| 30 |
+
address, without explicit permission
|
| 31 |
+
* Other conduct which could reasonably be considered inappropriate in a
|
| 32 |
+
professional setting
|
| 33 |
+
|
| 34 |
+
## Our Responsibilities
|
| 35 |
+
|
| 36 |
+
Project maintainers are responsible for clarifying the standards of acceptable
|
| 37 |
+
behavior and are expected to take appropriate and fair corrective action in
|
| 38 |
+
response to any instances of unacceptable behavior.
|
| 39 |
+
|
| 40 |
+
Project maintainers have the right and responsibility to remove, edit, or
|
| 41 |
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
| 42 |
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
| 43 |
+
permanently any contributor for other behaviors that they deem inappropriate,
|
| 44 |
+
threatening, offensive, or harmful.
|
| 45 |
+
|
| 46 |
+
## Scope
|
| 47 |
+
|
| 48 |
+
This Code of Conduct applies within all project spaces, and it also applies when
|
| 49 |
+
an individual is representing the project or its community in public spaces.
|
| 50 |
+
Examples of representing a project or community include using an official
|
| 51 |
+
project e-mail address, posting via an official social media account, or acting
|
| 52 |
+
as an appointed representative at an online or offline event. Representation of
|
| 53 |
+
a project may be further defined and clarified by project maintainers.
|
| 54 |
+
|
| 55 |
+
This Code of Conduct also applies outside the project spaces when there is a
|
| 56 |
+
reasonable belief that an individual's behavior may have a negative impact on
|
| 57 |
+
the project or its community.
|
| 58 |
+
|
| 59 |
+
## Enforcement
|
| 60 |
+
|
| 61 |
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
| 62 |
+
reported by contacting the project team at <[email protected]>. All
|
| 63 |
+
complaints will be reviewed and investigated and will result in a response that
|
| 64 |
+
is deemed necessary and appropriate to the circumstances. The project team is
|
| 65 |
+
obligated to maintain confidentiality with regard to the reporter of an incident.
|
| 66 |
+
Further details of specific enforcement policies may be posted separately.
|
| 67 |
+
|
| 68 |
+
Project maintainers who do not follow or enforce the Code of Conduct in good
|
| 69 |
+
faith may face temporary or permanent repercussions as determined by other
|
| 70 |
+
members of the project's leadership.
|
| 71 |
+
|
| 72 |
+
## Attribution
|
| 73 |
+
|
| 74 |
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
| 75 |
+
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
|
| 76 |
+
|
| 77 |
+
[homepage]: https://www.contributor-covenant.org
|
| 78 |
+
|
| 79 |
+
For answers to common questions about this code of conduct, see
|
| 80 |
+
https://www.contributor-covenant.org/faq
|
depth_anything_v2_metric/depth_anything_v2/dinov3/CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contributing to DINOv3
|
| 2 |
+
We want to make contributing to this project as easy and transparent as
|
| 3 |
+
possible.
|
| 4 |
+
|
| 5 |
+
## Pull Requests
|
| 6 |
+
We actively welcome your pull requests.
|
| 7 |
+
|
| 8 |
+
1. Fork the repo and create your branch from `main`.
|
| 9 |
+
2. If you've added code that should be tested, add tests.
|
| 10 |
+
3. If you've changed APIs, update the documentation.
|
| 11 |
+
4. Ensure the test suite passes.
|
| 12 |
+
5. Make sure your code lints.
|
| 13 |
+
6. If you haven't already, complete the Contributor License Agreement ("CLA").
|
| 14 |
+
|
| 15 |
+
## Contributor License Agreement ("CLA")
|
| 16 |
+
In order to accept your pull request, we need you to submit a CLA. You only need
|
| 17 |
+
to do this once to work on any of Meta's open source projects.
|
| 18 |
+
|
| 19 |
+
Complete your CLA here: <https://code.facebook.com/cla>
|
| 20 |
+
|
| 21 |
+
## Issues
|
| 22 |
+
We use GitHub issues to track public bugs. Please ensure your description is
|
| 23 |
+
clear and has sufficient instructions to be able to reproduce the issue.
|
| 24 |
+
|
| 25 |
+
Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
|
| 26 |
+
disclosure of security bugs. In those cases, please go through the process
|
| 27 |
+
outlined on that page and do not file a public issue.
|
| 28 |
+
|
| 29 |
+
## License
|
| 30 |
+
By contributing to DINOv3, you agree that your contributions will be licensed
|
| 31 |
+
under the LICENSE.md file in the root directory of this source tree.
|
depth_anything_v2_metric/depth_anything_v2/dinov3/LICENSE.md
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DINOv3 License
|
| 2 |
+
|
| 3 |
+
*Last Updated: August 19, 2025*
|
| 4 |
+
|
| 5 |
+
**“Agreement”** means the terms and conditions for use, reproduction, distribution and modification of the DINO Materials set forth herein.
|
| 6 |
+
|
| 7 |
+
**“DINO Materials”** means, collectively, Documentation and the models, software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code, and other elements of the foregoing distributed by Meta and made available under this Agreement.
|
| 8 |
+
|
| 9 |
+
**“Documentation”** means the specifications, manuals and documentation accompanying
|
| 10 |
+
DINO Materials distributed by Meta.
|
| 11 |
+
|
| 12 |
+
**“Licensee”** or **“you”** means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
|
| 13 |
+
|
| 14 |
+
**“Meta”** or **“we”** means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) or Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
|
| 15 |
+
|
| 16 |
+
**“Sanctions”** means any economic or trade sanctions or restrictions administered or enforced by the United States (including the Office of Foreign Assets Control of the U.S. Department of the Treasury (“OFAC”), the U.S. Department of State and the U.S. Department of Commerce), the United Nations, the European Union, or the United Kingdom.
|
| 17 |
+
|
| 18 |
+
**“Trade Controls”** means any of the following: Sanctions and applicable export and import controls.
|
| 19 |
+
|
| 20 |
+
By clicking “I Accept” below or by using or distributing any portion or element of the DINO Materials, you agree to be bound by this Agreement.
|
| 21 |
+
|
| 22 |
+
## 1. License Rights and Redistribution.
|
| 23 |
+
|
| 24 |
+
a. <ins>Grant of Rights</ins>. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the DINO Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the DINO Materials.
|
| 25 |
+
|
| 26 |
+
b. <ins>Redistribution and Use</ins>.
|
| 27 |
+
|
| 28 |
+
i. Distribution of DINO Materials, and any derivative works thereof, are subject to the terms of this Agreement. If you distribute or make the DINO Materials, or any derivative works thereof, available to a third party, you may only do so under the terms of this Agreement and you shall provide a copy of this Agreement with any such DINO Materials.
|
| 29 |
+
|
| 30 |
+
ii. If you submit for publication the results of research you perform on, using, or otherwise in connection with DINO Materials, you must acknowledge the use of DINO Materials in your publication.
|
| 31 |
+
|
| 32 |
+
iii. Your use of the DINO Materials must comply with applicable laws and regulations, including Trade Control Laws and applicable privacy and data protection laws.
|
| 33 |
+
|
| 34 |
+
iv. Your use of the DINO Materials will not involve or encourage others to reverse engineer, decompile or discover the underlying components of the DINO Materials.
|
| 35 |
+
|
| 36 |
+
v. You are not the target of Trade Controls and your use of DINO Materials must comply with Trade Controls. You agree not to use, or permit others to use, DINO Materials for any activities subject to the International Traffic in Arms Regulations (ITAR) or end uses prohibited by Trade Controls, including those related to military or warfare purposes, nuclear industries or applications, espionage, or the development or use of guns or illegal weapons.
|
| 37 |
+
|
| 38 |
+
## 2. User Support.
|
| 39 |
+
|
| 40 |
+
Your use of the DINO Materials is done at your own discretion; Meta does not process any information nor provide any service in relation to such use. Meta is under no obligation to provide any support services for the DINO Materials. Any support provided is “as is”, “with all faults”, and without warranty of any kind.
|
| 41 |
+
|
| 42 |
+
## 3. Disclaimer of Warranty.
|
| 43 |
+
|
| 44 |
+
UNLESS REQUIRED BY APPLICABLE LAW, THE DINO MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE DINO MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE DINO MATERIALS AND ANY OUTPUT AND RESULTS.
|
| 45 |
+
|
| 46 |
+
## 4. Limitation of Liability.
|
| 47 |
+
|
| 48 |
+
IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT OR INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
|
| 49 |
+
|
| 50 |
+
## 5. Intellectual Property.
|
| 51 |
+
|
| 52 |
+
a. Subject to Meta’s ownership of DINO Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the DINO Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
|
| 53 |
+
|
| 54 |
+
b. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the DINO Materials, outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the DINO Materials.
|
| 55 |
+
|
| 56 |
+
## 6. Term and Termination.
|
| 57 |
+
|
| 58 |
+
The term of this Agreement will commence upon your acceptance of this Agreement or access to the DINO Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the DINO Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement.
|
| 59 |
+
|
| 60 |
+
## 7. Governing Law and Jurisdiction.
|
| 61 |
+
|
| 62 |
+
This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
|
| 63 |
+
|
| 64 |
+
## 8. Modifications and Amendments.
|
| 65 |
+
|
| 66 |
+
Meta may modify this Agreement from time to time; provided that they are similar in spirit to the current version of the Agreement, but may differ in detail to address new problems or concerns. All such changes will be effective immediately. Your continued use of the DINO Materials after any modification to this Agreement constitutes your agreement to such modification. Except as provided in this Agreement, no modification or addition to any provision of this Agreement will be binding unless it is in writing and signed by an authorized representative of both you and Meta.
|
depth_anything_v2_metric/depth_anything_v2/dinov3/MODEL_CARD.md
ADDED
|
@@ -0,0 +1,432 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Model Card for DINOv3
|
| 2 |
+
|
| 3 |
+
DINOv3 is a family of versatile vision foundation models that outperforms the specialized state of the art across a broad range of settings, without fine-tuning. DINOv3 produces high-quality dense features that achieve outstanding performance on various vision tasks, significantly surpassing previous self- and weakly-supervised foundation models.
|
| 4 |
+
|
| 5 |
+
## Model Details
|
| 6 |
+
|
| 7 |
+
These are Vision Transformer and ConvNeXt models trained following the method described in the DINOv3 paper. 12 models are provided:
|
| 8 |
+
|
| 9 |
+
- 10 models pretrained on web data (LVD-1689M dataset)
|
| 10 |
+
- 1 ViT-7B trained from scratch,
|
| 11 |
+
- 5 ViT-S/S+/B/L/H+ models distilled from the ViT-7B,
|
| 12 |
+
- 4 ConvNeXt-{T/S/B/L} models distilled from the ViT-7B,
|
| 13 |
+
- 2 models pretrained on satellite data (SAT-493M dataset)
|
| 14 |
+
- 1 ViT-7B trained from scratch
|
| 15 |
+
- 1 ViT-L distilled from the ViT-7B
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
Each Transformer-based model takes an image as input and returns a class token, patch tokens (and register tokens). These models follow a ViT architecture, with a patch size of 16. For a 224x224 image, this results in 1 class token + 4 register tokens + 196 patch tokens = 201 tokens (for DINOv2 with registers this resulted in 1 + 4 + 256 = 261 tokens).
|
| 19 |
+
|
| 20 |
+
The models can accept larger images provided the image shapes are multiples of the patch size (16). If this condition is not verified, the model will crop to the closest smaller multiple of the patch size.
|
| 21 |
+
|
| 22 |
+
### Model Description
|
| 23 |
+
|
| 24 |
+
- **Developed by:** Meta AI
|
| 25 |
+
- **Model type:** Vision Transformer, ConvNeXt
|
| 26 |
+
- **License:** [DINOv3 License](https://ai.meta.com/resources/models-and-libraries/dinov3-license/)
|
| 27 |
+
|
| 28 |
+
### Model Sources
|
| 29 |
+
|
| 30 |
+
- **Repository:** [https://github.com/facebookresearch/dinov3](https://github.com/facebookresearch/dinov3)
|
| 31 |
+
- **Paper:** [https://arxiv.org/abs/2508.10104](https://arxiv.org/abs/2508.10104)
|
| 32 |
+
|
| 33 |
+
## Uses
|
| 34 |
+
|
| 35 |
+
The models are vision backbones providing multi-purpose features for downstream tasks.
|
| 36 |
+
|
| 37 |
+
### Direct Use
|
| 38 |
+
|
| 39 |
+
The models can be used without fine-tuning, with downstream classifiers as simple as linear layers, to obtain competitive results:
|
| 40 |
+
|
| 41 |
+
- on image classification, using k-NN classifiers on the class token
|
| 42 |
+
- on image classification, with logistic regression classifiers applied on the class token
|
| 43 |
+
- on image classification, with a linear layer applied on the class token and the average of the patch tokens
|
| 44 |
+
- on image retrieval using nearest neighbors
|
| 45 |
+
- on geometric and semantic 3D keypoint correspondances
|
| 46 |
+
- on depth estimation, semantic segmentation, using linear layers
|
| 47 |
+
- on unsupervised object discovery
|
| 48 |
+
- on video segmentation tracking
|
| 49 |
+
- on video classification, using a small 4-layer attentive probe
|
| 50 |
+
|
| 51 |
+
### Downstream Use
|
| 52 |
+
|
| 53 |
+
While fine-tuning the models can yield some gains, it is recommended to keep this option as a last resort: the frozen features are expected to provide good performance out-of-the-box.
|
| 54 |
+
|
| 55 |
+
## Bias, Risks, and Limitations
|
| 56 |
+
|
| 57 |
+
Compared to DINOv2 and SEERv2, DINOv3 delivers somewhat consistent performance across income categories on geographical fairness and diversity, although with a notable performance drop in the low-income bucket compared to the highest-income bucket.
|
| 58 |
+
|
| 59 |
+
DINOv3 also achieves relatively good scores across different regions, improving over its predecessor DINOv2. However, a relative difference is still observed between Europe and Africa.
|
| 60 |
+
|
| 61 |
+
### Recommendations
|
| 62 |
+
|
| 63 |
+
Fine-tuning is expected to increase the biases in the features produced by the model as they will be tuned to the fine-tuning labels.
|
| 64 |
+
|
| 65 |
+
## How to Get Started with the Model
|
| 66 |
+
|
| 67 |
+
Use the code below to get started with the model.
|
| 68 |
+
|
| 69 |
+
```python
|
| 70 |
+
import torch
|
| 71 |
+
|
| 72 |
+
model = torch.hub.load(
|
| 73 |
+
repo_or_dir='facebookresearch/dinov3',
|
| 74 |
+
model='<MODEL_NAME>',
|
| 75 |
+
weights='<PATH/OR/URL/TO/CHECKPOINT>',
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
# where MODEL_NAME can be one of:
|
| 79 |
+
# - dinov3_vits16
|
| 80 |
+
# - dinov3_vits16plus
|
| 81 |
+
# - dinov3_vitb16
|
| 82 |
+
# - dinov3_vitl16
|
| 83 |
+
# - dinov3_vith16plus
|
| 84 |
+
# - dinov3_vit7b16
|
| 85 |
+
# - dinov3_convnext_tiny
|
| 86 |
+
# - dinov3_convnext_small
|
| 87 |
+
# - dinov3_convnext_base
|
| 88 |
+
# - dinov3_convnext_large
|
| 89 |
+
|
| 90 |
+
# For instance
|
| 91 |
+
dinov3_vits16 = torch.hub.load(
|
| 92 |
+
repo_or_dir='facebookresearch/dinov3',
|
| 93 |
+
model='dinov3_vits16',
|
| 94 |
+
weights='<PATH/OR/URL/TO/DINOV3/VITS16/LVD1689M/CHECKPOINT>',
|
| 95 |
+
)
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
## Training Details
|
| 99 |
+
|
| 100 |
+
### Training Data
|
| 101 |
+
|
| 102 |
+
- Web dataset (LVD-1689M): a curated dataset of 1,689 millions of images extracted from a large data
|
| 103 |
+
pool of 17 billions web images collected from public posts on Instagram
|
| 104 |
+
|
| 105 |
+
- Satellite dataset (SAT-493M): a dataset of 493 millions of 512x512 images sampled randomly from Maxar RGB ortho-rectified imagery at 0.6 meter resolution
|
| 106 |
+
|
| 107 |
+
### Training Procedure
|
| 108 |
+
|
| 109 |
+
**Training objective:**
|
| 110 |
+
|
| 111 |
+
- DINO self-distillation loss with multi-crop
|
| 112 |
+
- iBOT masked-image modeling loss
|
| 113 |
+
- KoLeo regularization on [CLS] tokens
|
| 114 |
+
- Gram anchoring
|
| 115 |
+
|
| 116 |
+
- **Training regime:** PyTorch FSDP2 (with bf16 and fp8 matrix multiplications)
|
| 117 |
+
|
| 118 |
+
**Distillation:**
|
| 119 |
+
|
| 120 |
+
- Distillation follows the standard DINOv3 pretraining procedure, except the teacher is a frozen pretrained ViT-7B.
|
| 121 |
+
|
| 122 |
+
## Evaluation
|
| 123 |
+
|
| 124 |
+
**Results**
|
| 125 |
+
|
| 126 |
+
The reader is referred to the associated paper for details on the evaluation protocols
|
| 127 |
+
|
| 128 |
+
*Results for ViT backbones pretrained (or distilled) on web (LVD-1689M)*
|
| 129 |
+
|
| 130 |
+
<table>
|
| 131 |
+
<tr>
|
| 132 |
+
<th></th>
|
| 133 |
+
<!-- <th></th> -->
|
| 134 |
+
<th colspan="4">Global Tasks</th>
|
| 135 |
+
<th colspan="5">Dense Tasks</th>
|
| 136 |
+
</tr>
|
| 137 |
+
<tr>
|
| 138 |
+
<th>Model</th>
|
| 139 |
+
<!-- <th>Dataset</th> -->
|
| 140 |
+
<th>IN-ReaL</th>
|
| 141 |
+
<th>IN-R</th>
|
| 142 |
+
<th>Obj.Net</th>
|
| 143 |
+
<th>Ox.-H</th>
|
| 144 |
+
<th>ADE20k</th>
|
| 145 |
+
<th>NYU↓</th>
|
| 146 |
+
<th>DAVIS</th>
|
| 147 |
+
<th>NAVI</th>
|
| 148 |
+
<th>SPair</th>
|
| 149 |
+
</tr>
|
| 150 |
+
<tr>
|
| 151 |
+
<td>DINOv3 ViT-S/16</td>
|
| 152 |
+
<!-- <td>LVD-1689M</td> -->
|
| 153 |
+
<td align="right">87.0</td>
|
| 154 |
+
<td align="right">60.4</td>
|
| 155 |
+
<td align="right">50.9</td>
|
| 156 |
+
<td align="right">49.5</td>
|
| 157 |
+
<td align="right">47.0</td>
|
| 158 |
+
<td align="right">0.403</td>
|
| 159 |
+
<td align="right">72.7</td>
|
| 160 |
+
<td align="right">56.3</td>
|
| 161 |
+
<td align="right">50.4</td>
|
| 162 |
+
</tr>
|
| 163 |
+
<tr>
|
| 164 |
+
<td>DINOv3 ViT-S+/16</td>
|
| 165 |
+
<!-- <td>LVD-1689M</td> -->
|
| 166 |
+
<td align="right">88.0</td>
|
| 167 |
+
<td align="right">68.8</td>
|
| 168 |
+
<td align="right">54.6</td>
|
| 169 |
+
<td align="right">50.0</td>
|
| 170 |
+
<td align="right">48.8</td>
|
| 171 |
+
<td align="right">0.399</td>
|
| 172 |
+
<td align="right">75.5</td>
|
| 173 |
+
<td align="right">57.1</td>
|
| 174 |
+
<td align="right">55.2</td>
|
| 175 |
+
</tr>
|
| 176 |
+
<tr>
|
| 177 |
+
<td>DINOv3 ViT-B/16</td>
|
| 178 |
+
<!-- <td>LVD-1689M</td> -->
|
| 179 |
+
<td align="right">89.3</td>
|
| 180 |
+
<td align="right">76.7</td>
|
| 181 |
+
<td align="right">64.1</td>
|
| 182 |
+
<td align="right">58.5</td>
|
| 183 |
+
<td align="right">51.8</td>
|
| 184 |
+
<td align="right">0.373</td>
|
| 185 |
+
<td align="right">77.2</td>
|
| 186 |
+
<td align="right">58.8</td>
|
| 187 |
+
<td align="right">57.2</td>
|
| 188 |
+
</tr>
|
| 189 |
+
<tr>
|
| 190 |
+
<td>DINOv3 ViT-L/16</td>
|
| 191 |
+
<!-- <td>LVD-1689M</td> -->
|
| 192 |
+
<td align="right">90.2</td>
|
| 193 |
+
<td align="right">88.1</td>
|
| 194 |
+
<td align="right">74.8</td>
|
| 195 |
+
<td align="right">63.1</td>
|
| 196 |
+
<td align="right">54.9</td>
|
| 197 |
+
<td align="right">0.352</td>
|
| 198 |
+
<td align="right">79.9</td>
|
| 199 |
+
<td align="right">62.3</td>
|
| 200 |
+
<td align="right">61.3</td>
|
| 201 |
+
</tr>
|
| 202 |
+
<tr>
|
| 203 |
+
<td>DINOv3 ViT-H+/16</td>
|
| 204 |
+
<!-- <td>LVD-1689M</td> -->
|
| 205 |
+
<td align="right">90.3</td>
|
| 206 |
+
<td align="right">90.0</td>
|
| 207 |
+
<td align="right">78.6</td>
|
| 208 |
+
<td align="right">64.5</td>
|
| 209 |
+
<td align="right">54.8</td>
|
| 210 |
+
<td align="right">0.352</td>
|
| 211 |
+
<td align="right">79.3</td>
|
| 212 |
+
<td align="right">63.3</td>
|
| 213 |
+
<td align="right">56.3</td>
|
| 214 |
+
</tr>
|
| 215 |
+
<tr>
|
| 216 |
+
<td>DINOv3 ViT-7B/16</td>
|
| 217 |
+
<!-- <td>LVD-1689M</td> -->
|
| 218 |
+
<td align="right">90.4</td>
|
| 219 |
+
<td align="right">91.1</td>
|
| 220 |
+
<td align="right">91.1</td>
|
| 221 |
+
<td align="right">72.8</td>
|
| 222 |
+
<td align="right">55.9</td>
|
| 223 |
+
<td align="right">0.309</td>
|
| 224 |
+
<td align="right">79.7</td>
|
| 225 |
+
<td align="right">64.4</td>
|
| 226 |
+
<td align="right">58.7</td>
|
| 227 |
+
</tr>
|
| 228 |
+
</table>
|
| 229 |
+
|
| 230 |
+
*Results for ConvNeXt backbones distilled on web (LVD-1689M)*
|
| 231 |
+
|
| 232 |
+
<table>
|
| 233 |
+
<tr>
|
| 234 |
+
<th></th>
|
| 235 |
+
<th colspan="6">Global Tasks</th>
|
| 236 |
+
<th colspan="2">Dense Tasks</th>
|
| 237 |
+
</tr>
|
| 238 |
+
<tr>
|
| 239 |
+
<th>Model</th>
|
| 240 |
+
<th colspan="2">IN-ReaL</th>
|
| 241 |
+
<th colspan="2">IN-R</th>
|
| 242 |
+
<th colspan="2">Obj.Net</th>
|
| 243 |
+
<th>ADE20k</th>
|
| 244 |
+
<th>NYU↓</th>
|
| 245 |
+
</tr>
|
| 246 |
+
<tr>
|
| 247 |
+
<td></th>
|
| 248 |
+
<td>@256px</td>
|
| 249 |
+
<td>@512px</td>
|
| 250 |
+
<td>@256px</td>
|
| 251 |
+
<td>@512px</td>
|
| 252 |
+
<td>@256px</td>
|
| 253 |
+
<td>@512px</td>
|
| 254 |
+
<td colspan="2"></td>
|
| 255 |
+
</tr>
|
| 256 |
+
<tr>
|
| 257 |
+
<td>DINOv3 ConvNeXt Tiny</td>
|
| 258 |
+
<td align="right">86.6</td>
|
| 259 |
+
<td align="right">87.7</td>
|
| 260 |
+
<td align="right">73.7</td>
|
| 261 |
+
<td align="right">74.1</td>
|
| 262 |
+
<td align="right">52.6</td>
|
| 263 |
+
<td align="right">58.7</td>
|
| 264 |
+
<td align="right">42.7</td>
|
| 265 |
+
<td align="right">0.448</td>
|
| 266 |
+
</tr>
|
| 267 |
+
<tr>
|
| 268 |
+
<td>DINOv3 ConvNeXt Small</td>
|
| 269 |
+
<td align="right">87.9</td>
|
| 270 |
+
<td align="right">88.7</td>
|
| 271 |
+
<td align="right">73.7</td>
|
| 272 |
+
<td align="right">74.1</td>
|
| 273 |
+
<td align="right">52.6</td>
|
| 274 |
+
<td align="right">58.7</td>
|
| 275 |
+
<td align="right">44.8</td>
|
| 276 |
+
<td align="right">0.432</td>
|
| 277 |
+
</tr>
|
| 278 |
+
<tr>
|
| 279 |
+
<td>DINOv3 ConvNeXt Base</td>
|
| 280 |
+
<td align="right">88.5</td>
|
| 281 |
+
<td align="right">89.2</td>
|
| 282 |
+
<td align="right">77.2</td>
|
| 283 |
+
<td align="right">78.2</td>
|
| 284 |
+
<td align="right">56.2</td>
|
| 285 |
+
<td align="right">61.3</td>
|
| 286 |
+
<td align="right">46.3</td>
|
| 287 |
+
<td align="right">0.420</td>
|
| 288 |
+
</tr>
|
| 289 |
+
<tr>
|
| 290 |
+
<td>DINOv3 ConvNeXt Large</td>
|
| 291 |
+
<td align="right">88.9</td>
|
| 292 |
+
<td align="right">89.4</td>
|
| 293 |
+
<td align="right">81.3</td>
|
| 294 |
+
<td align="right">82.4</td>
|
| 295 |
+
<td align="right">59.3</td>
|
| 296 |
+
<td align="right">65.2</td>
|
| 297 |
+
<td align="right">47.8</td>
|
| 298 |
+
<td align="right">0.403</td>
|
| 299 |
+
</tr>
|
| 300 |
+
</table>
|
| 301 |
+
|
| 302 |
+
*Results for ViT backbones pretrained (or distilled) on satellite (SAT-493M)*
|
| 303 |
+
|
| 304 |
+
<table>
|
| 305 |
+
<tr>
|
| 306 |
+
<th></th>
|
| 307 |
+
<th colspan="7">(GEO-Bench) Classification</th>
|
| 308 |
+
</tr>
|
| 309 |
+
<tr>
|
| 310 |
+
<th>Model</ht>
|
| 311 |
+
<th>m-BEnet</th>
|
| 312 |
+
<th>m-brick-kiln
|
| 313 |
+
<th>m-eurosat</th>
|
| 314 |
+
<th>m-forestnet</th>
|
| 315 |
+
<th>m-pv4ger</th>
|
| 316 |
+
<th>m-so2sat</th>
|
| 317 |
+
<th>mean</th>
|
| 318 |
+
</tr>
|
| 319 |
+
<tr>
|
| 320 |
+
<td>DINOv3 ViT-L/16</td>
|
| 321 |
+
<td>73.0</td>
|
| 322 |
+
<td>96.5</td>
|
| 323 |
+
<td>94.1</td>
|
| 324 |
+
<td>60.6</td>
|
| 325 |
+
<td>96.0</td>
|
| 326 |
+
<td>57.4</td>
|
| 327 |
+
<td>79.6</td>
|
| 328 |
+
</tr>
|
| 329 |
+
<tr>
|
| 330 |
+
<td>DINOv3 ViT-7B/16</td>
|
| 331 |
+
<td>74.0</td>
|
| 332 |
+
<td>97.2</td>
|
| 333 |
+
<td>94.8</td>
|
| 334 |
+
<td>62.3</td>
|
| 335 |
+
<td>96.1</td>
|
| 336 |
+
<td>62.1</td>
|
| 337 |
+
<td>81.1</td>
|
| 338 |
+
</tr>
|
| 339 |
+
<tr>
|
| 340 |
+
<th></th>
|
| 341 |
+
<th colspan="7">(GEO-Bench) Segmentation</th>
|
| 342 |
+
</tr>
|
| 343 |
+
<tr>
|
| 344 |
+
<th>Model</th>
|
| 345 |
+
<th>m-cashew</th>
|
| 346 |
+
<th>m-chesapeake</th>
|
| 347 |
+
<th>m-NeonTree</th>
|
| 348 |
+
<th>m-nz-cattle</th>
|
| 349 |
+
<th>m-pv4ger-seg</th>
|
| 350 |
+
<th>m-SA-crop</th>
|
| 351 |
+
<th>mean</th>
|
| 352 |
+
</tr>
|
| 353 |
+
<tr>
|
| 354 |
+
<td>DINOv3 ViT-L/16</td>
|
| 355 |
+
<td>94.2</td>
|
| 356 |
+
<td>75.6</td>
|
| 357 |
+
<td>61.8</td>
|
| 358 |
+
<td>83.7</td>
|
| 359 |
+
<td>95.2</td>
|
| 360 |
+
<td>36.8</td>
|
| 361 |
+
<td>74.5</td>
|
| 362 |
+
</tr>
|
| 363 |
+
<tr>
|
| 364 |
+
<td>DINOv3 ViT-7B/16</td>
|
| 365 |
+
<td>94.1</td>
|
| 366 |
+
<td>76.6</td>
|
| 367 |
+
<td>62.6</td>
|
| 368 |
+
<td>83.4</td>
|
| 369 |
+
<td>95.5</td>
|
| 370 |
+
<td>37.6</td>
|
| 371 |
+
<td>75.0</td>
|
| 372 |
+
</tr>
|
| 373 |
+
</table>
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
## Environmental Impact
|
| 377 |
+
|
| 378 |
+
- **Hardware Type:** Nvidia H100
|
| 379 |
+
- **Hours used:** 61,440 hours for ViT-7B model training
|
| 380 |
+
- **Cloud Provider:** Private infrastructure
|
| 381 |
+
- **Compute Region:** USA
|
| 382 |
+
- **Carbon Emitted:** 18t CO2eq
|
| 383 |
+
|
| 384 |
+
## Technical Specifications
|
| 385 |
+
|
| 386 |
+
### Model Architecture and Objective
|
| 387 |
+
|
| 388 |
+
Vision Transformer models:
|
| 389 |
+
|
| 390 |
+
- ViT-S (21M parameters): patch size 16, embedding dimension 384, 4 register tokens, 6 heads, MLP FFN, RoPE
|
| 391 |
+
- ViT-S+ (29M parameters): patch size 16, embedding dimension 384, 4 register tokens, 6 heads, SwiGLU FFN, RoPE
|
| 392 |
+
- ViT-B (86M parameters): patch size 16, embedding dimension 768, 4 register tokens, 12 heads, MLP FFN, RoPE
|
| 393 |
+
- ViT-L (300M parameters): patch size 16, embedding dimension 1024, 4 register tokens, 16 heads, MLP FFN, RoPE
|
| 394 |
+
- ViT-H+ (840M parameters): patch size 16, embedding dimension 1280, 4 register tokens, 20 heads, SwiGLU FFN, RoPE
|
| 395 |
+
- ViT-7B (6716M parameters): patch size 16, embedding dimension 4096, 4 register tokens, 32 heads, SwiGLU FFN, RoPE
|
| 396 |
+
|
| 397 |
+
ConvNeXt models:
|
| 398 |
+
|
| 399 |
+
- ConvNeXt Tiny (29M parameters)
|
| 400 |
+
- ConvNeXt Small (50M parameters)
|
| 401 |
+
- ConvNeXt Base (89M parameters)
|
| 402 |
+
- ConvNeXt Large (198M parameters)
|
| 403 |
+
|
| 404 |
+
### Compute Infrastructure
|
| 405 |
+
|
| 406 |
+
#### Hardware
|
| 407 |
+
|
| 408 |
+
Nvidia H100 GPUs
|
| 409 |
+
|
| 410 |
+
#### Software
|
| 411 |
+
|
| 412 |
+
PyTorch 2.7
|
| 413 |
+
|
| 414 |
+
## More Information
|
| 415 |
+
|
| 416 |
+
See the [blog post](https://ai.meta.com/blog/dinov3-self-supervised-vision-model/) and the associated [website](https://ai.meta.com/dinov3/).
|
| 417 |
+
|
| 418 |
+
## Citation
|
| 419 |
+
|
| 420 |
+
**BibTeX**
|
| 421 |
+
|
| 422 |
+
```
|
| 423 |
+
@misc{simeoni2025dinov3,
|
| 424 |
+
title={{DINOv3}},
|
| 425 |
+
author={Sim{\'e}oni, Oriane and Vo, Huy V. and Seitzer, Maximilian and Baldassarre, Federico and Oquab, Maxime and Jose, Cijo and Khalidov, Vasil and Szafraniec, Marc and Yi, Seungeun and Ramamonjisoa, Micha{\"e}l and Massa, Francisco and Haziza, Daniel and Wehrstedt, Luca and Wang, Jianyuan and Darcet, Timoth{\'e}e and Moutakanni, Th{\'e}o and Sentana, Leonel and Roberts, Claire and Vedaldi, Andrea and Tolan, Jamie and Brandt, John and Couprie, Camille and Mairal, Julien and J{\'e}gou, Herv{\'e} and Labatut, Patrick and Bojanowski, Piotr},
|
| 426 |
+
year={2025},
|
| 427 |
+
eprint={2508.10104},
|
| 428 |
+
archivePrefix={arXiv},
|
| 429 |
+
primaryClass={cs.CV},
|
| 430 |
+
url={https://arxiv.org/abs/2508.10104},
|
| 431 |
+
}
|
| 432 |
+
```
|
depth_anything_v2_metric/depth_anything_v2/dinov3/README.md
ADDED
|
@@ -0,0 +1,734 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
🆕 [2025-08-14] :fire: DINOv3 backbones are now available in [Hugging Face Hub](https://huggingface.co/collections/facebook/dinov3-68924841bd6b561778e31009) and [supported](https://huggingface.co/docs/transformers/model_doc/dinov3) by the Hugging Face [Transformers](https://huggingface.co/docs/transformers/index) library
|
| 2 |
+
|
| 3 |
+
# DINOv3 🦖🦖🦖
|
| 4 |
+
|
| 5 |
+
**[Meta AI Research, FAIR](https://ai.meta.com/research/)**
|
| 6 |
+
|
| 7 |
+
Oriane Siméoni, Huy V. Vo, Maximilian Seitzer, Federico Baldassarre, Maxime Oquab, <br/>
|
| 8 |
+
Cijo Jose, Vasil Khalidov, Marc Szafraniec, Seungeun Yi, Michaël Ramamonjisoa, <br/>
|
| 9 |
+
Francisco Massa, Daniel Haziza, Luca Wehrstedt, Jianyuan Wang, <br/>
|
| 10 |
+
Timothée Darcet, Théo Moutakanni, Leonel Sentana, Claire Roberts, <br/>
|
| 11 |
+
Andrea Vedaldi, Jamie Tolan, John Brandt, Camille Couprie, <br/>
|
| 12 |
+
Julien Mairal, Hervé Jégou, Patrick Labatut, Piotr Bojanowski
|
| 13 |
+
|
| 14 |
+
[ :scroll: [`Paper`](https://arxiv.org/abs/2508.10104)] [ :newspaper: [`Blog`](https://ai.meta.com/blog/dinov3-self-supervised-vision-model/)] [ :globe_with_meridians: [`Website`](https://ai.meta.com/dinov3/)] [ :book: [`BibTeX`](#citing-dinov3)]
|
| 15 |
+
|
| 16 |
+
Reference PyTorch implementation and models for DINOv3. For details, see the **[DINOv3](https://arxiv.org/abs/2508.10104)** paper.
|
| 17 |
+
|
| 18 |
+
## Overview
|
| 19 |
+
|
| 20 |
+
<div align="center">
|
| 21 |
+
<img width="1364" height="1024" alt="market" src="https://github.com/user-attachments/assets/1411f491-988e-49cb-95ae-d03fe6e3c268" />
|
| 22 |
+
|
| 23 |
+
<i></em><b>High-resolution dense features.</b><br/>We visualize the cosine similarity maps obtained with DINOv3 output features<br/> between the patches marked with a red cross and all other patches.</i>
|
| 24 |
+
</div>
|
| 25 |
+
|
| 26 |
+
<br/>
|
| 27 |
+
|
| 28 |
+
An extended family of versatile vision foundation models producing high-quality dense features and achieving outstanding performance on various vision tasks including outperforming the specialized state of the art across a broad range of settings, without fine-tuning
|
| 29 |
+
|
| 30 |
+
## Pretrained models
|
| 31 |
+
|
| 32 |
+
:information_source: Please follow the link provided below to get access to all the model weights: once accepted, an e-mail will be sent with the complete list of URLs pointing to all the available model weights (both backbones and adapters). These URLs can then be used to either:
|
| 33 |
+
- download the model or adapter weights to a local filesystem and point `torch.hub.load()` to these local weights via the `weights` or `backbone_weights` parameters, or
|
| 34 |
+
- directly invoke `torch.hub.load()` to download and load a backbone or an adapter from its URL via also the `weights` or `backbone_weights` parameters.
|
| 35 |
+
|
| 36 |
+
See the example code snippets below.
|
| 37 |
+
|
| 38 |
+
:warning: Please use `wget` instead of a web browser to download the weights.
|
| 39 |
+
|
| 40 |
+
ViT models pretrained on web dataset (LVD-1689M):
|
| 41 |
+
<table style="margin: auto">
|
| 42 |
+
<thead>
|
| 43 |
+
<tr>
|
| 44 |
+
<th>Model</th>
|
| 45 |
+
<th>Parameters</th>
|
| 46 |
+
<th>Pretraining<br/>Dataset</th>
|
| 47 |
+
<th>Download</th>
|
| 48 |
+
</tr>
|
| 49 |
+
</thead>
|
| 50 |
+
<tbody>
|
| 51 |
+
<tr>
|
| 52 |
+
<td>ViT-S/16 distilled </td>
|
| 53 |
+
<td align="right">21M</td>
|
| 54 |
+
<td align="center">LVD-1689M</td>
|
| 55 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 56 |
+
</tr>
|
| 57 |
+
<tr>
|
| 58 |
+
<td>ViT-S+/16 distilled</td>
|
| 59 |
+
<td align="right">29M</td>
|
| 60 |
+
<td align="center">LVD-1689M</td>
|
| 61 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 62 |
+
</tr>
|
| 63 |
+
<tr>
|
| 64 |
+
<td>ViT-B/16 distilled</td>
|
| 65 |
+
<td align="right">86M</td>
|
| 66 |
+
<td align="center">LVD-1689M</td>
|
| 67 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 68 |
+
</tr>
|
| 69 |
+
<tr>
|
| 70 |
+
<td>ViT-L/16 distilled</td>
|
| 71 |
+
<td align="right">300M</td>
|
| 72 |
+
<td align="center">LVD-1689M</td>
|
| 73 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 74 |
+
</tr>
|
| 75 |
+
<tr>
|
| 76 |
+
<td>ViT-H+/16 distilled</td>
|
| 77 |
+
<td align="right">840M</td>
|
| 78 |
+
<td align="center">LVD-1689M</td>
|
| 79 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 80 |
+
</tr>
|
| 81 |
+
<tr>
|
| 82 |
+
<td>ViT-7B/16</td>
|
| 83 |
+
<td align="right">6,716M</td>
|
| 84 |
+
<td align="center">LVD-1689M</td>
|
| 85 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 86 |
+
</tr>
|
| 87 |
+
</tbody>
|
| 88 |
+
</table>
|
| 89 |
+
|
| 90 |
+
ConvNeXt models pretrained on web dataset (LVD-1689M):
|
| 91 |
+
<table style="margin: auto">
|
| 92 |
+
<thead>
|
| 93 |
+
<tr>
|
| 94 |
+
<th>Model</th>
|
| 95 |
+
<th>Parameters</th>
|
| 96 |
+
<th>Pretraining<br/>Dataset</th>
|
| 97 |
+
<th>Download</th>
|
| 98 |
+
</tr>
|
| 99 |
+
</thead>
|
| 100 |
+
<tbody>
|
| 101 |
+
<tr>
|
| 102 |
+
<td>ConvNeXt Tiny</td>
|
| 103 |
+
<td align="right">29M</td>
|
| 104 |
+
<td align="center">LVD-1689M</td>
|
| 105 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 106 |
+
</tr>
|
| 107 |
+
<tr>
|
| 108 |
+
<td>ConvNeXt Small</td>
|
| 109 |
+
<td align="right">50M</td>
|
| 110 |
+
<td align="center">LVD-1689M</td>
|
| 111 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 112 |
+
</tr>
|
| 113 |
+
<tr>
|
| 114 |
+
<td>ConvNeXt Base</td>
|
| 115 |
+
<td align="right">89M</td>
|
| 116 |
+
<td align="center">LVD-1689M</td>
|
| 117 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 118 |
+
</tr>
|
| 119 |
+
<tr>
|
| 120 |
+
<td>ConvNeXt Large</td>
|
| 121 |
+
<td align="right">198M</td>
|
| 122 |
+
<td align="center">LVD-1689M</td>
|
| 123 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 124 |
+
</tr>
|
| 125 |
+
</tbody>
|
| 126 |
+
</table>
|
| 127 |
+
|
| 128 |
+
ViT models pretrained on satellite dataset (SAT-493M):
|
| 129 |
+
<table style="margin: auto">
|
| 130 |
+
<thead>
|
| 131 |
+
<tr>
|
| 132 |
+
<th>Model</th>
|
| 133 |
+
<th>Parameters</th>
|
| 134 |
+
<th>Pretraining<br/>Dataset</th>
|
| 135 |
+
<th>Download</th>
|
| 136 |
+
</tr>
|
| 137 |
+
</thead>
|
| 138 |
+
<tbody>
|
| 139 |
+
<tr>
|
| 140 |
+
<td>ViT-L/16 distilled</td>
|
| 141 |
+
<td align="right">300M</td>
|
| 142 |
+
<td align="center">SAT-493M</td>
|
| 143 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 144 |
+
</tr>
|
| 145 |
+
<tr>
|
| 146 |
+
<td>ViT-7B/16</td>
|
| 147 |
+
<td align="right">6,716M</td>
|
| 148 |
+
<td align="center">SAT-493M</td>
|
| 149 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 150 |
+
</tr>
|
| 151 |
+
</tbody>
|
| 152 |
+
</table>
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
### Pretrained backbones (via PyTorch [Hub](https://docs.pytorch.org/docs/stable/hub.html))
|
| 156 |
+
|
| 157 |
+
Please follow the instructions [here](https://pytorch.org/get-started/locally/) to install PyTorch (the only required dependency for loading the model). Installing PyTorch with CUDA support is strongly recommended.
|
| 158 |
+
|
| 159 |
+
```python
|
| 160 |
+
import torch
|
| 161 |
+
|
| 162 |
+
REPO_DIR = <PATH/TO/A/LOCAL/DIRECTORY/WHERE/THE/DINOV3/REPO/WAS/CLONED>
|
| 163 |
+
|
| 164 |
+
# DINOv3 ViT models pretrained on web images
|
| 165 |
+
dinov3_vits16 = torch.hub.load(REPO_DIR, 'dinov3_vits16', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
|
| 166 |
+
dinov3_vits16plus = torch.hub.load(REPO_DIR, 'dinov3_vits16plus', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
|
| 167 |
+
dinov3_vitb16 = torch.hub.load(REPO_DIR, 'dinov3_vitb16', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
|
| 168 |
+
dinov3_vitl16 = torch.hub.load(REPO_DIR, 'dinov3_vitl16', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
|
| 169 |
+
dinov3_vith16plus = torch.hub.load(REPO_DIR, 'dinov3_vith16plus', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
|
| 170 |
+
dinov3_vit7b16 = torch.hub.load(REPO_DIR, 'dinov3_vit7b16', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
|
| 171 |
+
|
| 172 |
+
# DINOv3 ConvNeXt models pretrained on web images
|
| 173 |
+
dinov3_convnext_tiny = torch.hub.load(REPO_DIR, 'dinov3_convnext_tiny', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
|
| 174 |
+
dinov3_convnext_small = torch.hub.load(REPO_DIR, 'dinov3_convnext_small', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
|
| 175 |
+
dinov3_convnext_base = torch.hub.load(REPO_DIR, 'dinov3_convnext_base', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
|
| 176 |
+
dinov3_convnext_large = torch.hub.load(REPO_DIR, 'dinov3_convnext_large', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
|
| 177 |
+
|
| 178 |
+
# DINOv3 ViT models pretrained on satellite imagery
|
| 179 |
+
dinov3_vitl16 = torch.hub.load(REPO_DIR, 'dinov3_vitl16', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
|
| 180 |
+
dinov3_vit7b16 = torch.hub.load(REPO_DIR, 'dinov3_vit7b16', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
### Pretrained backbones (via Hugging Face [Transformers](https://huggingface.co/docs/transformers/))
|
| 184 |
+
|
| 185 |
+
All the backbones are available in the the [DINOv3](https://huggingface.co/collections/facebook/dinov3-68924841bd6b561778e31009) collection on Hugging Face Hub and supported via the Hugging Face [Transformers](https://huggingface.co/docs/transformers/index) library. Please refer to the corresponding documentation for usage, but below is a short example that demonstrates how to obtain an image embedding with either [Pipeline] or the [AutoModel] class.
|
| 186 |
+
|
| 187 |
+
```python
|
| 188 |
+
from transformers import pipeline
|
| 189 |
+
from transformers.image_utils import load_image
|
| 190 |
+
|
| 191 |
+
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
|
| 192 |
+
image = load_image(url)
|
| 193 |
+
|
| 194 |
+
feature_extractor = pipeline(
|
| 195 |
+
model="facebook/dinov3-convnext-tiny-pretrain-lvd1689m",
|
| 196 |
+
task="image-feature-extraction",
|
| 197 |
+
)
|
| 198 |
+
features = feature_extractor(image)
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
```python
|
| 202 |
+
import torch
|
| 203 |
+
from transformers import AutoImageProcessor, AutoModel
|
| 204 |
+
from transformers.image_utils import load_image
|
| 205 |
+
|
| 206 |
+
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
| 207 |
+
image = load_image(url)
|
| 208 |
+
|
| 209 |
+
pretrained_model_name = "facebook/dinov3-convnext-tiny-pretrain-lvd1689m"
|
| 210 |
+
processor = AutoImageProcessor.from_pretrained(pretrained_model_name)
|
| 211 |
+
model = AutoModel.from_pretrained(
|
| 212 |
+
pretrained_model_name,
|
| 213 |
+
device_map="auto",
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
inputs = processor(images=image, return_tensors="pt").to(model.device)
|
| 217 |
+
with torch.inference_mode():
|
| 218 |
+
outputs = model(**inputs)
|
| 219 |
+
|
| 220 |
+
pooled_output = outputs.pooler_output
|
| 221 |
+
print("Pooled output shape:", pooled_output.shape)
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
where `model` and `pretrained_model_name` above can be one of:
|
| 225 |
+
- `facebook/dinov3-vits16-pretrain-lvd1689m`
|
| 226 |
+
- `facebook/dinov3-vits16plus-pretrain-lvd1689m`
|
| 227 |
+
- `facebook/dinov3-vitb16-pretrain-lvd1689m`
|
| 228 |
+
- `facebook/dinov3-vitl16-pretrain-lvd1689m`
|
| 229 |
+
- `facebook/dinov3-vith16plus-pretrain-lvd1689m`
|
| 230 |
+
- `facebook/dinov3-vit7b16-pretrain-lvd1689m`
|
| 231 |
+
- `facebook/dinov3-convnext-base-pretrain-lvd1689m`
|
| 232 |
+
- `facebook/dinov3-convnext-large-pretrain-lvd1689m`
|
| 233 |
+
- `facebook/dinov3-convnext-small-pretrain-lvd1689m`
|
| 234 |
+
- `facebook/dinov3-convnext-tiny-pretrain-lvd1689m`
|
| 235 |
+
- `facebook/dinov3-vitl16-pretrain-sat493m`
|
| 236 |
+
- `facebook/dinov3-vit7b16-pretrain-sat493m`
|
| 237 |
+
|
| 238 |
+
### Image transforms
|
| 239 |
+
|
| 240 |
+
For models using the LVD-1689M weights (pretrained on web images), please use the following transform (standard ImageNet evaluation transform):
|
| 241 |
+
|
| 242 |
+
```python
|
| 243 |
+
import torchvision
|
| 244 |
+
|
| 245 |
+
def make_transform(resize_size: int = 224):
|
| 246 |
+
to_tensor = transforms.ToTensor()
|
| 247 |
+
resize = transforms.Resize((resize_size, resize_size), antialias=True)
|
| 248 |
+
normalize = transforms.Normalize(
|
| 249 |
+
mean=(0.485, 0.456, 0.406),
|
| 250 |
+
std=(0.229, 0.224, 0.225),
|
| 251 |
+
)
|
| 252 |
+
return transforms.Compose([to_tensor, resize, normalize])
|
| 253 |
+
```
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
For models using the SAT-493M weights (pretrained on satellite imagery), please use the following transform:
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
```python
|
| 260 |
+
import torchvision
|
| 261 |
+
|
| 262 |
+
def make_transform(resize_size: int = 224):
|
| 263 |
+
to_tensor = transforms.ToTensor()
|
| 264 |
+
resize = transforms.Resize((resize_size, resize_size), antialias=True)
|
| 265 |
+
normalize = transforms.Normalize(
|
| 266 |
+
mean=(0.430, 0.411, 0.296),
|
| 267 |
+
std=(0.213, 0.156, 0.143),
|
| 268 |
+
)
|
| 269 |
+
return transforms.Compose([to_tensor, resize, normalize])
|
| 270 |
+
```
|
| 271 |
+
|
| 272 |
+
### Pretrained heads - Image classification
|
| 273 |
+
|
| 274 |
+
<table style="margin: auto">
|
| 275 |
+
<thead>
|
| 276 |
+
<tr>
|
| 277 |
+
<th>Backbone</th>
|
| 278 |
+
<th>Pretraining<br/>Dataset</th>
|
| 279 |
+
<th>Head<br/>Dataset</th>
|
| 280 |
+
<th>Download</th>
|
| 281 |
+
</tr>
|
| 282 |
+
</thead>
|
| 283 |
+
<tbody>
|
| 284 |
+
<tr>
|
| 285 |
+
<td>ViT-7B/16</td>
|
| 286 |
+
<td align="center">LVD-1689M</td>
|
| 287 |
+
<td align="center">ImageNet</td>
|
| 288 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 289 |
+
</tr>
|
| 290 |
+
</tbody>
|
| 291 |
+
</table>
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
The (full) classifier models can be loaded via PyTorch Hub:
|
| 295 |
+
|
| 296 |
+
```python
|
| 297 |
+
import torch
|
| 298 |
+
|
| 299 |
+
# DINOv3
|
| 300 |
+
dinov3_vit7b16_lc = torch.hub.load(REPO_DIR, 'dinov3_vit7b16_lc', source="local", weights=<DEPTHER/CHECKPOINT/URL/OR/PATH>, backbone_weights=<BACKBONE/CHECKPOINT/URL/OR/PATH>)
|
| 301 |
+
|
| 302 |
+
```
|
| 303 |
+
|
| 304 |
+
### Pretrained heads - Depther trained on SYNTHMIX dataset
|
| 305 |
+
|
| 306 |
+
<table style="margin: auto">
|
| 307 |
+
<thead>
|
| 308 |
+
<tr>
|
| 309 |
+
<th>Backbone</th>
|
| 310 |
+
<th>Pretraining<br/>Dataset</th>
|
| 311 |
+
<th>Head<br/>Dataset</th>
|
| 312 |
+
<th>Download</th>
|
| 313 |
+
</tr>
|
| 314 |
+
</thead>
|
| 315 |
+
<tbody>
|
| 316 |
+
<tr>
|
| 317 |
+
<td>ViT-7B/16</td>
|
| 318 |
+
<td align="center">LVD-1689M</td>
|
| 319 |
+
<td align="center">SYNTHMIX</td>
|
| 320 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 321 |
+
</tr>
|
| 322 |
+
</tbody>
|
| 323 |
+
</table>
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
```python
|
| 327 |
+
depther = torch.hub.load(REPO_DIR, 'dinov3_vit7b16_dd', source="local", weights=<DEPTHER/CHECKPOINT/URL/OR/PATH>, backbone_weights=<BACKBONE/CHECKPOINT/URL/OR/PATH>)
|
| 328 |
+
```
|
| 329 |
+
|
| 330 |
+
Full example code of depther on an image
|
| 331 |
+
|
| 332 |
+
```python
|
| 333 |
+
from PIL import Image
|
| 334 |
+
import torch
|
| 335 |
+
from torchvision import transforms
|
| 336 |
+
import matplotlib.pyplot as plt
|
| 337 |
+
from matplotlib import colormaps
|
| 338 |
+
|
| 339 |
+
def get_img():
|
| 340 |
+
import requests
|
| 341 |
+
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
| 342 |
+
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
|
| 343 |
+
return image
|
| 344 |
+
|
| 345 |
+
def make_transform(resize_size: int | list[int] = 768):
|
| 346 |
+
to_tensor = transforms.ToTensor()
|
| 347 |
+
resize = transforms.Resize((resize_size, resize_size), antialias=True)
|
| 348 |
+
normalize = transforms.Normalize(
|
| 349 |
+
mean=(0.485, 0.456, 0.406),
|
| 350 |
+
std=(0.229, 0.224, 0.225),
|
| 351 |
+
)
|
| 352 |
+
return transforms.Compose([to_tensor, resize, normalize])
|
| 353 |
+
|
| 354 |
+
depther = torch.hub.load(REPO_DIR, 'dinov3_vit7b16_dd', source="local", weights=<DEPTHER/CHECKPOINT/URL/OR/PATH>, backbone_weights=<BACKBONE/CHECKPOINT/URL/OR/PATH>)
|
| 355 |
+
|
| 356 |
+
img_size = 1024
|
| 357 |
+
img = get_img()
|
| 358 |
+
transform = make_transform(img_size)
|
| 359 |
+
with torch.inference_mode():
|
| 360 |
+
with torch.autocast('cuda', dtype=torch.bfloat16):
|
| 361 |
+
batch_img = transform(img)[None]
|
| 362 |
+
batch_img = batch_img
|
| 363 |
+
depths = depther(batch_img)
|
| 364 |
+
|
| 365 |
+
plt.figure(figsize=(12, 6))
|
| 366 |
+
plt.subplot(121)
|
| 367 |
+
plt.imshow(img)
|
| 368 |
+
plt.axis("off")
|
| 369 |
+
plt.subplot(122)
|
| 370 |
+
plt.imshow(depths[0,0].cpu(), cmap=colormaps["Spectral"])
|
| 371 |
+
plt.axis("off")
|
| 372 |
+
|
| 373 |
+
```
|
| 374 |
+
|
| 375 |
+
### Pretrained heads - Detector trained on COCO2017 dataset
|
| 376 |
+
|
| 377 |
+
<table style="margin: auto">
|
| 378 |
+
<thead>
|
| 379 |
+
<tr>
|
| 380 |
+
<th>Backbone</th>
|
| 381 |
+
<th>Pretraining<br/>Dataset</th>
|
| 382 |
+
<th>Head<br/>Dataset</th>
|
| 383 |
+
<th>Download</th>
|
| 384 |
+
</tr>
|
| 385 |
+
</thead>
|
| 386 |
+
<tbody>
|
| 387 |
+
<tr>
|
| 388 |
+
<td>ViT-7B/16</td>
|
| 389 |
+
<td align="center">LVD-1689M</td>
|
| 390 |
+
<td align="center">COCO2017</td>
|
| 391 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 392 |
+
</tr>
|
| 393 |
+
</tbody>
|
| 394 |
+
</table>
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
```python
|
| 398 |
+
detector = torch.hub.load(REPO_DIR, 'dinov3_vit7b16_de', source="local", weights=<DETECTOR/CHECKPOINT/URL/OR/PATH>, backbone_weights=<BACKBONE/CHECKPOINT/URL/OR/PATH>)
|
| 399 |
+
```
|
| 400 |
+
|
| 401 |
+
### Pretrained heads - Segmentor trained on ADE20K dataset
|
| 402 |
+
|
| 403 |
+
<table style="margin: auto">
|
| 404 |
+
<thead>
|
| 405 |
+
<tr>
|
| 406 |
+
<th>Backbone</th>
|
| 407 |
+
<th>Pretraining<br/>Dataset</th>
|
| 408 |
+
<th>Head<br/>Dataset</th>
|
| 409 |
+
<th>Download</th>
|
| 410 |
+
</tr>
|
| 411 |
+
</thead>
|
| 412 |
+
<tbody>
|
| 413 |
+
<tr>
|
| 414 |
+
<td>ViT-7B/16</td>
|
| 415 |
+
<td align="center">LVD-1689M</td>
|
| 416 |
+
<td align="center">ADE20K</td>
|
| 417 |
+
<td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
|
| 418 |
+
</tr>
|
| 419 |
+
</tbody>
|
| 420 |
+
</table>
|
| 421 |
+
|
| 422 |
+
```python
|
| 423 |
+
segmentor = torch.hub.load(REPO_DIR, 'dinov3_vit7b16_ms', source="local", weights=<SEGMENTOR/CHECKPOINT/URL/OR/PATH>, backbone_weights=<BACKBONE/CHECKPOINT/URL/OR/PATH>)
|
| 424 |
+
```
|
| 425 |
+
|
| 426 |
+
Full example code of segmentator on an image
|
| 427 |
+
|
| 428 |
+
```python
|
| 429 |
+
import sys
|
| 430 |
+
sys.path.append(REPO_DIR)
|
| 431 |
+
|
| 432 |
+
from PIL import Image
|
| 433 |
+
import torch
|
| 434 |
+
from torchvision import transforms
|
| 435 |
+
import matplotlib.pyplot as plt
|
| 436 |
+
from matplotlib import colormaps
|
| 437 |
+
from functools import partial
|
| 438 |
+
from dinov3.eval.segmentation.inference import make_inference
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
def get_img():
|
| 442 |
+
import requests
|
| 443 |
+
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
| 444 |
+
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
|
| 445 |
+
return image
|
| 446 |
+
|
| 447 |
+
def make_transform(resize_size: int | list[int] = 768):
|
| 448 |
+
to_tensor = transforms.ToTensor()
|
| 449 |
+
resize = transforms.Resize((resize_size, resize_size), antialias=True)
|
| 450 |
+
normalize = transforms.Normalize(
|
| 451 |
+
mean=(0.485, 0.456, 0.406),
|
| 452 |
+
std=(0.229, 0.224, 0.225),
|
| 453 |
+
)
|
| 454 |
+
return transforms.Compose([to_tensor, resize, normalize])
|
| 455 |
+
|
| 456 |
+
segmentor = torch.hub.load(REPO_DIR, 'dinov3_vit7b16_ms', source="local", weights=<SEGMENTOR/CHECKPOINT/URL/OR/PATH>, backbone_weights=<BACKBONE/CHECKPOINT/URL/OR/PATH>)
|
| 457 |
+
|
| 458 |
+
img_size = 896
|
| 459 |
+
img = get_img()
|
| 460 |
+
transform = make_transform(img_size)
|
| 461 |
+
with torch.inference_mode():
|
| 462 |
+
with torch.autocast('cuda', dtype=torch.bfloat16):
|
| 463 |
+
batch_img = transform(img)[None]
|
| 464 |
+
pred_vit7b = segmentor(batch_img) # raw predictions
|
| 465 |
+
# actual segmentation map
|
| 466 |
+
segmentation_map_vit7b = make_inference(
|
| 467 |
+
batch_img,
|
| 468 |
+
segmentor,
|
| 469 |
+
inference_mode="slide",
|
| 470 |
+
decoder_head_type="m2f",
|
| 471 |
+
rescale_to=(img.size[-1], img.size[-2]),
|
| 472 |
+
n_output_channels=150,
|
| 473 |
+
crop_size=(img_size, img_size),
|
| 474 |
+
stride=(img_size, img_size),
|
| 475 |
+
output_activation=partial(torch.nn.functional.softmax, dim=1),
|
| 476 |
+
).argmax(dim=1, keepdim=True)
|
| 477 |
+
plt.figure(figsize=(12, 6))
|
| 478 |
+
plt.subplot(121)
|
| 479 |
+
plt.imshow(img)
|
| 480 |
+
plt.axis("off")
|
| 481 |
+
plt.subplot(122)
|
| 482 |
+
plt.imshow(segmentation_map_vit7b[0,0].cpu(), cmap=colormaps["Spectral"])
|
| 483 |
+
plt.axis("off")
|
| 484 |
+
```
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
### Pretrained heads - Zero-shot tasks with `dino.txt`
|
| 490 |
+
|
| 491 |
+
<table style="margin: auto">
|
| 492 |
+
<thead>
|
| 493 |
+
<tr>
|
| 494 |
+
<th rowspan="2">Backbone</th>
|
| 495 |
+
<th>Download</th>
|
| 496 |
+
</tr>
|
| 497 |
+
</thead>
|
| 498 |
+
<tbody>
|
| 499 |
+
<tr>
|
| 500 |
+
<td>ViT-L/16 distilled</td>
|
| 501 |
+
<td align="center">
|
| 502 |
+
<a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a>,
|
| 503 |
+
<a href="https://dl.fbaipublicfiles.com/dinov3/thirdparty/bpe_simple_vocab_16e6.txt.gz">vocabulary</a>,
|
| 504 |
+
<a href="https://dl.fbaipublicfiles.com/dinov2/thirdparty/LICENSE">vocabulary license</a>
|
| 505 |
+
</td>
|
| 506 |
+
</tr>
|
| 507 |
+
</tbody>
|
| 508 |
+
</table>
|
| 509 |
+
|
| 510 |
+
The (full) dino.txt model can be loaded via PyTorch Hub:
|
| 511 |
+
|
| 512 |
+
```python
|
| 513 |
+
import torch
|
| 514 |
+
# DINOv3
|
| 515 |
+
dinov3_vitl16_dinotxt_tet1280d20h24l, tokenizer = torch.hub.load(REPO_DIR, 'dinov3_vitl16_dinotxt_tet1280d20h24l', weights=<SEGMENTOR/CHECKPOINT/URL/OR/PATH>, backbone_weights=<BACKBONE/CHECKPOINT/URL/OR/PATH>)
|
| 516 |
+
```
|
| 517 |
+
|
| 518 |
+
|
| 519 |
+
## Installation
|
| 520 |
+
|
| 521 |
+
The training and evaluation code requires PyTorch version >= 2.7.1 as well as a few other 3rd party packages. Note that the code has only been tested with the specified versions and also expects a Linux environment. To setup all the required dependencies for training and evaluation, please follow the instructions below:
|
| 522 |
+
|
| 523 |
+
*[micromamba](https://mamba.readthedocs.io/en/latest/user_guide/micromamba.html)* **(Recommended)** - Clone the repository and then create and activate a `dinov3` conda environment using the provided environment definition:
|
| 524 |
+
|
| 525 |
+
```shell
|
| 526 |
+
micromamba env create -f conda.yaml
|
| 527 |
+
micromamba activate dinov3
|
| 528 |
+
```
|
| 529 |
+
|
| 530 |
+
## Getting started
|
| 531 |
+
|
| 532 |
+
Several notebooks are provided to get started applying DINOv3:
|
| 533 |
+
- [PCA of patch features](notebooks/pca.ipynb): display the PCA of DINOv3 patch features on a foreground object (rainbow visualizations from the paper) [[Run in Google Colab]](https://colab.research.google.com/github/facebookresearch/dinov3/blob/main/notebooks/pca.ipynb)
|
| 534 |
+
- [Foreground segmentation](notebooks/foreground_segmentation.ipynb): train a linear foreground segmentation model based on DINOv3 features [[Run in Google Colab]](https://colab.research.google.com/github/facebookresearch/dinov3/blob/main/notebooks/foreground_segmentation.ipynb)
|
| 535 |
+
- [Dense and sparse matching](notebooks/dense_sparse_matching.ipynb): match patches from objects on two different images based on DINOv3 features [[Run in Google Colab]](https://colab.research.google.com/github/facebookresearch/dinov3/blob/main/notebooks/dense_sparse_matching.ipynb)
|
| 536 |
+
- [Segmentation tracking](notebooks/segmentation_tracking.ipynb): video segmentation tracking using a non-parametric method based on DINOv3 features [[Run in Google Colab]](https://colab.research.google.com/github/facebookresearch/dinov3/blob/main/notebooks/segmentation_tracking.ipynb)
|
| 537 |
+
|
| 538 |
+
## Data preparation
|
| 539 |
+
|
| 540 |
+
### ImageNet-1k
|
| 541 |
+
|
| 542 |
+
The root directory of the dataset should hold the following contents:
|
| 543 |
+
|
| 544 |
+
- `<ROOT>/test/ILSVRC2012_test_00000001.JPEG`
|
| 545 |
+
- `<ROOT>/test/[..]`
|
| 546 |
+
- `<ROOT>/test/ILSVRC2012_test_00100000.JPEG`
|
| 547 |
+
- `<ROOT>/train/n01440764/n01440764_10026.JPEG`
|
| 548 |
+
- `<ROOT>/train/[...]`
|
| 549 |
+
- `<ROOT>/train/n15075141/n15075141_9993.JPEG`
|
| 550 |
+
- `<ROOT>/val/n01440764/ILSVRC2012_val_00000293.JPEG`
|
| 551 |
+
- `<ROOT>/val/[...]`
|
| 552 |
+
- `<ROOT>/val/n15075141/ILSVRC2012_val_00049174.JPEG`
|
| 553 |
+
- `<ROOT>/labels.txt`
|
| 554 |
+
|
| 555 |
+
The provided dataset implementation expects a few additional metadata files to be present under the extra directory:
|
| 556 |
+
|
| 557 |
+
- `<EXTRA>/class-ids-TRAIN.npy`
|
| 558 |
+
- `<EXTRA>/class-ids-VAL.npy`
|
| 559 |
+
- `<EXTRA>/class-names-TRAIN.npy`
|
| 560 |
+
- `<EXTRA>/class-names-VAL.npy`
|
| 561 |
+
- `<EXTRA>/entries-TEST.npy`
|
| 562 |
+
- `<EXTRA>/entries-TRAIN.npy`
|
| 563 |
+
- `<EXTRA>/entries-VAL.npy`
|
| 564 |
+
|
| 565 |
+
These metadata files can be generated (once) with the following lines of Python code:
|
| 566 |
+
|
| 567 |
+
```python
|
| 568 |
+
from dinov3.data.datasets import ImageNet
|
| 569 |
+
|
| 570 |
+
for split in ImageNet.Split:
|
| 571 |
+
dataset = ImageNet(split=split, root="<ROOT>", extra="<EXTRA>")
|
| 572 |
+
dataset.dump_extra()
|
| 573 |
+
```
|
| 574 |
+
|
| 575 |
+
Note that the root and extra directories do not have to be distinct directories.
|
| 576 |
+
|
| 577 |
+
### ImageNet-22k
|
| 578 |
+
|
| 579 |
+
Please adapt the [dataset class](dinov3/data/datasets/image_net_22k.py) to match your local setup.
|
| 580 |
+
|
| 581 |
+
<br />
|
| 582 |
+
|
| 583 |
+
:warning: To execute the commands provided in the next sections for training and evaluation, the `dinov3` package should be included in the Python module search path, i.e. simply prefix the command to run with `PYTHONPATH=.`.
|
| 584 |
+
|
| 585 |
+
## Training
|
| 586 |
+
|
| 587 |
+
### Fast setup: training DINOv3 ViT-L/16 on ImageNet-1k
|
| 588 |
+
|
| 589 |
+
Run DINOv3 pre-training on 4 H100-80GB nodes (32 GPUs) in a SLURM cluster environment with submitit:
|
| 590 |
+
|
| 591 |
+
```shell
|
| 592 |
+
PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/train/train.py \
|
| 593 |
+
--nodes 4 \
|
| 594 |
+
--config-file dinov3/configs/train/vitl_im1k_lin834.yaml \
|
| 595 |
+
--output-dir <PATH/TO/OUTPUT/DIR> \
|
| 596 |
+
train.dataset_path=ImageNet22k:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
|
| 597 |
+
```
|
| 598 |
+
Training time is approximately 14 hours and the resulting checkpoint should reach 82.0% on k-NN eval and 83.5% on linear eval.
|
| 599 |
+
|
| 600 |
+
The training code saves the weights of the teacher in the eval folder every 12500 iterations for evaluation.
|
| 601 |
+
|
| 602 |
+
### Exact DINOv3 setup: training DINOv3 ViT-7B/16
|
| 603 |
+
|
| 604 |
+
DINOv3 ViT-7B/16 is trained on a private dataset. The training involves 3 stages:
|
| 605 |
+
- Pretraining
|
| 606 |
+
- Gram anchoring
|
| 607 |
+
- High resolution adaptation
|
| 608 |
+
|
| 609 |
+
#### Pretraining
|
| 610 |
+
|
| 611 |
+
Launch DINOV3 ViT-7B/16 pretraining on 32 nodes (256 GPUs) in a SLURM cluster environment with submitit.
|
| 612 |
+
|
| 613 |
+
```shell
|
| 614 |
+
PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/train/train.py \
|
| 615 |
+
--nodes 32 \
|
| 616 |
+
--config-file dinov3/configs/train/dinov3_vit7b16_pretrain.yaml \
|
| 617 |
+
--output-dir <PATH/TO/OUTPUT/DIR> \
|
| 618 |
+
train.dataset_path=<DATASET>:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
|
| 619 |
+
```
|
| 620 |
+
|
| 621 |
+
#### Gram anchoring
|
| 622 |
+
|
| 623 |
+
```shell
|
| 624 |
+
PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/train/train.py \
|
| 625 |
+
--nodes 32 \
|
| 626 |
+
--config-file dinov3/configs/train/dinov3_vit7b16_gram_anchor.yaml \
|
| 627 |
+
--output-dir <PATH/TO/OUTPUT/DIR> \
|
| 628 |
+
train.dataset_path=<DATASET>:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
|
| 629 |
+
gram.ckpt=<PATH/TO/GRAM_TEACHER_FROM_PREVIOUS_STEP>
|
| 630 |
+
```
|
| 631 |
+
|
| 632 |
+
#### High-resolution adaptation
|
| 633 |
+
|
| 634 |
+
|
| 635 |
+
```shell
|
| 636 |
+
PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/train/train.py \
|
| 637 |
+
--nodes 32 \
|
| 638 |
+
--config-file dinov3/configs/train/dinov3_vit7b16_high_res_adapt.yaml \
|
| 639 |
+
--output-dir <PATH/TO/OUTPUT/DIR> \
|
| 640 |
+
train.dataset_path=<DATASET>:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
|
| 641 |
+
gram.ckpt=<PATH/TO/TEACHER_FROM_GRAM> \
|
| 642 |
+
student.resume_from_teacher_chkpt=<PATH/TO/TEACHER_FROM_GRAM>
|
| 643 |
+
```
|
| 644 |
+
|
| 645 |
+
## Multi-distillation
|
| 646 |
+
|
| 647 |
+
### Test setup:
|
| 648 |
+
|
| 649 |
+
```shell
|
| 650 |
+
PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/train/train.py \
|
| 651 |
+
--nodes 1 \
|
| 652 |
+
--config-file dinov3/configs/train/multi_distillation_test.yaml \
|
| 653 |
+
--output-dir <PATH/TO/OUTPUT/DIR> \
|
| 654 |
+
--multi-distillation \
|
| 655 |
+
train.dataset_path=<DATASET>:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
|
| 656 |
+
```
|
| 657 |
+
|
| 658 |
+
## Evaluation
|
| 659 |
+
|
| 660 |
+
The training code regularly saves the teacher weights. In order to evaluate the model, run the following evaluation on a single node:
|
| 661 |
+
|
| 662 |
+
|
| 663 |
+
### Logistic regression classification on ImageNet-1k
|
| 664 |
+
|
| 665 |
+
```shell
|
| 666 |
+
PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/eval/log_regression.py \
|
| 667 |
+
model.config_file=<PATH/TO/OUTPUT/DIR>/config.yaml \
|
| 668 |
+
model.pretrained_weights=<PATH/TO/OUTPUT/DIR>/teacher_checkpoint.pth \
|
| 669 |
+
output_dir=<PATH/TO/OUTPUT/DIR> \
|
| 670 |
+
train.dataset=ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
|
| 671 |
+
eval.test_dataset=ImageNet:split=VAL:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
|
| 672 |
+
```
|
| 673 |
+
|
| 674 |
+
### k-NN classification on ImageNet-1k
|
| 675 |
+
|
| 676 |
+
```shell
|
| 677 |
+
PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/eval/knn.py \
|
| 678 |
+
model.config_file=<PATH/TO/OUTPUT/DIR>/config.yaml \
|
| 679 |
+
model.pretrained_weights=<PATH/TO/OUTPUT/DIR>/teacher_checkpoint.pth \
|
| 680 |
+
output_dir=<PATH/TO/OUTPUT/DIR> \
|
| 681 |
+
train.dataset=ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
|
| 682 |
+
eval.test_dataset=ImageNet:split=VAL:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
|
| 683 |
+
```
|
| 684 |
+
|
| 685 |
+
### Linear classification with data augmentation on ImageNet-1k
|
| 686 |
+
|
| 687 |
+
```shell
|
| 688 |
+
PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/eval/linear.py \
|
| 689 |
+
model.config_file=<PATH/TO/OUTPUT/DIR>/config.yaml \
|
| 690 |
+
model.pretrained_weights=<PATH/TO/OUTPUT/DIR>/teacher_checkpoint.pth \
|
| 691 |
+
output_dir=<PATH/TO/OUTPUT/DIR> \
|
| 692 |
+
train.dataset=ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
|
| 693 |
+
train.val_dataset=ImageNet:split=VAL:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
|
| 694 |
+
```
|
| 695 |
+
|
| 696 |
+
|
| 697 |
+
### Text alignment on DINOv3 using dino.txt
|
| 698 |
+
|
| 699 |
+
Text alignment can be done following the method from `dino.txt` aka [DINOv2 Meets Text](https://arxiv.org/abs/2412.16334).
|
| 700 |
+
|
| 701 |
+
```shell
|
| 702 |
+
PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/eval/text/train_dinotxt.py \
|
| 703 |
+
--nodes 4 \
|
| 704 |
+
# An example config for text alignment is here: dinov3/eval/text/configs/dinov3_vitl_text.yaml \
|
| 705 |
+
trainer_config_file="<PATH/TO/DINOv3/TEXT/CONFIG>" \
|
| 706 |
+
output-dir=<PATH/TO/OUTPUT/DIR>
|
| 707 |
+
```
|
| 708 |
+
Launching the above trains text alignment on 4 nodes with 8 gpus each (32 gpus in total).
|
| 709 |
+
Please note that the text alignment model in the DINOv3 paper was trained on a private dataset and here we have given an example config in ```dinov3/eval/text/configs/dinov3_vitl_text.yaml``` using ```CocoCaptions``` dataset for illustration purposes.
|
| 710 |
+
Please adapt the provided ```CocoCaptions``` dataset class, the dataset can be found [here](https://www.kaggle.com/datasets/nikhil7280/coco-image-caption)
|
| 711 |
+
|
| 712 |
+
## License
|
| 713 |
+
|
| 714 |
+
DINOv3 code and model weights are released under the DINOv3 License. See [LICENSE.md](LICENSE.md) for additional details.
|
| 715 |
+
|
| 716 |
+
## Contributing
|
| 717 |
+
|
| 718 |
+
See [contributing](CONTRIBUTING.md) and the [code of conduct](CODE_OF_CONDUCT.md).
|
| 719 |
+
|
| 720 |
+
## Citing DINOv3
|
| 721 |
+
|
| 722 |
+
If you find this repository useful, please consider giving a star :star: and citation :t-rex::
|
| 723 |
+
|
| 724 |
+
```
|
| 725 |
+
@misc{simeoni2025dinov3,
|
| 726 |
+
title={{DINOv3}},
|
| 727 |
+
author={Sim{\'e}oni, Oriane and Vo, Huy V. and Seitzer, Maximilian and Baldassarre, Federico and Oquab, Maxime and Jose, Cijo and Khalidov, Vasil and Szafraniec, Marc and Yi, Seungeun and Ramamonjisoa, Micha{\"e}l and Massa, Francisco and Haziza, Daniel and Wehrstedt, Luca and Wang, Jianyuan and Darcet, Timoth{\'e}e and Moutakanni, Th{\'e}o and Sentana, Leonel and Roberts, Claire and Vedaldi, Andrea and Tolan, Jamie and Brandt, John and Couprie, Camille and Mairal, Julien and J{\'e}gou, Herv{\'e} and Labatut, Patrick and Bojanowski, Piotr},
|
| 728 |
+
year={2025},
|
| 729 |
+
eprint={2508.10104},
|
| 730 |
+
archivePrefix={arXiv},
|
| 731 |
+
primaryClass={cs.CV},
|
| 732 |
+
url={https://arxiv.org/abs/2508.10104},
|
| 733 |
+
}
|
| 734 |
+
```
|
depth_anything_v2_metric/depth_anything_v2/dinov3/conda.yaml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: dinov3
|
| 2 |
+
channels:
|
| 3 |
+
- defaults
|
| 4 |
+
- conda-forge
|
| 5 |
+
dependencies:
|
| 6 |
+
- python=3.11
|
| 7 |
+
- omegaconf
|
| 8 |
+
- pip
|
| 9 |
+
- pip:
|
| 10 |
+
- ftfy # needed for dino.txt
|
| 11 |
+
- iopath
|
| 12 |
+
- omegaconf
|
| 13 |
+
- pandas
|
| 14 |
+
- regex # needed for dino.txt
|
| 15 |
+
- pandas
|
| 16 |
+
- scikit-learn
|
| 17 |
+
- scikit-learn-intelex
|
| 18 |
+
- submitit
|
| 19 |
+
- termcolor
|
| 20 |
+
- torch
|
| 21 |
+
- torchvision
|
| 22 |
+
- torchmetrics
|
| 23 |
+
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This software may be used and distributed in accordance with
|
| 4 |
+
# the terms of the DINOv3 License Agreement.
|
| 5 |
+
|
| 6 |
+
__version__ = "0.0.1"
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/checkpointer/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This software may be used and distributed in accordance with
|
| 4 |
+
# the terms of the DINOv3 License Agreement.
|
| 5 |
+
|
| 6 |
+
from .checkpointer import (
|
| 7 |
+
CheckpointRetentionPolicy,
|
| 8 |
+
cleanup_checkpoint,
|
| 9 |
+
find_all_checkpoints,
|
| 10 |
+
find_latest_checkpoint,
|
| 11 |
+
init_fsdp_model_from_checkpoint,
|
| 12 |
+
init_model_from_checkpoint_for_evals,
|
| 13 |
+
keep_checkpoint_copy,
|
| 14 |
+
keep_last_n_checkpoints,
|
| 15 |
+
load_checkpoint,
|
| 16 |
+
register_dont_save_hooks,
|
| 17 |
+
save_checkpoint,
|
| 18 |
+
)
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/checkpointer/checkpointer.py
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This software may be used and distributed in accordance with
|
| 4 |
+
# the terms of the DINOv3 License Agreement.
|
| 5 |
+
|
| 6 |
+
"""
|
| 7 |
+
Suggested file structure:
|
| 8 |
+
|
| 9 |
+
output_dir/
|
| 10 |
+
|-- ckpt/
|
| 11 |
+
| |-- 0/
|
| 12 |
+
| |-- 99/
|
| 13 |
+
| |-- 199/
|
| 14 |
+
| |-- 199_keep/
|
| 15 |
+
| |-- 299/
|
| 16 |
+
| `-- ...
|
| 17 |
+
`-- eval/
|
| 18 |
+
`-- 0/
|
| 19 |
+
`-- 99/
|
| 20 |
+
`-- ckpt/
|
| 21 |
+
|
| 22 |
+
Distributed checkpointer docs:
|
| 23 |
+
- https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html
|
| 24 |
+
- https://pytorch.org/docs/stable/distributed.checkpoint.html
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
import logging
|
| 28 |
+
import shutil
|
| 29 |
+
import subprocess
|
| 30 |
+
import tempfile
|
| 31 |
+
from enum import Enum
|
| 32 |
+
from pathlib import Path
|
| 33 |
+
from typing import List, Sequence, Set
|
| 34 |
+
|
| 35 |
+
import torch
|
| 36 |
+
import torch.distributed as dist
|
| 37 |
+
import torch.distributed.checkpoint as dcp
|
| 38 |
+
import torch.distributed.checkpoint.filesystem as dcpfs
|
| 39 |
+
import torch.distributed.checkpoint.state_dict as dcpsd
|
| 40 |
+
from torch.distributed.checkpoint.stateful import Stateful
|
| 41 |
+
|
| 42 |
+
logger = logging.getLogger("dinov3")
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class CheckpointRetentionPolicy(Enum):
|
| 46 |
+
ALL = "all" # keep all checkpoints
|
| 47 |
+
BEST = "best"
|
| 48 |
+
LAST = "last"
|
| 49 |
+
LAST_AND_BEST = "last_and_best"
|
| 50 |
+
NONE = "none" # do not keep any checkpoints
|
| 51 |
+
|
| 52 |
+
@property
|
| 53 |
+
def keep_filters(self) -> Set[str]:
|
| 54 |
+
"""Files that match these patterns are not deleted by cleanup"""
|
| 55 |
+
if self == CheckpointRetentionPolicy.LAST:
|
| 56 |
+
return set(["final"])
|
| 57 |
+
if self == CheckpointRetentionPolicy.BEST:
|
| 58 |
+
return set(["best"])
|
| 59 |
+
if self == CheckpointRetentionPolicy.LAST_AND_BEST:
|
| 60 |
+
return set(["final", "best"])
|
| 61 |
+
if self == CheckpointRetentionPolicy.ALL:
|
| 62 |
+
return set()
|
| 63 |
+
return set()
|
| 64 |
+
|
| 65 |
+
@property
|
| 66 |
+
def max_to_keep(self) -> int | None:
|
| 67 |
+
"""
|
| 68 |
+
maximum "periodic" checkpoints to keep concurrently, ie. saved with `step` and not `save`. `None` for keep all
|
| 69 |
+
"""
|
| 70 |
+
if self == CheckpointRetentionPolicy.ALL:
|
| 71 |
+
return None
|
| 72 |
+
return 1
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def save_checkpoint(
|
| 76 |
+
ckpt_dir: str | Path, # output_dir/ckpt/199
|
| 77 |
+
*,
|
| 78 |
+
iteration: int | str,
|
| 79 |
+
model: torch.nn.Module,
|
| 80 |
+
optimizer: torch.optim.Optimizer | None = None,
|
| 81 |
+
overwrite: bool = True,
|
| 82 |
+
process_group: dist.ProcessGroup = None,
|
| 83 |
+
**others: Stateful,
|
| 84 |
+
):
|
| 85 |
+
"""Save a plain/DDP/FSDP/FSDP2 model, its optimizer, an integer iteration and other stateful objects."""
|
| 86 |
+
rank = torch.distributed.get_rank(group=process_group)
|
| 87 |
+
|
| 88 |
+
# Rank 0 checks if the checkpoint directory exists, but all ranks need to know if if exists,
|
| 89 |
+
# so they can raise an error when overwrite is False. If overwrite is True, rank 0 will delete it
|
| 90 |
+
# and other ranks wait for the deletion to finish.
|
| 91 |
+
ckpt_dir = Path(ckpt_dir)
|
| 92 |
+
ckpt_dir_exists = [ckpt_dir.exists() if rank == 0 else None]
|
| 93 |
+
src_rank = 0
|
| 94 |
+
if process_group is not None:
|
| 95 |
+
src_rank = torch.distributed.get_global_rank(group=process_group, group_rank=0)
|
| 96 |
+
torch.distributed.broadcast_object_list(ckpt_dir_exists, src=src_rank, group=process_group)
|
| 97 |
+
ckpt_dir_exists = ckpt_dir_exists[0]
|
| 98 |
+
if ckpt_dir_exists:
|
| 99 |
+
if overwrite:
|
| 100 |
+
if rank == 0:
|
| 101 |
+
if ckpt_dir.is_dir():
|
| 102 |
+
shutil.rmtree(ckpt_dir)
|
| 103 |
+
else:
|
| 104 |
+
ckpt_dir.unlink()
|
| 105 |
+
logger.info(f"Deleted: {ckpt_dir}")
|
| 106 |
+
torch.distributed.barrier(group=process_group)
|
| 107 |
+
else:
|
| 108 |
+
raise RuntimeError(f"Checkpoint already exists: {ckpt_dir}")
|
| 109 |
+
|
| 110 |
+
# Rank 0 creates a temporary directory for the checkpoint and broadcasts the name to all ranks.
|
| 111 |
+
ckpt_dir.parent.mkdir(parents=True, exist_ok=True)
|
| 112 |
+
ckpt_dir_tmp = [tempfile.mkdtemp(dir=ckpt_dir.parent, prefix=ckpt_dir.name) if rank == 0 else None]
|
| 113 |
+
torch.distributed.broadcast_object_list(ckpt_dir_tmp, src=src_rank, group=process_group)
|
| 114 |
+
ckpt_dir_tmp = Path(ckpt_dir_tmp[0])
|
| 115 |
+
|
| 116 |
+
to_save = {"iteration": iteration}
|
| 117 |
+
to_save["model"] = dcpsd.get_model_state_dict(model)
|
| 118 |
+
if optimizer is not None:
|
| 119 |
+
to_save["optimizer"] = dcpsd.get_optimizer_state_dict(model, optimizer)
|
| 120 |
+
to_save.update(others)
|
| 121 |
+
dcp.save(
|
| 122 |
+
to_save,
|
| 123 |
+
storage_writer=dcpfs.FileSystemWriter(ckpt_dir_tmp),
|
| 124 |
+
process_group=process_group,
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# Rank 0 renames the temporary directory to the final checkpoint directory. All ranks wait for the rename.
|
| 128 |
+
if rank == 0:
|
| 129 |
+
ckpt_dir_tmp.rename(ckpt_dir)
|
| 130 |
+
torch.distributed.barrier()
|
| 131 |
+
|
| 132 |
+
logger.info(f"Saved: {ckpt_dir}")
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def load_checkpoint(
|
| 136 |
+
ckpt_dir: str | Path, # output_dir/ckpt/199
|
| 137 |
+
*,
|
| 138 |
+
model: torch.nn.Module,
|
| 139 |
+
optimizer: torch.optim.Optimizer | None = None,
|
| 140 |
+
strict_loading: bool = True,
|
| 141 |
+
process_group: dist.ProcessGroup = None,
|
| 142 |
+
**others: Stateful,
|
| 143 |
+
) -> int | None:
|
| 144 |
+
"""
|
| 145 |
+
Load a plain/DDP/FSDP/FSDP2 model, its optimizer, an integer iteration and other stateful objects.
|
| 146 |
+
Can you take a checkpoint saved on N ranks and load it on M ranks? Sure you can!
|
| 147 |
+
Activation checkpointing and torch-compile can also be different between save and load, no problem.
|
| 148 |
+
"""
|
| 149 |
+
ckpt_dir = Path(ckpt_dir)
|
| 150 |
+
to_load = {"iteration": None}
|
| 151 |
+
to_load["model"] = dcpsd.get_model_state_dict(model)
|
| 152 |
+
if optimizer is not None:
|
| 153 |
+
to_load["optimizer"] = dcpsd.get_optimizer_state_dict(model, optimizer)
|
| 154 |
+
to_load.update(others)
|
| 155 |
+
dcp.load(
|
| 156 |
+
to_load,
|
| 157 |
+
storage_reader=dcpfs.FileSystemReader(ckpt_dir),
|
| 158 |
+
planner=dcp.default_planner.DefaultLoadPlanner(allow_partial_load=not strict_loading),
|
| 159 |
+
process_group=process_group,
|
| 160 |
+
)
|
| 161 |
+
iteration = to_load["iteration"]
|
| 162 |
+
dcpsd.set_model_state_dict(model, to_load["model"])
|
| 163 |
+
if optimizer is not None:
|
| 164 |
+
dcpsd.set_optimizer_state_dict(model, optimizer, to_load["optimizer"])
|
| 165 |
+
logger.info(f"Loaded: {ckpt_dir}")
|
| 166 |
+
return iteration
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def register_dont_save_hooks(module: torch.nn.Module, dont_save: Sequence[str]):
|
| 170 |
+
"""
|
| 171 |
+
Registers save/load state dict hooks such that the weights in `dont_save` are not persisted in the checkpoint.
|
| 172 |
+
|
| 173 |
+
Typical use case: a classification model composed of a frozen backbone and a trainable head.
|
| 174 |
+
If the frozen backbone is loaded from torch hub, it does't make sense to save a copy of it in each checkpoint.
|
| 175 |
+
"""
|
| 176 |
+
|
| 177 |
+
def state_dict_post_hook(module, state_dict, prefix, local_metadata):
|
| 178 |
+
# Remove frozen weights so they won't get saved.
|
| 179 |
+
# If this module is not the top-level module, its weights will have a prefix in the state dict.
|
| 180 |
+
nonlocal _dont_save
|
| 181 |
+
for k in _dont_save:
|
| 182 |
+
del state_dict[prefix + k]
|
| 183 |
+
|
| 184 |
+
def load_state_dict_pre_hook(
|
| 185 |
+
module,
|
| 186 |
+
state_dict,
|
| 187 |
+
prefix,
|
| 188 |
+
local_metadata,
|
| 189 |
+
strict,
|
| 190 |
+
missing_keys,
|
| 191 |
+
unexpected_keys,
|
| 192 |
+
error_msgs,
|
| 193 |
+
):
|
| 194 |
+
# This pre hook exists only to pass the prefix to the post hook when loading the state dict.
|
| 195 |
+
nonlocal _prefix
|
| 196 |
+
assert _prefix is None
|
| 197 |
+
_prefix = prefix
|
| 198 |
+
|
| 199 |
+
def load_state_dict_post_hook(module, incompatible_keys):
|
| 200 |
+
# Remove the frozen weights from the missing keys so they don't raise an error.
|
| 201 |
+
nonlocal _prefix
|
| 202 |
+
assert _prefix is not None
|
| 203 |
+
to_remove = []
|
| 204 |
+
for missing_key in incompatible_keys.missing_keys:
|
| 205 |
+
k = missing_key.removeprefix(_prefix)
|
| 206 |
+
k = k.replace("_checkpoint_wrapped_module.", "") # Added by activation checkpointing
|
| 207 |
+
if k in _dont_save:
|
| 208 |
+
to_remove.append(missing_key)
|
| 209 |
+
for r in to_remove:
|
| 210 |
+
incompatible_keys.missing_keys.remove(r)
|
| 211 |
+
_prefix = None
|
| 212 |
+
|
| 213 |
+
_dont_save = set(name.replace("_checkpoint_wrapped_module.", "") for name in dont_save)
|
| 214 |
+
_prefix = None
|
| 215 |
+
module.register_state_dict_post_hook(state_dict_post_hook)
|
| 216 |
+
module.register_load_state_dict_pre_hook(load_state_dict_pre_hook)
|
| 217 |
+
module.register_load_state_dict_post_hook(load_state_dict_post_hook)
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def find_all_checkpoints(ckpt_dir: Path | str) -> list[Path]:
|
| 221 |
+
"""Find all checkpoints in a directory, i.e. subdirs with integer name. Sorted from first to last."""
|
| 222 |
+
ckpt_dir = Path(ckpt_dir)
|
| 223 |
+
if not ckpt_dir.is_dir():
|
| 224 |
+
return []
|
| 225 |
+
checkpoints = [p for p in ckpt_dir.iterdir() if p.is_dir() and _is_int(p.name)]
|
| 226 |
+
checkpoints.sort(key=lambda p: int(p.name))
|
| 227 |
+
return checkpoints
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def find_latest_checkpoint(ckpt_dir: Path | str) -> Path | None:
|
| 231 |
+
"""Find the latest checkpoint in a directory, i.e. the subdir with the highest integer name."""
|
| 232 |
+
checkpoints = find_all_checkpoints(ckpt_dir)
|
| 233 |
+
if len(checkpoints) == 0:
|
| 234 |
+
return None
|
| 235 |
+
return checkpoints[-1]
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def keep_last_n_checkpoints(ckpt_dir: Path | str, n: int | None):
|
| 239 |
+
"""In a directory with integer-named subdirs, keep only the n subdirs with the highest number."""
|
| 240 |
+
if n is None:
|
| 241 |
+
return
|
| 242 |
+
checkpoints = find_all_checkpoints(ckpt_dir)
|
| 243 |
+
for ckpt_dir in checkpoints[:-n]:
|
| 244 |
+
try:
|
| 245 |
+
shutil.rmtree(ckpt_dir)
|
| 246 |
+
logger.info(f"Deleted: {ckpt_dir}")
|
| 247 |
+
except Exception:
|
| 248 |
+
logger.exception(f"Failed to delete: {ckpt_dir}")
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def keep_checkpoint_copy(src: Path | str):
|
| 252 |
+
"""Copy a file/directory next to itself with a _keep suffix. Files are hardlinked."""
|
| 253 |
+
src = Path(src)
|
| 254 |
+
dst = src.parent / f"{src.name}_keep"
|
| 255 |
+
subprocess.check_output(["cp", "--recursive", "--link", src, dst])
|
| 256 |
+
logger.info(f"Copied: {src} -> {dst}")
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def _is_int(s: str) -> bool:
|
| 260 |
+
try:
|
| 261 |
+
int(s)
|
| 262 |
+
return True
|
| 263 |
+
except ValueError:
|
| 264 |
+
return False
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
# Initialize a FSDP2 model from DCP or PyTorch standard checkpoint
|
| 268 |
+
def init_fsdp_model_from_checkpoint(
|
| 269 |
+
model: torch.nn.Module,
|
| 270 |
+
checkpoint_path: str,
|
| 271 |
+
skip_load_keys: List[str] | None = None,
|
| 272 |
+
keys_not_sharded: List[str] | None = None,
|
| 273 |
+
process_group: dist.ProcessGroup = None,
|
| 274 |
+
):
|
| 275 |
+
if not Path(checkpoint_path).is_dir(): # PyTorch standard checkpoint
|
| 276 |
+
logger.info(f"Loading pretrained weights from {checkpoint_path}")
|
| 277 |
+
chkpt = torch.load(checkpoint_path, map_location="cpu")["teacher"]
|
| 278 |
+
from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
|
| 279 |
+
|
| 280 |
+
if process_group is None:
|
| 281 |
+
world_mesh = init_device_mesh(
|
| 282 |
+
"cuda",
|
| 283 |
+
mesh_shape=(dist.get_world_size(),),
|
| 284 |
+
mesh_dim_names=("dp",),
|
| 285 |
+
)
|
| 286 |
+
else:
|
| 287 |
+
world_mesh = DeviceMesh.from_group(process_group, "cuda")
|
| 288 |
+
chkpt = {
|
| 289 |
+
key: (
|
| 290 |
+
torch.distributed.tensor.distribute_tensor(tensor, world_mesh, src_data_rank=None)
|
| 291 |
+
if not any(key_not_sharded in key for key_not_sharded in keys_not_sharded)
|
| 292 |
+
else tensor
|
| 293 |
+
)
|
| 294 |
+
for key, tensor in chkpt.items()
|
| 295 |
+
}
|
| 296 |
+
model.load_state_dict(
|
| 297 |
+
{
|
| 298 |
+
key: tensor
|
| 299 |
+
for key, tensor in chkpt.items()
|
| 300 |
+
if not any(skip_load_key in key for skip_load_key in skip_load_keys)
|
| 301 |
+
}
|
| 302 |
+
)
|
| 303 |
+
else: # DCP checkpoint
|
| 304 |
+
load_checkpoint(ckpt_dir=checkpoint_path, model=model, process_group=process_group)
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
# Initialize a standard non distributed PyTorch model from PyTorch standard checkpoint for evals
|
| 308 |
+
def init_model_from_checkpoint_for_evals(
|
| 309 |
+
model: torch.nn.Module, pretrained_weights: str | Path, checkpoint_key: str = None
|
| 310 |
+
):
|
| 311 |
+
state_dict = torch.load(pretrained_weights, map_location="cpu")
|
| 312 |
+
if checkpoint_key is not None and checkpoint_key in state_dict:
|
| 313 |
+
logger.info(f"Take key {checkpoint_key} in provided checkpoint dict")
|
| 314 |
+
state_dict = state_dict[checkpoint_key]
|
| 315 |
+
# remove `module.` prefix
|
| 316 |
+
state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
|
| 317 |
+
# remove `backbone.` prefix induced by multicrop wrapper
|
| 318 |
+
state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
|
| 319 |
+
msg = model.load_state_dict(state_dict, strict=False)
|
| 320 |
+
logger.info("Pretrained weights found at {} and loaded with msg: {}".format(pretrained_weights, msg))
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
def cleanup_checkpoint(ckpt_dir: str, checkpoint_retention_policy: CheckpointRetentionPolicy):
|
| 324 |
+
"""
|
| 325 |
+
ckpt_dir is the directory containing each individual checkpoint directories (either at iteration, best (validation performance) or final)
|
| 326 |
+
|-- ckpt_dir/
|
| 327 |
+
| |-- 0/
|
| 328 |
+
| |--checkpoint.pth or dcp_sharded_checkpoint_dir
|
| 329 |
+
| |-- 99/
|
| 330 |
+
|--checkpoint.pth or dcp_sharded_checkpoint_dir
|
| 331 |
+
| |-- 199/
|
| 332 |
+
|--checkpoint.pth or dcp_sharded_checkpoint_dir
|
| 333 |
+
| |-- best/
|
| 334 |
+
|--checkpoint.pth or dcp_sharded_checkpoint_dir
|
| 335 |
+
| |-- 299/
|
| 336 |
+
|--checkpoint.pth or dcp_sharded_checkpoint_dir
|
| 337 |
+
| |-- final/
|
| 338 |
+
|--checkpoint.pth or dcp_sharded_checkpoint_dir
|
| 339 |
+
"""
|
| 340 |
+
ckpt_dir = Path(ckpt_dir)
|
| 341 |
+
if not ckpt_dir.is_dir():
|
| 342 |
+
return []
|
| 343 |
+
checkpoint_filters = checkpoint_retention_policy.keep_filters
|
| 344 |
+
checkpoints = [p for p in ckpt_dir.iterdir() if p.is_dir()]
|
| 345 |
+
for checkpoint in checkpoints:
|
| 346 |
+
if checkpoint in checkpoint_filters:
|
| 347 |
+
continue
|
| 348 |
+
try:
|
| 349 |
+
shutil.rmtree(checkpoint)
|
| 350 |
+
logger.info(f"Deleted: {checkpoint}")
|
| 351 |
+
except Exception:
|
| 352 |
+
logger.exception(f"Failed to delete: {checkpoint}")
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This software may be used and distributed in accordance with
|
| 4 |
+
# the terms of the DINOv3 License Agreement.
|
| 5 |
+
|
| 6 |
+
from .config import (
|
| 7 |
+
DinoV3SetupArgs,
|
| 8 |
+
apply_scaling_rules_to_cfg,
|
| 9 |
+
exit_job,
|
| 10 |
+
get_cfg_from_args,
|
| 11 |
+
get_default_config,
|
| 12 |
+
setup_config,
|
| 13 |
+
setup_job,
|
| 14 |
+
setup_multidistillation,
|
| 15 |
+
write_config,
|
| 16 |
+
)
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/config.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This software may be used and distributed in accordance with
|
| 4 |
+
# the terms of the DINOv3 License Agreement.
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
import math
|
| 8 |
+
import os
|
| 9 |
+
import pathlib
|
| 10 |
+
import sys
|
| 11 |
+
from dataclasses import dataclass, field
|
| 12 |
+
from datetime import timedelta
|
| 13 |
+
from typing import Any, List, Optional, Sequence, Tuple
|
| 14 |
+
|
| 15 |
+
from omegaconf import DictConfig, OmegaConf
|
| 16 |
+
|
| 17 |
+
import dinov3.distributed as distributed
|
| 18 |
+
from dinov3.logging import cleanup_logging, setup_logging
|
| 19 |
+
from dinov3.utils import fix_random_seeds, get_conda_env, get_sha
|
| 20 |
+
|
| 21 |
+
logger = logging.getLogger("dinov3")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class DinoV3SetupArgs:
|
| 26 |
+
config_file: str
|
| 27 |
+
pretrained_weights: str | None = None
|
| 28 |
+
shard_unsharded_model: bool = False
|
| 29 |
+
output_dir: str = ""
|
| 30 |
+
opts: List[Any] = field(default_factory=lambda: [])
|
| 31 |
+
|
| 32 |
+
def __post_init__(self):
|
| 33 |
+
# When loaded from benchmark.yaml, self.opts is a frozen omegaconf.ListConfig,
|
| 34 |
+
# which works everywhere except when we want to modify it or when
|
| 35 |
+
# we try to json-serialize it. So we convert it to a regular list here.
|
| 36 |
+
if OmegaConf.is_config(self.opts):
|
| 37 |
+
self.opts = OmegaConf.to_object(self.opts)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def apply_scaling_rules_to_cfg(cfg): # to fix
|
| 41 |
+
assert distributed.is_enabled(), "Setup distributed to get global size !"
|
| 42 |
+
if "schedules" in cfg:
|
| 43 |
+
# For schedules v2, the scaling rules are applied when building the schedules, the config is not modified
|
| 44 |
+
return cfg
|
| 45 |
+
|
| 46 |
+
if cfg.optim.scaling_rule == "linear_wrt_256":
|
| 47 |
+
old_lr = cfg.optim.lr
|
| 48 |
+
cfg.optim.lr *= cfg.train.batch_size_per_gpu * distributed.get_world_size() / 256.0
|
| 49 |
+
logger.info(f"linear scaling learning rate; old: {old_lr}, new: {cfg.optim.lr}")
|
| 50 |
+
elif cfg.optim.scaling_rule == "sqrt_wrt_1024":
|
| 51 |
+
old_lr = cfg.optim.lr
|
| 52 |
+
cfg.optim.lr *= 4 * math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_world_size() / 1024.0)
|
| 53 |
+
logger.info(f"sqrt scaling learning rate; old: {old_lr}, new: {cfg.optim.lr}")
|
| 54 |
+
return cfg
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def write_config(cfg, output_dir, name="config.yaml"):
|
| 58 |
+
logger.info(OmegaConf.to_yaml(cfg))
|
| 59 |
+
output_dir = os.path.abspath(output_dir)
|
| 60 |
+
saved_cfg_path = os.path.join(output_dir, name)
|
| 61 |
+
with open(saved_cfg_path, "w") as f:
|
| 62 |
+
OmegaConf.save(config=cfg, f=f)
|
| 63 |
+
return saved_cfg_path
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_default_config() -> DictConfig:
|
| 67 |
+
p = pathlib.Path(__file__).parent / "ssl_default_config.yaml"
|
| 68 |
+
return OmegaConf.load(p)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def get_cfg_from_args(args: DinoV3SetupArgs, multidistillation=False, strict=True):
|
| 72 |
+
overrides = [*args.opts]
|
| 73 |
+
if args.output_dir is not None:
|
| 74 |
+
overrides.append(f"train.output_dir={os.path.realpath(args.output_dir)}")
|
| 75 |
+
|
| 76 |
+
# Config file
|
| 77 |
+
cfg = OmegaConf.load(args.config_file)
|
| 78 |
+
|
| 79 |
+
# Command line overrides
|
| 80 |
+
opts_cfg = OmegaConf.from_cli(overrides)
|
| 81 |
+
|
| 82 |
+
if multidistillation:
|
| 83 |
+
cfg = OmegaConf.merge(cfg, opts_cfg)
|
| 84 |
+
else:
|
| 85 |
+
# Default config
|
| 86 |
+
default_cfg = get_default_config()
|
| 87 |
+
if strict:
|
| 88 |
+
OmegaConf.set_struct(default_cfg, True)
|
| 89 |
+
cfg = OmegaConf.merge(default_cfg, cfg, opts_cfg)
|
| 90 |
+
return cfg
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def setup_config(args: DinoV3SetupArgs, strict_cfg=True):
|
| 94 |
+
"""
|
| 95 |
+
Create configs and perform basic setups.
|
| 96 |
+
"""
|
| 97 |
+
# Create the cfg with OmegaConf
|
| 98 |
+
cfg = get_cfg_from_args(args, strict=strict_cfg)
|
| 99 |
+
# setup distributed, logging, and random seeds
|
| 100 |
+
logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
|
| 101 |
+
# dump config before modifying so it can be reloaded
|
| 102 |
+
if args.output_dir is not None:
|
| 103 |
+
write_config(cfg, args.output_dir)
|
| 104 |
+
# modify the config inplace by applying scaling rules
|
| 105 |
+
apply_scaling_rules_to_cfg(cfg)
|
| 106 |
+
return cfg
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def _enumerate_all_subgroup_ranks(all_subgroup_rank_spans: Sequence[Tuple[int, int]]):
|
| 110 |
+
"""Expands a specification of process subgroups from spans to enumerated ranks.
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
all_group_rank_spans: a sequence of rank spans (first rank, last rank),
|
| 114 |
+
one for each process group. Example: ((0, 1), (2, 3), (4, 7)).
|
| 115 |
+
"""
|
| 116 |
+
for first, last in all_subgroup_rank_spans:
|
| 117 |
+
assert first <= last
|
| 118 |
+
return tuple(tuple(range(first, last + 1)) for first, last in all_subgroup_rank_spans)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def setup_multidistillation(args: DinoV3SetupArgs):
|
| 122 |
+
base_output_dir = args.output_dir
|
| 123 |
+
os.makedirs(args.output_dir, exist_ok=True)
|
| 124 |
+
# get config file for this rank
|
| 125 |
+
base_cfg = OmegaConf.load(args.config_file)
|
| 126 |
+
assert base_cfg.multidistillation.enabled
|
| 127 |
+
|
| 128 |
+
global_batch_size = base_cfg.multidistillation.global_batch_size
|
| 129 |
+
|
| 130 |
+
distributed.enable(overwrite=True)
|
| 131 |
+
seed = getattr(args, "seed", 0)
|
| 132 |
+
rank = distributed.get_rank()
|
| 133 |
+
|
| 134 |
+
# build process subgroups
|
| 135 |
+
all_subgroup_rank_spans = tuple(
|
| 136 |
+
(student.ranks_range[0], student.ranks_range[1] - 1) for student in base_cfg.multidistillation.students
|
| 137 |
+
)
|
| 138 |
+
all_subgroup_ranks = _enumerate_all_subgroup_ranks(all_subgroup_rank_spans)
|
| 139 |
+
distributed.new_subgroups(all_subgroup_ranks)
|
| 140 |
+
|
| 141 |
+
found = False
|
| 142 |
+
for student in base_cfg.multidistillation.students:
|
| 143 |
+
if rank in range(*student.ranks_range):
|
| 144 |
+
found = True
|
| 145 |
+
break
|
| 146 |
+
assert found, "rank of worker not in defined range"
|
| 147 |
+
|
| 148 |
+
name = student.name
|
| 149 |
+
config_path = student.config_path
|
| 150 |
+
n_gpus = student.ranks_range[1] - student.ranks_range[0]
|
| 151 |
+
assert global_batch_size % n_gpus == 0
|
| 152 |
+
total_n_gpus = distributed.get_world_size()
|
| 153 |
+
|
| 154 |
+
args.output_dir = os.path.join(base_output_dir, name)
|
| 155 |
+
args.opts += [f"train.output_dir={args.output_dir}"]
|
| 156 |
+
args.opts += [f"train.batch_size_per_gpu={global_batch_size // total_n_gpus}"]
|
| 157 |
+
args.config_file = os.path.abspath(config_path)
|
| 158 |
+
default_cfg = get_default_config()
|
| 159 |
+
cfg = OmegaConf.load(args.config_file)
|
| 160 |
+
cfg = OmegaConf.merge(default_cfg, cfg, base_cfg, OmegaConf.from_cli(args.opts))
|
| 161 |
+
|
| 162 |
+
global logger
|
| 163 |
+
setup_logging(output=args.output_dir, level=logging.INFO)
|
| 164 |
+
|
| 165 |
+
fix_random_seeds(seed + rank)
|
| 166 |
+
|
| 167 |
+
write_config(cfg, args.output_dir)
|
| 168 |
+
apply_scaling_rules_to_cfg(cfg)
|
| 169 |
+
|
| 170 |
+
return cfg
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def setup_job(
|
| 174 |
+
output_dir: Optional[str] = None,
|
| 175 |
+
distributed_enabled: bool = True,
|
| 176 |
+
logging_enabled: bool = True,
|
| 177 |
+
seed: Optional[int] = 0,
|
| 178 |
+
restrict_print_to_main_process: bool = True,
|
| 179 |
+
distributed_timeout: timedelta | None = None,
|
| 180 |
+
):
|
| 181 |
+
"""
|
| 182 |
+
Setup methods that should be done in every fairvit job
|
| 183 |
+
Initializes logging, distributed, random seeds and other utilities.
|
| 184 |
+
"""
|
| 185 |
+
if output_dir is not None:
|
| 186 |
+
output_dir = os.path.realpath(output_dir)
|
| 187 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 188 |
+
|
| 189 |
+
if logging_enabled:
|
| 190 |
+
setup_logging(
|
| 191 |
+
output=output_dir,
|
| 192 |
+
level=logging.INFO,
|
| 193 |
+
log_to_stdout_only_in_main_process=restrict_print_to_main_process,
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
if distributed_enabled:
|
| 197 |
+
distributed.enable(
|
| 198 |
+
overwrite=True,
|
| 199 |
+
nccl_async_error_handling=True,
|
| 200 |
+
restrict_print_to_main_process=restrict_print_to_main_process,
|
| 201 |
+
timeout=distributed_timeout,
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
if seed is not None:
|
| 205 |
+
rank = distributed.get_rank()
|
| 206 |
+
fix_random_seeds(seed + rank)
|
| 207 |
+
|
| 208 |
+
logger = logging.getLogger("dinov3")
|
| 209 |
+
logger.info("git:\n {}\n".format(get_sha()))
|
| 210 |
+
|
| 211 |
+
# Log some python info
|
| 212 |
+
conda_env_name, conda_env_path = get_conda_env()
|
| 213 |
+
logger.info(f"conda env name: {conda_env_name}")
|
| 214 |
+
logger.info(f"conda env path: {conda_env_path}")
|
| 215 |
+
logger.info(f"python path: {sys.path}")
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def exit_job(distributed_enabled: bool = True, logging_enabled: bool = True):
|
| 219 |
+
if distributed_enabled:
|
| 220 |
+
distributed.disable()
|
| 221 |
+
if logging_enabled:
|
| 222 |
+
cleanup_logging()
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/ssl_default_config.yaml
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: SSLMetaArch
|
| 3 |
+
DEVICE: cuda
|
| 4 |
+
WEIGHTS: ''
|
| 5 |
+
DTYPE: float32
|
| 6 |
+
compute_precision:
|
| 7 |
+
param_dtype: bf16
|
| 8 |
+
reduce_dtype: fp32
|
| 9 |
+
sharding_strategy: SHARD_GRAD_OP
|
| 10 |
+
dino:
|
| 11 |
+
loss_weight: 1.0
|
| 12 |
+
global_ignore_diagonal: true # Whether to ignore A-A and B-B global pairs, default as in DINOv2, ignored by SSLMetaArch
|
| 13 |
+
head_n_prototypes: 65536
|
| 14 |
+
head_bottleneck_dim: 256
|
| 15 |
+
head_norm_last_layer: false
|
| 16 |
+
head_nlayers: 3
|
| 17 |
+
head_hidden_dim: 2048
|
| 18 |
+
koleo_loss_weight: 0.1
|
| 19 |
+
koleo_loss_distributed: false
|
| 20 |
+
koleo_topk: 1
|
| 21 |
+
koleo_distributed_replicas: 0
|
| 22 |
+
koleo_distributed_loss_group_size: null # Size of the nearest neighbor set for distributed Koleo. If None, uses global batch size.
|
| 23 |
+
koleo_distributed_loss_group_data: true # group data from adjacent ranks to make sure koleo is applied on the same data distribution
|
| 24 |
+
force_weight_norm: false
|
| 25 |
+
reweight_dino_local_loss: false # If true, reweighting of DINO loss
|
| 26 |
+
local_loss_weight_schedule: # Schedule for local loss weight, enabled if reweight_dino_local_loss is true
|
| 27 |
+
start: 0.5
|
| 28 |
+
peak: 0.5
|
| 29 |
+
end: 0.5
|
| 30 |
+
warmup_epochs: 0
|
| 31 |
+
ibot:
|
| 32 |
+
loss_weight: 1.0
|
| 33 |
+
mask_sample_probability: 0.5
|
| 34 |
+
mask_ratio_min_max:
|
| 35 |
+
- 0.1
|
| 36 |
+
- 0.5
|
| 37 |
+
mask_random_circular_shift: false
|
| 38 |
+
force_masking_even_with_zero_weight: False
|
| 39 |
+
separate_head: true
|
| 40 |
+
head_n_prototypes: 65536
|
| 41 |
+
head_bottleneck_dim: 256
|
| 42 |
+
head_norm_last_layer: false
|
| 43 |
+
head_nlayers: 3
|
| 44 |
+
head_hidden_dim: 2048
|
| 45 |
+
gram:
|
| 46 |
+
use_loss: false # (bool) if true gram is used, else not
|
| 47 |
+
compute_stats: false # (bool): if true compute auxilliary stats
|
| 48 |
+
loss_weight: 1.0 # (float): weight of the loss
|
| 49 |
+
ema_teacher: false # (bool): using the EMA teacher as GRAM teacher
|
| 50 |
+
ckpt: null #(str): Checkpoint to the teacher
|
| 51 |
+
it_load_ema_teacher: -1 # (int): iteration at which the ema teacher is loaded into the gram teacher
|
| 52 |
+
rep_update: true # (bool): if true GRAM teacher updated every gram.update_frequency after iter gram.it_first_update steps
|
| 53 |
+
update_frequency: 50000 # (int): update frequency
|
| 54 |
+
it_first_update: 0 # (int): iteration of the first update
|
| 55 |
+
max_updates: null # (int): maximum number of updates to gram teacher. If None, it is unlimited
|
| 56 |
+
normalized: true # (bool): normalization of the features
|
| 57 |
+
img_level: false # (bool): if true GRAM computation at the image else, otherwise at the local batch level
|
| 58 |
+
remove_neg: false # (bool): if true remove the negative similarities before applying the loss
|
| 59 |
+
remove_only_teacher_neg: false # (bool): remove negative similarities of the teacher
|
| 60 |
+
tokens_used: all # (str): In [all, masked, unmasked]
|
| 61 |
+
global_teacher_resize_method: bicubic # Method for resizing the outputs of the gram teacher
|
| 62 |
+
global_teacher_resize_antialias: false # Whether to use antialiasing when resizing the outputs of the gram teacher
|
| 63 |
+
loss_weight_schedule: null # (dict): If not None, use a schedule for the loss weight instead of `loss_weight`
|
| 64 |
+
train:
|
| 65 |
+
batch_size_per_gpu: 64
|
| 66 |
+
dataset_path: ImageNet:split=TRAIN
|
| 67 |
+
data_config: null
|
| 68 |
+
output_dir: .
|
| 69 |
+
saveckp_freq: 20
|
| 70 |
+
seed: 0
|
| 71 |
+
num_workers: 10
|
| 72 |
+
OFFICIAL_EPOCH_LENGTH: 1250
|
| 73 |
+
monitor_gradient_norm: false
|
| 74 |
+
chunk_schedule: []
|
| 75 |
+
use_teacher_head: true
|
| 76 |
+
learn_from_teacher_tokens: false
|
| 77 |
+
centering: "sinkhorn_knopp" # or "sinkhorn_knopp"
|
| 78 |
+
checkpointing: false
|
| 79 |
+
checkpointing_full: false # aggressive checkpointing
|
| 80 |
+
compile: true
|
| 81 |
+
cudagraphs: false
|
| 82 |
+
sharded_eval_checkpoint: false
|
| 83 |
+
cache_dataset: false
|
| 84 |
+
student:
|
| 85 |
+
arch: vit_large
|
| 86 |
+
patch_size: 16
|
| 87 |
+
drop_path_rate: 0.3
|
| 88 |
+
layerscale: 1.0e-05
|
| 89 |
+
pretrained_weights: ''
|
| 90 |
+
ffn_layer: "mlp"
|
| 91 |
+
ffn_ratio: 4.0
|
| 92 |
+
resume_from_teacher_chkpt: ""
|
| 93 |
+
qkv_bias: true
|
| 94 |
+
proj_bias: true
|
| 95 |
+
ffn_bias: true
|
| 96 |
+
norm_layer: "layernorm"
|
| 97 |
+
n_storage_tokens: 0
|
| 98 |
+
mask_k_bias: false
|
| 99 |
+
untie_cls_and_patch_norms: false # If true, use separate norms for CLS/reg and patch/mask tokens
|
| 100 |
+
untie_global_and_local_cls_norm: false # If true, use separate norms for local and global crop CLS token during training
|
| 101 |
+
in_chans: 3
|
| 102 |
+
pos_embed_type: rope
|
| 103 |
+
pos_embed_rope_base: 100.0
|
| 104 |
+
pos_embed_rope_min_period: null
|
| 105 |
+
pos_embed_rope_max_period: null
|
| 106 |
+
pos_embed_rope_normalize_coords: separate # min, max, separate
|
| 107 |
+
pos_embed_rope_shift_coords: null
|
| 108 |
+
pos_embed_rope_jitter_coords: null
|
| 109 |
+
pos_embed_rope_rescale_coords: null
|
| 110 |
+
pos_embed_rope_dtype: bf16
|
| 111 |
+
fp8_enabled: False # Convert Linear layers to operate in fp8 precision
|
| 112 |
+
fp8_filter: "blocks" # Regex that must appear in module path; empty means everything
|
| 113 |
+
teacher:
|
| 114 |
+
momentum_teacher: 0.992
|
| 115 |
+
final_momentum_teacher: 1
|
| 116 |
+
warmup_teacher_temp: 0.04
|
| 117 |
+
teacher_temp: 0.07
|
| 118 |
+
warmup_teacher_temp_epochs: 30
|
| 119 |
+
in_chans: 3
|
| 120 |
+
distillation: # teacher
|
| 121 |
+
enabled: false
|
| 122 |
+
full_cfg_path: ""
|
| 123 |
+
checkpoint_path: ""
|
| 124 |
+
multidistillation:
|
| 125 |
+
enabled: false
|
| 126 |
+
hrft: # non-hrft'd student
|
| 127 |
+
enabled: false
|
| 128 |
+
checkpoint_path: "" # teacher_checkpoint path
|
| 129 |
+
optim:
|
| 130 |
+
epochs: 100
|
| 131 |
+
optimizer: adamw
|
| 132 |
+
weight_decay: 0.04
|
| 133 |
+
weight_decay_end: 0.4
|
| 134 |
+
lr: 0.001
|
| 135 |
+
warmup_epochs: 10
|
| 136 |
+
min_lr: 1.0e-06
|
| 137 |
+
schedule_trunc_extra: 0.0 # Compute the schedule for (1 + schedule_trunc_extra) steps and truncate, .25 is a good choice
|
| 138 |
+
clip_grad: 3.0
|
| 139 |
+
freeze_last_layer_epochs: 1
|
| 140 |
+
scaling_rule: sqrt_wrt_1024
|
| 141 |
+
patch_embed_lr_mult: 0.2
|
| 142 |
+
dino_head_wd_multiplier: 1.0
|
| 143 |
+
layerwise_decay: 0.9
|
| 144 |
+
multi_tensor_optim: true
|
| 145 |
+
dump_fsdp_weights_path: ""
|
| 146 |
+
adamw_beta1: 0.9
|
| 147 |
+
adamw_beta2: 0.999
|
| 148 |
+
crops:
|
| 149 |
+
global_crops_scale:
|
| 150 |
+
- 0.32
|
| 151 |
+
- 1.0
|
| 152 |
+
local_crops_number: 8
|
| 153 |
+
local_crops_scale:
|
| 154 |
+
- 0.05
|
| 155 |
+
- 0.32
|
| 156 |
+
global_crops_size: 224
|
| 157 |
+
local_crops_size: 96
|
| 158 |
+
global_local_crop_pairs_ratios: 1.0
|
| 159 |
+
gram_teacher_crops_size: null # If not None, return crops for gram teacher
|
| 160 |
+
localcrops_subset_of_globalcrops: false
|
| 161 |
+
share_color_jitter: false
|
| 162 |
+
horizontal_flips: true
|
| 163 |
+
gram_teacher_no_distortions: false # If True, no distortions are applied to gram teacher crops
|
| 164 |
+
rgb_mean:
|
| 165 |
+
- 0.485
|
| 166 |
+
- 0.456
|
| 167 |
+
- 0.406
|
| 168 |
+
rgb_std:
|
| 169 |
+
- 0.229
|
| 170 |
+
- 0.224
|
| 171 |
+
- 0.225
|
| 172 |
+
evaluation:
|
| 173 |
+
eval_period_iterations: 12500
|
| 174 |
+
low_freq_every: 5
|
| 175 |
+
config_files: # Must be in fairvit/eval/configs
|
| 176 |
+
high_freq: benchmark_high_frequency.yaml # More often
|
| 177 |
+
low_freq: benchmark_low_frequency.yaml # Less often
|
| 178 |
+
checkpointing:
|
| 179 |
+
period: 3750
|
| 180 |
+
max_to_keep: 3
|
| 181 |
+
keep_every: 99999999999999999 # Save a checkpoint every N iterations, regardless of max_to_keep and period
|
| 182 |
+
|
| 183 |
+
# Example of constant schedules with schedules v2
|
| 184 |
+
# # schedules:
|
| 185 |
+
# # lr:
|
| 186 |
+
# # start: 0.0
|
| 187 |
+
# # peak: 1e-3
|
| 188 |
+
# # end: 1e-6
|
| 189 |
+
# # warmup_epochs: 10
|
| 190 |
+
# # freeze_last_layer_epochs: 1
|
| 191 |
+
# # weight_decay:
|
| 192 |
+
# # start: 0.04
|
| 193 |
+
# # peak: 0.04
|
| 194 |
+
# # end: 0.04
|
| 195 |
+
# # warmup_epochs: 0
|
| 196 |
+
# # momentum:
|
| 197 |
+
# # start: 0.992
|
| 198 |
+
# # peak: 0.992
|
| 199 |
+
# # end: 0.992
|
| 200 |
+
# # warmup_epochs: 0
|
| 201 |
+
# # teacher_temp:
|
| 202 |
+
# # start: 0.04
|
| 203 |
+
# # peak: 0.07
|
| 204 |
+
# # end: 0.07
|
| 205 |
+
# # warmup_epochs: 30
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vit7b16_gram_anchor.yaml
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: SSLMetaArch
|
| 3 |
+
DEVICE: cuda
|
| 4 |
+
WEIGHTS: ''
|
| 5 |
+
DTYPE: float32
|
| 6 |
+
compute_precision:
|
| 7 |
+
param_dtype: bf16
|
| 8 |
+
reduce_dtype: fp32
|
| 9 |
+
sharding_strategy: SHARD_GRAD_OP
|
| 10 |
+
dino:
|
| 11 |
+
loss_weight: 1.0
|
| 12 |
+
global_ignore_diagonal: true
|
| 13 |
+
head_n_prototypes: 262144
|
| 14 |
+
head_bottleneck_dim: 512
|
| 15 |
+
head_norm_last_layer: false
|
| 16 |
+
head_nlayers: 3
|
| 17 |
+
head_hidden_dim: 8192
|
| 18 |
+
koleo_loss_weight: 0.1
|
| 19 |
+
koleo_loss_distributed: false
|
| 20 |
+
koleo_topk: 1
|
| 21 |
+
koleo_distributed_replicas: 0
|
| 22 |
+
koleo_distributed_loss_group_size: null
|
| 23 |
+
koleo_distributed_loss_group_data: true
|
| 24 |
+
force_weight_norm: false
|
| 25 |
+
reweight_dino_local_loss: true
|
| 26 |
+
local_loss_weight_schedule:
|
| 27 |
+
start: 1
|
| 28 |
+
peak: 1
|
| 29 |
+
end: 0.5
|
| 30 |
+
warmup_epochs: 1000
|
| 31 |
+
cosine_epochs: 1
|
| 32 |
+
ibot:
|
| 33 |
+
loss_weight: 1.0
|
| 34 |
+
mask_sample_probability: 0.5
|
| 35 |
+
mask_ratio_min_max:
|
| 36 |
+
- 0.1
|
| 37 |
+
- 0.5
|
| 38 |
+
mask_random_circular_shift: false
|
| 39 |
+
force_masking_even_with_zero_weight: false
|
| 40 |
+
separate_head: true
|
| 41 |
+
head_n_prototypes: 98304
|
| 42 |
+
head_bottleneck_dim: 384
|
| 43 |
+
head_norm_last_layer: false
|
| 44 |
+
head_nlayers: 3
|
| 45 |
+
head_hidden_dim: 4096
|
| 46 |
+
gram:
|
| 47 |
+
use_loss: true
|
| 48 |
+
compute_stats: false
|
| 49 |
+
loss_weight: 1.0
|
| 50 |
+
ema_teacher: false
|
| 51 |
+
ckpt: ignore
|
| 52 |
+
it_load_ema_teacher: -1
|
| 53 |
+
rep_update: true
|
| 54 |
+
update_frequency: 10000
|
| 55 |
+
it_first_update: 1010000
|
| 56 |
+
max_updates: 3
|
| 57 |
+
normalized: true
|
| 58 |
+
img_level: true
|
| 59 |
+
remove_neg: false
|
| 60 |
+
remove_only_teacher_neg: false
|
| 61 |
+
tokens_used: all
|
| 62 |
+
global_teacher_resize_method: bicubic
|
| 63 |
+
global_teacher_resize_antialias: false
|
| 64 |
+
loss_weight_schedule:
|
| 65 |
+
start: 0
|
| 66 |
+
peak: 0
|
| 67 |
+
end: 2.0
|
| 68 |
+
warmup_epochs: 1000
|
| 69 |
+
cosine_epochs: 1
|
| 70 |
+
train:
|
| 71 |
+
batch_size_per_gpu: 16
|
| 72 |
+
dataset_path: null
|
| 73 |
+
saveckp_freq: 20
|
| 74 |
+
seed: 0
|
| 75 |
+
num_workers: 10
|
| 76 |
+
OFFICIAL_EPOCH_LENGTH: 1000
|
| 77 |
+
monitor_gradient_norm: false
|
| 78 |
+
chunk_schedule: []
|
| 79 |
+
cache_dataset: true
|
| 80 |
+
use_teacher_head: true
|
| 81 |
+
learn_from_teacher_tokens: false
|
| 82 |
+
centering: sinkhorn_knopp
|
| 83 |
+
checkpointing: true
|
| 84 |
+
checkpointing_full: true
|
| 85 |
+
compile: true
|
| 86 |
+
cudagraphs: false
|
| 87 |
+
cell_augmentation: false
|
| 88 |
+
cell_augmentation_type: hpa
|
| 89 |
+
sharded_eval_checkpoint: true
|
| 90 |
+
student:
|
| 91 |
+
arch: vit_7b
|
| 92 |
+
patch_size: 16
|
| 93 |
+
drop_path_rate: 0.4
|
| 94 |
+
layerscale: 1.0e-05
|
| 95 |
+
patch_drop: 0.0
|
| 96 |
+
pretrained_weights: ''
|
| 97 |
+
ffn_layer: swiglu64
|
| 98 |
+
ffn_ratio: 3
|
| 99 |
+
resume_from_teacher_chkpt: ''
|
| 100 |
+
qkv_bias: false
|
| 101 |
+
proj_bias: true
|
| 102 |
+
ffn_bias: true
|
| 103 |
+
norm_layer: layernormbf16
|
| 104 |
+
n_storage_tokens: 4
|
| 105 |
+
untie_cls_and_patch_norms: false
|
| 106 |
+
untie_global_and_local_cls_norm: true
|
| 107 |
+
mask_k_bias: true
|
| 108 |
+
in_chans: 3
|
| 109 |
+
pos_embed_type: rope
|
| 110 |
+
pos_embed_rope_base: 100
|
| 111 |
+
pos_embed_rope_min_period: null
|
| 112 |
+
pos_embed_rope_max_period: null
|
| 113 |
+
pos_embed_rope_normalize_coords: separate
|
| 114 |
+
pos_embed_rope_shift_coords: null
|
| 115 |
+
pos_embed_rope_jitter_coords: null
|
| 116 |
+
pos_embed_rope_rescale_coords: 2
|
| 117 |
+
pos_embed_rope_dtype: fp32
|
| 118 |
+
fp8_enabled: true
|
| 119 |
+
fp8_filter: blocks
|
| 120 |
+
teacher:
|
| 121 |
+
momentum_teacher: null
|
| 122 |
+
final_momentum_teacher: null
|
| 123 |
+
warmup_teacher_temp: null
|
| 124 |
+
teacher_temp: null
|
| 125 |
+
warmup_teacher_temp_epochs: null
|
| 126 |
+
in_chans: 3
|
| 127 |
+
distillation:
|
| 128 |
+
enabled: false
|
| 129 |
+
full_cfg_path: ''
|
| 130 |
+
checkpoint_path: ''
|
| 131 |
+
multidistillation:
|
| 132 |
+
enabled: false
|
| 133 |
+
hrft:
|
| 134 |
+
enabled: false
|
| 135 |
+
checkpoint_path: ''
|
| 136 |
+
optim:
|
| 137 |
+
epochs: 1200
|
| 138 |
+
optimizer: adamw
|
| 139 |
+
weight_decay: null
|
| 140 |
+
weight_decay_end: null
|
| 141 |
+
lr: null
|
| 142 |
+
warmup_epochs: null
|
| 143 |
+
min_lr: null
|
| 144 |
+
schedule_trunc_extra: null
|
| 145 |
+
clip_grad: 30.0
|
| 146 |
+
freeze_last_layer_epochs: null
|
| 147 |
+
scaling_rule: sqrt_wrt_1024
|
| 148 |
+
patch_embed_lr_mult: 0.2
|
| 149 |
+
dino_head_wd_multiplier: 1.0
|
| 150 |
+
layerwise_decay: 0.98
|
| 151 |
+
multi_tensor_optim: true
|
| 152 |
+
dump_fsdp_weights_path: ''
|
| 153 |
+
adamw_beta1: 0.9
|
| 154 |
+
adamw_beta2: 0.99
|
| 155 |
+
crops:
|
| 156 |
+
global_crops_scale:
|
| 157 |
+
- 0.32
|
| 158 |
+
- 1.0
|
| 159 |
+
local_crops_number: 8
|
| 160 |
+
local_crops_scale:
|
| 161 |
+
- 0.05
|
| 162 |
+
- 0.32
|
| 163 |
+
global_crops_size: 256
|
| 164 |
+
local_crops_size: 112
|
| 165 |
+
gram_teacher_crops_size: 512
|
| 166 |
+
localcrops_subset_of_globalcrops: false
|
| 167 |
+
share_color_jitter: false
|
| 168 |
+
horizontal_flips: false
|
| 169 |
+
gram_teacher_no_distortions: true
|
| 170 |
+
rgb_mean:
|
| 171 |
+
- 0.485
|
| 172 |
+
- 0.456
|
| 173 |
+
- 0.406
|
| 174 |
+
rgb_std:
|
| 175 |
+
- 0.229
|
| 176 |
+
- 0.224
|
| 177 |
+
- 0.225
|
| 178 |
+
checkpointing:
|
| 179 |
+
period: 1000
|
| 180 |
+
max_to_keep: 3
|
| 181 |
+
keep_every: 50000
|
| 182 |
+
schedules:
|
| 183 |
+
lr:
|
| 184 |
+
start: 0
|
| 185 |
+
peak: 3.0e-05
|
| 186 |
+
end: 3.0e-05
|
| 187 |
+
warmup_epochs: 100
|
| 188 |
+
freeze_last_layer_epochs: 5
|
| 189 |
+
weight_decay:
|
| 190 |
+
start: 0.04
|
| 191 |
+
peak: 0.04
|
| 192 |
+
end: 0.04
|
| 193 |
+
warmup_epochs: 0
|
| 194 |
+
teacher_temp:
|
| 195 |
+
start: 0.04
|
| 196 |
+
peak: 0.07
|
| 197 |
+
end: 0.07
|
| 198 |
+
warmup_epochs: 100
|
| 199 |
+
momentum:
|
| 200 |
+
start: 0.999
|
| 201 |
+
peak: 0.999
|
| 202 |
+
end: 0.999
|
| 203 |
+
warmup_epochs: 0
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vit7b16_high_res_adapt.yaml
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: SSLMetaArch
|
| 3 |
+
DEVICE: cuda
|
| 4 |
+
WEIGHTS: ''
|
| 5 |
+
DTYPE: float32
|
| 6 |
+
compute_precision:
|
| 7 |
+
param_dtype: bf16
|
| 8 |
+
reduce_dtype: fp32
|
| 9 |
+
sharding_strategy: SHARD_GRAD_OP
|
| 10 |
+
dino:
|
| 11 |
+
loss_weight: 1.0
|
| 12 |
+
global_ignore_diagonal: true
|
| 13 |
+
head_n_prototypes: 262144
|
| 14 |
+
head_bottleneck_dim: 512
|
| 15 |
+
head_norm_last_layer: false
|
| 16 |
+
head_nlayers: 3
|
| 17 |
+
head_hidden_dim: 8192
|
| 18 |
+
koleo_loss_weight: 0.1
|
| 19 |
+
koleo_loss_distributed: true
|
| 20 |
+
koleo_topk: 1
|
| 21 |
+
koleo_distributed_replicas: 0
|
| 22 |
+
koleo_distributed_loss_group_size: 16
|
| 23 |
+
force_weight_norm: false
|
| 24 |
+
reweight_dino_local_loss: true
|
| 25 |
+
local_loss_weight_schedule:
|
| 26 |
+
start: 0.5
|
| 27 |
+
peak: 0.5
|
| 28 |
+
end: 0.5
|
| 29 |
+
warmup_epochs: 0
|
| 30 |
+
cosine_epochs: 0
|
| 31 |
+
koleo_distributed_loss_group_data: true
|
| 32 |
+
ibot:
|
| 33 |
+
loss_weight: 1.0
|
| 34 |
+
mask_sample_probability: 0.5
|
| 35 |
+
mask_ratio_min_max:
|
| 36 |
+
- 0.1
|
| 37 |
+
- 0.5
|
| 38 |
+
mask_random_circular_shift: false
|
| 39 |
+
force_masking_even_with_zero_weight: false
|
| 40 |
+
separate_head: true
|
| 41 |
+
head_n_prototypes: 98304
|
| 42 |
+
head_bottleneck_dim: 384
|
| 43 |
+
head_norm_last_layer: false
|
| 44 |
+
head_nlayers: 3
|
| 45 |
+
head_hidden_dim: 4096
|
| 46 |
+
gram:
|
| 47 |
+
use_loss: true
|
| 48 |
+
compute_stats: false
|
| 49 |
+
loss_weight: 1.0
|
| 50 |
+
ema_teacher: false
|
| 51 |
+
it_load_ema_teacher: -1
|
| 52 |
+
rep_update: false
|
| 53 |
+
update_frequency: 10000
|
| 54 |
+
it_first_update: 1010000
|
| 55 |
+
max_updates: 3
|
| 56 |
+
normalized: true
|
| 57 |
+
img_level: true
|
| 58 |
+
remove_neg: false
|
| 59 |
+
remove_only_teacher_neg: false
|
| 60 |
+
tokens_used: all
|
| 61 |
+
global_teacher_resize_method: bicubic
|
| 62 |
+
global_teacher_resize_antialias: false
|
| 63 |
+
loss_weight_schedule:
|
| 64 |
+
start: 1.5
|
| 65 |
+
peak: 1.5
|
| 66 |
+
end: 1.5
|
| 67 |
+
warmup_epochs: 0
|
| 68 |
+
cosine_epochs: 0
|
| 69 |
+
train:
|
| 70 |
+
batch_size_per_gpu: 8
|
| 71 |
+
dataset_path: null
|
| 72 |
+
saveckp_freq: 20
|
| 73 |
+
seed: 0
|
| 74 |
+
num_workers: 2
|
| 75 |
+
OFFICIAL_EPOCH_LENGTH: 1000
|
| 76 |
+
monitor_gradient_norm: false
|
| 77 |
+
chunk_schedule: []
|
| 78 |
+
cache_dataset: true
|
| 79 |
+
use_teacher_head: true
|
| 80 |
+
learn_from_teacher_tokens: false
|
| 81 |
+
centering: sinkhorn_knopp
|
| 82 |
+
checkpointing: true
|
| 83 |
+
checkpointing_full: true
|
| 84 |
+
compile: true
|
| 85 |
+
cudagraphs: false
|
| 86 |
+
cell_augmentation: false
|
| 87 |
+
cell_augmentation_type: hpa
|
| 88 |
+
sharded_eval_checkpoint: true
|
| 89 |
+
student:
|
| 90 |
+
arch: vit_7b
|
| 91 |
+
patch_size: 16
|
| 92 |
+
drop_path_rate: 0.4
|
| 93 |
+
layerscale: 1.0e-05
|
| 94 |
+
patch_drop: 0.0
|
| 95 |
+
pretrained_weights: ''
|
| 96 |
+
ffn_layer: swiglu64
|
| 97 |
+
ffn_ratio: 3
|
| 98 |
+
resume_from_teacher_chkpt: ''
|
| 99 |
+
qkv_bias: false
|
| 100 |
+
proj_bias: true
|
| 101 |
+
ffn_bias: true
|
| 102 |
+
norm_layer: layernormbf16
|
| 103 |
+
n_storage_tokens: 4
|
| 104 |
+
untie_cls_and_patch_norms: false
|
| 105 |
+
untie_global_and_local_cls_norm: true
|
| 106 |
+
mask_k_bias: true
|
| 107 |
+
in_chans: 3
|
| 108 |
+
pos_embed_type: rope
|
| 109 |
+
pos_embed_rope_base: 100
|
| 110 |
+
pos_embed_rope_min_period: null
|
| 111 |
+
pos_embed_rope_max_period: null
|
| 112 |
+
pos_embed_rope_normalize_coords: separate
|
| 113 |
+
pos_embed_rope_shift_coords: null
|
| 114 |
+
pos_embed_rope_jitter_coords: null
|
| 115 |
+
pos_embed_rope_rescale_coords: 2
|
| 116 |
+
pos_embed_rope_dtype: fp32
|
| 117 |
+
fp8_enabled: true
|
| 118 |
+
fp8_filter: blocks
|
| 119 |
+
teacher:
|
| 120 |
+
momentum_teacher: null
|
| 121 |
+
final_momentum_teacher: null
|
| 122 |
+
warmup_teacher_temp: null
|
| 123 |
+
teacher_temp: null
|
| 124 |
+
warmup_teacher_temp_epochs: null
|
| 125 |
+
in_chans: 3
|
| 126 |
+
distillation:
|
| 127 |
+
enabled: false
|
| 128 |
+
full_cfg_path: ''
|
| 129 |
+
checkpoint_path: ''
|
| 130 |
+
multidistillation:
|
| 131 |
+
enabled: false
|
| 132 |
+
hrft:
|
| 133 |
+
enabled: false
|
| 134 |
+
checkpoint_path: ''
|
| 135 |
+
optim:
|
| 136 |
+
epochs: 30
|
| 137 |
+
optimizer: adamw
|
| 138 |
+
weight_decay: null
|
| 139 |
+
weight_decay_end: null
|
| 140 |
+
lr: null
|
| 141 |
+
warmup_epochs: null
|
| 142 |
+
min_lr: null
|
| 143 |
+
schedule_trunc_extra: null
|
| 144 |
+
clip_grad: 30.0
|
| 145 |
+
freeze_last_layer_epochs: null
|
| 146 |
+
scaling_rule: sqrt_wrt_1024
|
| 147 |
+
patch_embed_lr_mult: 0.2
|
| 148 |
+
dino_head_wd_multiplier: 1.0
|
| 149 |
+
layerwise_decay: 0.98
|
| 150 |
+
multi_tensor_optim: true
|
| 151 |
+
dump_fsdp_weights_path: ''
|
| 152 |
+
adamw_beta1: 0.9
|
| 153 |
+
adamw_beta2: 0.99
|
| 154 |
+
crops:
|
| 155 |
+
global_crops_scale:
|
| 156 |
+
- 0.32
|
| 157 |
+
- 1.0
|
| 158 |
+
local_crops_number: 8
|
| 159 |
+
local_crops_scale:
|
| 160 |
+
- 0.05
|
| 161 |
+
- 0.32
|
| 162 |
+
global_crops_size:
|
| 163 |
+
- 512
|
| 164 |
+
- 768
|
| 165 |
+
- 768
|
| 166 |
+
- 768
|
| 167 |
+
- 768
|
| 168 |
+
local_crops_size:
|
| 169 |
+
- 112
|
| 170 |
+
- 112
|
| 171 |
+
- 168
|
| 172 |
+
- 224
|
| 173 |
+
- 336
|
| 174 |
+
global_local_crop_pairs_ratios:
|
| 175 |
+
- 0.3
|
| 176 |
+
- 0.3
|
| 177 |
+
- 0.3
|
| 178 |
+
- 0.05
|
| 179 |
+
- 0.05
|
| 180 |
+
gram_teacher_crops_size:
|
| 181 |
+
- 768
|
| 182 |
+
- 1152
|
| 183 |
+
- 1152
|
| 184 |
+
- 1152
|
| 185 |
+
- 1152
|
| 186 |
+
localcrops_subset_of_globalcrops: false
|
| 187 |
+
share_color_jitter: false
|
| 188 |
+
horizontal_flips: false
|
| 189 |
+
gram_teacher_no_distortions: true
|
| 190 |
+
rgb_mean:
|
| 191 |
+
- 0.485
|
| 192 |
+
- 0.456
|
| 193 |
+
- 0.406
|
| 194 |
+
rgb_std:
|
| 195 |
+
- 0.229
|
| 196 |
+
- 0.224
|
| 197 |
+
- 0.225
|
| 198 |
+
checkpointing:
|
| 199 |
+
period: 250
|
| 200 |
+
max_to_keep: 3
|
| 201 |
+
keep_every: 50000
|
| 202 |
+
schedules:
|
| 203 |
+
lr:
|
| 204 |
+
start: 0
|
| 205 |
+
peak: 0
|
| 206 |
+
end: 1.25e-05
|
| 207 |
+
warmup_epochs: 0
|
| 208 |
+
freeze_last_layer_epochs: 0
|
| 209 |
+
cosine_epochs: 10
|
| 210 |
+
weight_decay:
|
| 211 |
+
start: 0.04
|
| 212 |
+
peak: 0.04
|
| 213 |
+
end: 0.04
|
| 214 |
+
warmup_epochs: 0
|
| 215 |
+
teacher_temp:
|
| 216 |
+
start: 0.07
|
| 217 |
+
peak: 0.07
|
| 218 |
+
end: 0.07
|
| 219 |
+
warmup_epochs: 0
|
| 220 |
+
momentum:
|
| 221 |
+
start: 0.999
|
| 222 |
+
peak: 0.999
|
| 223 |
+
end: 0.999
|
| 224 |
+
warmup_epochs: 0
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vit7b16_pretrain.yaml
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: SSLMetaArch
|
| 3 |
+
DEVICE: cuda
|
| 4 |
+
WEIGHTS: ''
|
| 5 |
+
DTYPE: float32
|
| 6 |
+
compute_precision:
|
| 7 |
+
param_dtype: bf16
|
| 8 |
+
reduce_dtype: fp32
|
| 9 |
+
sharding_strategy: SHARD_GRAD_OP
|
| 10 |
+
dino:
|
| 11 |
+
loss_weight: 1.0
|
| 12 |
+
global_ignore_diagonal: true
|
| 13 |
+
head_n_prototypes: 262144
|
| 14 |
+
head_bottleneck_dim: 512
|
| 15 |
+
head_norm_last_layer: false
|
| 16 |
+
head_nlayers: 3
|
| 17 |
+
head_hidden_dim: 8192
|
| 18 |
+
koleo_loss_weight: 0.1
|
| 19 |
+
koleo_loss_distributed: false
|
| 20 |
+
koleo_topk: 1
|
| 21 |
+
koleo_distributed_replicas: 0
|
| 22 |
+
koleo_distributed_loss_group_size: null
|
| 23 |
+
force_weight_norm: false
|
| 24 |
+
ibot:
|
| 25 |
+
loss_weight: 1.0
|
| 26 |
+
mask_sample_probability: 0.5
|
| 27 |
+
mask_ratio_min_max:
|
| 28 |
+
- 0.1
|
| 29 |
+
- 0.5
|
| 30 |
+
mask_random_circular_shift: false
|
| 31 |
+
force_masking_even_with_zero_weight: false
|
| 32 |
+
separate_head: true
|
| 33 |
+
head_n_prototypes: 98304
|
| 34 |
+
head_bottleneck_dim: 384
|
| 35 |
+
head_norm_last_layer: false
|
| 36 |
+
head_nlayers: 3
|
| 37 |
+
head_hidden_dim: 4096
|
| 38 |
+
gram:
|
| 39 |
+
use_loss: false
|
| 40 |
+
compute_stats: false
|
| 41 |
+
train:
|
| 42 |
+
batch_size_per_gpu: 16
|
| 43 |
+
dataset_path: null
|
| 44 |
+
saveckp_freq: 20
|
| 45 |
+
seed: 0
|
| 46 |
+
num_workers: 10
|
| 47 |
+
OFFICIAL_EPOCH_LENGTH: 1000
|
| 48 |
+
monitor_gradient_norm: false
|
| 49 |
+
chunk_schedule: []
|
| 50 |
+
cache_dataset: true
|
| 51 |
+
use_teacher_head: true
|
| 52 |
+
learn_from_teacher_tokens: false
|
| 53 |
+
centering: sinkhorn_knopp
|
| 54 |
+
checkpointing: true
|
| 55 |
+
checkpointing_full: false
|
| 56 |
+
compile: true
|
| 57 |
+
cudagraphs: false
|
| 58 |
+
cell_augmentation: false
|
| 59 |
+
cell_augmentation_type: hpa
|
| 60 |
+
sharded_eval_checkpoint: true
|
| 61 |
+
student:
|
| 62 |
+
arch: vit_7b
|
| 63 |
+
patch_size: 16
|
| 64 |
+
drop_path_rate: 0.4
|
| 65 |
+
layerscale: 1.0e-05
|
| 66 |
+
patch_drop: 0.0
|
| 67 |
+
pretrained_weights: ''
|
| 68 |
+
ffn_layer: swiglu64
|
| 69 |
+
ffn_ratio: 3
|
| 70 |
+
resume_from_teacher_chkpt: ''
|
| 71 |
+
qkv_bias: false
|
| 72 |
+
proj_bias: true
|
| 73 |
+
ffn_bias: true
|
| 74 |
+
norm_layer: layernormbf16
|
| 75 |
+
n_storage_tokens: 4
|
| 76 |
+
untie_cls_and_patch_norms: false
|
| 77 |
+
untie_global_and_local_cls_norm: true
|
| 78 |
+
mask_k_bias: true
|
| 79 |
+
in_chans: 3
|
| 80 |
+
pos_embed_type: rope
|
| 81 |
+
pos_embed_rope_base: 100
|
| 82 |
+
pos_embed_rope_min_period: null
|
| 83 |
+
pos_embed_rope_max_period: null
|
| 84 |
+
pos_embed_rope_normalize_coords: separate
|
| 85 |
+
pos_embed_rope_shift_coords: null
|
| 86 |
+
pos_embed_rope_jitter_coords: null
|
| 87 |
+
pos_embed_rope_rescale_coords: 2
|
| 88 |
+
pos_embed_rope_dtype: fp32
|
| 89 |
+
fp8_enabled: true
|
| 90 |
+
fp8_filter: blocks
|
| 91 |
+
teacher:
|
| 92 |
+
momentum_teacher: null
|
| 93 |
+
final_momentum_teacher: null
|
| 94 |
+
warmup_teacher_temp: null
|
| 95 |
+
teacher_temp: null
|
| 96 |
+
warmup_teacher_temp_epochs: null
|
| 97 |
+
in_chans: 3
|
| 98 |
+
distillation:
|
| 99 |
+
enabled: false
|
| 100 |
+
full_cfg_path: ''
|
| 101 |
+
checkpoint_path: ''
|
| 102 |
+
multidistillation:
|
| 103 |
+
enabled: false
|
| 104 |
+
hrft:
|
| 105 |
+
enabled: false
|
| 106 |
+
checkpoint_path: ''
|
| 107 |
+
optim:
|
| 108 |
+
epochs: 1000
|
| 109 |
+
optimizer: adamw
|
| 110 |
+
weight_decay: null
|
| 111 |
+
weight_decay_end: null
|
| 112 |
+
lr: null
|
| 113 |
+
warmup_epochs: null
|
| 114 |
+
min_lr: null
|
| 115 |
+
schedule_trunc_extra: null
|
| 116 |
+
clip_grad: 30.0
|
| 117 |
+
freeze_last_layer_epochs: null
|
| 118 |
+
scaling_rule: sqrt_wrt_1024
|
| 119 |
+
patch_embed_lr_mult: 0.2
|
| 120 |
+
dino_head_wd_multiplier: 1.0
|
| 121 |
+
layerwise_decay: 0.98
|
| 122 |
+
multi_tensor_optim: true
|
| 123 |
+
dump_fsdp_weights_path: ''
|
| 124 |
+
adamw_beta1: 0.9
|
| 125 |
+
adamw_beta2: 0.99
|
| 126 |
+
crops:
|
| 127 |
+
global_crops_scale:
|
| 128 |
+
- 0.32
|
| 129 |
+
- 1.0
|
| 130 |
+
local_crops_number: 8
|
| 131 |
+
local_crops_scale:
|
| 132 |
+
- 0.05
|
| 133 |
+
- 0.32
|
| 134 |
+
global_crops_size: 256
|
| 135 |
+
local_crops_size: 112
|
| 136 |
+
localcrops_subset_of_globalcrops: false
|
| 137 |
+
share_color_jitter: false
|
| 138 |
+
horizontal_flips: false
|
| 139 |
+
rgb_mean:
|
| 140 |
+
- 0.485
|
| 141 |
+
- 0.456
|
| 142 |
+
- 0.406
|
| 143 |
+
rgb_std:
|
| 144 |
+
- 0.229
|
| 145 |
+
- 0.224
|
| 146 |
+
- 0.225
|
| 147 |
+
checkpointing:
|
| 148 |
+
period: 1000
|
| 149 |
+
max_to_keep: 3
|
| 150 |
+
keep_every: 50000
|
| 151 |
+
schedules:
|
| 152 |
+
lr:
|
| 153 |
+
start: 0
|
| 154 |
+
peak: 5.0e-05
|
| 155 |
+
end: 5.0e-05
|
| 156 |
+
warmup_epochs: 100
|
| 157 |
+
freeze_last_layer_epochs: 5
|
| 158 |
+
weight_decay:
|
| 159 |
+
start: 0.04
|
| 160 |
+
peak: 0.04
|
| 161 |
+
end: 0.04
|
| 162 |
+
warmup_epochs: 0
|
| 163 |
+
teacher_temp:
|
| 164 |
+
start: 0.04
|
| 165 |
+
peak: 0.07
|
| 166 |
+
end: 0.07
|
| 167 |
+
warmup_epochs: 100
|
| 168 |
+
momentum:
|
| 169 |
+
start: 0.994
|
| 170 |
+
peak: 0.994
|
| 171 |
+
end: 0.994
|
| 172 |
+
warmup_epochs: 0
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vitl16_lvd1689m_distilled.yaml
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: MultiDistillationMetaArch
|
| 3 |
+
DEVICE: cuda
|
| 4 |
+
WEIGHTS: ''
|
| 5 |
+
DTYPE: float32
|
| 6 |
+
compute_precision:
|
| 7 |
+
param_dtype: bf16
|
| 8 |
+
reduce_dtype: fp32
|
| 9 |
+
sharding_strategy: SHARD_GRAD_OP
|
| 10 |
+
dino:
|
| 11 |
+
loss_weight: 1.0
|
| 12 |
+
global_ignore_diagonal: true
|
| 13 |
+
head_n_prototypes: 262144
|
| 14 |
+
head_bottleneck_dim: 512
|
| 15 |
+
head_norm_last_layer: false
|
| 16 |
+
head_nlayers: 3
|
| 17 |
+
head_hidden_dim: 8192
|
| 18 |
+
koleo_loss_weight: 0.1
|
| 19 |
+
koleo_loss_distributed: false
|
| 20 |
+
koleo_topk: 1
|
| 21 |
+
koleo_distributed_replicas: 0
|
| 22 |
+
koleo_distributed_loss_group_size: null
|
| 23 |
+
koleo_distributed_loss_group_data: true
|
| 24 |
+
force_weight_norm: false
|
| 25 |
+
reweight_dino_local_loss: false
|
| 26 |
+
local_loss_weight_schedule:
|
| 27 |
+
start: 0.5
|
| 28 |
+
peak: 0.5
|
| 29 |
+
end: 0.5
|
| 30 |
+
warmup_epochs: 0
|
| 31 |
+
ibot:
|
| 32 |
+
loss_weight: 1.0
|
| 33 |
+
mask_sample_probability: 0.5
|
| 34 |
+
mask_ratio_min_max:
|
| 35 |
+
- 0.1
|
| 36 |
+
- 0.5
|
| 37 |
+
mask_random_circular_shift: false
|
| 38 |
+
force_masking_even_with_zero_weight: false
|
| 39 |
+
separate_head: true
|
| 40 |
+
head_n_prototypes: 98304
|
| 41 |
+
head_bottleneck_dim: 384
|
| 42 |
+
head_norm_last_layer: false
|
| 43 |
+
head_nlayers: 3
|
| 44 |
+
head_hidden_dim: 4096
|
| 45 |
+
coding_rate_loss:
|
| 46 |
+
use_cls_loss: false
|
| 47 |
+
cls_loss_weight: 0.2
|
| 48 |
+
use_masked_patches_loss: false
|
| 49 |
+
masked_patches_loss_weight: 0.1
|
| 50 |
+
epsilon: 8
|
| 51 |
+
gram:
|
| 52 |
+
use_loss: false
|
| 53 |
+
compute_stats: false
|
| 54 |
+
loss_weight: 1.0
|
| 55 |
+
ema_teacher: false
|
| 56 |
+
ckpt: null
|
| 57 |
+
it_load_ema_teacher: -1
|
| 58 |
+
rep_update: true
|
| 59 |
+
update_frequency: 50000
|
| 60 |
+
it_first_update: 0
|
| 61 |
+
max_updates: null
|
| 62 |
+
normalized: true
|
| 63 |
+
img_level: false
|
| 64 |
+
remove_neg: false
|
| 65 |
+
remove_only_teacher_neg: false
|
| 66 |
+
tokens_used: all
|
| 67 |
+
global_teacher_resize_method: bicubic
|
| 68 |
+
global_teacher_resize_antialias: false
|
| 69 |
+
loss_weight_schedule: null
|
| 70 |
+
train:
|
| 71 |
+
batch_size_per_gpu: 3
|
| 72 |
+
dataset_path: <TRAIN/DATASET>
|
| 73 |
+
output_dir: <OUTPUT/DIR>
|
| 74 |
+
saveckp_freq: 20
|
| 75 |
+
seed: 0
|
| 76 |
+
num_workers: 2
|
| 77 |
+
OFFICIAL_EPOCH_LENGTH: 1250
|
| 78 |
+
monitor_gradient_norm: false
|
| 79 |
+
chunk_schedule: []
|
| 80 |
+
cache_dataset: true
|
| 81 |
+
use_teacher_head: true
|
| 82 |
+
learn_from_teacher_tokens: false
|
| 83 |
+
centering: sinkhorn_knopp
|
| 84 |
+
checkpointing: true
|
| 85 |
+
checkpointing_full: true
|
| 86 |
+
compile: true
|
| 87 |
+
cudagraphs: false
|
| 88 |
+
cell_augmentation: false
|
| 89 |
+
cell_augmentation_type: hpa
|
| 90 |
+
sharded_eval_checkpoint: false
|
| 91 |
+
student:
|
| 92 |
+
arch: vit_large
|
| 93 |
+
patch_size: 16
|
| 94 |
+
drop_path_rate: 0.0
|
| 95 |
+
layerscale: 1.0e-05
|
| 96 |
+
drop_path_uniform: true
|
| 97 |
+
drop_path_shape: uniform
|
| 98 |
+
patch_drop: 0.0
|
| 99 |
+
pretrained_weights: ''
|
| 100 |
+
sin_cos_embeddings: false
|
| 101 |
+
fourier_embeddings: false
|
| 102 |
+
fourier_encoding_dim: 64
|
| 103 |
+
multiple_pos_embeddings: false
|
| 104 |
+
cls_pos_embedding: false
|
| 105 |
+
reg_pos_embedding: false
|
| 106 |
+
ffn_layer: mlp
|
| 107 |
+
ffn_ratio: 4.0
|
| 108 |
+
resume_from_teacher_chkpt: <PATH/TO/HRFT/TEACHER>
|
| 109 |
+
block_chunks: 0
|
| 110 |
+
qkv_bias: true
|
| 111 |
+
proj_bias: true
|
| 112 |
+
ffn_bias: true
|
| 113 |
+
norm_layer: layernormbf16
|
| 114 |
+
n_storage_tokens: 4
|
| 115 |
+
mask_attention: false
|
| 116 |
+
mask_register_attention: false
|
| 117 |
+
untie_cls_and_patch_norms: false
|
| 118 |
+
untie_global_and_local_cls_norm: false
|
| 119 |
+
interpolate_offset: 0.0
|
| 120 |
+
interpolate_antialias: true
|
| 121 |
+
mask_k_bias: true
|
| 122 |
+
init_std_cls: 0.02
|
| 123 |
+
init_std_reg: 0.02
|
| 124 |
+
rescale_weights_by_layer_id: false
|
| 125 |
+
in_chans: 3
|
| 126 |
+
pos_embed_grid_size: 48
|
| 127 |
+
pos_embed_type: ropenew
|
| 128 |
+
pos_embed_rope_gamma: 1.0
|
| 129 |
+
pos_embed_rope_init_multi_frequencies: false
|
| 130 |
+
pos_embed_rope_base: 100
|
| 131 |
+
pos_embed_rope_min_period: null
|
| 132 |
+
pos_embed_rope_max_period: null
|
| 133 |
+
pos_embed_rope_normalize_coords: separate
|
| 134 |
+
pos_embed_rope_shift_coords: null
|
| 135 |
+
pos_embed_rope_jitter_coords: null
|
| 136 |
+
pos_embed_rope_rescale_coords: 2
|
| 137 |
+
pos_embed_rope_dtype: bf16
|
| 138 |
+
sparse24_ranges: []
|
| 139 |
+
sparse24_filter:
|
| 140 |
+
- mlp
|
| 141 |
+
sparse24_default: false
|
| 142 |
+
fp8_enabled: false
|
| 143 |
+
fp8_filter: blocks
|
| 144 |
+
teacher:
|
| 145 |
+
momentum_teacher: 0.994
|
| 146 |
+
final_momentum_teacher: 1
|
| 147 |
+
warmup_teacher_temp: 0.04
|
| 148 |
+
teacher_temp: 0.07
|
| 149 |
+
warmup_teacher_temp_epochs: 120
|
| 150 |
+
in_chans: 3
|
| 151 |
+
distillation:
|
| 152 |
+
enabled: true
|
| 153 |
+
full_cfg_path: <PATH/TO/TEACHER/CONFIG/config.yaml>
|
| 154 |
+
checkpoint_path: <PATH/TO/TEACHER/checkpoint.pth>
|
| 155 |
+
multidistillation:
|
| 156 |
+
enabled: true
|
| 157 |
+
global_batch_size: 1920
|
| 158 |
+
students:
|
| 159 |
+
- name: vits_mlp4_4
|
| 160 |
+
config_path: <PATH/TO/STUDENT/CONFIG/vits_mlp4_4.yaml>
|
| 161 |
+
ranks_range:
|
| 162 |
+
- 0
|
| 163 |
+
- 48
|
| 164 |
+
- name: vitsp_swiglu6_1
|
| 165 |
+
config_path: <PATH/TO/STUDENT/CONFIG/vitsp_swiglu6_1.yaml>
|
| 166 |
+
ranks_range:
|
| 167 |
+
- 48
|
| 168 |
+
- 96
|
| 169 |
+
- name: vitb_mlp4_3
|
| 170 |
+
config_path: <PATH/TO/STUDENT/CONFIG/vitb_mlp4_3.yaml>
|
| 171 |
+
ranks_range:
|
| 172 |
+
- 96
|
| 173 |
+
- 176
|
| 174 |
+
- name: vitl_mlp4_1
|
| 175 |
+
config_path: <PATH/TO/STUDENT/CONFIG/vitl_mlp4_1.yaml>
|
| 176 |
+
ranks_range:
|
| 177 |
+
- 176
|
| 178 |
+
- 296
|
| 179 |
+
hrft:
|
| 180 |
+
enabled: false
|
| 181 |
+
checkpoint_path: ''
|
| 182 |
+
optim:
|
| 183 |
+
epochs: 20
|
| 184 |
+
optimizer: adamw
|
| 185 |
+
weight_decay: 0.04
|
| 186 |
+
weight_decay_end: 0.2
|
| 187 |
+
lr: 0.0002
|
| 188 |
+
warmup_epochs: 0
|
| 189 |
+
min_lr: 1.0e-06
|
| 190 |
+
schedule_trunc_extra: 0.0
|
| 191 |
+
clip_grad: 3.0
|
| 192 |
+
freeze_last_layer_epochs: 0
|
| 193 |
+
scaling_rule: sqrt_wrt_1024
|
| 194 |
+
patch_embed_lr_mult: 0.2
|
| 195 |
+
dino_head_wd_multiplier: 1.0
|
| 196 |
+
layerwise_decay: 0.99
|
| 197 |
+
multi_tensor_optim: true
|
| 198 |
+
dump_fsdp_weights_path: ''
|
| 199 |
+
adamw_beta1: 0.9
|
| 200 |
+
adamw_beta2: 0.999
|
| 201 |
+
crops:
|
| 202 |
+
global_crops_scale:
|
| 203 |
+
- 0.32
|
| 204 |
+
- 1.0
|
| 205 |
+
local_crops_number: 8
|
| 206 |
+
local_crops_scale:
|
| 207 |
+
- 0.05
|
| 208 |
+
- 0.32
|
| 209 |
+
global_crops_size: 256
|
| 210 |
+
local_crops_size: 112
|
| 211 |
+
global_local_crop_pairs_ratios: 1.0
|
| 212 |
+
gram_teacher_crops_size: 256
|
| 213 |
+
localcrops_subset_of_globalcrops: false
|
| 214 |
+
share_color_jitter: false
|
| 215 |
+
horizontal_flips: false
|
| 216 |
+
gram_teacher_no_distortions: false
|
| 217 |
+
rgb_mean:
|
| 218 |
+
- 0.485
|
| 219 |
+
- 0.456
|
| 220 |
+
- 0.406
|
| 221 |
+
rgb_std:
|
| 222 |
+
- 0.229
|
| 223 |
+
- 0.224
|
| 224 |
+
- 0.225
|
| 225 |
+
checkpointing:
|
| 226 |
+
period: 3750
|
| 227 |
+
max_to_keep: 3
|
| 228 |
+
keep_every: 99999999999999999
|
| 229 |
+
schedules:
|
| 230 |
+
weight_decay:
|
| 231 |
+
start: 0.04
|
| 232 |
+
peak: 0.04
|
| 233 |
+
end: 0.04
|
| 234 |
+
warmup_epochs: 0
|
| 235 |
+
teacher_temp:
|
| 236 |
+
start: 0.04
|
| 237 |
+
peak: 0.07
|
| 238 |
+
end: 0.07
|
| 239 |
+
warmup_epochs: 0
|
| 240 |
+
lr:
|
| 241 |
+
start: 0
|
| 242 |
+
peak: 0
|
| 243 |
+
end: 5.0e-05
|
| 244 |
+
warmup_epochs: 0
|
| 245 |
+
freeze_last_layer_epochs: 0
|
| 246 |
+
cosine_epochs: 10
|
| 247 |
+
momentum:
|
| 248 |
+
start: 0.994
|
| 249 |
+
peak: 0.994
|
| 250 |
+
end: 1.0
|
| 251 |
+
warmup_epochs: 0
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/multi_distillation_test.yaml
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: MultiDistillationMetaArch
|
| 3 |
+
multidistillation:
|
| 4 |
+
enabled: true
|
| 5 |
+
global_batch_size: 256
|
| 6 |
+
students:
|
| 7 |
+
- name: vits
|
| 8 |
+
config_path: dinov3/configs/train/multidist_tests/vits_p16.yaml
|
| 9 |
+
ranks_range:
|
| 10 |
+
- 0
|
| 11 |
+
- 4
|
| 12 |
+
- name: vitb
|
| 13 |
+
config_path: dinov3/configs/train/multidist_tests/vitb_p16.yaml
|
| 14 |
+
ranks_range:
|
| 15 |
+
- 4
|
| 16 |
+
- 8
|
| 17 |
+
distillation: # teacher
|
| 18 |
+
enabled: true
|
| 19 |
+
full_cfg_path: dinov3/configs/train/vitl_im1k_lin834.yaml
|
| 20 |
+
checkpoint_path: ignore
|
| 21 |
+
train:
|
| 22 |
+
dataset_path: ImageNet:split=TRAIN
|
| 23 |
+
cache_dataset: false
|
| 24 |
+
centering: "sinkhorn_knopp"
|
| 25 |
+
compile: true
|
| 26 |
+
ibot:
|
| 27 |
+
separate_head: true
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/multidist_tests/vitb_p16.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# this corresponds to the default config
|
| 2 |
+
train:
|
| 3 |
+
dataset_path: ImageNet:split=TRAIN
|
| 4 |
+
checkpointing: true
|
| 5 |
+
student:
|
| 6 |
+
drop_path_rate: 0.1
|
| 7 |
+
arch: vit_base
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/multidist_tests/vits_p16.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# this corresponds to the default config
|
| 2 |
+
train:
|
| 3 |
+
dataset_path: ImageNet:split=TRAIN
|
| 4 |
+
student:
|
| 5 |
+
drop_path_rate: 0.1
|
| 6 |
+
arch: vit_small
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/vitl_im1k_lin834.yaml
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# tested on RSC: /checkpoint/dino/qas/rope/vitl16_im1k/
|
| 2 |
+
# gives 82.2 im1k-knn, 83.3 im1k-linear
|
| 3 |
+
# runs with a total batch size of 2048 (64/gpu, 4 nodes here)
|
| 4 |
+
# runs at 0.57s/iter
|
| 5 |
+
MODEL:
|
| 6 |
+
META_ARCHITECTURE: SSLMetaArch
|
| 7 |
+
DEVICE: cuda
|
| 8 |
+
WEIGHTS: ''
|
| 9 |
+
DTYPE: float32
|
| 10 |
+
compute_precision:
|
| 11 |
+
param_dtype: bf16
|
| 12 |
+
reduce_dtype: fp32
|
| 13 |
+
sharding_strategy: SHARD_GRAD_OP
|
| 14 |
+
dino:
|
| 15 |
+
loss_weight: 1.0
|
| 16 |
+
global_ignore_diagonal: true
|
| 17 |
+
head_n_prototypes: 65536
|
| 18 |
+
head_bottleneck_dim: 256
|
| 19 |
+
head_norm_last_layer: false
|
| 20 |
+
head_nlayers: 3
|
| 21 |
+
head_hidden_dim: 2048
|
| 22 |
+
koleo_loss_weight: 0.1
|
| 23 |
+
koleo_loss_distributed: false
|
| 24 |
+
koleo_topk: 1
|
| 25 |
+
koleo_distributed_replicas: 0
|
| 26 |
+
force_weight_norm: false
|
| 27 |
+
ibot:
|
| 28 |
+
loss_weight: 1.0
|
| 29 |
+
mask_sample_probability: 0.5
|
| 30 |
+
mask_ratio_min_max:
|
| 31 |
+
- 0.1
|
| 32 |
+
- 0.5
|
| 33 |
+
mask_random_circular_shift: false
|
| 34 |
+
force_masking_even_with_zero_weight: false
|
| 35 |
+
separate_head: true
|
| 36 |
+
head_n_prototypes: 65536
|
| 37 |
+
head_bottleneck_dim: 256
|
| 38 |
+
head_norm_last_layer: false
|
| 39 |
+
head_nlayers: 3
|
| 40 |
+
head_hidden_dim: 2048
|
| 41 |
+
train:
|
| 42 |
+
batch_size_per_gpu: 64
|
| 43 |
+
dataset_path: ImageNet:split=TRAIN
|
| 44 |
+
output_dir: /checkpoint/dino/qas/rope/vitl16_im1k
|
| 45 |
+
saveckp_freq: 20
|
| 46 |
+
seed: 0
|
| 47 |
+
num_workers: 10
|
| 48 |
+
OFFICIAL_EPOCH_LENGTH: 1250
|
| 49 |
+
monitor_gradient_norm: false
|
| 50 |
+
chunk_schedule: []
|
| 51 |
+
cache_dataset: true
|
| 52 |
+
use_teacher_head: true
|
| 53 |
+
learn_from_teacher_tokens: false
|
| 54 |
+
centering: sinkhorn_knopp
|
| 55 |
+
checkpointing: false
|
| 56 |
+
compile: true
|
| 57 |
+
cudagraphs: false
|
| 58 |
+
cell_augmentation: false
|
| 59 |
+
cell_augmentation_type: hpa
|
| 60 |
+
student:
|
| 61 |
+
arch: vit_large
|
| 62 |
+
patch_size: 16
|
| 63 |
+
drop_path_rate: 0.3
|
| 64 |
+
layerscale: 1.0e-05
|
| 65 |
+
patch_drop: 0.0
|
| 66 |
+
pretrained_weights: ''
|
| 67 |
+
ffn_layer: mlp
|
| 68 |
+
ffn_ratio: 4.0
|
| 69 |
+
resume_from_teacher_chkpt: ''
|
| 70 |
+
qkv_bias: true
|
| 71 |
+
proj_bias: true
|
| 72 |
+
ffn_bias: true
|
| 73 |
+
norm_layer: layernorm
|
| 74 |
+
n_storage_tokens: 0
|
| 75 |
+
mask_k_bias: false
|
| 76 |
+
in_chans: 3
|
| 77 |
+
pos_embed_type: rope
|
| 78 |
+
pos_embed_rope_base: 100.0
|
| 79 |
+
pos_embed_rope_min_period: null
|
| 80 |
+
pos_embed_rope_max_period: null
|
| 81 |
+
pos_embed_rope_normalize_coords: separate # min, max, separate
|
| 82 |
+
pos_embed_rope_shift_coords: null
|
| 83 |
+
pos_embed_rope_jitter_coords: null
|
| 84 |
+
pos_embed_rope_rescale_coords: null
|
| 85 |
+
pos_embed_rope_dtype: bf16
|
| 86 |
+
fp8_enabled: False # Convert Linear layers to operate in fp8 precision
|
| 87 |
+
fp8_filter: "blocks" # Regex that must appear in module path; empty means everything
|
| 88 |
+
teacher:
|
| 89 |
+
momentum_teacher: 0.992
|
| 90 |
+
final_momentum_teacher: 1
|
| 91 |
+
warmup_teacher_temp: 0.04
|
| 92 |
+
teacher_temp: 0.07
|
| 93 |
+
warmup_teacher_temp_epochs: 30
|
| 94 |
+
in_chans: 3
|
| 95 |
+
distillation:
|
| 96 |
+
enabled: false
|
| 97 |
+
full_cfg_path: ''
|
| 98 |
+
checkpoint_path: ''
|
| 99 |
+
multidistillation:
|
| 100 |
+
enabled: false
|
| 101 |
+
hrft:
|
| 102 |
+
enabled: false
|
| 103 |
+
checkpoint_path: ''
|
| 104 |
+
optim:
|
| 105 |
+
epochs: 100
|
| 106 |
+
optimizer: adamw
|
| 107 |
+
weight_decay: 0.04
|
| 108 |
+
weight_decay_end: 0.4
|
| 109 |
+
lr: 0.001
|
| 110 |
+
warmup_epochs: 10
|
| 111 |
+
min_lr: 1.0e-06
|
| 112 |
+
clip_grad: 3.0
|
| 113 |
+
freeze_last_layer_epochs: 1
|
| 114 |
+
scaling_rule: sqrt_wrt_1024
|
| 115 |
+
patch_embed_lr_mult: 0.2
|
| 116 |
+
dino_head_wd_multiplier: 1.0
|
| 117 |
+
layerwise_decay: 0.9
|
| 118 |
+
multi_tensor_optim: true
|
| 119 |
+
dump_fsdp_weights_path: ''
|
| 120 |
+
adamw_beta1: 0.9
|
| 121 |
+
adamw_beta2: 0.999
|
| 122 |
+
crops:
|
| 123 |
+
global_crops_scale:
|
| 124 |
+
- 0.32
|
| 125 |
+
- 1.0
|
| 126 |
+
local_crops_number: 8
|
| 127 |
+
local_crops_scale:
|
| 128 |
+
- 0.05
|
| 129 |
+
- 0.32
|
| 130 |
+
global_crops_size: 224
|
| 131 |
+
local_crops_size: 96
|
| 132 |
+
localcrops_subset_of_globalcrops: false
|
| 133 |
+
share_color_jitter: false
|
| 134 |
+
horizontal_flips: true
|
| 135 |
+
evaluation:
|
| 136 |
+
eval_period_iterations: 12500
|
| 137 |
+
low_freq_every: 5
|
| 138 |
+
config_files:
|
| 139 |
+
high_freq: benchmark_high_frequency.yaml
|
| 140 |
+
low_freq: benchmark_low_frequency.yaml
|
| 141 |
+
checkpointing:
|
| 142 |
+
period: 3750
|
| 143 |
+
max_to_keep: 3
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This software may be used and distributed in accordance with
|
| 4 |
+
# the terms of the DINOv3 License Agreement.
|
| 5 |
+
|
| 6 |
+
from .adapters import DatasetWithEnumeratedTargets
|
| 7 |
+
from .augmentations import DataAugmentationDINO
|
| 8 |
+
from .collate import collate_data_and_cast
|
| 9 |
+
from .loaders import SamplerType, make_data_loader, make_dataset
|
| 10 |
+
from .meta_loaders import CombinedDataLoader
|
| 11 |
+
from .masking import MaskingGenerator
|
| 12 |
+
from .transforms import make_classification_eval_transform, make_classification_train_transform
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/adapters.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This software may be used and distributed in accordance with
|
| 4 |
+
# the terms of the DINOv3 License Agreement.
|
| 5 |
+
|
| 6 |
+
from typing import Any, Optional, Tuple
|
| 7 |
+
|
| 8 |
+
from torch.utils.data import Dataset
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def extend_samples_with_index(dataset_class):
|
| 12 |
+
class DatasetWithIndex(dataset_class):
|
| 13 |
+
def __init__(self, **kwargs) -> None:
|
| 14 |
+
root = dataset_class.get_root()
|
| 15 |
+
super().__init__(root=root, **kwargs)
|
| 16 |
+
|
| 17 |
+
def __getitem__(self, index: int):
|
| 18 |
+
image, target = super().__getitem__(index)
|
| 19 |
+
return image, target, index
|
| 20 |
+
|
| 21 |
+
return DatasetWithIndex
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class DatasetWithEnumeratedTargets(Dataset):
|
| 25 |
+
"""
|
| 26 |
+
If pad_dataset is set, pads based on torch's DistributedSampler implementation, which
|
| 27 |
+
with drop_last=False pads the last batch to be a multiple of the world size.
|
| 28 |
+
https://github.com/pytorch/pytorch/blob/main/torch/utils/data/distributed.py#L91
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(self, dataset: Dataset, pad_dataset: bool = False, num_replicas: Optional[int] = None):
|
| 32 |
+
self._dataset = dataset
|
| 33 |
+
self._size = len(self._dataset)
|
| 34 |
+
self._padded_size = self._size
|
| 35 |
+
self._pad_dataset = pad_dataset
|
| 36 |
+
if self._pad_dataset:
|
| 37 |
+
assert num_replicas is not None, "num_replicas should be set if pad_dataset is True"
|
| 38 |
+
self._padded_size = num_replicas * ((len(dataset) + num_replicas - 1) // num_replicas)
|
| 39 |
+
|
| 40 |
+
def get_image_relpath(self, index: int) -> str:
|
| 41 |
+
assert self._pad_dataset or index < self._size
|
| 42 |
+
return self._dataset.get_image_relpath(index % self._size)
|
| 43 |
+
|
| 44 |
+
def get_image_data(self, index: int) -> bytes:
|
| 45 |
+
assert self._pad_dataset or index < self._size
|
| 46 |
+
return self._dataset.get_image_data(index % self._size)
|
| 47 |
+
|
| 48 |
+
def get_target(self, index: int) -> Tuple[Any, int]:
|
| 49 |
+
target = self._dataset.get_target(index % self._size)
|
| 50 |
+
if index >= self._size:
|
| 51 |
+
assert self._pad_dataset
|
| 52 |
+
return (-1, target)
|
| 53 |
+
return (index, target)
|
| 54 |
+
|
| 55 |
+
def get_sample_decoder(self, index: int) -> Any:
|
| 56 |
+
assert self._pad_dataset or index < self._size
|
| 57 |
+
return self._dataset.get_sample_decoder(index % self._size)
|
| 58 |
+
|
| 59 |
+
def __getitem__(self, index: int) -> Tuple[Any, Tuple[Any, int]]:
|
| 60 |
+
image, target = self._dataset[index % self._size]
|
| 61 |
+
if index >= self._size:
|
| 62 |
+
assert self._pad_dataset
|
| 63 |
+
return image, (-1, target)
|
| 64 |
+
target = index if target is None else target
|
| 65 |
+
return image, (index, target)
|
| 66 |
+
|
| 67 |
+
def __len__(self) -> int:
|
| 68 |
+
return self._padded_size
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/augmentations.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This software may be used and distributed in accordance with
|
| 4 |
+
# the terms of the DINOv3 License Agreement.
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
import torch
|
| 10 |
+
from torch import nn
|
| 11 |
+
from torchvision.transforms import v2
|
| 12 |
+
|
| 13 |
+
from dinov3.data.transforms import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, GaussianBlur, make_normalize_transform
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger("dinov3")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class DataAugmentationDINO(object):
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
global_crops_scale,
|
| 22 |
+
local_crops_scale,
|
| 23 |
+
local_crops_number,
|
| 24 |
+
global_crops_size=224,
|
| 25 |
+
local_crops_size=96,
|
| 26 |
+
gram_teacher_crops_size=None,
|
| 27 |
+
gram_teacher_no_distortions=False,
|
| 28 |
+
teacher_no_color_jitter=False,
|
| 29 |
+
local_crops_subset_of_global_crops=False,
|
| 30 |
+
patch_size=16,
|
| 31 |
+
share_color_jitter=False,
|
| 32 |
+
horizontal_flips=True,
|
| 33 |
+
mean=IMAGENET_DEFAULT_MEAN,
|
| 34 |
+
std=IMAGENET_DEFAULT_STD,
|
| 35 |
+
):
|
| 36 |
+
self.global_crops_scale = global_crops_scale
|
| 37 |
+
self.local_crops_scale = local_crops_scale
|
| 38 |
+
self.local_crops_number = local_crops_number
|
| 39 |
+
self.global_crops_size = global_crops_size
|
| 40 |
+
self.local_crops_size = local_crops_size
|
| 41 |
+
self.gram_teacher_crops_size = gram_teacher_crops_size
|
| 42 |
+
self.gram_teacher_no_distortions = gram_teacher_no_distortions
|
| 43 |
+
self.teacher_no_color_jitter = teacher_no_color_jitter
|
| 44 |
+
self.local_crops_subset_of_global_crops = local_crops_subset_of_global_crops
|
| 45 |
+
self.patch_size = patch_size
|
| 46 |
+
self.share_color_jitter = share_color_jitter
|
| 47 |
+
self.mean = mean
|
| 48 |
+
self.std = std
|
| 49 |
+
|
| 50 |
+
logger.info("###################################")
|
| 51 |
+
logger.info("Using data augmentation parameters:")
|
| 52 |
+
logger.info(f"global_crops_scale: {global_crops_scale}")
|
| 53 |
+
logger.info(f"local_crops_scale: {local_crops_scale}")
|
| 54 |
+
logger.info(f"local_crops_number: {local_crops_number}")
|
| 55 |
+
logger.info(f"global_crops_size: {global_crops_size}")
|
| 56 |
+
logger.info(f"local_crops_size: {local_crops_size}")
|
| 57 |
+
logger.info(f"gram_crops_size: {gram_teacher_crops_size}")
|
| 58 |
+
logger.info(f"gram_teacher_no_distortions: {gram_teacher_no_distortions}")
|
| 59 |
+
logger.info(f"teacher_no_color_jitter: {teacher_no_color_jitter}")
|
| 60 |
+
logger.info(f"local_crops_subset_of_global_crops: {local_crops_subset_of_global_crops}")
|
| 61 |
+
logger.info(f"patch_size if local_crops_subset_of_global_crops: {patch_size}")
|
| 62 |
+
logger.info(f"share_color_jitter: {share_color_jitter}")
|
| 63 |
+
logger.info(f"horizontal flips: {horizontal_flips}")
|
| 64 |
+
logger.info("###################################")
|
| 65 |
+
|
| 66 |
+
# Global crops and gram teacher crops can have different sizes. We first take a crop of the maximum size
|
| 67 |
+
# and then resize it to the desired size for global and gram teacher crops.
|
| 68 |
+
global_crop_max_size = max(global_crops_size, gram_teacher_crops_size if gram_teacher_crops_size else 0)
|
| 69 |
+
|
| 70 |
+
# random resized crop and flip
|
| 71 |
+
self.geometric_augmentation_global = v2.Compose(
|
| 72 |
+
[
|
| 73 |
+
v2.RandomResizedCrop(
|
| 74 |
+
global_crop_max_size,
|
| 75 |
+
scale=global_crops_scale,
|
| 76 |
+
interpolation=v2.InterpolationMode.BICUBIC,
|
| 77 |
+
),
|
| 78 |
+
v2.RandomHorizontalFlip(p=0.5 if horizontal_flips else 0.0),
|
| 79 |
+
]
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
resize_global = nn.Identity() # Resize transform applied to global crops after random crop
|
| 83 |
+
self.resize_global_post_transf = (
|
| 84 |
+
nn.Identity()
|
| 85 |
+
) # Resize transform applied to global crops after all other transforms
|
| 86 |
+
self.resize_gram_teacher = None # Resize transform applied to crops for gram teacher
|
| 87 |
+
if gram_teacher_crops_size is not None:
|
| 88 |
+
# All resize transforms will do nothing if the crop size is already the desired size.
|
| 89 |
+
if gram_teacher_no_distortions:
|
| 90 |
+
# When there a no distortions for the gram teacher crop, we can resize before the distortions.
|
| 91 |
+
# This is the preferred order, because it keeps the image size for the augmentations consistent,
|
| 92 |
+
# which matters e.g. for GaussianBlur.
|
| 93 |
+
resize_global = v2.Resize(
|
| 94 |
+
global_crops_size,
|
| 95 |
+
interpolation=v2.InterpolationMode.BICUBIC,
|
| 96 |
+
)
|
| 97 |
+
else:
|
| 98 |
+
# When there a no distortions for the gram teacher crop, we need to resize after the distortions,
|
| 99 |
+
# because the distortions are shared between global and gram teacher crops.
|
| 100 |
+
self.resize_global_post_transf = v2.Resize(
|
| 101 |
+
global_crops_size,
|
| 102 |
+
interpolation=v2.InterpolationMode.BICUBIC,
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
self.resize_gram_teacher = v2.Resize(
|
| 106 |
+
gram_teacher_crops_size,
|
| 107 |
+
interpolation=v2.InterpolationMode.BICUBIC,
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
self.geometric_augmentation_local = v2.Compose(
|
| 111 |
+
[
|
| 112 |
+
v2.RandomResizedCrop(
|
| 113 |
+
local_crops_size,
|
| 114 |
+
scale=local_crops_scale,
|
| 115 |
+
interpolation=v2.InterpolationMode.BICUBIC,
|
| 116 |
+
),
|
| 117 |
+
v2.RandomHorizontalFlip(p=0.5 if horizontal_flips else 0.0),
|
| 118 |
+
]
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
# color distortions / blurring
|
| 122 |
+
color_jittering = v2.Compose(
|
| 123 |
+
[
|
| 124 |
+
v2.RandomApply(
|
| 125 |
+
[v2.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)],
|
| 126 |
+
p=0.8,
|
| 127 |
+
),
|
| 128 |
+
v2.RandomGrayscale(p=0.2),
|
| 129 |
+
]
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
global_transfo1_extra = GaussianBlur(p=1.0)
|
| 133 |
+
|
| 134 |
+
global_transfo2_extra = v2.Compose(
|
| 135 |
+
[
|
| 136 |
+
GaussianBlur(p=0.1),
|
| 137 |
+
v2.RandomSolarize(threshold=128, p=0.2),
|
| 138 |
+
]
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
local_transfo_extra = GaussianBlur(p=0.5)
|
| 142 |
+
|
| 143 |
+
# normalization
|
| 144 |
+
self.normalize = v2.Compose(
|
| 145 |
+
[
|
| 146 |
+
v2.ToImage(),
|
| 147 |
+
v2.ToDtype(torch.float32, scale=True),
|
| 148 |
+
make_normalize_transform(mean=mean, std=std),
|
| 149 |
+
]
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
if self.share_color_jitter:
|
| 153 |
+
self.color_jittering = color_jittering
|
| 154 |
+
self.global_transfo1 = v2.Compose([resize_global, global_transfo1_extra, self.normalize])
|
| 155 |
+
self.global_transfo2 = v2.Compose([resize_global, global_transfo2_extra, self.normalize])
|
| 156 |
+
self.local_transfo = v2.Compose([local_transfo_extra, self.normalize])
|
| 157 |
+
else:
|
| 158 |
+
self.global_transfo1 = v2.Compose(
|
| 159 |
+
[resize_global, color_jittering, global_transfo1_extra, self.normalize]
|
| 160 |
+
)
|
| 161 |
+
self.global_transfo2 = v2.Compose(
|
| 162 |
+
[resize_global, color_jittering, global_transfo2_extra, self.normalize]
|
| 163 |
+
)
|
| 164 |
+
self.local_transfo = v2.Compose([color_jittering, local_transfo_extra, self.normalize])
|
| 165 |
+
|
| 166 |
+
def __call__(self, image):
|
| 167 |
+
output = {}
|
| 168 |
+
output["weak_flag"] = True # some residual from mugs
|
| 169 |
+
|
| 170 |
+
if self.share_color_jitter:
|
| 171 |
+
image = self.color_jittering(image)
|
| 172 |
+
|
| 173 |
+
# global crops:
|
| 174 |
+
im1_base = self.geometric_augmentation_global(image)
|
| 175 |
+
global_crop_1_transf = self.global_transfo1(im1_base)
|
| 176 |
+
global_crop_1 = self.resize_global_post_transf(global_crop_1_transf)
|
| 177 |
+
|
| 178 |
+
im2_base = self.geometric_augmentation_global(image)
|
| 179 |
+
global_crop_2_transf = self.global_transfo2(im2_base)
|
| 180 |
+
global_crop_2 = self.resize_global_post_transf(global_crop_2_transf)
|
| 181 |
+
|
| 182 |
+
output["global_crops"] = [global_crop_1, global_crop_2]
|
| 183 |
+
|
| 184 |
+
# global crops for teacher:
|
| 185 |
+
if self.teacher_no_color_jitter:
|
| 186 |
+
output["global_crops_teacher"] = [
|
| 187 |
+
self.normalize(im1_base),
|
| 188 |
+
self.normalize(im2_base),
|
| 189 |
+
]
|
| 190 |
+
else:
|
| 191 |
+
output["global_crops_teacher"] = [global_crop_1, global_crop_2]
|
| 192 |
+
|
| 193 |
+
if self.gram_teacher_crops_size is not None:
|
| 194 |
+
# crops for gram teacher:
|
| 195 |
+
if self.gram_teacher_no_distortions:
|
| 196 |
+
gram_crop_1 = self.normalize(self.resize_gram_teacher(im1_base))
|
| 197 |
+
gram_crop_2 = self.normalize(self.resize_gram_teacher(im2_base))
|
| 198 |
+
else:
|
| 199 |
+
gram_crop_1 = self.resize_gram_teacher(global_crop_1_transf)
|
| 200 |
+
gram_crop_2 = self.resize_gram_teacher(global_crop_2_transf)
|
| 201 |
+
output["gram_teacher_crops"] = [gram_crop_1, gram_crop_2]
|
| 202 |
+
|
| 203 |
+
# local crops:
|
| 204 |
+
if self.local_crops_subset_of_global_crops:
|
| 205 |
+
_local_crops = [self.local_transfo(im1_base) for _ in range(self.local_crops_number // 2)] + [
|
| 206 |
+
self.local_transfo(im2_base) for _ in range(self.local_crops_number // 2)
|
| 207 |
+
]
|
| 208 |
+
|
| 209 |
+
local_crops = []
|
| 210 |
+
offsets = []
|
| 211 |
+
gs = self.global_crops_size
|
| 212 |
+
ls = self.local_crops_size
|
| 213 |
+
for img in _local_crops:
|
| 214 |
+
rx, ry = np.random.randint(0, (gs - ls) // self.patch_size, 2) * self.patch_size
|
| 215 |
+
local_crops.append(img[:, rx : rx + ls, ry : ry + ls])
|
| 216 |
+
offsets.append((rx, ry))
|
| 217 |
+
|
| 218 |
+
output["local_crops"] = local_crops
|
| 219 |
+
output["offsets"] = offsets
|
| 220 |
+
else:
|
| 221 |
+
local_crops = [
|
| 222 |
+
self.local_transfo(self.geometric_augmentation_local(image)) for _ in range(self.local_crops_number)
|
| 223 |
+
]
|
| 224 |
+
output["local_crops"] = local_crops
|
| 225 |
+
output["offsets"] = ()
|
| 226 |
+
|
| 227 |
+
return output
|
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/collate.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This software may be used and distributed in accordance with
|
| 4 |
+
# the terms of the DINOv3 License Agreement.
|
| 5 |
+
|
| 6 |
+
import random
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def collate_data_and_cast(
|
| 12 |
+
samples_list,
|
| 13 |
+
mask_ratio_tuple,
|
| 14 |
+
mask_probability,
|
| 15 |
+
dtype,
|
| 16 |
+
n_tokens=None,
|
| 17 |
+
mask_generator=None,
|
| 18 |
+
random_circular_shift=False,
|
| 19 |
+
local_batch_size=None,
|
| 20 |
+
):
|
| 21 |
+
n_global_crops = len(samples_list[0][0]["global_crops"])
|
| 22 |
+
n_local_crops = len(samples_list[0][0]["local_crops"])
|
| 23 |
+
|
| 24 |
+
collated_global_crops = torch.stack(
|
| 25 |
+
[s[0]["global_crops"][i] for i in range(n_global_crops) for s in samples_list]
|
| 26 |
+
) # [n_global_crops, B, ...]
|
| 27 |
+
collated_local_crops = torch.stack([s[0]["local_crops"][i] for i in range(n_local_crops) for s in samples_list])
|
| 28 |
+
if "gram_teacher_crops" in samples_list[0][0]:
|
| 29 |
+
collated_gram_teacher_crops = torch.stack(
|
| 30 |
+
[s[0]["gram_teacher_crops"][i] for i in range(n_global_crops) for s in samples_list]
|
| 31 |
+
) # [n_global_crops, B, ...]
|
| 32 |
+
else:
|
| 33 |
+
collated_gram_teacher_crops = None
|
| 34 |
+
|
| 35 |
+
if local_batch_size is not None:
|
| 36 |
+
# multi-distillation case, number of masks is different because the number of samples masked
|
| 37 |
+
# is different of the number of samples passed into the teacher initially
|
| 38 |
+
B = n_global_crops * local_batch_size
|
| 39 |
+
else:
|
| 40 |
+
B = len(collated_global_crops)
|
| 41 |
+
N = n_tokens
|
| 42 |
+
n_samples_masked = int(B * mask_probability)
|
| 43 |
+
probs = torch.linspace(*mask_ratio_tuple, n_samples_masked + 1)
|
| 44 |
+
upperbound = 0
|
| 45 |
+
masks_list = []
|
| 46 |
+
for i in range(0, n_samples_masked):
|
| 47 |
+
prob_max = probs[i + 1]
|
| 48 |
+
mask = torch.BoolTensor(mask_generator(int(N * prob_max)))
|
| 49 |
+
if random_circular_shift: # apply le random circular shift to
|
| 50 |
+
shift_x, shift_y = (
|
| 51 |
+
random.randint(0, mask.shape[0] - 1),
|
| 52 |
+
random.randint(0, mask.shape[1] - 1),
|
| 53 |
+
)
|
| 54 |
+
mask = torch.roll(mask, (shift_x, shift_y), (0, 1))
|
| 55 |
+
masks_list.append(mask)
|
| 56 |
+
upperbound += int(N * prob_max)
|
| 57 |
+
for _ in range(n_samples_masked, B):
|
| 58 |
+
masks_list.append(torch.BoolTensor(mask_generator(0)))
|
| 59 |
+
|
| 60 |
+
random.shuffle(masks_list)
|
| 61 |
+
|
| 62 |
+
collated_masks = torch.stack(masks_list).flatten(1)
|
| 63 |
+
mask_indices_list = collated_masks.flatten().nonzero().flatten()
|
| 64 |
+
|
| 65 |
+
masks_weight = (1 / collated_masks.sum(-1).clamp(min=1.0)).unsqueeze(-1).expand_as(collated_masks)[collated_masks]
|
| 66 |
+
|
| 67 |
+
out = {
|
| 68 |
+
"collated_global_crops": collated_global_crops.to(dtype),
|
| 69 |
+
"collated_local_crops": collated_local_crops.to(dtype),
|
| 70 |
+
"collated_masks": collated_masks,
|
| 71 |
+
"mask_indices_list": mask_indices_list,
|
| 72 |
+
"masks_weight": masks_weight,
|
| 73 |
+
"upperbound": upperbound,
|
| 74 |
+
"n_masked_patches": torch.full((1,), fill_value=mask_indices_list.shape[0], dtype=torch.long),
|
| 75 |
+
}
|
| 76 |
+
if collated_gram_teacher_crops is not None:
|
| 77 |
+
out["collated_gram_teacher_crops"] = collated_gram_teacher_crops.to(dtype)
|
| 78 |
+
return out
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# def get_batch_subset(collated_data_batch, target_bs):
|
| 82 |
+
def get_batch_subset(collated_data_batch, divide_by):
|
| 83 |
+
old_bs = collated_data_batch["collated_global_crops"].shape[0] // 2
|
| 84 |
+
target_bs = (old_bs + divide_by - 1) // divide_by
|
| 85 |
+
collated_global_crops = (
|
| 86 |
+
collated_data_batch["collated_global_crops"].unflatten(0, (2, old_bs)).narrow(1, 0, target_bs).flatten(0, 1)
|
| 87 |
+
)
|
| 88 |
+
collated_local_crops = (
|
| 89 |
+
collated_data_batch["collated_local_crops"].unflatten(0, (-1, old_bs)).narrow(1, 0, target_bs).flatten(0, 1)
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
masks_old_bs = collated_data_batch["collated_masks"].shape[0] // 2
|
| 93 |
+
masks_target_bs = masks_old_bs // divide_by
|
| 94 |
+
collated_masks = (
|
| 95 |
+
collated_data_batch["collated_masks"]
|
| 96 |
+
.unflatten(0, (2, masks_old_bs))
|
| 97 |
+
.narrow(1, 0, masks_target_bs)
|
| 98 |
+
.flatten(0, 1)
|
| 99 |
+
)
|
| 100 |
+
mask_indices_list = collated_masks.flatten().nonzero().flatten()
|
| 101 |
+
|
| 102 |
+
while mask_indices_list.shape[0] == 0:
|
| 103 |
+
_unbind = list(collated_data_batch["collated_masks"].unbind(0))
|
| 104 |
+
random.shuffle(_unbind)
|
| 105 |
+
_bind = torch.stack(_unbind, dim=0)
|
| 106 |
+
collated_masks = _bind.unflatten(0, (2, masks_old_bs)).narrow(1, 0, masks_target_bs).flatten(0, 1)
|
| 107 |
+
mask_indices_list = collated_masks.flatten().nonzero().flatten()
|
| 108 |
+
|
| 109 |
+
masks_weight = (1 / collated_masks.sum(-1).clamp(min=1.0)).unsqueeze(-1).expand_as(collated_masks)[collated_masks]
|
| 110 |
+
upperbound = collated_data_batch["upperbound"]
|
| 111 |
+
|
| 112 |
+
new_batch = {
|
| 113 |
+
"collated_global_crops": collated_global_crops,
|
| 114 |
+
"collated_local_crops": collated_local_crops,
|
| 115 |
+
"collated_masks": collated_masks,
|
| 116 |
+
"mask_indices_list": mask_indices_list,
|
| 117 |
+
"masks_weight": masks_weight,
|
| 118 |
+
"upperbound": upperbound,
|
| 119 |
+
"n_masked_patches": torch.full((1,), fill_value=mask_indices_list.shape[0], dtype=torch.long),
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
if "global_batch_size" in collated_data_batch.keys():
|
| 123 |
+
new_batch["global_batch_size"] = collated_data_batch["global_batch_size"] // divide_by
|
| 124 |
+
|
| 125 |
+
return new_batch
|