Insta360-Research commited on
Commit
e528a23
·
1 Parent(s): c9e5c19

add gradio app and dependencies

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +102 -0
  2. config/infer.yaml +19 -0
  3. depth_anything_utils.py +249 -0
  4. depth_anything_v2_metric/depth_anything_v2/__pycache__/dinov2.cpython-310.pyc +0 -0
  5. depth_anything_v2_metric/depth_anything_v2/__pycache__/dinov3_adpther.cpython-310.pyc +0 -0
  6. depth_anything_v2_metric/depth_anything_v2/__pycache__/dpt.cpython-310.pyc +0 -0
  7. depth_anything_v2_metric/depth_anything_v2/dinov2.py +415 -0
  8. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__init__.py +11 -0
  9. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-310.pyc +0 -0
  10. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-310.pyc +0 -0
  11. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-310.pyc +0 -0
  12. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-310.pyc +0 -0
  13. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc +0 -0
  14. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-310.pyc +0 -0
  15. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc +0 -0
  16. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc +0 -0
  17. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/attention.py +83 -0
  18. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/block.py +252 -0
  19. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/drop_path.py +35 -0
  20. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/layer_scale.py +28 -0
  21. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/mlp.py +41 -0
  22. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/patch_embed.py +89 -0
  23. depth_anything_v2_metric/depth_anything_v2/dinov2_layers/swiglu_ffn.py +63 -0
  24. depth_anything_v2_metric/depth_anything_v2/dinov3/.docstr.yaml +6 -0
  25. depth_anything_v2_metric/depth_anything_v2/dinov3/.github/workflows/lint.yaml +47 -0
  26. depth_anything_v2_metric/depth_anything_v2/dinov3/.gitignore +18 -0
  27. depth_anything_v2_metric/depth_anything_v2/dinov3/CODE_OF_CONDUCT.md +80 -0
  28. depth_anything_v2_metric/depth_anything_v2/dinov3/CONTRIBUTING.md +31 -0
  29. depth_anything_v2_metric/depth_anything_v2/dinov3/LICENSE.md +66 -0
  30. depth_anything_v2_metric/depth_anything_v2/dinov3/MODEL_CARD.md +432 -0
  31. depth_anything_v2_metric/depth_anything_v2/dinov3/README.md +734 -0
  32. depth_anything_v2_metric/depth_anything_v2/dinov3/conda.yaml +23 -0
  33. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/__init__.py +6 -0
  34. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/checkpointer/__init__.py +18 -0
  35. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/checkpointer/checkpointer.py +352 -0
  36. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/__init__.py +16 -0
  37. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/config.py +222 -0
  38. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/ssl_default_config.yaml +205 -0
  39. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vit7b16_gram_anchor.yaml +203 -0
  40. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vit7b16_high_res_adapt.yaml +224 -0
  41. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vit7b16_pretrain.yaml +172 -0
  42. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vitl16_lvd1689m_distilled.yaml +251 -0
  43. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/multi_distillation_test.yaml +27 -0
  44. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/multidist_tests/vitb_p16.yaml +7 -0
  45. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/multidist_tests/vits_p16.yaml +6 -0
  46. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/vitl_im1k_lin834.yaml +143 -0
  47. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/__init__.py +12 -0
  48. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/adapters.py +68 -0
  49. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/augmentations.py +227 -0
  50. depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/collate.py +125 -0
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, division, print_function
2
+
3
+ import os, sys
4
+ import cv2
5
+ import yaml
6
+ import torch
7
+ import numpy as np
8
+ import torch.nn as nn
9
+ import gradio as gr
10
+ from huggingface_hub import hf_hub_download
11
+
12
+ # ========== 让 Space 能 import 你的工程 ==========
13
+ PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__)) # app.py 在仓库根目录
14
+ sys.path.append(PROJECT_ROOT)
15
+
16
+ from networks.models import make
17
+
18
+ device = "cuda" if torch.cuda.is_available() else "cpu"
19
+
20
+ # ====== HF 权重仓库配置(你已经上传成功)======
21
+ WEIGHTS_REPO = "Insta360-Research/DAP-weights"
22
+ WEIGHTS_FILE = "model.pth"
23
+
24
+ # ========== 可视化 ==========
25
+ def colorize_depth(depth, colormap=cv2.COLORMAP_JET):
26
+ depth = depth.astype(np.float32)
27
+ depth_norm = (depth - depth.min()) / (depth.max() - depth.min() + 1e-6)
28
+ depth_u8 = (depth_norm * 255).astype(np.uint8)
29
+ return cv2.applyColorMap(depth_u8, colormap) # BGR
30
+
31
+ # ========== 加载模型(只加载一次) ==========
32
+ def load_model(config_path: str):
33
+ with open(config_path, "r") as f:
34
+ config = yaml.load(f, Loader=yaml.FullLoader)
35
+
36
+ print(f"Downloading weights from HF: {WEIGHTS_REPO}/{WEIGHTS_FILE}")
37
+ model_path = hf_hub_download(repo_id=WEIGHTS_REPO, filename=WEIGHTS_FILE)
38
+ print(f"✅ Weights downloaded to: {model_path}")
39
+
40
+ state = torch.load(model_path, map_location=device)
41
+
42
+ model = make(config["model"])
43
+ if any(k.startswith("module") for k in state.keys()):
44
+ model = nn.DataParallel(model)
45
+
46
+ model = model.to(device)
47
+
48
+ model_state = model.state_dict()
49
+ model.load_state_dict({k: v for k, v in state.items() if k in model_state}, strict=False)
50
+ model.eval()
51
+ print("✅ Model loaded.")
52
+ return model
53
+
54
+ # 这里改成你仓库里的 config 路径
55
+ CONFIG_PATH = "config/infer.yaml"
56
+ model = load_model(CONFIG_PATH)
57
+
58
+ # ========== 单张图推理 ==========
59
+ @torch.no_grad()
60
+ def predict(img_rgb: np.ndarray):
61
+ """
62
+ img_rgb: H x W x 3 (RGB), uint8
63
+ return: depth_color_rgb, depth_gray
64
+ """
65
+ if img_rgb is None:
66
+ return None, None
67
+
68
+ img = img_rgb.astype(np.float32) / 255.0
69
+ tensor = torch.from_numpy(img.transpose(2, 0, 1)).unsqueeze(0).to(device)
70
+
71
+ outputs = model(tensor)
72
+
73
+ if isinstance(outputs, dict) and "pred_depth" in outputs:
74
+ # 你原来的 mask 逻辑
75
+ if "pred_mask" in outputs:
76
+ outputs["pred_mask"] = 1 - outputs["pred_mask"]
77
+ outputs["pred_mask"] = (outputs["pred_mask"] > 0.5)
78
+ outputs["pred_depth"][~outputs["pred_mask"]] = 1
79
+ pred = outputs["pred_depth"][0].detach().cpu().squeeze().numpy()
80
+ else:
81
+ pred = outputs[0].detach().cpu().squeeze().numpy()
82
+
83
+ pred_clip = np.clip(pred, 0.001, 1.0)
84
+ depth_gray = (pred_clip * 255).astype(np.uint8)
85
+
86
+ depth_color_bgr = colorize_depth(pred_clip, cv2.COLORMAP_JET)
87
+ depth_color_rgb = cv2.cvtColor(depth_color_bgr, cv2.COLOR_BGR2RGB)
88
+
89
+ return depth_color_rgb, depth_gray
90
+
91
+ demo = gr.Interface(
92
+ fn=predict,
93
+ inputs=gr.Image(type="numpy", label="Input Image"),
94
+ outputs=[
95
+ gr.Image(type="numpy", label="Depth (Color)"),
96
+ gr.Image(type="numpy", label="Depth (Gray)"),
97
+ ],
98
+ title="DAP Depth Prediction Demo",
99
+ description="Upload an image and get depth prediction."
100
+ )
101
+
102
+ demo.launch()
config/infer.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ name: dap
3
+ args:
4
+ midas_model_type: vitl
5
+ fine_tune_type: hypersim
6
+ min_depth: 0.01
7
+ max_depth: 1.0
8
+ train_decoder: True
9
+
10
+ median_align: False
11
+ load_weights_dir: checkpoints
12
+ input:
13
+ height: 512
14
+ width: 1024
15
+ inference:
16
+ batch_size: 1
17
+ num_workers: 1
18
+ save_colormap: True
19
+ colormap_type: jet
depth_anything_utils.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ from PIL import Image, ImageOps, ImageFilter
4
+ import torch
5
+ from torchvision import transforms
6
+ import torch.nn.functional as F
7
+
8
+ import numpy as np
9
+ import cv2
10
+ import math
11
+
12
+
13
+ def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
14
+ """Rezise the sample to ensure the given size. Keeps aspect ratio.
15
+
16
+ Args:
17
+ sample (dict): sample
18
+ size (tuple): image size
19
+
20
+ Returns:
21
+ tuple: new size
22
+ """
23
+ shape = list(sample["disparity"].shape)
24
+
25
+ if shape[0] >= size[0] and shape[1] >= size[1]:
26
+ return sample
27
+
28
+ scale = [0, 0]
29
+ scale[0] = size[0] / shape[0]
30
+ scale[1] = size[1] / shape[1]
31
+
32
+ scale = max(scale)
33
+
34
+ shape[0] = math.ceil(scale * shape[0])
35
+ shape[1] = math.ceil(scale * shape[1])
36
+
37
+ # resize
38
+ sample["image"] = cv2.resize(
39
+ sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
40
+ )
41
+
42
+ sample["disparity"] = cv2.resize(
43
+ sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
44
+ )
45
+ sample["mask"] = cv2.resize(
46
+ sample["mask"].astype(np.float32),
47
+ tuple(shape[::-1]),
48
+ interpolation=cv2.INTER_NEAREST,
49
+ )
50
+ sample["mask"] = sample["mask"].astype(bool)
51
+
52
+ return tuple(shape)
53
+
54
+
55
+ class Resize(object):
56
+ """Resize sample to given size (width, height).
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ width,
62
+ height,
63
+ resize_target=True,
64
+ keep_aspect_ratio=False,
65
+ ensure_multiple_of=1,
66
+ resize_method="lower_bound",
67
+ image_interpolation_method=cv2.INTER_AREA,
68
+ ):
69
+ """Init.
70
+
71
+ Args:
72
+ width (int): desired output width
73
+ height (int): desired output height
74
+ resize_target (bool, optional):
75
+ True: Resize the full sample (image, mask, target).
76
+ False: Resize image only.
77
+ Defaults to True.
78
+ keep_aspect_ratio (bool, optional):
79
+ True: Keep the aspect ratio of the input sample.
80
+ Output sample might not have the given width and height, and
81
+ resize behaviour depends on the parameter 'resize_method'.
82
+ Defaults to False.
83
+ ensure_multiple_of (int, optional):
84
+ Output width and height is constrained to be multiple of this parameter.
85
+ Defaults to 1.
86
+ resize_method (str, optional):
87
+ "lower_bound": Output will be at least as large as the given size.
88
+ "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
89
+ "minimal": Scale as least as possible. (Output size might be smaller than given size.)
90
+ Defaults to "lower_bound".
91
+ """
92
+ self.__width = width
93
+ self.__height = height
94
+
95
+ self.__resize_target = resize_target
96
+ self.__keep_aspect_ratio = keep_aspect_ratio
97
+ self.__multiple_of = ensure_multiple_of
98
+ self.__resize_method = resize_method
99
+ self.__image_interpolation_method = image_interpolation_method
100
+
101
+ def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
102
+ y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
103
+
104
+ if max_val is not None and y > max_val:
105
+ y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
106
+
107
+ if y < min_val:
108
+ y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
109
+
110
+ return y
111
+
112
+ def get_size(self, width, height):
113
+ # determine new height and width
114
+ scale_height = self.__height / height
115
+ scale_width = self.__width / width
116
+
117
+ if self.__keep_aspect_ratio:
118
+ if self.__resize_method == "lower_bound":
119
+ # scale such that output size is lower bound
120
+ if scale_width > scale_height:
121
+ # fit width
122
+ scale_height = scale_width
123
+ else:
124
+ # fit height
125
+ scale_width = scale_height
126
+ elif self.__resize_method == "upper_bound":
127
+ # scale such that output size is upper bound
128
+ if scale_width < scale_height:
129
+ # fit width
130
+ scale_height = scale_width
131
+ else:
132
+ # fit height
133
+ scale_width = scale_height
134
+ elif self.__resize_method == "minimal":
135
+ # scale as least as possbile
136
+ if abs(1 - scale_width) < abs(1 - scale_height):
137
+ # fit width
138
+ scale_height = scale_width
139
+ else:
140
+ # fit height
141
+ scale_width = scale_height
142
+ else:
143
+ raise ValueError(
144
+ f"resize_method {self.__resize_method} not implemented"
145
+ )
146
+
147
+ if self.__resize_method == "lower_bound":
148
+ new_height = self.constrain_to_multiple_of(
149
+ scale_height * height, min_val=self.__height
150
+ )
151
+ new_width = self.constrain_to_multiple_of(
152
+ scale_width * width, min_val=self.__width
153
+ )
154
+ elif self.__resize_method == "upper_bound":
155
+ new_height = self.constrain_to_multiple_of(
156
+ scale_height * height, max_val=self.__height
157
+ )
158
+ new_width = self.constrain_to_multiple_of(
159
+ scale_width * width, max_val=self.__width
160
+ )
161
+ elif self.__resize_method == "minimal":
162
+ new_height = self.constrain_to_multiple_of(scale_height * height)
163
+ new_width = self.constrain_to_multiple_of(scale_width * width)
164
+ else:
165
+ raise ValueError(f"resize_method {self.__resize_method} not implemented")
166
+
167
+ return (new_width, new_height)
168
+
169
+ def __call__(self, sample):
170
+ width, height = self.get_size(
171
+ sample["image"].shape[1], sample["image"].shape[0]
172
+ )
173
+
174
+ # resize sample
175
+ sample["image"] = cv2.resize(
176
+ sample["image"],
177
+ (width, height),
178
+ interpolation=self.__image_interpolation_method,
179
+ )
180
+
181
+ if self.__resize_target:
182
+ if "disparity" in sample:
183
+ sample["disparity"] = cv2.resize(
184
+ sample["disparity"],
185
+ (width, height),
186
+ interpolation=cv2.INTER_NEAREST,
187
+ )
188
+
189
+ if "depth" in sample:
190
+ sample["depth"] = cv2.resize(
191
+ sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
192
+ )
193
+
194
+ if "semseg_mask" in sample:
195
+ # sample["semseg_mask"] = cv2.resize(
196
+ # sample["semseg_mask"], (width, height), interpolation=cv2.INTER_NEAREST
197
+ # )
198
+ sample["semseg_mask"] = F.interpolate(torch.from_numpy(sample["semseg_mask"]).float()[None, None, ...], (height, width), mode='nearest').numpy()[0, 0]
199
+
200
+ if "mask" in sample:
201
+ sample["mask"] = cv2.resize(
202
+ sample["mask"].astype(np.float32),
203
+ (width, height),
204
+ interpolation=cv2.INTER_NEAREST,
205
+ )
206
+ # sample["mask"] = sample["mask"].astype(bool)
207
+
208
+ # print(sample['image'].shape, sample['depth'].shape)
209
+ return sample
210
+
211
+
212
+ class NormalizeImage(object):
213
+ """Normlize image by given mean and std.
214
+ """
215
+
216
+ def __init__(self, mean, std):
217
+ self.__mean = mean
218
+ self.__std = std
219
+
220
+ def __call__(self, sample):
221
+ sample["image"] = (sample["image"] - self.__mean) / self.__std
222
+
223
+ return sample
224
+
225
+
226
+ class PrepareForNet(object):
227
+ """Prepare sample for usage as network input.
228
+ """
229
+
230
+ def __init__(self):
231
+ pass
232
+
233
+ def __call__(self, sample):
234
+ image = np.transpose(sample["image"], (2, 0, 1))
235
+ sample["image"] = np.ascontiguousarray(image).astype(np.float32)
236
+
237
+ if "mask" in sample:
238
+ sample["mask"] = sample["mask"].astype(np.float32)
239
+ sample["mask"] = np.ascontiguousarray(sample["mask"])
240
+
241
+ if "depth" in sample:
242
+ depth = sample["depth"].astype(np.float32)
243
+ sample["depth"] = np.ascontiguousarray(depth)
244
+
245
+ if "semseg_mask" in sample:
246
+ sample["semseg_mask"] = sample["semseg_mask"].astype(np.float32)
247
+ sample["semseg_mask"] = np.ascontiguousarray(sample["semseg_mask"])
248
+
249
+ return sample
depth_anything_v2_metric/depth_anything_v2/__pycache__/dinov2.cpython-310.pyc ADDED
Binary file (12.2 kB). View file
 
depth_anything_v2_metric/depth_anything_v2/__pycache__/dinov3_adpther.cpython-310.pyc ADDED
Binary file (2.54 kB). View file
 
depth_anything_v2_metric/depth_anything_v2/__pycache__/dpt.cpython-310.pyc ADDED
Binary file (6.62 kB). View file
 
depth_anything_v2_metric/depth_anything_v2/dinov2.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ # References:
7
+ # https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
8
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
9
+
10
+ from functools import partial
11
+ import math
12
+ import logging
13
+ from typing import Sequence, Tuple, Union, Callable
14
+
15
+ import torch
16
+ import torch.nn as nn
17
+ import torch.utils.checkpoint
18
+ from torch.nn.init import trunc_normal_
19
+
20
+ from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
21
+
22
+
23
+ logger = logging.getLogger("dinov2")
24
+
25
+
26
+ def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
27
+ if not depth_first and include_root:
28
+ fn(module=module, name=name)
29
+ for child_name, child_module in module.named_children():
30
+ child_name = ".".join((name, child_name)) if name else child_name
31
+ named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
32
+ if depth_first and include_root:
33
+ fn(module=module, name=name)
34
+ return module
35
+
36
+
37
+ class BlockChunk(nn.ModuleList):
38
+ def forward(self, x):
39
+ for b in self:
40
+ x = b(x)
41
+ return x
42
+
43
+
44
+ class DinoVisionTransformer(nn.Module):
45
+ def __init__(
46
+ self,
47
+ img_size=224,
48
+ patch_size=16,
49
+ in_chans=3,
50
+ embed_dim=768,
51
+ depth=12,
52
+ num_heads=12,
53
+ mlp_ratio=4.0,
54
+ qkv_bias=True,
55
+ ffn_bias=True,
56
+ proj_bias=True,
57
+ drop_path_rate=0.0,
58
+ drop_path_uniform=False,
59
+ init_values=None, # for layerscale: None or 0 => no layerscale
60
+ embed_layer=PatchEmbed,
61
+ act_layer=nn.GELU,
62
+ block_fn=Block,
63
+ ffn_layer="mlp",
64
+ block_chunks=1,
65
+ num_register_tokens=0,
66
+ interpolate_antialias=False,
67
+ interpolate_offset=0.1,
68
+ ):
69
+ """
70
+ Args:
71
+ img_size (int, tuple): input image size
72
+ patch_size (int, tuple): patch size
73
+ in_chans (int): number of input channels
74
+ embed_dim (int): embedding dimension
75
+ depth (int): depth of transformer
76
+ num_heads (int): number of attention heads
77
+ mlp_ratio (int): ratio of mlp hidden dim to embedding dim
78
+ qkv_bias (bool): enable bias for qkv if True
79
+ proj_bias (bool): enable bias for proj in attn if True
80
+ ffn_bias (bool): enable bias for ffn if True
81
+ drop_path_rate (float): stochastic depth rate
82
+ drop_path_uniform (bool): apply uniform drop rate across blocks
83
+ weight_init (str): weight init scheme
84
+ init_values (float): layer-scale init values
85
+ embed_layer (nn.Module): patch embedding layer
86
+ act_layer (nn.Module): MLP activation layer
87
+ block_fn (nn.Module): transformer block class
88
+ ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
89
+ block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
90
+ num_register_tokens: (int) number of extra cls tokens (so-called "registers")
91
+ interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
92
+ interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
93
+ """
94
+ super().__init__()
95
+ norm_layer = partial(nn.LayerNorm, eps=1e-6)
96
+
97
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
98
+ self.num_tokens = 1
99
+ self.n_blocks = depth
100
+ self.num_heads = num_heads
101
+ self.patch_size = patch_size
102
+ self.num_register_tokens = num_register_tokens
103
+ self.interpolate_antialias = interpolate_antialias
104
+ self.interpolate_offset = interpolate_offset
105
+
106
+ self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
107
+ num_patches = self.patch_embed.num_patches
108
+
109
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
110
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
111
+ assert num_register_tokens >= 0
112
+ self.register_tokens = (
113
+ nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
114
+ )
115
+
116
+ if drop_path_uniform is True:
117
+ dpr = [drop_path_rate] * depth
118
+ else:
119
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
120
+
121
+ if ffn_layer == "mlp":
122
+ logger.info("using MLP layer as FFN")
123
+ ffn_layer = Mlp
124
+ elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
125
+ logger.info("using SwiGLU layer as FFN")
126
+ ffn_layer = SwiGLUFFNFused
127
+ elif ffn_layer == "identity":
128
+ logger.info("using Identity layer as FFN")
129
+
130
+ def f(*args, **kwargs):
131
+ return nn.Identity()
132
+
133
+ ffn_layer = f
134
+ else:
135
+ raise NotImplementedError
136
+
137
+ blocks_list = [
138
+ block_fn(
139
+ dim=embed_dim,
140
+ num_heads=num_heads,
141
+ mlp_ratio=mlp_ratio,
142
+ qkv_bias=qkv_bias,
143
+ proj_bias=proj_bias,
144
+ ffn_bias=ffn_bias,
145
+ drop_path=dpr[i],
146
+ norm_layer=norm_layer,
147
+ act_layer=act_layer,
148
+ ffn_layer=ffn_layer,
149
+ init_values=init_values,
150
+ )
151
+ for i in range(depth)
152
+ ]
153
+ if block_chunks > 0:
154
+ self.chunked_blocks = True
155
+ chunked_blocks = []
156
+ chunksize = depth // block_chunks
157
+ for i in range(0, depth, chunksize):
158
+ # this is to keep the block index consistent if we chunk the block list
159
+ chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
160
+ self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
161
+ else:
162
+ self.chunked_blocks = False
163
+ self.blocks = nn.ModuleList(blocks_list)
164
+
165
+ self.norm = norm_layer(embed_dim)
166
+ self.head = nn.Identity()
167
+
168
+ self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
169
+
170
+ self.init_weights()
171
+
172
+ def init_weights(self):
173
+ trunc_normal_(self.pos_embed, std=0.02)
174
+ nn.init.normal_(self.cls_token, std=1e-6)
175
+ if self.register_tokens is not None:
176
+ nn.init.normal_(self.register_tokens, std=1e-6)
177
+ named_apply(init_weights_vit_timm, self)
178
+
179
+ def interpolate_pos_encoding(self, x, w, h):
180
+ previous_dtype = x.dtype
181
+ npatch = x.shape[1] - 1
182
+ N = self.pos_embed.shape[1] - 1
183
+ if npatch == N and w == h:
184
+ return self.pos_embed
185
+ pos_embed = self.pos_embed.float()
186
+ class_pos_embed = pos_embed[:, 0]
187
+ patch_pos_embed = pos_embed[:, 1:]
188
+ dim = x.shape[-1]
189
+ w0 = w // self.patch_size
190
+ h0 = h // self.patch_size
191
+ # we add a small number to avoid floating point error in the interpolation
192
+ # see discussion at https://github.com/facebookresearch/dino/issues/8
193
+ # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
194
+ w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
195
+ # w0, h0 = w0 + 0.1, h0 + 0.1
196
+
197
+ sqrt_N = math.sqrt(N)
198
+ sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
199
+ patch_pos_embed = nn.functional.interpolate(
200
+ patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
201
+ scale_factor=(sx, sy),
202
+ # (int(w0), int(h0)), # to solve the upsampling shape issue
203
+ mode="bicubic",
204
+ antialias=self.interpolate_antialias
205
+ )
206
+
207
+ assert int(w0) == patch_pos_embed.shape[-2]
208
+ assert int(h0) == patch_pos_embed.shape[-1]
209
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
210
+ return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
211
+
212
+ def prepare_tokens_with_masks(self, x, masks=None):
213
+ B, nc, w, h = x.shape
214
+ x = self.patch_embed(x)
215
+ if masks is not None:
216
+ x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
217
+
218
+ x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
219
+ x = x + self.interpolate_pos_encoding(x, w, h)
220
+
221
+ if self.register_tokens is not None:
222
+ x = torch.cat(
223
+ (
224
+ x[:, :1],
225
+ self.register_tokens.expand(x.shape[0], -1, -1),
226
+ x[:, 1:],
227
+ ),
228
+ dim=1,
229
+ )
230
+
231
+ return x
232
+
233
+ def forward_features_list(self, x_list, masks_list):
234
+ x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
235
+ for blk in self.blocks:
236
+ x = blk(x)
237
+
238
+ all_x = x
239
+ output = []
240
+ for x, masks in zip(all_x, masks_list):
241
+ x_norm = self.norm(x)
242
+ output.append(
243
+ {
244
+ "x_norm_clstoken": x_norm[:, 0],
245
+ "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
246
+ "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
247
+ "x_prenorm": x,
248
+ "masks": masks,
249
+ }
250
+ )
251
+ return output
252
+
253
+ def forward_features(self, x, masks=None):
254
+ if isinstance(x, list):
255
+ return self.forward_features_list(x, masks)
256
+
257
+ x = self.prepare_tokens_with_masks(x, masks)
258
+
259
+ for blk in self.blocks:
260
+ x = blk(x)
261
+
262
+ x_norm = self.norm(x)
263
+ return {
264
+ "x_norm_clstoken": x_norm[:, 0],
265
+ "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
266
+ "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
267
+ "x_prenorm": x,
268
+ "masks": masks,
269
+ }
270
+
271
+ def _get_intermediate_layers_not_chunked(self, x, n=1):
272
+ x = self.prepare_tokens_with_masks(x)
273
+ # If n is an int, take the n last blocks. If it's a list, take them
274
+ output, total_block_len = [], len(self.blocks)
275
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
276
+ for i, blk in enumerate(self.blocks):
277
+ x = blk(x)
278
+ if i in blocks_to_take:
279
+ output.append(x)
280
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
281
+ return output
282
+
283
+ def _get_intermediate_layers_chunked(self, x, n=1):
284
+ x = self.prepare_tokens_with_masks(x)
285
+ output, i, total_block_len = [], 0, len(self.blocks[-1])
286
+ # If n is an int, take the n last blocks. If it's a list, take them
287
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
288
+ for block_chunk in self.blocks:
289
+ for blk in block_chunk[i:]: # Passing the nn.Identity()
290
+ x = blk(x)
291
+ if i in blocks_to_take:
292
+ output.append(x)
293
+ i += 1
294
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
295
+ return output
296
+
297
+ def get_intermediate_layers(
298
+ self,
299
+ x: torch.Tensor,
300
+ n: Union[int, Sequence] = 1, # Layers or n last layers to take
301
+ reshape: bool = False,
302
+ return_class_token: bool = False,
303
+ norm=True
304
+ ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
305
+ if self.chunked_blocks:
306
+ outputs = self._get_intermediate_layers_chunked(x, n)
307
+ else:
308
+ outputs = self._get_intermediate_layers_not_chunked(x, n)
309
+ if norm:
310
+ outputs = [self.norm(out) for out in outputs]
311
+ class_tokens = [out[:, 0] for out in outputs]
312
+ outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
313
+ if reshape:
314
+ B, _, w, h = x.shape
315
+ outputs = [
316
+ out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
317
+ for out in outputs
318
+ ]
319
+ if return_class_token:
320
+ return tuple(zip(outputs, class_tokens))
321
+ return tuple(outputs)
322
+
323
+ def forward(self, *args, is_training=False, **kwargs):
324
+ ret = self.forward_features(*args, **kwargs)
325
+ if is_training:
326
+ return ret
327
+ else:
328
+ return self.head(ret["x_norm_clstoken"])
329
+
330
+
331
+ def init_weights_vit_timm(module: nn.Module, name: str = ""):
332
+ """ViT weight initialization, original timm impl (for reproducibility)"""
333
+ if isinstance(module, nn.Linear):
334
+ trunc_normal_(module.weight, std=0.02)
335
+ if module.bias is not None:
336
+ nn.init.zeros_(module.bias)
337
+
338
+
339
+ def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
340
+ model = DinoVisionTransformer(
341
+ patch_size=patch_size,
342
+ embed_dim=384,
343
+ depth=12,
344
+ num_heads=6,
345
+ mlp_ratio=4,
346
+ block_fn=partial(Block, attn_class=MemEffAttention),
347
+ num_register_tokens=num_register_tokens,
348
+ **kwargs,
349
+ )
350
+ return model
351
+
352
+
353
+ def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
354
+ model = DinoVisionTransformer(
355
+ patch_size=patch_size,
356
+ embed_dim=768,
357
+ depth=12,
358
+ num_heads=12,
359
+ mlp_ratio=4,
360
+ block_fn=partial(Block, attn_class=MemEffAttention),
361
+ num_register_tokens=num_register_tokens,
362
+ **kwargs,
363
+ )
364
+ return model
365
+
366
+
367
+ def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
368
+ model = DinoVisionTransformer(
369
+ patch_size=patch_size,
370
+ embed_dim=1024,
371
+ depth=24,
372
+ num_heads=16,
373
+ mlp_ratio=4,
374
+ block_fn=partial(Block, attn_class=MemEffAttention),
375
+ num_register_tokens=num_register_tokens,
376
+ **kwargs,
377
+ )
378
+ return model
379
+
380
+
381
+ def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
382
+ """
383
+ Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
384
+ """
385
+ model = DinoVisionTransformer(
386
+ patch_size=patch_size,
387
+ embed_dim=1536,
388
+ depth=40,
389
+ num_heads=24,
390
+ mlp_ratio=4,
391
+ block_fn=partial(Block, attn_class=MemEffAttention),
392
+ num_register_tokens=num_register_tokens,
393
+ **kwargs,
394
+ )
395
+ return model
396
+
397
+
398
+ def DINOv2(model_name):
399
+ model_zoo = {
400
+ "vits": vit_small,
401
+ "vitb": vit_base,
402
+ "vitl": vit_large,
403
+ "vitg": vit_giant2
404
+ }
405
+
406
+ return model_zoo[model_name](
407
+ img_size=518,
408
+ patch_size=14,
409
+ init_values=1.0,
410
+ ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
411
+ block_chunks=0,
412
+ num_register_tokens=0,
413
+ interpolate_antialias=False,
414
+ interpolate_offset=0.1
415
+ )
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from .mlp import Mlp
8
+ from .patch_embed import PatchEmbed
9
+ from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
10
+ from .block import NestedTensorBlock
11
+ from .attention import MemEffAttention
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (456 Bytes). View file
 
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-310.pyc ADDED
Binary file (2.42 kB). View file
 
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-310.pyc ADDED
Binary file (8.03 kB). View file
 
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-310.pyc ADDED
Binary file (1.26 kB). View file
 
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc ADDED
Binary file (1.06 kB). View file
 
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-310.pyc ADDED
Binary file (1.25 kB). View file
 
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc ADDED
Binary file (2.7 kB). View file
 
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc ADDED
Binary file (2.05 kB). View file
 
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/attention.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
10
+
11
+ import logging
12
+
13
+ from torch import Tensor
14
+ from torch import nn
15
+
16
+
17
+ logger = logging.getLogger("dinov2")
18
+
19
+
20
+ try:
21
+ from xformers.ops import memory_efficient_attention, unbind, fmha
22
+
23
+ XFORMERS_AVAILABLE = True
24
+ except ImportError:
25
+ logger.warning("xFormers not available")
26
+ XFORMERS_AVAILABLE = False
27
+
28
+
29
+ class Attention(nn.Module):
30
+ def __init__(
31
+ self,
32
+ dim: int,
33
+ num_heads: int = 8,
34
+ qkv_bias: bool = False,
35
+ proj_bias: bool = True,
36
+ attn_drop: float = 0.0,
37
+ proj_drop: float = 0.0,
38
+ ) -> None:
39
+ super().__init__()
40
+ self.num_heads = num_heads
41
+ head_dim = dim // num_heads
42
+ self.scale = head_dim**-0.5
43
+
44
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
45
+ self.attn_drop = nn.Dropout(attn_drop)
46
+ self.proj = nn.Linear(dim, dim, bias=proj_bias)
47
+ self.proj_drop = nn.Dropout(proj_drop)
48
+
49
+ def forward(self, x: Tensor) -> Tensor:
50
+ B, N, C = x.shape
51
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
52
+
53
+ q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
54
+ attn = q @ k.transpose(-2, -1)
55
+
56
+ attn = attn.softmax(dim=-1)
57
+ attn = self.attn_drop(attn)
58
+
59
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
60
+ x = self.proj(x)
61
+ x = self.proj_drop(x)
62
+ return x
63
+
64
+
65
+ class MemEffAttention(Attention):
66
+ def forward(self, x: Tensor, attn_bias=None) -> Tensor:
67
+ if not XFORMERS_AVAILABLE:
68
+ assert attn_bias is None, "xFormers is required for nested tensors usage"
69
+ return super().forward(x)
70
+
71
+ B, N, C = x.shape
72
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
73
+
74
+ q, k, v = unbind(qkv, 2)
75
+
76
+ x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
77
+ x = x.reshape([B, N, C])
78
+
79
+ x = self.proj(x)
80
+ x = self.proj_drop(x)
81
+ return x
82
+
83
+
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/block.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
10
+
11
+ import logging
12
+ from typing import Callable, List, Any, Tuple, Dict
13
+
14
+ import torch
15
+ from torch import nn, Tensor
16
+
17
+ from .attention import Attention, MemEffAttention
18
+ from .drop_path import DropPath
19
+ from .layer_scale import LayerScale
20
+ from .mlp import Mlp
21
+
22
+
23
+ logger = logging.getLogger("dinov2")
24
+
25
+
26
+ try:
27
+ from xformers.ops import fmha
28
+ from xformers.ops import scaled_index_add, index_select_cat
29
+
30
+ XFORMERS_AVAILABLE = True
31
+ except ImportError:
32
+ logger.warning("xFormers not available")
33
+ XFORMERS_AVAILABLE = False
34
+
35
+
36
+ class Block(nn.Module):
37
+ def __init__(
38
+ self,
39
+ dim: int,
40
+ num_heads: int,
41
+ mlp_ratio: float = 4.0,
42
+ qkv_bias: bool = False,
43
+ proj_bias: bool = True,
44
+ ffn_bias: bool = True,
45
+ drop: float = 0.0,
46
+ attn_drop: float = 0.0,
47
+ init_values=None,
48
+ drop_path: float = 0.0,
49
+ act_layer: Callable[..., nn.Module] = nn.GELU,
50
+ norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
51
+ attn_class: Callable[..., nn.Module] = Attention,
52
+ ffn_layer: Callable[..., nn.Module] = Mlp,
53
+ ) -> None:
54
+ super().__init__()
55
+ # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
56
+ self.norm1 = norm_layer(dim)
57
+ self.attn = attn_class(
58
+ dim,
59
+ num_heads=num_heads,
60
+ qkv_bias=qkv_bias,
61
+ proj_bias=proj_bias,
62
+ attn_drop=attn_drop,
63
+ proj_drop=drop,
64
+ )
65
+ self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
66
+ self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
67
+
68
+ self.norm2 = norm_layer(dim)
69
+ mlp_hidden_dim = int(dim * mlp_ratio)
70
+ self.mlp = ffn_layer(
71
+ in_features=dim,
72
+ hidden_features=mlp_hidden_dim,
73
+ act_layer=act_layer,
74
+ drop=drop,
75
+ bias=ffn_bias,
76
+ )
77
+ self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
78
+ self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
79
+
80
+ self.sample_drop_ratio = drop_path
81
+
82
+ def forward(self, x: Tensor) -> Tensor:
83
+ def attn_residual_func(x: Tensor) -> Tensor:
84
+ return self.ls1(self.attn(self.norm1(x)))
85
+
86
+ def ffn_residual_func(x: Tensor) -> Tensor:
87
+ return self.ls2(self.mlp(self.norm2(x)))
88
+
89
+ if self.training and self.sample_drop_ratio > 0.1:
90
+ # the overhead is compensated only for a drop path rate larger than 0.1
91
+ x = drop_add_residual_stochastic_depth(
92
+ x,
93
+ residual_func=attn_residual_func,
94
+ sample_drop_ratio=self.sample_drop_ratio,
95
+ )
96
+ x = drop_add_residual_stochastic_depth(
97
+ x,
98
+ residual_func=ffn_residual_func,
99
+ sample_drop_ratio=self.sample_drop_ratio,
100
+ )
101
+ elif self.training and self.sample_drop_ratio > 0.0:
102
+ x = x + self.drop_path1(attn_residual_func(x))
103
+ x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2
104
+ else:
105
+ x = x + attn_residual_func(x)
106
+ x = x + ffn_residual_func(x)
107
+ return x
108
+
109
+
110
+ def drop_add_residual_stochastic_depth(
111
+ x: Tensor,
112
+ residual_func: Callable[[Tensor], Tensor],
113
+ sample_drop_ratio: float = 0.0,
114
+ ) -> Tensor:
115
+ # 1) extract subset using permutation
116
+ b, n, d = x.shape
117
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
118
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
119
+ x_subset = x[brange]
120
+
121
+ # 2) apply residual_func to get residual
122
+ residual = residual_func(x_subset)
123
+
124
+ x_flat = x.flatten(1)
125
+ residual = residual.flatten(1)
126
+
127
+ residual_scale_factor = b / sample_subset_size
128
+
129
+ # 3) add the residual
130
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
131
+ return x_plus_residual.view_as(x)
132
+
133
+
134
+ def get_branges_scales(x, sample_drop_ratio=0.0):
135
+ b, n, d = x.shape
136
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
137
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
138
+ residual_scale_factor = b / sample_subset_size
139
+ return brange, residual_scale_factor
140
+
141
+
142
+ def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
143
+ if scaling_vector is None:
144
+ x_flat = x.flatten(1)
145
+ residual = residual.flatten(1)
146
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
147
+ else:
148
+ x_plus_residual = scaled_index_add(
149
+ x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
150
+ )
151
+ return x_plus_residual
152
+
153
+
154
+ attn_bias_cache: Dict[Tuple, Any] = {}
155
+
156
+
157
+ def get_attn_bias_and_cat(x_list, branges=None):
158
+ """
159
+ this will perform the index select, cat the tensors, and provide the attn_bias from cache
160
+ """
161
+ batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
162
+ all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
163
+ if all_shapes not in attn_bias_cache.keys():
164
+ seqlens = []
165
+ for b, x in zip(batch_sizes, x_list):
166
+ for _ in range(b):
167
+ seqlens.append(x.shape[1])
168
+ attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
169
+ attn_bias._batch_sizes = batch_sizes
170
+ attn_bias_cache[all_shapes] = attn_bias
171
+
172
+ if branges is not None:
173
+ cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
174
+ else:
175
+ tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
176
+ cat_tensors = torch.cat(tensors_bs1, dim=1)
177
+
178
+ return attn_bias_cache[all_shapes], cat_tensors
179
+
180
+
181
+ def drop_add_residual_stochastic_depth_list(
182
+ x_list: List[Tensor],
183
+ residual_func: Callable[[Tensor, Any], Tensor],
184
+ sample_drop_ratio: float = 0.0,
185
+ scaling_vector=None,
186
+ ) -> Tensor:
187
+ # 1) generate random set of indices for dropping samples in the batch
188
+ branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
189
+ branges = [s[0] for s in branges_scales]
190
+ residual_scale_factors = [s[1] for s in branges_scales]
191
+
192
+ # 2) get attention bias and index+concat the tensors
193
+ attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
194
+
195
+ # 3) apply residual_func to get residual, and split the result
196
+ residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore
197
+
198
+ outputs = []
199
+ for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
200
+ outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
201
+ return outputs
202
+
203
+
204
+ class NestedTensorBlock(Block):
205
+ def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
206
+ """
207
+ x_list contains a list of tensors to nest together and run
208
+ """
209
+ assert isinstance(self.attn, MemEffAttention)
210
+
211
+ if self.training and self.sample_drop_ratio > 0.0:
212
+
213
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
214
+ return self.attn(self.norm1(x), attn_bias=attn_bias)
215
+
216
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
217
+ return self.mlp(self.norm2(x))
218
+
219
+ x_list = drop_add_residual_stochastic_depth_list(
220
+ x_list,
221
+ residual_func=attn_residual_func,
222
+ sample_drop_ratio=self.sample_drop_ratio,
223
+ scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
224
+ )
225
+ x_list = drop_add_residual_stochastic_depth_list(
226
+ x_list,
227
+ residual_func=ffn_residual_func,
228
+ sample_drop_ratio=self.sample_drop_ratio,
229
+ scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
230
+ )
231
+ return x_list
232
+ else:
233
+
234
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
235
+ return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
236
+
237
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
238
+ return self.ls2(self.mlp(self.norm2(x)))
239
+
240
+ attn_bias, x = get_attn_bias_and_cat(x_list)
241
+ x = x + attn_residual_func(x, attn_bias=attn_bias)
242
+ x = x + ffn_residual_func(x)
243
+ return attn_bias.split(x)
244
+
245
+ def forward(self, x_or_x_list):
246
+ if isinstance(x_or_x_list, Tensor):
247
+ return super().forward(x_or_x_list)
248
+ elif isinstance(x_or_x_list, list):
249
+ assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
250
+ return self.forward_nested(x_or_x_list)
251
+ else:
252
+ raise AssertionError
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/drop_path.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
10
+
11
+
12
+ from torch import nn
13
+
14
+
15
+ def drop_path(x, drop_prob: float = 0.0, training: bool = False):
16
+ if drop_prob == 0.0 or not training:
17
+ return x
18
+ keep_prob = 1 - drop_prob
19
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
20
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
21
+ if keep_prob > 0.0:
22
+ random_tensor.div_(keep_prob)
23
+ output = x * random_tensor
24
+ return output
25
+
26
+
27
+ class DropPath(nn.Module):
28
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
29
+
30
+ def __init__(self, drop_prob=None):
31
+ super(DropPath, self).__init__()
32
+ self.drop_prob = drop_prob
33
+
34
+ def forward(self, x):
35
+ return drop_path(x, self.drop_prob, self.training)
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/layer_scale.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
8
+
9
+ from typing import Union
10
+
11
+ import torch
12
+ from torch import Tensor
13
+ from torch import nn
14
+
15
+
16
+ class LayerScale(nn.Module):
17
+ def __init__(
18
+ self,
19
+ dim: int,
20
+ init_values: Union[float, Tensor] = 1e-5,
21
+ inplace: bool = False,
22
+ ) -> None:
23
+ super().__init__()
24
+ self.inplace = inplace
25
+ self.gamma = nn.Parameter(init_values * torch.ones(dim))
26
+
27
+ def forward(self, x: Tensor) -> Tensor:
28
+ return x.mul_(self.gamma) if self.inplace else x * self.gamma
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/mlp.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
10
+
11
+
12
+ from typing import Callable, Optional
13
+
14
+ from torch import Tensor, nn
15
+
16
+
17
+ class Mlp(nn.Module):
18
+ def __init__(
19
+ self,
20
+ in_features: int,
21
+ hidden_features: Optional[int] = None,
22
+ out_features: Optional[int] = None,
23
+ act_layer: Callable[..., nn.Module] = nn.GELU,
24
+ drop: float = 0.0,
25
+ bias: bool = True,
26
+ ) -> None:
27
+ super().__init__()
28
+ out_features = out_features or in_features
29
+ hidden_features = hidden_features or in_features
30
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
31
+ self.act = act_layer()
32
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
33
+ self.drop = nn.Dropout(drop)
34
+
35
+ def forward(self, x: Tensor) -> Tensor:
36
+ x = self.fc1(x)
37
+ x = self.act(x)
38
+ x = self.drop(x)
39
+ x = self.fc2(x)
40
+ x = self.drop(x)
41
+ return x
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/patch_embed.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
10
+
11
+ from typing import Callable, Optional, Tuple, Union
12
+
13
+ from torch import Tensor
14
+ import torch.nn as nn
15
+
16
+
17
+ def make_2tuple(x):
18
+ if isinstance(x, tuple):
19
+ assert len(x) == 2
20
+ return x
21
+
22
+ assert isinstance(x, int)
23
+ return (x, x)
24
+
25
+
26
+ class PatchEmbed(nn.Module):
27
+ """
28
+ 2D image to patch embedding: (B,C,H,W) -> (B,N,D)
29
+
30
+ Args:
31
+ img_size: Image size.
32
+ patch_size: Patch token size.
33
+ in_chans: Number of input image channels.
34
+ embed_dim: Number of linear projection output channels.
35
+ norm_layer: Normalization layer.
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ img_size: Union[int, Tuple[int, int]] = 224,
41
+ patch_size: Union[int, Tuple[int, int]] = 16,
42
+ in_chans: int = 3,
43
+ embed_dim: int = 768,
44
+ norm_layer: Optional[Callable] = None,
45
+ flatten_embedding: bool = True,
46
+ ) -> None:
47
+ super().__init__()
48
+
49
+ image_HW = make_2tuple(img_size)
50
+ patch_HW = make_2tuple(patch_size)
51
+ patch_grid_size = (
52
+ image_HW[0] // patch_HW[0],
53
+ image_HW[1] // patch_HW[1],
54
+ )
55
+
56
+ self.img_size = image_HW
57
+ self.patch_size = patch_HW
58
+ self.patches_resolution = patch_grid_size
59
+ self.num_patches = patch_grid_size[0] * patch_grid_size[1]
60
+
61
+ self.in_chans = in_chans
62
+ self.embed_dim = embed_dim
63
+
64
+ self.flatten_embedding = flatten_embedding
65
+
66
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
67
+ self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
68
+
69
+ def forward(self, x: Tensor) -> Tensor:
70
+ _, _, H, W = x.shape
71
+ patch_H, patch_W = self.patch_size
72
+
73
+ assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
74
+ assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
75
+
76
+ x = self.proj(x) # B C H W
77
+ H, W = x.size(2), x.size(3)
78
+ x = x.flatten(2).transpose(1, 2) # B HW C
79
+ x = self.norm(x)
80
+ if not self.flatten_embedding:
81
+ x = x.reshape(-1, H, W, self.embed_dim) # B H W C
82
+ return x
83
+
84
+ def flops(self) -> float:
85
+ Ho, Wo = self.patches_resolution
86
+ flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
87
+ if self.norm is not None:
88
+ flops += Ho * Wo * self.embed_dim
89
+ return flops
depth_anything_v2_metric/depth_anything_v2/dinov2_layers/swiglu_ffn.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from typing import Callable, Optional
8
+
9
+ from torch import Tensor, nn
10
+ import torch.nn.functional as F
11
+
12
+
13
+ class SwiGLUFFN(nn.Module):
14
+ def __init__(
15
+ self,
16
+ in_features: int,
17
+ hidden_features: Optional[int] = None,
18
+ out_features: Optional[int] = None,
19
+ act_layer: Callable[..., nn.Module] = None,
20
+ drop: float = 0.0,
21
+ bias: bool = True,
22
+ ) -> None:
23
+ super().__init__()
24
+ out_features = out_features or in_features
25
+ hidden_features = hidden_features or in_features
26
+ self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
27
+ self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
28
+
29
+ def forward(self, x: Tensor) -> Tensor:
30
+ x12 = self.w12(x)
31
+ x1, x2 = x12.chunk(2, dim=-1)
32
+ hidden = F.silu(x1) * x2
33
+ return self.w3(hidden)
34
+
35
+
36
+ try:
37
+ from xformers.ops import SwiGLU
38
+
39
+ XFORMERS_AVAILABLE = True
40
+ except ImportError:
41
+ SwiGLU = SwiGLUFFN
42
+ XFORMERS_AVAILABLE = False
43
+
44
+
45
+ class SwiGLUFFNFused(SwiGLU):
46
+ def __init__(
47
+ self,
48
+ in_features: int,
49
+ hidden_features: Optional[int] = None,
50
+ out_features: Optional[int] = None,
51
+ act_layer: Callable[..., nn.Module] = None,
52
+ drop: float = 0.0,
53
+ bias: bool = True,
54
+ ) -> None:
55
+ out_features = out_features or in_features
56
+ hidden_features = hidden_features or in_features
57
+ hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
58
+ super().__init__(
59
+ in_features=in_features,
60
+ hidden_features=hidden_features,
61
+ out_features=out_features,
62
+ bias=bias,
63
+ )
depth_anything_v2_metric/depth_anything_v2/dinov3/.docstr.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ paths:
2
+ - dinov3
3
+ exclude: dinov3/tests
4
+ skip_init: True
5
+ skip_private: True
6
+ fail_under: 0
depth_anything_v2_metric/depth_anything_v2/dinov3/.github/workflows/lint.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Lint
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ pull_request:
8
+ branches:
9
+ - main
10
+
11
+ jobs:
12
+ run-linters:
13
+ name: Run linters
14
+ runs-on: ubuntu-latest
15
+
16
+ steps:
17
+ - name: Checkout repository
18
+ uses: actions/checkout@v4
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: 3.11
23
+ cache: 'pip'
24
+ cache-dependency-path: '**/requirements*.txt'
25
+ - name: Install Python (development) dependencies
26
+ run: |
27
+ pip install -r requirements-dev.txt
28
+ - name: Run ruff (linter)
29
+ run: |
30
+ ruff check dinov3
31
+ - name: Run ruff (formatter)
32
+ if: always()
33
+ run: |
34
+ ruff format --diff dinov3
35
+ - name: Report docstring coverage
36
+ if: always()
37
+ run: |
38
+ docstr-coverage dinov3
39
+ - name: Run mypy
40
+ if: always()
41
+ run: |
42
+ mypy --txt-report .
43
+ [ -f index.txt ] && cat index.txt
44
+ - name: Run pylint
45
+ if: always()
46
+ run: |
47
+ pylint --exit-zero dinov3
depth_anything_v2_metric/depth_anything_v2/dinov3/.gitignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ build/
2
+ dist/
3
+ *.egg-info/
4
+ **/__pycache__/
5
+
6
+ **/.ipynb_checkpoints
7
+ **/.ipynb_checkpoints/**
8
+
9
+ **/notebooks
10
+
11
+ # Ignore shell scripts
12
+ *.sh
13
+
14
+ # Ignore swap files
15
+ *.swp
16
+
17
+ # Ignore vscode directory
18
+ .vscode/
depth_anything_v2_metric/depth_anything_v2/dinov3/CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to make participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, sex characteristics, gender identity and expression,
9
+ level of experience, education, socio-economic status, nationality, personal
10
+ appearance, race, religion, or sexual identity and orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies within all project spaces, and it also applies when
49
+ an individual is representing the project or its community in public spaces.
50
+ Examples of representing a project or community include using an official
51
+ project e-mail address, posting via an official social media account, or acting
52
+ as an appointed representative at an online or offline event. Representation of
53
+ a project may be further defined and clarified by project maintainers.
54
+
55
+ This Code of Conduct also applies outside the project spaces when there is a
56
+ reasonable belief that an individual's behavior may have a negative impact on
57
+ the project or its community.
58
+
59
+ ## Enforcement
60
+
61
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
+ reported by contacting the project team at <[email protected]>. All
63
+ complaints will be reviewed and investigated and will result in a response that
64
+ is deemed necessary and appropriate to the circumstances. The project team is
65
+ obligated to maintain confidentiality with regard to the reporter of an incident.
66
+ Further details of specific enforcement policies may be posted separately.
67
+
68
+ Project maintainers who do not follow or enforce the Code of Conduct in good
69
+ faith may face temporary or permanent repercussions as determined by other
70
+ members of the project's leadership.
71
+
72
+ ## Attribution
73
+
74
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75
+ available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76
+
77
+ [homepage]: https://www.contributor-covenant.org
78
+
79
+ For answers to common questions about this code of conduct, see
80
+ https://www.contributor-covenant.org/faq
depth_anything_v2_metric/depth_anything_v2/dinov3/CONTRIBUTING.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to DINOv3
2
+ We want to make contributing to this project as easy and transparent as
3
+ possible.
4
+
5
+ ## Pull Requests
6
+ We actively welcome your pull requests.
7
+
8
+ 1. Fork the repo and create your branch from `main`.
9
+ 2. If you've added code that should be tested, add tests.
10
+ 3. If you've changed APIs, update the documentation.
11
+ 4. Ensure the test suite passes.
12
+ 5. Make sure your code lints.
13
+ 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14
+
15
+ ## Contributor License Agreement ("CLA")
16
+ In order to accept your pull request, we need you to submit a CLA. You only need
17
+ to do this once to work on any of Meta's open source projects.
18
+
19
+ Complete your CLA here: <https://code.facebook.com/cla>
20
+
21
+ ## Issues
22
+ We use GitHub issues to track public bugs. Please ensure your description is
23
+ clear and has sufficient instructions to be able to reproduce the issue.
24
+
25
+ Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26
+ disclosure of security bugs. In those cases, please go through the process
27
+ outlined on that page and do not file a public issue.
28
+
29
+ ## License
30
+ By contributing to DINOv3, you agree that your contributions will be licensed
31
+ under the LICENSE.md file in the root directory of this source tree.
depth_anything_v2_metric/depth_anything_v2/dinov3/LICENSE.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DINOv3 License
2
+
3
+ *Last Updated: August 19, 2025*
4
+
5
+ **“Agreement”** means the terms and conditions for use, reproduction, distribution and modification of the DINO Materials set forth herein.
6
+
7
+ **“DINO Materials”** means, collectively, Documentation and the models, software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code, and other elements of the foregoing distributed by Meta and made available under this Agreement.
8
+
9
+ **“Documentation”** means the specifications, manuals and documentation accompanying
10
+ DINO Materials distributed by Meta.
11
+
12
+ **“Licensee”** or **“you”** means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
13
+
14
+ **“Meta”** or **“we”** means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) or Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
15
+
16
+ **“Sanctions”** means any economic or trade sanctions or restrictions administered or enforced by the United States (including the Office of Foreign Assets Control of the U.S. Department of the Treasury (“OFAC”), the U.S. Department of State and the U.S. Department of Commerce), the United Nations, the European Union, or the United Kingdom.
17
+
18
+ **“Trade Controls”** means any of the following: Sanctions and applicable export and import controls.
19
+
20
+ By clicking “I Accept” below or by using or distributing any portion or element of the DINO Materials, you agree to be bound by this Agreement.
21
+
22
+ ## 1. License Rights and Redistribution.
23
+
24
+ a. <ins>Grant of Rights</ins>. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the DINO Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the DINO Materials.
25
+
26
+ b. <ins>Redistribution and Use</ins>.
27
+
28
+ i. Distribution of DINO Materials, and any derivative works thereof, are subject to the terms of this Agreement. If you distribute or make the DINO Materials, or any derivative works thereof, available to a third party, you may only do so under the terms of this Agreement and you shall provide a copy of this Agreement with any such DINO Materials.
29
+
30
+ ii. If you submit for publication the results of research you perform on, using, or otherwise in connection with DINO Materials, you must acknowledge the use of DINO Materials in your publication.
31
+
32
+ iii. Your use of the DINO Materials must comply with applicable laws and regulations, including Trade Control Laws and applicable privacy and data protection laws.
33
+
34
+ iv. Your use of the DINO Materials will not involve or encourage others to reverse engineer, decompile or discover the underlying components of the DINO Materials.
35
+
36
+ v. You are not the target of Trade Controls and your use of DINO Materials must comply with Trade Controls. You agree not to use, or permit others to use, DINO Materials for any activities subject to the International Traffic in Arms Regulations (ITAR) or end uses prohibited by Trade Controls, including those related to military or warfare purposes, nuclear industries or applications, espionage, or the development or use of guns or illegal weapons.
37
+
38
+ ## 2. User Support.
39
+
40
+ Your use of the DINO Materials is done at your own discretion; Meta does not process any information nor provide any service in relation to such use. Meta is under no obligation to provide any support services for the DINO Materials. Any support provided is “as is”, “with all faults”, and without warranty of any kind.
41
+
42
+ ## 3. Disclaimer of Warranty.
43
+
44
+ UNLESS REQUIRED BY APPLICABLE LAW, THE DINO MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE DINO MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE DINO MATERIALS AND ANY OUTPUT AND RESULTS.
45
+
46
+ ## 4. Limitation of Liability.
47
+
48
+ IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT OR INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
49
+
50
+ ## 5. Intellectual Property.
51
+
52
+ a. Subject to Meta’s ownership of DINO Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the DINO Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
53
+
54
+ b. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the DINO Materials, outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the DINO Materials.
55
+
56
+ ## 6. Term and Termination.
57
+
58
+ The term of this Agreement will commence upon your acceptance of this Agreement or access to the DINO Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the DINO Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement.
59
+
60
+ ## 7. Governing Law and Jurisdiction.
61
+
62
+ This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
63
+
64
+ ## 8. Modifications and Amendments.
65
+
66
+ Meta may modify this Agreement from time to time; provided that they are similar in spirit to the current version of the Agreement, but may differ in detail to address new problems or concerns. All such changes will be effective immediately. Your continued use of the DINO Materials after any modification to this Agreement constitutes your agreement to such modification. Except as provided in this Agreement, no modification or addition to any provision of this Agreement will be binding unless it is in writing and signed by an authorized representative of both you and Meta.
depth_anything_v2_metric/depth_anything_v2/dinov3/MODEL_CARD.md ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model Card for DINOv3
2
+
3
+ DINOv3 is a family of versatile vision foundation models that outperforms the specialized state of the art across a broad range of settings, without fine-tuning. DINOv3 produces high-quality dense features that achieve outstanding performance on various vision tasks, significantly surpassing previous self- and weakly-supervised foundation models.
4
+
5
+ ## Model Details
6
+
7
+ These are Vision Transformer and ConvNeXt models trained following the method described in the DINOv3 paper. 12 models are provided:
8
+
9
+ - 10 models pretrained on web data (LVD-1689M dataset)
10
+ - 1 ViT-7B trained from scratch,
11
+ - 5 ViT-S/S+/B/L/H+ models distilled from the ViT-7B,
12
+ - 4 ConvNeXt-{T/S/B/L} models distilled from the ViT-7B,
13
+ - 2 models pretrained on satellite data (SAT-493M dataset)
14
+ - 1 ViT-7B trained from scratch
15
+ - 1 ViT-L distilled from the ViT-7B
16
+
17
+
18
+ Each Transformer-based model takes an image as input and returns a class token, patch tokens (and register tokens). These models follow a ViT architecture, with a patch size of 16. For a 224x224 image, this results in 1 class token + 4 register tokens + 196 patch tokens = 201 tokens (for DINOv2 with registers this resulted in 1 + 4 + 256 = 261 tokens).
19
+
20
+ The models can accept larger images provided the image shapes are multiples of the patch size (16). If this condition is not verified, the model will crop to the closest smaller multiple of the patch size.
21
+
22
+ ### Model Description
23
+
24
+ - **Developed by:** Meta AI
25
+ - **Model type:** Vision Transformer, ConvNeXt
26
+ - **License:** [DINOv3 License](https://ai.meta.com/resources/models-and-libraries/dinov3-license/)
27
+
28
+ ### Model Sources
29
+
30
+ - **Repository:** [https://github.com/facebookresearch/dinov3](https://github.com/facebookresearch/dinov3)
31
+ - **Paper:** [https://arxiv.org/abs/2508.10104](https://arxiv.org/abs/2508.10104)
32
+
33
+ ## Uses
34
+
35
+ The models are vision backbones providing multi-purpose features for downstream tasks.
36
+
37
+ ### Direct Use
38
+
39
+ The models can be used without fine-tuning, with downstream classifiers as simple as linear layers, to obtain competitive results:
40
+
41
+ - on image classification, using k-NN classifiers on the class token
42
+ - on image classification, with logistic regression classifiers applied on the class token
43
+ - on image classification, with a linear layer applied on the class token and the average of the patch tokens
44
+ - on image retrieval using nearest neighbors
45
+ - on geometric and semantic 3D keypoint correspondances
46
+ - on depth estimation, semantic segmentation, using linear layers
47
+ - on unsupervised object discovery
48
+ - on video segmentation tracking
49
+ - on video classification, using a small 4-layer attentive probe
50
+
51
+ ### Downstream Use
52
+
53
+ While fine-tuning the models can yield some gains, it is recommended to keep this option as a last resort: the frozen features are expected to provide good performance out-of-the-box.
54
+
55
+ ## Bias, Risks, and Limitations
56
+
57
+ Compared to DINOv2 and SEERv2, DINOv3 delivers somewhat consistent performance across income categories on geographical fairness and diversity, although with a notable performance drop in the low-income bucket compared to the highest-income bucket.
58
+
59
+ DINOv3 also achieves relatively good scores across different regions, improving over its predecessor DINOv2. However, a relative difference is still observed between Europe and Africa.
60
+
61
+ ### Recommendations
62
+
63
+ Fine-tuning is expected to increase the biases in the features produced by the model as they will be tuned to the fine-tuning labels.
64
+
65
+ ## How to Get Started with the Model
66
+
67
+ Use the code below to get started with the model.
68
+
69
+ ```python
70
+ import torch
71
+
72
+ model = torch.hub.load(
73
+ repo_or_dir='facebookresearch/dinov3',
74
+ model='<MODEL_NAME>',
75
+ weights='<PATH/OR/URL/TO/CHECKPOINT>',
76
+ )
77
+
78
+ # where MODEL_NAME can be one of:
79
+ # - dinov3_vits16
80
+ # - dinov3_vits16plus
81
+ # - dinov3_vitb16
82
+ # - dinov3_vitl16
83
+ # - dinov3_vith16plus
84
+ # - dinov3_vit7b16
85
+ # - dinov3_convnext_tiny
86
+ # - dinov3_convnext_small
87
+ # - dinov3_convnext_base
88
+ # - dinov3_convnext_large
89
+
90
+ # For instance
91
+ dinov3_vits16 = torch.hub.load(
92
+ repo_or_dir='facebookresearch/dinov3',
93
+ model='dinov3_vits16',
94
+ weights='<PATH/OR/URL/TO/DINOV3/VITS16/LVD1689M/CHECKPOINT>',
95
+ )
96
+ ```
97
+
98
+ ## Training Details
99
+
100
+ ### Training Data
101
+
102
+ - Web dataset (LVD-1689M): a curated dataset of 1,689 millions of images extracted from a large data
103
+ pool of 17 billions web images collected from public posts on Instagram
104
+
105
+ - Satellite dataset (SAT-493M): a dataset of 493 millions of 512x512 images sampled randomly from Maxar RGB ortho-rectified imagery at 0.6 meter resolution
106
+
107
+ ### Training Procedure
108
+
109
+ **Training objective:**
110
+
111
+ - DINO self-distillation loss with multi-crop
112
+ - iBOT masked-image modeling loss
113
+ - KoLeo regularization on [CLS] tokens
114
+ - Gram anchoring
115
+
116
+ - **Training regime:** PyTorch FSDP2 (with bf16 and fp8 matrix multiplications)
117
+
118
+ **Distillation:**
119
+
120
+ - Distillation follows the standard DINOv3 pretraining procedure, except the teacher is a frozen pretrained ViT-7B.
121
+
122
+ ## Evaluation
123
+
124
+ **Results**
125
+
126
+ The reader is referred to the associated paper for details on the evaluation protocols
127
+
128
+ *Results for ViT backbones pretrained (or distilled) on web (LVD-1689M)*
129
+
130
+ <table>
131
+ <tr>
132
+ <th></th>
133
+ <!-- <th></th> -->
134
+ <th colspan="4">Global Tasks</th>
135
+ <th colspan="5">Dense Tasks</th>
136
+ </tr>
137
+ <tr>
138
+ <th>Model</th>
139
+ <!-- <th>Dataset</th> -->
140
+ <th>IN-ReaL</th>
141
+ <th>IN-R</th>
142
+ <th>Obj.Net</th>
143
+ <th>Ox.-H</th>
144
+ <th>ADE20k</th>
145
+ <th>NYU↓</th>
146
+ <th>DAVIS</th>
147
+ <th>NAVI</th>
148
+ <th>SPair</th>
149
+ </tr>
150
+ <tr>
151
+ <td>DINOv3 ViT-S/16</td>
152
+ <!-- <td>LVD-1689M</td> -->
153
+ <td align="right">87.0</td>
154
+ <td align="right">60.4</td>
155
+ <td align="right">50.9</td>
156
+ <td align="right">49.5</td>
157
+ <td align="right">47.0</td>
158
+ <td align="right">0.403</td>
159
+ <td align="right">72.7</td>
160
+ <td align="right">56.3</td>
161
+ <td align="right">50.4</td>
162
+ </tr>
163
+ <tr>
164
+ <td>DINOv3 ViT-S+/16</td>
165
+ <!-- <td>LVD-1689M</td> -->
166
+ <td align="right">88.0</td>
167
+ <td align="right">68.8</td>
168
+ <td align="right">54.6</td>
169
+ <td align="right">50.0</td>
170
+ <td align="right">48.8</td>
171
+ <td align="right">0.399</td>
172
+ <td align="right">75.5</td>
173
+ <td align="right">57.1</td>
174
+ <td align="right">55.2</td>
175
+ </tr>
176
+ <tr>
177
+ <td>DINOv3 ViT-B/16</td>
178
+ <!-- <td>LVD-1689M</td> -->
179
+ <td align="right">89.3</td>
180
+ <td align="right">76.7</td>
181
+ <td align="right">64.1</td>
182
+ <td align="right">58.5</td>
183
+ <td align="right">51.8</td>
184
+ <td align="right">0.373</td>
185
+ <td align="right">77.2</td>
186
+ <td align="right">58.8</td>
187
+ <td align="right">57.2</td>
188
+ </tr>
189
+ <tr>
190
+ <td>DINOv3 ViT-L/16</td>
191
+ <!-- <td>LVD-1689M</td> -->
192
+ <td align="right">90.2</td>
193
+ <td align="right">88.1</td>
194
+ <td align="right">74.8</td>
195
+ <td align="right">63.1</td>
196
+ <td align="right">54.9</td>
197
+ <td align="right">0.352</td>
198
+ <td align="right">79.9</td>
199
+ <td align="right">62.3</td>
200
+ <td align="right">61.3</td>
201
+ </tr>
202
+ <tr>
203
+ <td>DINOv3 ViT-H+/16</td>
204
+ <!-- <td>LVD-1689M</td> -->
205
+ <td align="right">90.3</td>
206
+ <td align="right">90.0</td>
207
+ <td align="right">78.6</td>
208
+ <td align="right">64.5</td>
209
+ <td align="right">54.8</td>
210
+ <td align="right">0.352</td>
211
+ <td align="right">79.3</td>
212
+ <td align="right">63.3</td>
213
+ <td align="right">56.3</td>
214
+ </tr>
215
+ <tr>
216
+ <td>DINOv3 ViT-7B/16</td>
217
+ <!-- <td>LVD-1689M</td> -->
218
+ <td align="right">90.4</td>
219
+ <td align="right">91.1</td>
220
+ <td align="right">91.1</td>
221
+ <td align="right">72.8</td>
222
+ <td align="right">55.9</td>
223
+ <td align="right">0.309</td>
224
+ <td align="right">79.7</td>
225
+ <td align="right">64.4</td>
226
+ <td align="right">58.7</td>
227
+ </tr>
228
+ </table>
229
+
230
+ *Results for ConvNeXt backbones distilled on web (LVD-1689M)*
231
+
232
+ <table>
233
+ <tr>
234
+ <th></th>
235
+ <th colspan="6">Global Tasks</th>
236
+ <th colspan="2">Dense Tasks</th>
237
+ </tr>
238
+ <tr>
239
+ <th>Model</th>
240
+ <th colspan="2">IN-ReaL</th>
241
+ <th colspan="2">IN-R</th>
242
+ <th colspan="2">Obj.Net</th>
243
+ <th>ADE20k</th>
244
+ <th>NYU↓</th>
245
+ </tr>
246
+ <tr>
247
+ <td></th>
248
+ <td>@256px</td>
249
+ <td>@512px</td>
250
+ <td>@256px</td>
251
+ <td>@512px</td>
252
+ <td>@256px</td>
253
+ <td>@512px</td>
254
+ <td colspan="2"></td>
255
+ </tr>
256
+ <tr>
257
+ <td>DINOv3 ConvNeXt Tiny</td>
258
+ <td align="right">86.6</td>
259
+ <td align="right">87.7</td>
260
+ <td align="right">73.7</td>
261
+ <td align="right">74.1</td>
262
+ <td align="right">52.6</td>
263
+ <td align="right">58.7</td>
264
+ <td align="right">42.7</td>
265
+ <td align="right">0.448</td>
266
+ </tr>
267
+ <tr>
268
+ <td>DINOv3 ConvNeXt Small</td>
269
+ <td align="right">87.9</td>
270
+ <td align="right">88.7</td>
271
+ <td align="right">73.7</td>
272
+ <td align="right">74.1</td>
273
+ <td align="right">52.6</td>
274
+ <td align="right">58.7</td>
275
+ <td align="right">44.8</td>
276
+ <td align="right">0.432</td>
277
+ </tr>
278
+ <tr>
279
+ <td>DINOv3 ConvNeXt Base</td>
280
+ <td align="right">88.5</td>
281
+ <td align="right">89.2</td>
282
+ <td align="right">77.2</td>
283
+ <td align="right">78.2</td>
284
+ <td align="right">56.2</td>
285
+ <td align="right">61.3</td>
286
+ <td align="right">46.3</td>
287
+ <td align="right">0.420</td>
288
+ </tr>
289
+ <tr>
290
+ <td>DINOv3 ConvNeXt Large</td>
291
+ <td align="right">88.9</td>
292
+ <td align="right">89.4</td>
293
+ <td align="right">81.3</td>
294
+ <td align="right">82.4</td>
295
+ <td align="right">59.3</td>
296
+ <td align="right">65.2</td>
297
+ <td align="right">47.8</td>
298
+ <td align="right">0.403</td>
299
+ </tr>
300
+ </table>
301
+
302
+ *Results for ViT backbones pretrained (or distilled) on satellite (SAT-493M)*
303
+
304
+ <table>
305
+ <tr>
306
+ <th></th>
307
+ <th colspan="7">(GEO-Bench) Classification</th>
308
+ </tr>
309
+ <tr>
310
+ <th>Model</ht>
311
+ <th>m-BEnet</th>
312
+ <th>m-brick-kiln
313
+ <th>m-eurosat</th>
314
+ <th>m-forestnet</th>
315
+ <th>m-pv4ger</th>
316
+ <th>m-so2sat</th>
317
+ <th>mean</th>
318
+ </tr>
319
+ <tr>
320
+ <td>DINOv3 ViT-L/16</td>
321
+ <td>73.0</td>
322
+ <td>96.5</td>
323
+ <td>94.1</td>
324
+ <td>60.6</td>
325
+ <td>96.0</td>
326
+ <td>57.4</td>
327
+ <td>79.6</td>
328
+ </tr>
329
+ <tr>
330
+ <td>DINOv3 ViT-7B/16</td>
331
+ <td>74.0</td>
332
+ <td>97.2</td>
333
+ <td>94.8</td>
334
+ <td>62.3</td>
335
+ <td>96.1</td>
336
+ <td>62.1</td>
337
+ <td>81.1</td>
338
+ </tr>
339
+ <tr>
340
+ <th></th>
341
+ <th colspan="7">(GEO-Bench) Segmentation</th>
342
+ </tr>
343
+ <tr>
344
+ <th>Model</th>
345
+ <th>m-cashew</th>
346
+ <th>m-chesapeake</th>
347
+ <th>m-NeonTree</th>
348
+ <th>m-nz-cattle</th>
349
+ <th>m-pv4ger-seg</th>
350
+ <th>m-SA-crop</th>
351
+ <th>mean</th>
352
+ </tr>
353
+ <tr>
354
+ <td>DINOv3 ViT-L/16</td>
355
+ <td>94.2</td>
356
+ <td>75.6</td>
357
+ <td>61.8</td>
358
+ <td>83.7</td>
359
+ <td>95.2</td>
360
+ <td>36.8</td>
361
+ <td>74.5</td>
362
+ </tr>
363
+ <tr>
364
+ <td>DINOv3 ViT-7B/16</td>
365
+ <td>94.1</td>
366
+ <td>76.6</td>
367
+ <td>62.6</td>
368
+ <td>83.4</td>
369
+ <td>95.5</td>
370
+ <td>37.6</td>
371
+ <td>75.0</td>
372
+ </tr>
373
+ </table>
374
+
375
+
376
+ ## Environmental Impact
377
+
378
+ - **Hardware Type:** Nvidia H100
379
+ - **Hours used:** 61,440 hours for ViT-7B model training
380
+ - **Cloud Provider:** Private infrastructure
381
+ - **Compute Region:** USA
382
+ - **Carbon Emitted:** 18t CO2eq
383
+
384
+ ## Technical Specifications
385
+
386
+ ### Model Architecture and Objective
387
+
388
+ Vision Transformer models:
389
+
390
+ - ViT-S (21M parameters): patch size 16, embedding dimension 384, 4 register tokens, 6 heads, MLP FFN, RoPE
391
+ - ViT-S+ (29M parameters): patch size 16, embedding dimension 384, 4 register tokens, 6 heads, SwiGLU FFN, RoPE
392
+ - ViT-B (86M parameters): patch size 16, embedding dimension 768, 4 register tokens, 12 heads, MLP FFN, RoPE
393
+ - ViT-L (300M parameters): patch size 16, embedding dimension 1024, 4 register tokens, 16 heads, MLP FFN, RoPE
394
+ - ViT-H+ (840M parameters): patch size 16, embedding dimension 1280, 4 register tokens, 20 heads, SwiGLU FFN, RoPE
395
+ - ViT-7B (6716M parameters): patch size 16, embedding dimension 4096, 4 register tokens, 32 heads, SwiGLU FFN, RoPE
396
+
397
+ ConvNeXt models:
398
+
399
+ - ConvNeXt Tiny (29M parameters)
400
+ - ConvNeXt Small (50M parameters)
401
+ - ConvNeXt Base (89M parameters)
402
+ - ConvNeXt Large (198M parameters)
403
+
404
+ ### Compute Infrastructure
405
+
406
+ #### Hardware
407
+
408
+ Nvidia H100 GPUs
409
+
410
+ #### Software
411
+
412
+ PyTorch 2.7
413
+
414
+ ## More Information
415
+
416
+ See the [blog post](https://ai.meta.com/blog/dinov3-self-supervised-vision-model/) and the associated [website](https://ai.meta.com/dinov3/).
417
+
418
+ ## Citation
419
+
420
+ **BibTeX**
421
+
422
+ ```
423
+ @misc{simeoni2025dinov3,
424
+ title={{DINOv3}},
425
+ author={Sim{\'e}oni, Oriane and Vo, Huy V. and Seitzer, Maximilian and Baldassarre, Federico and Oquab, Maxime and Jose, Cijo and Khalidov, Vasil and Szafraniec, Marc and Yi, Seungeun and Ramamonjisoa, Micha{\"e}l and Massa, Francisco and Haziza, Daniel and Wehrstedt, Luca and Wang, Jianyuan and Darcet, Timoth{\'e}e and Moutakanni, Th{\'e}o and Sentana, Leonel and Roberts, Claire and Vedaldi, Andrea and Tolan, Jamie and Brandt, John and Couprie, Camille and Mairal, Julien and J{\'e}gou, Herv{\'e} and Labatut, Patrick and Bojanowski, Piotr},
426
+ year={2025},
427
+ eprint={2508.10104},
428
+ archivePrefix={arXiv},
429
+ primaryClass={cs.CV},
430
+ url={https://arxiv.org/abs/2508.10104},
431
+ }
432
+ ```
depth_anything_v2_metric/depth_anything_v2/dinov3/README.md ADDED
@@ -0,0 +1,734 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 🆕 [2025-08-14] :fire: DINOv3 backbones are now available in [Hugging Face Hub](https://huggingface.co/collections/facebook/dinov3-68924841bd6b561778e31009) and [supported](https://huggingface.co/docs/transformers/model_doc/dinov3) by the Hugging Face [Transformers](https://huggingface.co/docs/transformers/index) library
2
+
3
+ # DINOv3 🦖🦖🦖
4
+
5
+ **[Meta AI Research, FAIR](https://ai.meta.com/research/)**
6
+
7
+ Oriane Siméoni, Huy V. Vo, Maximilian Seitzer, Federico Baldassarre, Maxime Oquab, <br/>
8
+ Cijo Jose, Vasil Khalidov, Marc Szafraniec, Seungeun Yi, Michaël Ramamonjisoa, <br/>
9
+ Francisco Massa, Daniel Haziza, Luca Wehrstedt, Jianyuan Wang, <br/>
10
+ Timothée Darcet, Théo Moutakanni, Leonel Sentana, Claire Roberts, <br/>
11
+ Andrea Vedaldi, Jamie Tolan, John Brandt, Camille Couprie, <br/>
12
+ Julien Mairal, Hervé Jégou, Patrick Labatut, Piotr Bojanowski
13
+
14
+ [ :scroll: [`Paper`](https://arxiv.org/abs/2508.10104)] [ :newspaper: [`Blog`](https://ai.meta.com/blog/dinov3-self-supervised-vision-model/)] [ :globe_with_meridians: [`Website`](https://ai.meta.com/dinov3/)] [ :book: [`BibTeX`](#citing-dinov3)]
15
+
16
+ Reference PyTorch implementation and models for DINOv3. For details, see the **[DINOv3](https://arxiv.org/abs/2508.10104)** paper.
17
+
18
+ ## Overview
19
+
20
+ <div align="center">
21
+ <img width="1364" height="1024" alt="market" src="https://github.com/user-attachments/assets/1411f491-988e-49cb-95ae-d03fe6e3c268" />
22
+
23
+ <i></em><b>High-resolution dense features.</b><br/>We visualize the cosine similarity maps obtained with DINOv3 output features<br/> between the patches marked with a red cross and all other patches.</i>
24
+ </div>
25
+
26
+ <br/>
27
+
28
+ An extended family of versatile vision foundation models producing high-quality dense features and achieving outstanding performance on various vision tasks including outperforming the specialized state of the art across a broad range of settings, without fine-tuning
29
+
30
+ ## Pretrained models
31
+
32
+ :information_source: Please follow the link provided below to get access to all the model weights: once accepted, an e-mail will be sent with the complete list of URLs pointing to all the available model weights (both backbones and adapters). These URLs can then be used to either:
33
+ - download the model or adapter weights to a local filesystem and point `torch.hub.load()` to these local weights via the `weights` or `backbone_weights` parameters, or
34
+ - directly invoke `torch.hub.load()` to download and load a backbone or an adapter from its URL via also the `weights` or `backbone_weights` parameters.
35
+
36
+ See the example code snippets below.
37
+
38
+ :warning: Please use `wget` instead of a web browser to download the weights.
39
+
40
+ ViT models pretrained on web dataset (LVD-1689M):
41
+ <table style="margin: auto">
42
+ <thead>
43
+ <tr>
44
+ <th>Model</th>
45
+ <th>Parameters</th>
46
+ <th>Pretraining<br/>Dataset</th>
47
+ <th>Download</th>
48
+ </tr>
49
+ </thead>
50
+ <tbody>
51
+ <tr>
52
+ <td>ViT-S/16 distilled </td>
53
+ <td align="right">21M</td>
54
+ <td align="center">LVD-1689M</td>
55
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
56
+ </tr>
57
+ <tr>
58
+ <td>ViT-S+/16 distilled</td>
59
+ <td align="right">29M</td>
60
+ <td align="center">LVD-1689M</td>
61
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
62
+ </tr>
63
+ <tr>
64
+ <td>ViT-B/16 distilled</td>
65
+ <td align="right">86M</td>
66
+ <td align="center">LVD-1689M</td>
67
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
68
+ </tr>
69
+ <tr>
70
+ <td>ViT-L/16 distilled</td>
71
+ <td align="right">300M</td>
72
+ <td align="center">LVD-1689M</td>
73
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
74
+ </tr>
75
+ <tr>
76
+ <td>ViT-H+/16 distilled</td>
77
+ <td align="right">840M</td>
78
+ <td align="center">LVD-1689M</td>
79
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
80
+ </tr>
81
+ <tr>
82
+ <td>ViT-7B/16</td>
83
+ <td align="right">6,716M</td>
84
+ <td align="center">LVD-1689M</td>
85
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
86
+ </tr>
87
+ </tbody>
88
+ </table>
89
+
90
+ ConvNeXt models pretrained on web dataset (LVD-1689M):
91
+ <table style="margin: auto">
92
+ <thead>
93
+ <tr>
94
+ <th>Model</th>
95
+ <th>Parameters</th>
96
+ <th>Pretraining<br/>Dataset</th>
97
+ <th>Download</th>
98
+ </tr>
99
+ </thead>
100
+ <tbody>
101
+ <tr>
102
+ <td>ConvNeXt Tiny</td>
103
+ <td align="right">29M</td>
104
+ <td align="center">LVD-1689M</td>
105
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
106
+ </tr>
107
+ <tr>
108
+ <td>ConvNeXt Small</td>
109
+ <td align="right">50M</td>
110
+ <td align="center">LVD-1689M</td>
111
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
112
+ </tr>
113
+ <tr>
114
+ <td>ConvNeXt Base</td>
115
+ <td align="right">89M</td>
116
+ <td align="center">LVD-1689M</td>
117
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
118
+ </tr>
119
+ <tr>
120
+ <td>ConvNeXt Large</td>
121
+ <td align="right">198M</td>
122
+ <td align="center">LVD-1689M</td>
123
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
124
+ </tr>
125
+ </tbody>
126
+ </table>
127
+
128
+ ViT models pretrained on satellite dataset (SAT-493M):
129
+ <table style="margin: auto">
130
+ <thead>
131
+ <tr>
132
+ <th>Model</th>
133
+ <th>Parameters</th>
134
+ <th>Pretraining<br/>Dataset</th>
135
+ <th>Download</th>
136
+ </tr>
137
+ </thead>
138
+ <tbody>
139
+ <tr>
140
+ <td>ViT-L/16 distilled</td>
141
+ <td align="right">300M</td>
142
+ <td align="center">SAT-493M</td>
143
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
144
+ </tr>
145
+ <tr>
146
+ <td>ViT-7B/16</td>
147
+ <td align="right">6,716M</td>
148
+ <td align="center">SAT-493M</td>
149
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
150
+ </tr>
151
+ </tbody>
152
+ </table>
153
+
154
+
155
+ ### Pretrained backbones (via PyTorch [Hub](https://docs.pytorch.org/docs/stable/hub.html))
156
+
157
+ Please follow the instructions [here](https://pytorch.org/get-started/locally/) to install PyTorch (the only required dependency for loading the model). Installing PyTorch with CUDA support is strongly recommended.
158
+
159
+ ```python
160
+ import torch
161
+
162
+ REPO_DIR = <PATH/TO/A/LOCAL/DIRECTORY/WHERE/THE/DINOV3/REPO/WAS/CLONED>
163
+
164
+ # DINOv3 ViT models pretrained on web images
165
+ dinov3_vits16 = torch.hub.load(REPO_DIR, 'dinov3_vits16', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
166
+ dinov3_vits16plus = torch.hub.load(REPO_DIR, 'dinov3_vits16plus', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
167
+ dinov3_vitb16 = torch.hub.load(REPO_DIR, 'dinov3_vitb16', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
168
+ dinov3_vitl16 = torch.hub.load(REPO_DIR, 'dinov3_vitl16', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
169
+ dinov3_vith16plus = torch.hub.load(REPO_DIR, 'dinov3_vith16plus', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
170
+ dinov3_vit7b16 = torch.hub.load(REPO_DIR, 'dinov3_vit7b16', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
171
+
172
+ # DINOv3 ConvNeXt models pretrained on web images
173
+ dinov3_convnext_tiny = torch.hub.load(REPO_DIR, 'dinov3_convnext_tiny', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
174
+ dinov3_convnext_small = torch.hub.load(REPO_DIR, 'dinov3_convnext_small', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
175
+ dinov3_convnext_base = torch.hub.load(REPO_DIR, 'dinov3_convnext_base', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
176
+ dinov3_convnext_large = torch.hub.load(REPO_DIR, 'dinov3_convnext_large', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
177
+
178
+ # DINOv3 ViT models pretrained on satellite imagery
179
+ dinov3_vitl16 = torch.hub.load(REPO_DIR, 'dinov3_vitl16', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
180
+ dinov3_vit7b16 = torch.hub.load(REPO_DIR, 'dinov3_vit7b16', source='local', weights=<CHECKPOINT/URL/OR/PATH>)
181
+ ```
182
+
183
+ ### Pretrained backbones (via Hugging Face [Transformers](https://huggingface.co/docs/transformers/))
184
+
185
+ All the backbones are available in the the [DINOv3](https://huggingface.co/collections/facebook/dinov3-68924841bd6b561778e31009) collection on Hugging Face Hub and supported via the Hugging Face [Transformers](https://huggingface.co/docs/transformers/index) library. Please refer to the corresponding documentation for usage, but below is a short example that demonstrates how to obtain an image embedding with either [Pipeline] or the [AutoModel] class.
186
+
187
+ ```python
188
+ from transformers import pipeline
189
+ from transformers.image_utils import load_image
190
+
191
+ url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
192
+ image = load_image(url)
193
+
194
+ feature_extractor = pipeline(
195
+ model="facebook/dinov3-convnext-tiny-pretrain-lvd1689m",
196
+ task="image-feature-extraction",
197
+ )
198
+ features = feature_extractor(image)
199
+ ```
200
+
201
+ ```python
202
+ import torch
203
+ from transformers import AutoImageProcessor, AutoModel
204
+ from transformers.image_utils import load_image
205
+
206
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
207
+ image = load_image(url)
208
+
209
+ pretrained_model_name = "facebook/dinov3-convnext-tiny-pretrain-lvd1689m"
210
+ processor = AutoImageProcessor.from_pretrained(pretrained_model_name)
211
+ model = AutoModel.from_pretrained(
212
+ pretrained_model_name,
213
+ device_map="auto",
214
+ )
215
+
216
+ inputs = processor(images=image, return_tensors="pt").to(model.device)
217
+ with torch.inference_mode():
218
+ outputs = model(**inputs)
219
+
220
+ pooled_output = outputs.pooler_output
221
+ print("Pooled output shape:", pooled_output.shape)
222
+ ```
223
+
224
+ where `model` and `pretrained_model_name` above can be one of:
225
+ - `facebook/dinov3-vits16-pretrain-lvd1689m`
226
+ - `facebook/dinov3-vits16plus-pretrain-lvd1689m`
227
+ - `facebook/dinov3-vitb16-pretrain-lvd1689m`
228
+ - `facebook/dinov3-vitl16-pretrain-lvd1689m`
229
+ - `facebook/dinov3-vith16plus-pretrain-lvd1689m`
230
+ - `facebook/dinov3-vit7b16-pretrain-lvd1689m`
231
+ - `facebook/dinov3-convnext-base-pretrain-lvd1689m`
232
+ - `facebook/dinov3-convnext-large-pretrain-lvd1689m`
233
+ - `facebook/dinov3-convnext-small-pretrain-lvd1689m`
234
+ - `facebook/dinov3-convnext-tiny-pretrain-lvd1689m`
235
+ - `facebook/dinov3-vitl16-pretrain-sat493m`
236
+ - `facebook/dinov3-vit7b16-pretrain-sat493m`
237
+
238
+ ### Image transforms
239
+
240
+ For models using the LVD-1689M weights (pretrained on web images), please use the following transform (standard ImageNet evaluation transform):
241
+
242
+ ```python
243
+ import torchvision
244
+
245
+ def make_transform(resize_size: int = 224):
246
+ to_tensor = transforms.ToTensor()
247
+ resize = transforms.Resize((resize_size, resize_size), antialias=True)
248
+ normalize = transforms.Normalize(
249
+ mean=(0.485, 0.456, 0.406),
250
+ std=(0.229, 0.224, 0.225),
251
+ )
252
+ return transforms.Compose([to_tensor, resize, normalize])
253
+ ```
254
+
255
+
256
+ For models using the SAT-493M weights (pretrained on satellite imagery), please use the following transform:
257
+
258
+
259
+ ```python
260
+ import torchvision
261
+
262
+ def make_transform(resize_size: int = 224):
263
+ to_tensor = transforms.ToTensor()
264
+ resize = transforms.Resize((resize_size, resize_size), antialias=True)
265
+ normalize = transforms.Normalize(
266
+ mean=(0.430, 0.411, 0.296),
267
+ std=(0.213, 0.156, 0.143),
268
+ )
269
+ return transforms.Compose([to_tensor, resize, normalize])
270
+ ```
271
+
272
+ ### Pretrained heads - Image classification
273
+
274
+ <table style="margin: auto">
275
+ <thead>
276
+ <tr>
277
+ <th>Backbone</th>
278
+ <th>Pretraining<br/>Dataset</th>
279
+ <th>Head<br/>Dataset</th>
280
+ <th>Download</th>
281
+ </tr>
282
+ </thead>
283
+ <tbody>
284
+ <tr>
285
+ <td>ViT-7B/16</td>
286
+ <td align="center">LVD-1689M</td>
287
+ <td align="center">ImageNet</td>
288
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
289
+ </tr>
290
+ </tbody>
291
+ </table>
292
+
293
+
294
+ The (full) classifier models can be loaded via PyTorch Hub:
295
+
296
+ ```python
297
+ import torch
298
+
299
+ # DINOv3
300
+ dinov3_vit7b16_lc = torch.hub.load(REPO_DIR, 'dinov3_vit7b16_lc', source="local", weights=<DEPTHER/CHECKPOINT/URL/OR/PATH>, backbone_weights=<BACKBONE/CHECKPOINT/URL/OR/PATH>)
301
+
302
+ ```
303
+
304
+ ### Pretrained heads - Depther trained on SYNTHMIX dataset
305
+
306
+ <table style="margin: auto">
307
+ <thead>
308
+ <tr>
309
+ <th>Backbone</th>
310
+ <th>Pretraining<br/>Dataset</th>
311
+ <th>Head<br/>Dataset</th>
312
+ <th>Download</th>
313
+ </tr>
314
+ </thead>
315
+ <tbody>
316
+ <tr>
317
+ <td>ViT-7B/16</td>
318
+ <td align="center">LVD-1689M</td>
319
+ <td align="center">SYNTHMIX</td>
320
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
321
+ </tr>
322
+ </tbody>
323
+ </table>
324
+
325
+
326
+ ```python
327
+ depther = torch.hub.load(REPO_DIR, 'dinov3_vit7b16_dd', source="local", weights=<DEPTHER/CHECKPOINT/URL/OR/PATH>, backbone_weights=<BACKBONE/CHECKPOINT/URL/OR/PATH>)
328
+ ```
329
+
330
+ Full example code of depther on an image
331
+
332
+ ```python
333
+ from PIL import Image
334
+ import torch
335
+ from torchvision import transforms
336
+ import matplotlib.pyplot as plt
337
+ from matplotlib import colormaps
338
+
339
+ def get_img():
340
+ import requests
341
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
342
+ image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
343
+ return image
344
+
345
+ def make_transform(resize_size: int | list[int] = 768):
346
+ to_tensor = transforms.ToTensor()
347
+ resize = transforms.Resize((resize_size, resize_size), antialias=True)
348
+ normalize = transforms.Normalize(
349
+ mean=(0.485, 0.456, 0.406),
350
+ std=(0.229, 0.224, 0.225),
351
+ )
352
+ return transforms.Compose([to_tensor, resize, normalize])
353
+
354
+ depther = torch.hub.load(REPO_DIR, 'dinov3_vit7b16_dd', source="local", weights=<DEPTHER/CHECKPOINT/URL/OR/PATH>, backbone_weights=<BACKBONE/CHECKPOINT/URL/OR/PATH>)
355
+
356
+ img_size = 1024
357
+ img = get_img()
358
+ transform = make_transform(img_size)
359
+ with torch.inference_mode():
360
+ with torch.autocast('cuda', dtype=torch.bfloat16):
361
+ batch_img = transform(img)[None]
362
+ batch_img = batch_img
363
+ depths = depther(batch_img)
364
+
365
+ plt.figure(figsize=(12, 6))
366
+ plt.subplot(121)
367
+ plt.imshow(img)
368
+ plt.axis("off")
369
+ plt.subplot(122)
370
+ plt.imshow(depths[0,0].cpu(), cmap=colormaps["Spectral"])
371
+ plt.axis("off")
372
+
373
+ ```
374
+
375
+ ### Pretrained heads - Detector trained on COCO2017 dataset
376
+
377
+ <table style="margin: auto">
378
+ <thead>
379
+ <tr>
380
+ <th>Backbone</th>
381
+ <th>Pretraining<br/>Dataset</th>
382
+ <th>Head<br/>Dataset</th>
383
+ <th>Download</th>
384
+ </tr>
385
+ </thead>
386
+ <tbody>
387
+ <tr>
388
+ <td>ViT-7B/16</td>
389
+ <td align="center">LVD-1689M</td>
390
+ <td align="center">COCO2017</td>
391
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
392
+ </tr>
393
+ </tbody>
394
+ </table>
395
+
396
+
397
+ ```python
398
+ detector = torch.hub.load(REPO_DIR, 'dinov3_vit7b16_de', source="local", weights=<DETECTOR/CHECKPOINT/URL/OR/PATH>, backbone_weights=<BACKBONE/CHECKPOINT/URL/OR/PATH>)
399
+ ```
400
+
401
+ ### Pretrained heads - Segmentor trained on ADE20K dataset
402
+
403
+ <table style="margin: auto">
404
+ <thead>
405
+ <tr>
406
+ <th>Backbone</th>
407
+ <th>Pretraining<br/>Dataset</th>
408
+ <th>Head<br/>Dataset</th>
409
+ <th>Download</th>
410
+ </tr>
411
+ </thead>
412
+ <tbody>
413
+ <tr>
414
+ <td>ViT-7B/16</td>
415
+ <td align="center">LVD-1689M</td>
416
+ <td align="center">ADE20K</td>
417
+ <td align="center"><a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a></td>
418
+ </tr>
419
+ </tbody>
420
+ </table>
421
+
422
+ ```python
423
+ segmentor = torch.hub.load(REPO_DIR, 'dinov3_vit7b16_ms', source="local", weights=<SEGMENTOR/CHECKPOINT/URL/OR/PATH>, backbone_weights=<BACKBONE/CHECKPOINT/URL/OR/PATH>)
424
+ ```
425
+
426
+ Full example code of segmentator on an image
427
+
428
+ ```python
429
+ import sys
430
+ sys.path.append(REPO_DIR)
431
+
432
+ from PIL import Image
433
+ import torch
434
+ from torchvision import transforms
435
+ import matplotlib.pyplot as plt
436
+ from matplotlib import colormaps
437
+ from functools import partial
438
+ from dinov3.eval.segmentation.inference import make_inference
439
+
440
+
441
+ def get_img():
442
+ import requests
443
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
444
+ image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
445
+ return image
446
+
447
+ def make_transform(resize_size: int | list[int] = 768):
448
+ to_tensor = transforms.ToTensor()
449
+ resize = transforms.Resize((resize_size, resize_size), antialias=True)
450
+ normalize = transforms.Normalize(
451
+ mean=(0.485, 0.456, 0.406),
452
+ std=(0.229, 0.224, 0.225),
453
+ )
454
+ return transforms.Compose([to_tensor, resize, normalize])
455
+
456
+ segmentor = torch.hub.load(REPO_DIR, 'dinov3_vit7b16_ms', source="local", weights=<SEGMENTOR/CHECKPOINT/URL/OR/PATH>, backbone_weights=<BACKBONE/CHECKPOINT/URL/OR/PATH>)
457
+
458
+ img_size = 896
459
+ img = get_img()
460
+ transform = make_transform(img_size)
461
+ with torch.inference_mode():
462
+ with torch.autocast('cuda', dtype=torch.bfloat16):
463
+ batch_img = transform(img)[None]
464
+ pred_vit7b = segmentor(batch_img) # raw predictions
465
+ # actual segmentation map
466
+ segmentation_map_vit7b = make_inference(
467
+ batch_img,
468
+ segmentor,
469
+ inference_mode="slide",
470
+ decoder_head_type="m2f",
471
+ rescale_to=(img.size[-1], img.size[-2]),
472
+ n_output_channels=150,
473
+ crop_size=(img_size, img_size),
474
+ stride=(img_size, img_size),
475
+ output_activation=partial(torch.nn.functional.softmax, dim=1),
476
+ ).argmax(dim=1, keepdim=True)
477
+ plt.figure(figsize=(12, 6))
478
+ plt.subplot(121)
479
+ plt.imshow(img)
480
+ plt.axis("off")
481
+ plt.subplot(122)
482
+ plt.imshow(segmentation_map_vit7b[0,0].cpu(), cmap=colormaps["Spectral"])
483
+ plt.axis("off")
484
+ ```
485
+
486
+
487
+
488
+
489
+ ### Pretrained heads - Zero-shot tasks with `dino.txt`
490
+
491
+ <table style="margin: auto">
492
+ <thead>
493
+ <tr>
494
+ <th rowspan="2">Backbone</th>
495
+ <th>Download</th>
496
+ </tr>
497
+ </thead>
498
+ <tbody>
499
+ <tr>
500
+ <td>ViT-L/16 distilled</td>
501
+ <td align="center">
502
+ <a href="https://ai.meta.com/resources/models-and-libraries/dinov3-downloads/">[link]</a>,
503
+ <a href="https://dl.fbaipublicfiles.com/dinov3/thirdparty/bpe_simple_vocab_16e6.txt.gz">vocabulary</a>,
504
+ <a href="https://dl.fbaipublicfiles.com/dinov2/thirdparty/LICENSE">vocabulary license</a>
505
+ </td>
506
+ </tr>
507
+ </tbody>
508
+ </table>
509
+
510
+ The (full) dino.txt model can be loaded via PyTorch Hub:
511
+
512
+ ```python
513
+ import torch
514
+ # DINOv3
515
+ dinov3_vitl16_dinotxt_tet1280d20h24l, tokenizer = torch.hub.load(REPO_DIR, 'dinov3_vitl16_dinotxt_tet1280d20h24l', weights=<SEGMENTOR/CHECKPOINT/URL/OR/PATH>, backbone_weights=<BACKBONE/CHECKPOINT/URL/OR/PATH>)
516
+ ```
517
+
518
+
519
+ ## Installation
520
+
521
+ The training and evaluation code requires PyTorch version >= 2.7.1 as well as a few other 3rd party packages. Note that the code has only been tested with the specified versions and also expects a Linux environment. To setup all the required dependencies for training and evaluation, please follow the instructions below:
522
+
523
+ *[micromamba](https://mamba.readthedocs.io/en/latest/user_guide/micromamba.html)* **(Recommended)** - Clone the repository and then create and activate a `dinov3` conda environment using the provided environment definition:
524
+
525
+ ```shell
526
+ micromamba env create -f conda.yaml
527
+ micromamba activate dinov3
528
+ ```
529
+
530
+ ## Getting started
531
+
532
+ Several notebooks are provided to get started applying DINOv3:
533
+ - [PCA of patch features](notebooks/pca.ipynb): display the PCA of DINOv3 patch features on a foreground object (rainbow visualizations from the paper) [[Run in Google Colab]](https://colab.research.google.com/github/facebookresearch/dinov3/blob/main/notebooks/pca.ipynb)
534
+ - [Foreground segmentation](notebooks/foreground_segmentation.ipynb): train a linear foreground segmentation model based on DINOv3 features [[Run in Google Colab]](https://colab.research.google.com/github/facebookresearch/dinov3/blob/main/notebooks/foreground_segmentation.ipynb)
535
+ - [Dense and sparse matching](notebooks/dense_sparse_matching.ipynb): match patches from objects on two different images based on DINOv3 features [[Run in Google Colab]](https://colab.research.google.com/github/facebookresearch/dinov3/blob/main/notebooks/dense_sparse_matching.ipynb)
536
+ - [Segmentation tracking](notebooks/segmentation_tracking.ipynb): video segmentation tracking using a non-parametric method based on DINOv3 features [[Run in Google Colab]](https://colab.research.google.com/github/facebookresearch/dinov3/blob/main/notebooks/segmentation_tracking.ipynb)
537
+
538
+ ## Data preparation
539
+
540
+ ### ImageNet-1k
541
+
542
+ The root directory of the dataset should hold the following contents:
543
+
544
+ - `<ROOT>/test/ILSVRC2012_test_00000001.JPEG`
545
+ - `<ROOT>/test/[..]`
546
+ - `<ROOT>/test/ILSVRC2012_test_00100000.JPEG`
547
+ - `<ROOT>/train/n01440764/n01440764_10026.JPEG`
548
+ - `<ROOT>/train/[...]`
549
+ - `<ROOT>/train/n15075141/n15075141_9993.JPEG`
550
+ - `<ROOT>/val/n01440764/ILSVRC2012_val_00000293.JPEG`
551
+ - `<ROOT>/val/[...]`
552
+ - `<ROOT>/val/n15075141/ILSVRC2012_val_00049174.JPEG`
553
+ - `<ROOT>/labels.txt`
554
+
555
+ The provided dataset implementation expects a few additional metadata files to be present under the extra directory:
556
+
557
+ - `<EXTRA>/class-ids-TRAIN.npy`
558
+ - `<EXTRA>/class-ids-VAL.npy`
559
+ - `<EXTRA>/class-names-TRAIN.npy`
560
+ - `<EXTRA>/class-names-VAL.npy`
561
+ - `<EXTRA>/entries-TEST.npy`
562
+ - `<EXTRA>/entries-TRAIN.npy`
563
+ - `<EXTRA>/entries-VAL.npy`
564
+
565
+ These metadata files can be generated (once) with the following lines of Python code:
566
+
567
+ ```python
568
+ from dinov3.data.datasets import ImageNet
569
+
570
+ for split in ImageNet.Split:
571
+ dataset = ImageNet(split=split, root="<ROOT>", extra="<EXTRA>")
572
+ dataset.dump_extra()
573
+ ```
574
+
575
+ Note that the root and extra directories do not have to be distinct directories.
576
+
577
+ ### ImageNet-22k
578
+
579
+ Please adapt the [dataset class](dinov3/data/datasets/image_net_22k.py) to match your local setup.
580
+
581
+ <br />
582
+
583
+ :warning: To execute the commands provided in the next sections for training and evaluation, the `dinov3` package should be included in the Python module search path, i.e. simply prefix the command to run with `PYTHONPATH=.`.
584
+
585
+ ## Training
586
+
587
+ ### Fast setup: training DINOv3 ViT-L/16 on ImageNet-1k
588
+
589
+ Run DINOv3 pre-training on 4 H100-80GB nodes (32 GPUs) in a SLURM cluster environment with submitit:
590
+
591
+ ```shell
592
+ PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/train/train.py \
593
+ --nodes 4 \
594
+ --config-file dinov3/configs/train/vitl_im1k_lin834.yaml \
595
+ --output-dir <PATH/TO/OUTPUT/DIR> \
596
+ train.dataset_path=ImageNet22k:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
597
+ ```
598
+ Training time is approximately 14 hours and the resulting checkpoint should reach 82.0% on k-NN eval and 83.5% on linear eval.
599
+
600
+ The training code saves the weights of the teacher in the eval folder every 12500 iterations for evaluation.
601
+
602
+ ### Exact DINOv3 setup: training DINOv3 ViT-7B/16
603
+
604
+ DINOv3 ViT-7B/16 is trained on a private dataset. The training involves 3 stages:
605
+ - Pretraining
606
+ - Gram anchoring
607
+ - High resolution adaptation
608
+
609
+ #### Pretraining
610
+
611
+ Launch DINOV3 ViT-7B/16 pretraining on 32 nodes (256 GPUs) in a SLURM cluster environment with submitit.
612
+
613
+ ```shell
614
+ PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/train/train.py \
615
+ --nodes 32 \
616
+ --config-file dinov3/configs/train/dinov3_vit7b16_pretrain.yaml \
617
+ --output-dir <PATH/TO/OUTPUT/DIR> \
618
+ train.dataset_path=<DATASET>:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
619
+ ```
620
+
621
+ #### Gram anchoring
622
+
623
+ ```shell
624
+ PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/train/train.py \
625
+ --nodes 32 \
626
+ --config-file dinov3/configs/train/dinov3_vit7b16_gram_anchor.yaml \
627
+ --output-dir <PATH/TO/OUTPUT/DIR> \
628
+ train.dataset_path=<DATASET>:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
629
+ gram.ckpt=<PATH/TO/GRAM_TEACHER_FROM_PREVIOUS_STEP>
630
+ ```
631
+
632
+ #### High-resolution adaptation
633
+
634
+
635
+ ```shell
636
+ PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/train/train.py \
637
+ --nodes 32 \
638
+ --config-file dinov3/configs/train/dinov3_vit7b16_high_res_adapt.yaml \
639
+ --output-dir <PATH/TO/OUTPUT/DIR> \
640
+ train.dataset_path=<DATASET>:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
641
+ gram.ckpt=<PATH/TO/TEACHER_FROM_GRAM> \
642
+ student.resume_from_teacher_chkpt=<PATH/TO/TEACHER_FROM_GRAM>
643
+ ```
644
+
645
+ ## Multi-distillation
646
+
647
+ ### Test setup:
648
+
649
+ ```shell
650
+ PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/train/train.py \
651
+ --nodes 1 \
652
+ --config-file dinov3/configs/train/multi_distillation_test.yaml \
653
+ --output-dir <PATH/TO/OUTPUT/DIR> \
654
+ --multi-distillation \
655
+ train.dataset_path=<DATASET>:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
656
+ ```
657
+
658
+ ## Evaluation
659
+
660
+ The training code regularly saves the teacher weights. In order to evaluate the model, run the following evaluation on a single node:
661
+
662
+
663
+ ### Logistic regression classification on ImageNet-1k
664
+
665
+ ```shell
666
+ PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/eval/log_regression.py \
667
+ model.config_file=<PATH/TO/OUTPUT/DIR>/config.yaml \
668
+ model.pretrained_weights=<PATH/TO/OUTPUT/DIR>/teacher_checkpoint.pth \
669
+ output_dir=<PATH/TO/OUTPUT/DIR> \
670
+ train.dataset=ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
671
+ eval.test_dataset=ImageNet:split=VAL:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
672
+ ```
673
+
674
+ ### k-NN classification on ImageNet-1k
675
+
676
+ ```shell
677
+ PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/eval/knn.py \
678
+ model.config_file=<PATH/TO/OUTPUT/DIR>/config.yaml \
679
+ model.pretrained_weights=<PATH/TO/OUTPUT/DIR>/teacher_checkpoint.pth \
680
+ output_dir=<PATH/TO/OUTPUT/DIR> \
681
+ train.dataset=ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
682
+ eval.test_dataset=ImageNet:split=VAL:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
683
+ ```
684
+
685
+ ### Linear classification with data augmentation on ImageNet-1k
686
+
687
+ ```shell
688
+ PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/eval/linear.py \
689
+ model.config_file=<PATH/TO/OUTPUT/DIR>/config.yaml \
690
+ model.pretrained_weights=<PATH/TO/OUTPUT/DIR>/teacher_checkpoint.pth \
691
+ output_dir=<PATH/TO/OUTPUT/DIR> \
692
+ train.dataset=ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
693
+ train.val_dataset=ImageNet:split=VAL:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
694
+ ```
695
+
696
+
697
+ ### Text alignment on DINOv3 using dino.txt
698
+
699
+ Text alignment can be done following the method from `dino.txt` aka [DINOv2 Meets Text](https://arxiv.org/abs/2412.16334).
700
+
701
+ ```shell
702
+ PYTHONPATH=${PWD} python -m dinov3.run.submit dinov3/eval/text/train_dinotxt.py \
703
+ --nodes 4 \
704
+ # An example config for text alignment is here: dinov3/eval/text/configs/dinov3_vitl_text.yaml \
705
+ trainer_config_file="<PATH/TO/DINOv3/TEXT/CONFIG>" \
706
+ output-dir=<PATH/TO/OUTPUT/DIR>
707
+ ```
708
+ Launching the above trains text alignment on 4 nodes with 8 gpus each (32 gpus in total).
709
+ Please note that the text alignment model in the DINOv3 paper was trained on a private dataset and here we have given an example config in ```dinov3/eval/text/configs/dinov3_vitl_text.yaml``` using ```CocoCaptions``` dataset for illustration purposes.
710
+ Please adapt the provided ```CocoCaptions``` dataset class, the dataset can be found [here](https://www.kaggle.com/datasets/nikhil7280/coco-image-caption)
711
+
712
+ ## License
713
+
714
+ DINOv3 code and model weights are released under the DINOv3 License. See [LICENSE.md](LICENSE.md) for additional details.
715
+
716
+ ## Contributing
717
+
718
+ See [contributing](CONTRIBUTING.md) and the [code of conduct](CODE_OF_CONDUCT.md).
719
+
720
+ ## Citing DINOv3
721
+
722
+ If you find this repository useful, please consider giving a star :star: and citation :t-rex::
723
+
724
+ ```
725
+ @misc{simeoni2025dinov3,
726
+ title={{DINOv3}},
727
+ author={Sim{\'e}oni, Oriane and Vo, Huy V. and Seitzer, Maximilian and Baldassarre, Federico and Oquab, Maxime and Jose, Cijo and Khalidov, Vasil and Szafraniec, Marc and Yi, Seungeun and Ramamonjisoa, Micha{\"e}l and Massa, Francisco and Haziza, Daniel and Wehrstedt, Luca and Wang, Jianyuan and Darcet, Timoth{\'e}e and Moutakanni, Th{\'e}o and Sentana, Leonel and Roberts, Claire and Vedaldi, Andrea and Tolan, Jamie and Brandt, John and Couprie, Camille and Mairal, Julien and J{\'e}gou, Herv{\'e} and Labatut, Patrick and Bojanowski, Piotr},
728
+ year={2025},
729
+ eprint={2508.10104},
730
+ archivePrefix={arXiv},
731
+ primaryClass={cs.CV},
732
+ url={https://arxiv.org/abs/2508.10104},
733
+ }
734
+ ```
depth_anything_v2_metric/depth_anything_v2/dinov3/conda.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: dinov3
2
+ channels:
3
+ - defaults
4
+ - conda-forge
5
+ dependencies:
6
+ - python=3.11
7
+ - omegaconf
8
+ - pip
9
+ - pip:
10
+ - ftfy # needed for dino.txt
11
+ - iopath
12
+ - omegaconf
13
+ - pandas
14
+ - regex # needed for dino.txt
15
+ - pandas
16
+ - scikit-learn
17
+ - scikit-learn-intelex
18
+ - submitit
19
+ - termcolor
20
+ - torch
21
+ - torchvision
22
+ - torchmetrics
23
+
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This software may be used and distributed in accordance with
4
+ # the terms of the DINOv3 License Agreement.
5
+
6
+ __version__ = "0.0.1"
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/checkpointer/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This software may be used and distributed in accordance with
4
+ # the terms of the DINOv3 License Agreement.
5
+
6
+ from .checkpointer import (
7
+ CheckpointRetentionPolicy,
8
+ cleanup_checkpoint,
9
+ find_all_checkpoints,
10
+ find_latest_checkpoint,
11
+ init_fsdp_model_from_checkpoint,
12
+ init_model_from_checkpoint_for_evals,
13
+ keep_checkpoint_copy,
14
+ keep_last_n_checkpoints,
15
+ load_checkpoint,
16
+ register_dont_save_hooks,
17
+ save_checkpoint,
18
+ )
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/checkpointer/checkpointer.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This software may be used and distributed in accordance with
4
+ # the terms of the DINOv3 License Agreement.
5
+
6
+ """
7
+ Suggested file structure:
8
+
9
+ output_dir/
10
+ |-- ckpt/
11
+ | |-- 0/
12
+ | |-- 99/
13
+ | |-- 199/
14
+ | |-- 199_keep/
15
+ | |-- 299/
16
+ | `-- ...
17
+ `-- eval/
18
+ `-- 0/
19
+ `-- 99/
20
+ `-- ckpt/
21
+
22
+ Distributed checkpointer docs:
23
+ - https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html
24
+ - https://pytorch.org/docs/stable/distributed.checkpoint.html
25
+ """
26
+
27
+ import logging
28
+ import shutil
29
+ import subprocess
30
+ import tempfile
31
+ from enum import Enum
32
+ from pathlib import Path
33
+ from typing import List, Sequence, Set
34
+
35
+ import torch
36
+ import torch.distributed as dist
37
+ import torch.distributed.checkpoint as dcp
38
+ import torch.distributed.checkpoint.filesystem as dcpfs
39
+ import torch.distributed.checkpoint.state_dict as dcpsd
40
+ from torch.distributed.checkpoint.stateful import Stateful
41
+
42
+ logger = logging.getLogger("dinov3")
43
+
44
+
45
+ class CheckpointRetentionPolicy(Enum):
46
+ ALL = "all" # keep all checkpoints
47
+ BEST = "best"
48
+ LAST = "last"
49
+ LAST_AND_BEST = "last_and_best"
50
+ NONE = "none" # do not keep any checkpoints
51
+
52
+ @property
53
+ def keep_filters(self) -> Set[str]:
54
+ """Files that match these patterns are not deleted by cleanup"""
55
+ if self == CheckpointRetentionPolicy.LAST:
56
+ return set(["final"])
57
+ if self == CheckpointRetentionPolicy.BEST:
58
+ return set(["best"])
59
+ if self == CheckpointRetentionPolicy.LAST_AND_BEST:
60
+ return set(["final", "best"])
61
+ if self == CheckpointRetentionPolicy.ALL:
62
+ return set()
63
+ return set()
64
+
65
+ @property
66
+ def max_to_keep(self) -> int | None:
67
+ """
68
+ maximum "periodic" checkpoints to keep concurrently, ie. saved with `step` and not `save`. `None` for keep all
69
+ """
70
+ if self == CheckpointRetentionPolicy.ALL:
71
+ return None
72
+ return 1
73
+
74
+
75
+ def save_checkpoint(
76
+ ckpt_dir: str | Path, # output_dir/ckpt/199
77
+ *,
78
+ iteration: int | str,
79
+ model: torch.nn.Module,
80
+ optimizer: torch.optim.Optimizer | None = None,
81
+ overwrite: bool = True,
82
+ process_group: dist.ProcessGroup = None,
83
+ **others: Stateful,
84
+ ):
85
+ """Save a plain/DDP/FSDP/FSDP2 model, its optimizer, an integer iteration and other stateful objects."""
86
+ rank = torch.distributed.get_rank(group=process_group)
87
+
88
+ # Rank 0 checks if the checkpoint directory exists, but all ranks need to know if if exists,
89
+ # so they can raise an error when overwrite is False. If overwrite is True, rank 0 will delete it
90
+ # and other ranks wait for the deletion to finish.
91
+ ckpt_dir = Path(ckpt_dir)
92
+ ckpt_dir_exists = [ckpt_dir.exists() if rank == 0 else None]
93
+ src_rank = 0
94
+ if process_group is not None:
95
+ src_rank = torch.distributed.get_global_rank(group=process_group, group_rank=0)
96
+ torch.distributed.broadcast_object_list(ckpt_dir_exists, src=src_rank, group=process_group)
97
+ ckpt_dir_exists = ckpt_dir_exists[0]
98
+ if ckpt_dir_exists:
99
+ if overwrite:
100
+ if rank == 0:
101
+ if ckpt_dir.is_dir():
102
+ shutil.rmtree(ckpt_dir)
103
+ else:
104
+ ckpt_dir.unlink()
105
+ logger.info(f"Deleted: {ckpt_dir}")
106
+ torch.distributed.barrier(group=process_group)
107
+ else:
108
+ raise RuntimeError(f"Checkpoint already exists: {ckpt_dir}")
109
+
110
+ # Rank 0 creates a temporary directory for the checkpoint and broadcasts the name to all ranks.
111
+ ckpt_dir.parent.mkdir(parents=True, exist_ok=True)
112
+ ckpt_dir_tmp = [tempfile.mkdtemp(dir=ckpt_dir.parent, prefix=ckpt_dir.name) if rank == 0 else None]
113
+ torch.distributed.broadcast_object_list(ckpt_dir_tmp, src=src_rank, group=process_group)
114
+ ckpt_dir_tmp = Path(ckpt_dir_tmp[0])
115
+
116
+ to_save = {"iteration": iteration}
117
+ to_save["model"] = dcpsd.get_model_state_dict(model)
118
+ if optimizer is not None:
119
+ to_save["optimizer"] = dcpsd.get_optimizer_state_dict(model, optimizer)
120
+ to_save.update(others)
121
+ dcp.save(
122
+ to_save,
123
+ storage_writer=dcpfs.FileSystemWriter(ckpt_dir_tmp),
124
+ process_group=process_group,
125
+ )
126
+
127
+ # Rank 0 renames the temporary directory to the final checkpoint directory. All ranks wait for the rename.
128
+ if rank == 0:
129
+ ckpt_dir_tmp.rename(ckpt_dir)
130
+ torch.distributed.barrier()
131
+
132
+ logger.info(f"Saved: {ckpt_dir}")
133
+
134
+
135
+ def load_checkpoint(
136
+ ckpt_dir: str | Path, # output_dir/ckpt/199
137
+ *,
138
+ model: torch.nn.Module,
139
+ optimizer: torch.optim.Optimizer | None = None,
140
+ strict_loading: bool = True,
141
+ process_group: dist.ProcessGroup = None,
142
+ **others: Stateful,
143
+ ) -> int | None:
144
+ """
145
+ Load a plain/DDP/FSDP/FSDP2 model, its optimizer, an integer iteration and other stateful objects.
146
+ Can you take a checkpoint saved on N ranks and load it on M ranks? Sure you can!
147
+ Activation checkpointing and torch-compile can also be different between save and load, no problem.
148
+ """
149
+ ckpt_dir = Path(ckpt_dir)
150
+ to_load = {"iteration": None}
151
+ to_load["model"] = dcpsd.get_model_state_dict(model)
152
+ if optimizer is not None:
153
+ to_load["optimizer"] = dcpsd.get_optimizer_state_dict(model, optimizer)
154
+ to_load.update(others)
155
+ dcp.load(
156
+ to_load,
157
+ storage_reader=dcpfs.FileSystemReader(ckpt_dir),
158
+ planner=dcp.default_planner.DefaultLoadPlanner(allow_partial_load=not strict_loading),
159
+ process_group=process_group,
160
+ )
161
+ iteration = to_load["iteration"]
162
+ dcpsd.set_model_state_dict(model, to_load["model"])
163
+ if optimizer is not None:
164
+ dcpsd.set_optimizer_state_dict(model, optimizer, to_load["optimizer"])
165
+ logger.info(f"Loaded: {ckpt_dir}")
166
+ return iteration
167
+
168
+
169
+ def register_dont_save_hooks(module: torch.nn.Module, dont_save: Sequence[str]):
170
+ """
171
+ Registers save/load state dict hooks such that the weights in `dont_save` are not persisted in the checkpoint.
172
+
173
+ Typical use case: a classification model composed of a frozen backbone and a trainable head.
174
+ If the frozen backbone is loaded from torch hub, it does't make sense to save a copy of it in each checkpoint.
175
+ """
176
+
177
+ def state_dict_post_hook(module, state_dict, prefix, local_metadata):
178
+ # Remove frozen weights so they won't get saved.
179
+ # If this module is not the top-level module, its weights will have a prefix in the state dict.
180
+ nonlocal _dont_save
181
+ for k in _dont_save:
182
+ del state_dict[prefix + k]
183
+
184
+ def load_state_dict_pre_hook(
185
+ module,
186
+ state_dict,
187
+ prefix,
188
+ local_metadata,
189
+ strict,
190
+ missing_keys,
191
+ unexpected_keys,
192
+ error_msgs,
193
+ ):
194
+ # This pre hook exists only to pass the prefix to the post hook when loading the state dict.
195
+ nonlocal _prefix
196
+ assert _prefix is None
197
+ _prefix = prefix
198
+
199
+ def load_state_dict_post_hook(module, incompatible_keys):
200
+ # Remove the frozen weights from the missing keys so they don't raise an error.
201
+ nonlocal _prefix
202
+ assert _prefix is not None
203
+ to_remove = []
204
+ for missing_key in incompatible_keys.missing_keys:
205
+ k = missing_key.removeprefix(_prefix)
206
+ k = k.replace("_checkpoint_wrapped_module.", "") # Added by activation checkpointing
207
+ if k in _dont_save:
208
+ to_remove.append(missing_key)
209
+ for r in to_remove:
210
+ incompatible_keys.missing_keys.remove(r)
211
+ _prefix = None
212
+
213
+ _dont_save = set(name.replace("_checkpoint_wrapped_module.", "") for name in dont_save)
214
+ _prefix = None
215
+ module.register_state_dict_post_hook(state_dict_post_hook)
216
+ module.register_load_state_dict_pre_hook(load_state_dict_pre_hook)
217
+ module.register_load_state_dict_post_hook(load_state_dict_post_hook)
218
+
219
+
220
+ def find_all_checkpoints(ckpt_dir: Path | str) -> list[Path]:
221
+ """Find all checkpoints in a directory, i.e. subdirs with integer name. Sorted from first to last."""
222
+ ckpt_dir = Path(ckpt_dir)
223
+ if not ckpt_dir.is_dir():
224
+ return []
225
+ checkpoints = [p for p in ckpt_dir.iterdir() if p.is_dir() and _is_int(p.name)]
226
+ checkpoints.sort(key=lambda p: int(p.name))
227
+ return checkpoints
228
+
229
+
230
+ def find_latest_checkpoint(ckpt_dir: Path | str) -> Path | None:
231
+ """Find the latest checkpoint in a directory, i.e. the subdir with the highest integer name."""
232
+ checkpoints = find_all_checkpoints(ckpt_dir)
233
+ if len(checkpoints) == 0:
234
+ return None
235
+ return checkpoints[-1]
236
+
237
+
238
+ def keep_last_n_checkpoints(ckpt_dir: Path | str, n: int | None):
239
+ """In a directory with integer-named subdirs, keep only the n subdirs with the highest number."""
240
+ if n is None:
241
+ return
242
+ checkpoints = find_all_checkpoints(ckpt_dir)
243
+ for ckpt_dir in checkpoints[:-n]:
244
+ try:
245
+ shutil.rmtree(ckpt_dir)
246
+ logger.info(f"Deleted: {ckpt_dir}")
247
+ except Exception:
248
+ logger.exception(f"Failed to delete: {ckpt_dir}")
249
+
250
+
251
+ def keep_checkpoint_copy(src: Path | str):
252
+ """Copy a file/directory next to itself with a _keep suffix. Files are hardlinked."""
253
+ src = Path(src)
254
+ dst = src.parent / f"{src.name}_keep"
255
+ subprocess.check_output(["cp", "--recursive", "--link", src, dst])
256
+ logger.info(f"Copied: {src} -> {dst}")
257
+
258
+
259
+ def _is_int(s: str) -> bool:
260
+ try:
261
+ int(s)
262
+ return True
263
+ except ValueError:
264
+ return False
265
+
266
+
267
+ # Initialize a FSDP2 model from DCP or PyTorch standard checkpoint
268
+ def init_fsdp_model_from_checkpoint(
269
+ model: torch.nn.Module,
270
+ checkpoint_path: str,
271
+ skip_load_keys: List[str] | None = None,
272
+ keys_not_sharded: List[str] | None = None,
273
+ process_group: dist.ProcessGroup = None,
274
+ ):
275
+ if not Path(checkpoint_path).is_dir(): # PyTorch standard checkpoint
276
+ logger.info(f"Loading pretrained weights from {checkpoint_path}")
277
+ chkpt = torch.load(checkpoint_path, map_location="cpu")["teacher"]
278
+ from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
279
+
280
+ if process_group is None:
281
+ world_mesh = init_device_mesh(
282
+ "cuda",
283
+ mesh_shape=(dist.get_world_size(),),
284
+ mesh_dim_names=("dp",),
285
+ )
286
+ else:
287
+ world_mesh = DeviceMesh.from_group(process_group, "cuda")
288
+ chkpt = {
289
+ key: (
290
+ torch.distributed.tensor.distribute_tensor(tensor, world_mesh, src_data_rank=None)
291
+ if not any(key_not_sharded in key for key_not_sharded in keys_not_sharded)
292
+ else tensor
293
+ )
294
+ for key, tensor in chkpt.items()
295
+ }
296
+ model.load_state_dict(
297
+ {
298
+ key: tensor
299
+ for key, tensor in chkpt.items()
300
+ if not any(skip_load_key in key for skip_load_key in skip_load_keys)
301
+ }
302
+ )
303
+ else: # DCP checkpoint
304
+ load_checkpoint(ckpt_dir=checkpoint_path, model=model, process_group=process_group)
305
+
306
+
307
+ # Initialize a standard non distributed PyTorch model from PyTorch standard checkpoint for evals
308
+ def init_model_from_checkpoint_for_evals(
309
+ model: torch.nn.Module, pretrained_weights: str | Path, checkpoint_key: str = None
310
+ ):
311
+ state_dict = torch.load(pretrained_weights, map_location="cpu")
312
+ if checkpoint_key is not None and checkpoint_key in state_dict:
313
+ logger.info(f"Take key {checkpoint_key} in provided checkpoint dict")
314
+ state_dict = state_dict[checkpoint_key]
315
+ # remove `module.` prefix
316
+ state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
317
+ # remove `backbone.` prefix induced by multicrop wrapper
318
+ state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
319
+ msg = model.load_state_dict(state_dict, strict=False)
320
+ logger.info("Pretrained weights found at {} and loaded with msg: {}".format(pretrained_weights, msg))
321
+
322
+
323
+ def cleanup_checkpoint(ckpt_dir: str, checkpoint_retention_policy: CheckpointRetentionPolicy):
324
+ """
325
+ ckpt_dir is the directory containing each individual checkpoint directories (either at iteration, best (validation performance) or final)
326
+ |-- ckpt_dir/
327
+ | |-- 0/
328
+ | |--checkpoint.pth or dcp_sharded_checkpoint_dir
329
+ | |-- 99/
330
+ |--checkpoint.pth or dcp_sharded_checkpoint_dir
331
+ | |-- 199/
332
+ |--checkpoint.pth or dcp_sharded_checkpoint_dir
333
+ | |-- best/
334
+ |--checkpoint.pth or dcp_sharded_checkpoint_dir
335
+ | |-- 299/
336
+ |--checkpoint.pth or dcp_sharded_checkpoint_dir
337
+ | |-- final/
338
+ |--checkpoint.pth or dcp_sharded_checkpoint_dir
339
+ """
340
+ ckpt_dir = Path(ckpt_dir)
341
+ if not ckpt_dir.is_dir():
342
+ return []
343
+ checkpoint_filters = checkpoint_retention_policy.keep_filters
344
+ checkpoints = [p for p in ckpt_dir.iterdir() if p.is_dir()]
345
+ for checkpoint in checkpoints:
346
+ if checkpoint in checkpoint_filters:
347
+ continue
348
+ try:
349
+ shutil.rmtree(checkpoint)
350
+ logger.info(f"Deleted: {checkpoint}")
351
+ except Exception:
352
+ logger.exception(f"Failed to delete: {checkpoint}")
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This software may be used and distributed in accordance with
4
+ # the terms of the DINOv3 License Agreement.
5
+
6
+ from .config import (
7
+ DinoV3SetupArgs,
8
+ apply_scaling_rules_to_cfg,
9
+ exit_job,
10
+ get_cfg_from_args,
11
+ get_default_config,
12
+ setup_config,
13
+ setup_job,
14
+ setup_multidistillation,
15
+ write_config,
16
+ )
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/config.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This software may be used and distributed in accordance with
4
+ # the terms of the DINOv3 License Agreement.
5
+
6
+ import logging
7
+ import math
8
+ import os
9
+ import pathlib
10
+ import sys
11
+ from dataclasses import dataclass, field
12
+ from datetime import timedelta
13
+ from typing import Any, List, Optional, Sequence, Tuple
14
+
15
+ from omegaconf import DictConfig, OmegaConf
16
+
17
+ import dinov3.distributed as distributed
18
+ from dinov3.logging import cleanup_logging, setup_logging
19
+ from dinov3.utils import fix_random_seeds, get_conda_env, get_sha
20
+
21
+ logger = logging.getLogger("dinov3")
22
+
23
+
24
+ @dataclass
25
+ class DinoV3SetupArgs:
26
+ config_file: str
27
+ pretrained_weights: str | None = None
28
+ shard_unsharded_model: bool = False
29
+ output_dir: str = ""
30
+ opts: List[Any] = field(default_factory=lambda: [])
31
+
32
+ def __post_init__(self):
33
+ # When loaded from benchmark.yaml, self.opts is a frozen omegaconf.ListConfig,
34
+ # which works everywhere except when we want to modify it or when
35
+ # we try to json-serialize it. So we convert it to a regular list here.
36
+ if OmegaConf.is_config(self.opts):
37
+ self.opts = OmegaConf.to_object(self.opts)
38
+
39
+
40
+ def apply_scaling_rules_to_cfg(cfg): # to fix
41
+ assert distributed.is_enabled(), "Setup distributed to get global size !"
42
+ if "schedules" in cfg:
43
+ # For schedules v2, the scaling rules are applied when building the schedules, the config is not modified
44
+ return cfg
45
+
46
+ if cfg.optim.scaling_rule == "linear_wrt_256":
47
+ old_lr = cfg.optim.lr
48
+ cfg.optim.lr *= cfg.train.batch_size_per_gpu * distributed.get_world_size() / 256.0
49
+ logger.info(f"linear scaling learning rate; old: {old_lr}, new: {cfg.optim.lr}")
50
+ elif cfg.optim.scaling_rule == "sqrt_wrt_1024":
51
+ old_lr = cfg.optim.lr
52
+ cfg.optim.lr *= 4 * math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_world_size() / 1024.0)
53
+ logger.info(f"sqrt scaling learning rate; old: {old_lr}, new: {cfg.optim.lr}")
54
+ return cfg
55
+
56
+
57
+ def write_config(cfg, output_dir, name="config.yaml"):
58
+ logger.info(OmegaConf.to_yaml(cfg))
59
+ output_dir = os.path.abspath(output_dir)
60
+ saved_cfg_path = os.path.join(output_dir, name)
61
+ with open(saved_cfg_path, "w") as f:
62
+ OmegaConf.save(config=cfg, f=f)
63
+ return saved_cfg_path
64
+
65
+
66
+ def get_default_config() -> DictConfig:
67
+ p = pathlib.Path(__file__).parent / "ssl_default_config.yaml"
68
+ return OmegaConf.load(p)
69
+
70
+
71
+ def get_cfg_from_args(args: DinoV3SetupArgs, multidistillation=False, strict=True):
72
+ overrides = [*args.opts]
73
+ if args.output_dir is not None:
74
+ overrides.append(f"train.output_dir={os.path.realpath(args.output_dir)}")
75
+
76
+ # Config file
77
+ cfg = OmegaConf.load(args.config_file)
78
+
79
+ # Command line overrides
80
+ opts_cfg = OmegaConf.from_cli(overrides)
81
+
82
+ if multidistillation:
83
+ cfg = OmegaConf.merge(cfg, opts_cfg)
84
+ else:
85
+ # Default config
86
+ default_cfg = get_default_config()
87
+ if strict:
88
+ OmegaConf.set_struct(default_cfg, True)
89
+ cfg = OmegaConf.merge(default_cfg, cfg, opts_cfg)
90
+ return cfg
91
+
92
+
93
+ def setup_config(args: DinoV3SetupArgs, strict_cfg=True):
94
+ """
95
+ Create configs and perform basic setups.
96
+ """
97
+ # Create the cfg with OmegaConf
98
+ cfg = get_cfg_from_args(args, strict=strict_cfg)
99
+ # setup distributed, logging, and random seeds
100
+ logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
101
+ # dump config before modifying so it can be reloaded
102
+ if args.output_dir is not None:
103
+ write_config(cfg, args.output_dir)
104
+ # modify the config inplace by applying scaling rules
105
+ apply_scaling_rules_to_cfg(cfg)
106
+ return cfg
107
+
108
+
109
+ def _enumerate_all_subgroup_ranks(all_subgroup_rank_spans: Sequence[Tuple[int, int]]):
110
+ """Expands a specification of process subgroups from spans to enumerated ranks.
111
+
112
+ Args:
113
+ all_group_rank_spans: a sequence of rank spans (first rank, last rank),
114
+ one for each process group. Example: ((0, 1), (2, 3), (4, 7)).
115
+ """
116
+ for first, last in all_subgroup_rank_spans:
117
+ assert first <= last
118
+ return tuple(tuple(range(first, last + 1)) for first, last in all_subgroup_rank_spans)
119
+
120
+
121
+ def setup_multidistillation(args: DinoV3SetupArgs):
122
+ base_output_dir = args.output_dir
123
+ os.makedirs(args.output_dir, exist_ok=True)
124
+ # get config file for this rank
125
+ base_cfg = OmegaConf.load(args.config_file)
126
+ assert base_cfg.multidistillation.enabled
127
+
128
+ global_batch_size = base_cfg.multidistillation.global_batch_size
129
+
130
+ distributed.enable(overwrite=True)
131
+ seed = getattr(args, "seed", 0)
132
+ rank = distributed.get_rank()
133
+
134
+ # build process subgroups
135
+ all_subgroup_rank_spans = tuple(
136
+ (student.ranks_range[0], student.ranks_range[1] - 1) for student in base_cfg.multidistillation.students
137
+ )
138
+ all_subgroup_ranks = _enumerate_all_subgroup_ranks(all_subgroup_rank_spans)
139
+ distributed.new_subgroups(all_subgroup_ranks)
140
+
141
+ found = False
142
+ for student in base_cfg.multidistillation.students:
143
+ if rank in range(*student.ranks_range):
144
+ found = True
145
+ break
146
+ assert found, "rank of worker not in defined range"
147
+
148
+ name = student.name
149
+ config_path = student.config_path
150
+ n_gpus = student.ranks_range[1] - student.ranks_range[0]
151
+ assert global_batch_size % n_gpus == 0
152
+ total_n_gpus = distributed.get_world_size()
153
+
154
+ args.output_dir = os.path.join(base_output_dir, name)
155
+ args.opts += [f"train.output_dir={args.output_dir}"]
156
+ args.opts += [f"train.batch_size_per_gpu={global_batch_size // total_n_gpus}"]
157
+ args.config_file = os.path.abspath(config_path)
158
+ default_cfg = get_default_config()
159
+ cfg = OmegaConf.load(args.config_file)
160
+ cfg = OmegaConf.merge(default_cfg, cfg, base_cfg, OmegaConf.from_cli(args.opts))
161
+
162
+ global logger
163
+ setup_logging(output=args.output_dir, level=logging.INFO)
164
+
165
+ fix_random_seeds(seed + rank)
166
+
167
+ write_config(cfg, args.output_dir)
168
+ apply_scaling_rules_to_cfg(cfg)
169
+
170
+ return cfg
171
+
172
+
173
+ def setup_job(
174
+ output_dir: Optional[str] = None,
175
+ distributed_enabled: bool = True,
176
+ logging_enabled: bool = True,
177
+ seed: Optional[int] = 0,
178
+ restrict_print_to_main_process: bool = True,
179
+ distributed_timeout: timedelta | None = None,
180
+ ):
181
+ """
182
+ Setup methods that should be done in every fairvit job
183
+ Initializes logging, distributed, random seeds and other utilities.
184
+ """
185
+ if output_dir is not None:
186
+ output_dir = os.path.realpath(output_dir)
187
+ os.makedirs(output_dir, exist_ok=True)
188
+
189
+ if logging_enabled:
190
+ setup_logging(
191
+ output=output_dir,
192
+ level=logging.INFO,
193
+ log_to_stdout_only_in_main_process=restrict_print_to_main_process,
194
+ )
195
+
196
+ if distributed_enabled:
197
+ distributed.enable(
198
+ overwrite=True,
199
+ nccl_async_error_handling=True,
200
+ restrict_print_to_main_process=restrict_print_to_main_process,
201
+ timeout=distributed_timeout,
202
+ )
203
+
204
+ if seed is not None:
205
+ rank = distributed.get_rank()
206
+ fix_random_seeds(seed + rank)
207
+
208
+ logger = logging.getLogger("dinov3")
209
+ logger.info("git:\n {}\n".format(get_sha()))
210
+
211
+ # Log some python info
212
+ conda_env_name, conda_env_path = get_conda_env()
213
+ logger.info(f"conda env name: {conda_env_name}")
214
+ logger.info(f"conda env path: {conda_env_path}")
215
+ logger.info(f"python path: {sys.path}")
216
+
217
+
218
+ def exit_job(distributed_enabled: bool = True, logging_enabled: bool = True):
219
+ if distributed_enabled:
220
+ distributed.disable()
221
+ if logging_enabled:
222
+ cleanup_logging()
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/ssl_default_config.yaml ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: SSLMetaArch
3
+ DEVICE: cuda
4
+ WEIGHTS: ''
5
+ DTYPE: float32
6
+ compute_precision:
7
+ param_dtype: bf16
8
+ reduce_dtype: fp32
9
+ sharding_strategy: SHARD_GRAD_OP
10
+ dino:
11
+ loss_weight: 1.0
12
+ global_ignore_diagonal: true # Whether to ignore A-A and B-B global pairs, default as in DINOv2, ignored by SSLMetaArch
13
+ head_n_prototypes: 65536
14
+ head_bottleneck_dim: 256
15
+ head_norm_last_layer: false
16
+ head_nlayers: 3
17
+ head_hidden_dim: 2048
18
+ koleo_loss_weight: 0.1
19
+ koleo_loss_distributed: false
20
+ koleo_topk: 1
21
+ koleo_distributed_replicas: 0
22
+ koleo_distributed_loss_group_size: null # Size of the nearest neighbor set for distributed Koleo. If None, uses global batch size.
23
+ koleo_distributed_loss_group_data: true # group data from adjacent ranks to make sure koleo is applied on the same data distribution
24
+ force_weight_norm: false
25
+ reweight_dino_local_loss: false # If true, reweighting of DINO loss
26
+ local_loss_weight_schedule: # Schedule for local loss weight, enabled if reweight_dino_local_loss is true
27
+ start: 0.5
28
+ peak: 0.5
29
+ end: 0.5
30
+ warmup_epochs: 0
31
+ ibot:
32
+ loss_weight: 1.0
33
+ mask_sample_probability: 0.5
34
+ mask_ratio_min_max:
35
+ - 0.1
36
+ - 0.5
37
+ mask_random_circular_shift: false
38
+ force_masking_even_with_zero_weight: False
39
+ separate_head: true
40
+ head_n_prototypes: 65536
41
+ head_bottleneck_dim: 256
42
+ head_norm_last_layer: false
43
+ head_nlayers: 3
44
+ head_hidden_dim: 2048
45
+ gram:
46
+ use_loss: false # (bool) if true gram is used, else not
47
+ compute_stats: false # (bool): if true compute auxilliary stats
48
+ loss_weight: 1.0 # (float): weight of the loss
49
+ ema_teacher: false # (bool): using the EMA teacher as GRAM teacher
50
+ ckpt: null #(str): Checkpoint to the teacher
51
+ it_load_ema_teacher: -1 # (int): iteration at which the ema teacher is loaded into the gram teacher
52
+ rep_update: true # (bool): if true GRAM teacher updated every gram.update_frequency after iter gram.it_first_update steps
53
+ update_frequency: 50000 # (int): update frequency
54
+ it_first_update: 0 # (int): iteration of the first update
55
+ max_updates: null # (int): maximum number of updates to gram teacher. If None, it is unlimited
56
+ normalized: true # (bool): normalization of the features
57
+ img_level: false # (bool): if true GRAM computation at the image else, otherwise at the local batch level
58
+ remove_neg: false # (bool): if true remove the negative similarities before applying the loss
59
+ remove_only_teacher_neg: false # (bool): remove negative similarities of the teacher
60
+ tokens_used: all # (str): In [all, masked, unmasked]
61
+ global_teacher_resize_method: bicubic # Method for resizing the outputs of the gram teacher
62
+ global_teacher_resize_antialias: false # Whether to use antialiasing when resizing the outputs of the gram teacher
63
+ loss_weight_schedule: null # (dict): If not None, use a schedule for the loss weight instead of `loss_weight`
64
+ train:
65
+ batch_size_per_gpu: 64
66
+ dataset_path: ImageNet:split=TRAIN
67
+ data_config: null
68
+ output_dir: .
69
+ saveckp_freq: 20
70
+ seed: 0
71
+ num_workers: 10
72
+ OFFICIAL_EPOCH_LENGTH: 1250
73
+ monitor_gradient_norm: false
74
+ chunk_schedule: []
75
+ use_teacher_head: true
76
+ learn_from_teacher_tokens: false
77
+ centering: "sinkhorn_knopp" # or "sinkhorn_knopp"
78
+ checkpointing: false
79
+ checkpointing_full: false # aggressive checkpointing
80
+ compile: true
81
+ cudagraphs: false
82
+ sharded_eval_checkpoint: false
83
+ cache_dataset: false
84
+ student:
85
+ arch: vit_large
86
+ patch_size: 16
87
+ drop_path_rate: 0.3
88
+ layerscale: 1.0e-05
89
+ pretrained_weights: ''
90
+ ffn_layer: "mlp"
91
+ ffn_ratio: 4.0
92
+ resume_from_teacher_chkpt: ""
93
+ qkv_bias: true
94
+ proj_bias: true
95
+ ffn_bias: true
96
+ norm_layer: "layernorm"
97
+ n_storage_tokens: 0
98
+ mask_k_bias: false
99
+ untie_cls_and_patch_norms: false # If true, use separate norms for CLS/reg and patch/mask tokens
100
+ untie_global_and_local_cls_norm: false # If true, use separate norms for local and global crop CLS token during training
101
+ in_chans: 3
102
+ pos_embed_type: rope
103
+ pos_embed_rope_base: 100.0
104
+ pos_embed_rope_min_period: null
105
+ pos_embed_rope_max_period: null
106
+ pos_embed_rope_normalize_coords: separate # min, max, separate
107
+ pos_embed_rope_shift_coords: null
108
+ pos_embed_rope_jitter_coords: null
109
+ pos_embed_rope_rescale_coords: null
110
+ pos_embed_rope_dtype: bf16
111
+ fp8_enabled: False # Convert Linear layers to operate in fp8 precision
112
+ fp8_filter: "blocks" # Regex that must appear in module path; empty means everything
113
+ teacher:
114
+ momentum_teacher: 0.992
115
+ final_momentum_teacher: 1
116
+ warmup_teacher_temp: 0.04
117
+ teacher_temp: 0.07
118
+ warmup_teacher_temp_epochs: 30
119
+ in_chans: 3
120
+ distillation: # teacher
121
+ enabled: false
122
+ full_cfg_path: ""
123
+ checkpoint_path: ""
124
+ multidistillation:
125
+ enabled: false
126
+ hrft: # non-hrft'd student
127
+ enabled: false
128
+ checkpoint_path: "" # teacher_checkpoint path
129
+ optim:
130
+ epochs: 100
131
+ optimizer: adamw
132
+ weight_decay: 0.04
133
+ weight_decay_end: 0.4
134
+ lr: 0.001
135
+ warmup_epochs: 10
136
+ min_lr: 1.0e-06
137
+ schedule_trunc_extra: 0.0 # Compute the schedule for (1 + schedule_trunc_extra) steps and truncate, .25 is a good choice
138
+ clip_grad: 3.0
139
+ freeze_last_layer_epochs: 1
140
+ scaling_rule: sqrt_wrt_1024
141
+ patch_embed_lr_mult: 0.2
142
+ dino_head_wd_multiplier: 1.0
143
+ layerwise_decay: 0.9
144
+ multi_tensor_optim: true
145
+ dump_fsdp_weights_path: ""
146
+ adamw_beta1: 0.9
147
+ adamw_beta2: 0.999
148
+ crops:
149
+ global_crops_scale:
150
+ - 0.32
151
+ - 1.0
152
+ local_crops_number: 8
153
+ local_crops_scale:
154
+ - 0.05
155
+ - 0.32
156
+ global_crops_size: 224
157
+ local_crops_size: 96
158
+ global_local_crop_pairs_ratios: 1.0
159
+ gram_teacher_crops_size: null # If not None, return crops for gram teacher
160
+ localcrops_subset_of_globalcrops: false
161
+ share_color_jitter: false
162
+ horizontal_flips: true
163
+ gram_teacher_no_distortions: false # If True, no distortions are applied to gram teacher crops
164
+ rgb_mean:
165
+ - 0.485
166
+ - 0.456
167
+ - 0.406
168
+ rgb_std:
169
+ - 0.229
170
+ - 0.224
171
+ - 0.225
172
+ evaluation:
173
+ eval_period_iterations: 12500
174
+ low_freq_every: 5
175
+ config_files: # Must be in fairvit/eval/configs
176
+ high_freq: benchmark_high_frequency.yaml # More often
177
+ low_freq: benchmark_low_frequency.yaml # Less often
178
+ checkpointing:
179
+ period: 3750
180
+ max_to_keep: 3
181
+ keep_every: 99999999999999999 # Save a checkpoint every N iterations, regardless of max_to_keep and period
182
+
183
+ # Example of constant schedules with schedules v2
184
+ # # schedules:
185
+ # # lr:
186
+ # # start: 0.0
187
+ # # peak: 1e-3
188
+ # # end: 1e-6
189
+ # # warmup_epochs: 10
190
+ # # freeze_last_layer_epochs: 1
191
+ # # weight_decay:
192
+ # # start: 0.04
193
+ # # peak: 0.04
194
+ # # end: 0.04
195
+ # # warmup_epochs: 0
196
+ # # momentum:
197
+ # # start: 0.992
198
+ # # peak: 0.992
199
+ # # end: 0.992
200
+ # # warmup_epochs: 0
201
+ # # teacher_temp:
202
+ # # start: 0.04
203
+ # # peak: 0.07
204
+ # # end: 0.07
205
+ # # warmup_epochs: 30
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vit7b16_gram_anchor.yaml ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: SSLMetaArch
3
+ DEVICE: cuda
4
+ WEIGHTS: ''
5
+ DTYPE: float32
6
+ compute_precision:
7
+ param_dtype: bf16
8
+ reduce_dtype: fp32
9
+ sharding_strategy: SHARD_GRAD_OP
10
+ dino:
11
+ loss_weight: 1.0
12
+ global_ignore_diagonal: true
13
+ head_n_prototypes: 262144
14
+ head_bottleneck_dim: 512
15
+ head_norm_last_layer: false
16
+ head_nlayers: 3
17
+ head_hidden_dim: 8192
18
+ koleo_loss_weight: 0.1
19
+ koleo_loss_distributed: false
20
+ koleo_topk: 1
21
+ koleo_distributed_replicas: 0
22
+ koleo_distributed_loss_group_size: null
23
+ koleo_distributed_loss_group_data: true
24
+ force_weight_norm: false
25
+ reweight_dino_local_loss: true
26
+ local_loss_weight_schedule:
27
+ start: 1
28
+ peak: 1
29
+ end: 0.5
30
+ warmup_epochs: 1000
31
+ cosine_epochs: 1
32
+ ibot:
33
+ loss_weight: 1.0
34
+ mask_sample_probability: 0.5
35
+ mask_ratio_min_max:
36
+ - 0.1
37
+ - 0.5
38
+ mask_random_circular_shift: false
39
+ force_masking_even_with_zero_weight: false
40
+ separate_head: true
41
+ head_n_prototypes: 98304
42
+ head_bottleneck_dim: 384
43
+ head_norm_last_layer: false
44
+ head_nlayers: 3
45
+ head_hidden_dim: 4096
46
+ gram:
47
+ use_loss: true
48
+ compute_stats: false
49
+ loss_weight: 1.0
50
+ ema_teacher: false
51
+ ckpt: ignore
52
+ it_load_ema_teacher: -1
53
+ rep_update: true
54
+ update_frequency: 10000
55
+ it_first_update: 1010000
56
+ max_updates: 3
57
+ normalized: true
58
+ img_level: true
59
+ remove_neg: false
60
+ remove_only_teacher_neg: false
61
+ tokens_used: all
62
+ global_teacher_resize_method: bicubic
63
+ global_teacher_resize_antialias: false
64
+ loss_weight_schedule:
65
+ start: 0
66
+ peak: 0
67
+ end: 2.0
68
+ warmup_epochs: 1000
69
+ cosine_epochs: 1
70
+ train:
71
+ batch_size_per_gpu: 16
72
+ dataset_path: null
73
+ saveckp_freq: 20
74
+ seed: 0
75
+ num_workers: 10
76
+ OFFICIAL_EPOCH_LENGTH: 1000
77
+ monitor_gradient_norm: false
78
+ chunk_schedule: []
79
+ cache_dataset: true
80
+ use_teacher_head: true
81
+ learn_from_teacher_tokens: false
82
+ centering: sinkhorn_knopp
83
+ checkpointing: true
84
+ checkpointing_full: true
85
+ compile: true
86
+ cudagraphs: false
87
+ cell_augmentation: false
88
+ cell_augmentation_type: hpa
89
+ sharded_eval_checkpoint: true
90
+ student:
91
+ arch: vit_7b
92
+ patch_size: 16
93
+ drop_path_rate: 0.4
94
+ layerscale: 1.0e-05
95
+ patch_drop: 0.0
96
+ pretrained_weights: ''
97
+ ffn_layer: swiglu64
98
+ ffn_ratio: 3
99
+ resume_from_teacher_chkpt: ''
100
+ qkv_bias: false
101
+ proj_bias: true
102
+ ffn_bias: true
103
+ norm_layer: layernormbf16
104
+ n_storage_tokens: 4
105
+ untie_cls_and_patch_norms: false
106
+ untie_global_and_local_cls_norm: true
107
+ mask_k_bias: true
108
+ in_chans: 3
109
+ pos_embed_type: rope
110
+ pos_embed_rope_base: 100
111
+ pos_embed_rope_min_period: null
112
+ pos_embed_rope_max_period: null
113
+ pos_embed_rope_normalize_coords: separate
114
+ pos_embed_rope_shift_coords: null
115
+ pos_embed_rope_jitter_coords: null
116
+ pos_embed_rope_rescale_coords: 2
117
+ pos_embed_rope_dtype: fp32
118
+ fp8_enabled: true
119
+ fp8_filter: blocks
120
+ teacher:
121
+ momentum_teacher: null
122
+ final_momentum_teacher: null
123
+ warmup_teacher_temp: null
124
+ teacher_temp: null
125
+ warmup_teacher_temp_epochs: null
126
+ in_chans: 3
127
+ distillation:
128
+ enabled: false
129
+ full_cfg_path: ''
130
+ checkpoint_path: ''
131
+ multidistillation:
132
+ enabled: false
133
+ hrft:
134
+ enabled: false
135
+ checkpoint_path: ''
136
+ optim:
137
+ epochs: 1200
138
+ optimizer: adamw
139
+ weight_decay: null
140
+ weight_decay_end: null
141
+ lr: null
142
+ warmup_epochs: null
143
+ min_lr: null
144
+ schedule_trunc_extra: null
145
+ clip_grad: 30.0
146
+ freeze_last_layer_epochs: null
147
+ scaling_rule: sqrt_wrt_1024
148
+ patch_embed_lr_mult: 0.2
149
+ dino_head_wd_multiplier: 1.0
150
+ layerwise_decay: 0.98
151
+ multi_tensor_optim: true
152
+ dump_fsdp_weights_path: ''
153
+ adamw_beta1: 0.9
154
+ adamw_beta2: 0.99
155
+ crops:
156
+ global_crops_scale:
157
+ - 0.32
158
+ - 1.0
159
+ local_crops_number: 8
160
+ local_crops_scale:
161
+ - 0.05
162
+ - 0.32
163
+ global_crops_size: 256
164
+ local_crops_size: 112
165
+ gram_teacher_crops_size: 512
166
+ localcrops_subset_of_globalcrops: false
167
+ share_color_jitter: false
168
+ horizontal_flips: false
169
+ gram_teacher_no_distortions: true
170
+ rgb_mean:
171
+ - 0.485
172
+ - 0.456
173
+ - 0.406
174
+ rgb_std:
175
+ - 0.229
176
+ - 0.224
177
+ - 0.225
178
+ checkpointing:
179
+ period: 1000
180
+ max_to_keep: 3
181
+ keep_every: 50000
182
+ schedules:
183
+ lr:
184
+ start: 0
185
+ peak: 3.0e-05
186
+ end: 3.0e-05
187
+ warmup_epochs: 100
188
+ freeze_last_layer_epochs: 5
189
+ weight_decay:
190
+ start: 0.04
191
+ peak: 0.04
192
+ end: 0.04
193
+ warmup_epochs: 0
194
+ teacher_temp:
195
+ start: 0.04
196
+ peak: 0.07
197
+ end: 0.07
198
+ warmup_epochs: 100
199
+ momentum:
200
+ start: 0.999
201
+ peak: 0.999
202
+ end: 0.999
203
+ warmup_epochs: 0
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vit7b16_high_res_adapt.yaml ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: SSLMetaArch
3
+ DEVICE: cuda
4
+ WEIGHTS: ''
5
+ DTYPE: float32
6
+ compute_precision:
7
+ param_dtype: bf16
8
+ reduce_dtype: fp32
9
+ sharding_strategy: SHARD_GRAD_OP
10
+ dino:
11
+ loss_weight: 1.0
12
+ global_ignore_diagonal: true
13
+ head_n_prototypes: 262144
14
+ head_bottleneck_dim: 512
15
+ head_norm_last_layer: false
16
+ head_nlayers: 3
17
+ head_hidden_dim: 8192
18
+ koleo_loss_weight: 0.1
19
+ koleo_loss_distributed: true
20
+ koleo_topk: 1
21
+ koleo_distributed_replicas: 0
22
+ koleo_distributed_loss_group_size: 16
23
+ force_weight_norm: false
24
+ reweight_dino_local_loss: true
25
+ local_loss_weight_schedule:
26
+ start: 0.5
27
+ peak: 0.5
28
+ end: 0.5
29
+ warmup_epochs: 0
30
+ cosine_epochs: 0
31
+ koleo_distributed_loss_group_data: true
32
+ ibot:
33
+ loss_weight: 1.0
34
+ mask_sample_probability: 0.5
35
+ mask_ratio_min_max:
36
+ - 0.1
37
+ - 0.5
38
+ mask_random_circular_shift: false
39
+ force_masking_even_with_zero_weight: false
40
+ separate_head: true
41
+ head_n_prototypes: 98304
42
+ head_bottleneck_dim: 384
43
+ head_norm_last_layer: false
44
+ head_nlayers: 3
45
+ head_hidden_dim: 4096
46
+ gram:
47
+ use_loss: true
48
+ compute_stats: false
49
+ loss_weight: 1.0
50
+ ema_teacher: false
51
+ it_load_ema_teacher: -1
52
+ rep_update: false
53
+ update_frequency: 10000
54
+ it_first_update: 1010000
55
+ max_updates: 3
56
+ normalized: true
57
+ img_level: true
58
+ remove_neg: false
59
+ remove_only_teacher_neg: false
60
+ tokens_used: all
61
+ global_teacher_resize_method: bicubic
62
+ global_teacher_resize_antialias: false
63
+ loss_weight_schedule:
64
+ start: 1.5
65
+ peak: 1.5
66
+ end: 1.5
67
+ warmup_epochs: 0
68
+ cosine_epochs: 0
69
+ train:
70
+ batch_size_per_gpu: 8
71
+ dataset_path: null
72
+ saveckp_freq: 20
73
+ seed: 0
74
+ num_workers: 2
75
+ OFFICIAL_EPOCH_LENGTH: 1000
76
+ monitor_gradient_norm: false
77
+ chunk_schedule: []
78
+ cache_dataset: true
79
+ use_teacher_head: true
80
+ learn_from_teacher_tokens: false
81
+ centering: sinkhorn_knopp
82
+ checkpointing: true
83
+ checkpointing_full: true
84
+ compile: true
85
+ cudagraphs: false
86
+ cell_augmentation: false
87
+ cell_augmentation_type: hpa
88
+ sharded_eval_checkpoint: true
89
+ student:
90
+ arch: vit_7b
91
+ patch_size: 16
92
+ drop_path_rate: 0.4
93
+ layerscale: 1.0e-05
94
+ patch_drop: 0.0
95
+ pretrained_weights: ''
96
+ ffn_layer: swiglu64
97
+ ffn_ratio: 3
98
+ resume_from_teacher_chkpt: ''
99
+ qkv_bias: false
100
+ proj_bias: true
101
+ ffn_bias: true
102
+ norm_layer: layernormbf16
103
+ n_storage_tokens: 4
104
+ untie_cls_and_patch_norms: false
105
+ untie_global_and_local_cls_norm: true
106
+ mask_k_bias: true
107
+ in_chans: 3
108
+ pos_embed_type: rope
109
+ pos_embed_rope_base: 100
110
+ pos_embed_rope_min_period: null
111
+ pos_embed_rope_max_period: null
112
+ pos_embed_rope_normalize_coords: separate
113
+ pos_embed_rope_shift_coords: null
114
+ pos_embed_rope_jitter_coords: null
115
+ pos_embed_rope_rescale_coords: 2
116
+ pos_embed_rope_dtype: fp32
117
+ fp8_enabled: true
118
+ fp8_filter: blocks
119
+ teacher:
120
+ momentum_teacher: null
121
+ final_momentum_teacher: null
122
+ warmup_teacher_temp: null
123
+ teacher_temp: null
124
+ warmup_teacher_temp_epochs: null
125
+ in_chans: 3
126
+ distillation:
127
+ enabled: false
128
+ full_cfg_path: ''
129
+ checkpoint_path: ''
130
+ multidistillation:
131
+ enabled: false
132
+ hrft:
133
+ enabled: false
134
+ checkpoint_path: ''
135
+ optim:
136
+ epochs: 30
137
+ optimizer: adamw
138
+ weight_decay: null
139
+ weight_decay_end: null
140
+ lr: null
141
+ warmup_epochs: null
142
+ min_lr: null
143
+ schedule_trunc_extra: null
144
+ clip_grad: 30.0
145
+ freeze_last_layer_epochs: null
146
+ scaling_rule: sqrt_wrt_1024
147
+ patch_embed_lr_mult: 0.2
148
+ dino_head_wd_multiplier: 1.0
149
+ layerwise_decay: 0.98
150
+ multi_tensor_optim: true
151
+ dump_fsdp_weights_path: ''
152
+ adamw_beta1: 0.9
153
+ adamw_beta2: 0.99
154
+ crops:
155
+ global_crops_scale:
156
+ - 0.32
157
+ - 1.0
158
+ local_crops_number: 8
159
+ local_crops_scale:
160
+ - 0.05
161
+ - 0.32
162
+ global_crops_size:
163
+ - 512
164
+ - 768
165
+ - 768
166
+ - 768
167
+ - 768
168
+ local_crops_size:
169
+ - 112
170
+ - 112
171
+ - 168
172
+ - 224
173
+ - 336
174
+ global_local_crop_pairs_ratios:
175
+ - 0.3
176
+ - 0.3
177
+ - 0.3
178
+ - 0.05
179
+ - 0.05
180
+ gram_teacher_crops_size:
181
+ - 768
182
+ - 1152
183
+ - 1152
184
+ - 1152
185
+ - 1152
186
+ localcrops_subset_of_globalcrops: false
187
+ share_color_jitter: false
188
+ horizontal_flips: false
189
+ gram_teacher_no_distortions: true
190
+ rgb_mean:
191
+ - 0.485
192
+ - 0.456
193
+ - 0.406
194
+ rgb_std:
195
+ - 0.229
196
+ - 0.224
197
+ - 0.225
198
+ checkpointing:
199
+ period: 250
200
+ max_to_keep: 3
201
+ keep_every: 50000
202
+ schedules:
203
+ lr:
204
+ start: 0
205
+ peak: 0
206
+ end: 1.25e-05
207
+ warmup_epochs: 0
208
+ freeze_last_layer_epochs: 0
209
+ cosine_epochs: 10
210
+ weight_decay:
211
+ start: 0.04
212
+ peak: 0.04
213
+ end: 0.04
214
+ warmup_epochs: 0
215
+ teacher_temp:
216
+ start: 0.07
217
+ peak: 0.07
218
+ end: 0.07
219
+ warmup_epochs: 0
220
+ momentum:
221
+ start: 0.999
222
+ peak: 0.999
223
+ end: 0.999
224
+ warmup_epochs: 0
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vit7b16_pretrain.yaml ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: SSLMetaArch
3
+ DEVICE: cuda
4
+ WEIGHTS: ''
5
+ DTYPE: float32
6
+ compute_precision:
7
+ param_dtype: bf16
8
+ reduce_dtype: fp32
9
+ sharding_strategy: SHARD_GRAD_OP
10
+ dino:
11
+ loss_weight: 1.0
12
+ global_ignore_diagonal: true
13
+ head_n_prototypes: 262144
14
+ head_bottleneck_dim: 512
15
+ head_norm_last_layer: false
16
+ head_nlayers: 3
17
+ head_hidden_dim: 8192
18
+ koleo_loss_weight: 0.1
19
+ koleo_loss_distributed: false
20
+ koleo_topk: 1
21
+ koleo_distributed_replicas: 0
22
+ koleo_distributed_loss_group_size: null
23
+ force_weight_norm: false
24
+ ibot:
25
+ loss_weight: 1.0
26
+ mask_sample_probability: 0.5
27
+ mask_ratio_min_max:
28
+ - 0.1
29
+ - 0.5
30
+ mask_random_circular_shift: false
31
+ force_masking_even_with_zero_weight: false
32
+ separate_head: true
33
+ head_n_prototypes: 98304
34
+ head_bottleneck_dim: 384
35
+ head_norm_last_layer: false
36
+ head_nlayers: 3
37
+ head_hidden_dim: 4096
38
+ gram:
39
+ use_loss: false
40
+ compute_stats: false
41
+ train:
42
+ batch_size_per_gpu: 16
43
+ dataset_path: null
44
+ saveckp_freq: 20
45
+ seed: 0
46
+ num_workers: 10
47
+ OFFICIAL_EPOCH_LENGTH: 1000
48
+ monitor_gradient_norm: false
49
+ chunk_schedule: []
50
+ cache_dataset: true
51
+ use_teacher_head: true
52
+ learn_from_teacher_tokens: false
53
+ centering: sinkhorn_knopp
54
+ checkpointing: true
55
+ checkpointing_full: false
56
+ compile: true
57
+ cudagraphs: false
58
+ cell_augmentation: false
59
+ cell_augmentation_type: hpa
60
+ sharded_eval_checkpoint: true
61
+ student:
62
+ arch: vit_7b
63
+ patch_size: 16
64
+ drop_path_rate: 0.4
65
+ layerscale: 1.0e-05
66
+ patch_drop: 0.0
67
+ pretrained_weights: ''
68
+ ffn_layer: swiglu64
69
+ ffn_ratio: 3
70
+ resume_from_teacher_chkpt: ''
71
+ qkv_bias: false
72
+ proj_bias: true
73
+ ffn_bias: true
74
+ norm_layer: layernormbf16
75
+ n_storage_tokens: 4
76
+ untie_cls_and_patch_norms: false
77
+ untie_global_and_local_cls_norm: true
78
+ mask_k_bias: true
79
+ in_chans: 3
80
+ pos_embed_type: rope
81
+ pos_embed_rope_base: 100
82
+ pos_embed_rope_min_period: null
83
+ pos_embed_rope_max_period: null
84
+ pos_embed_rope_normalize_coords: separate
85
+ pos_embed_rope_shift_coords: null
86
+ pos_embed_rope_jitter_coords: null
87
+ pos_embed_rope_rescale_coords: 2
88
+ pos_embed_rope_dtype: fp32
89
+ fp8_enabled: true
90
+ fp8_filter: blocks
91
+ teacher:
92
+ momentum_teacher: null
93
+ final_momentum_teacher: null
94
+ warmup_teacher_temp: null
95
+ teacher_temp: null
96
+ warmup_teacher_temp_epochs: null
97
+ in_chans: 3
98
+ distillation:
99
+ enabled: false
100
+ full_cfg_path: ''
101
+ checkpoint_path: ''
102
+ multidistillation:
103
+ enabled: false
104
+ hrft:
105
+ enabled: false
106
+ checkpoint_path: ''
107
+ optim:
108
+ epochs: 1000
109
+ optimizer: adamw
110
+ weight_decay: null
111
+ weight_decay_end: null
112
+ lr: null
113
+ warmup_epochs: null
114
+ min_lr: null
115
+ schedule_trunc_extra: null
116
+ clip_grad: 30.0
117
+ freeze_last_layer_epochs: null
118
+ scaling_rule: sqrt_wrt_1024
119
+ patch_embed_lr_mult: 0.2
120
+ dino_head_wd_multiplier: 1.0
121
+ layerwise_decay: 0.98
122
+ multi_tensor_optim: true
123
+ dump_fsdp_weights_path: ''
124
+ adamw_beta1: 0.9
125
+ adamw_beta2: 0.99
126
+ crops:
127
+ global_crops_scale:
128
+ - 0.32
129
+ - 1.0
130
+ local_crops_number: 8
131
+ local_crops_scale:
132
+ - 0.05
133
+ - 0.32
134
+ global_crops_size: 256
135
+ local_crops_size: 112
136
+ localcrops_subset_of_globalcrops: false
137
+ share_color_jitter: false
138
+ horizontal_flips: false
139
+ rgb_mean:
140
+ - 0.485
141
+ - 0.456
142
+ - 0.406
143
+ rgb_std:
144
+ - 0.229
145
+ - 0.224
146
+ - 0.225
147
+ checkpointing:
148
+ period: 1000
149
+ max_to_keep: 3
150
+ keep_every: 50000
151
+ schedules:
152
+ lr:
153
+ start: 0
154
+ peak: 5.0e-05
155
+ end: 5.0e-05
156
+ warmup_epochs: 100
157
+ freeze_last_layer_epochs: 5
158
+ weight_decay:
159
+ start: 0.04
160
+ peak: 0.04
161
+ end: 0.04
162
+ warmup_epochs: 0
163
+ teacher_temp:
164
+ start: 0.04
165
+ peak: 0.07
166
+ end: 0.07
167
+ warmup_epochs: 100
168
+ momentum:
169
+ start: 0.994
170
+ peak: 0.994
171
+ end: 0.994
172
+ warmup_epochs: 0
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/dinov3_vitl16_lvd1689m_distilled.yaml ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: MultiDistillationMetaArch
3
+ DEVICE: cuda
4
+ WEIGHTS: ''
5
+ DTYPE: float32
6
+ compute_precision:
7
+ param_dtype: bf16
8
+ reduce_dtype: fp32
9
+ sharding_strategy: SHARD_GRAD_OP
10
+ dino:
11
+ loss_weight: 1.0
12
+ global_ignore_diagonal: true
13
+ head_n_prototypes: 262144
14
+ head_bottleneck_dim: 512
15
+ head_norm_last_layer: false
16
+ head_nlayers: 3
17
+ head_hidden_dim: 8192
18
+ koleo_loss_weight: 0.1
19
+ koleo_loss_distributed: false
20
+ koleo_topk: 1
21
+ koleo_distributed_replicas: 0
22
+ koleo_distributed_loss_group_size: null
23
+ koleo_distributed_loss_group_data: true
24
+ force_weight_norm: false
25
+ reweight_dino_local_loss: false
26
+ local_loss_weight_schedule:
27
+ start: 0.5
28
+ peak: 0.5
29
+ end: 0.5
30
+ warmup_epochs: 0
31
+ ibot:
32
+ loss_weight: 1.0
33
+ mask_sample_probability: 0.5
34
+ mask_ratio_min_max:
35
+ - 0.1
36
+ - 0.5
37
+ mask_random_circular_shift: false
38
+ force_masking_even_with_zero_weight: false
39
+ separate_head: true
40
+ head_n_prototypes: 98304
41
+ head_bottleneck_dim: 384
42
+ head_norm_last_layer: false
43
+ head_nlayers: 3
44
+ head_hidden_dim: 4096
45
+ coding_rate_loss:
46
+ use_cls_loss: false
47
+ cls_loss_weight: 0.2
48
+ use_masked_patches_loss: false
49
+ masked_patches_loss_weight: 0.1
50
+ epsilon: 8
51
+ gram:
52
+ use_loss: false
53
+ compute_stats: false
54
+ loss_weight: 1.0
55
+ ema_teacher: false
56
+ ckpt: null
57
+ it_load_ema_teacher: -1
58
+ rep_update: true
59
+ update_frequency: 50000
60
+ it_first_update: 0
61
+ max_updates: null
62
+ normalized: true
63
+ img_level: false
64
+ remove_neg: false
65
+ remove_only_teacher_neg: false
66
+ tokens_used: all
67
+ global_teacher_resize_method: bicubic
68
+ global_teacher_resize_antialias: false
69
+ loss_weight_schedule: null
70
+ train:
71
+ batch_size_per_gpu: 3
72
+ dataset_path: <TRAIN/DATASET>
73
+ output_dir: <OUTPUT/DIR>
74
+ saveckp_freq: 20
75
+ seed: 0
76
+ num_workers: 2
77
+ OFFICIAL_EPOCH_LENGTH: 1250
78
+ monitor_gradient_norm: false
79
+ chunk_schedule: []
80
+ cache_dataset: true
81
+ use_teacher_head: true
82
+ learn_from_teacher_tokens: false
83
+ centering: sinkhorn_knopp
84
+ checkpointing: true
85
+ checkpointing_full: true
86
+ compile: true
87
+ cudagraphs: false
88
+ cell_augmentation: false
89
+ cell_augmentation_type: hpa
90
+ sharded_eval_checkpoint: false
91
+ student:
92
+ arch: vit_large
93
+ patch_size: 16
94
+ drop_path_rate: 0.0
95
+ layerscale: 1.0e-05
96
+ drop_path_uniform: true
97
+ drop_path_shape: uniform
98
+ patch_drop: 0.0
99
+ pretrained_weights: ''
100
+ sin_cos_embeddings: false
101
+ fourier_embeddings: false
102
+ fourier_encoding_dim: 64
103
+ multiple_pos_embeddings: false
104
+ cls_pos_embedding: false
105
+ reg_pos_embedding: false
106
+ ffn_layer: mlp
107
+ ffn_ratio: 4.0
108
+ resume_from_teacher_chkpt: <PATH/TO/HRFT/TEACHER>
109
+ block_chunks: 0
110
+ qkv_bias: true
111
+ proj_bias: true
112
+ ffn_bias: true
113
+ norm_layer: layernormbf16
114
+ n_storage_tokens: 4
115
+ mask_attention: false
116
+ mask_register_attention: false
117
+ untie_cls_and_patch_norms: false
118
+ untie_global_and_local_cls_norm: false
119
+ interpolate_offset: 0.0
120
+ interpolate_antialias: true
121
+ mask_k_bias: true
122
+ init_std_cls: 0.02
123
+ init_std_reg: 0.02
124
+ rescale_weights_by_layer_id: false
125
+ in_chans: 3
126
+ pos_embed_grid_size: 48
127
+ pos_embed_type: ropenew
128
+ pos_embed_rope_gamma: 1.0
129
+ pos_embed_rope_init_multi_frequencies: false
130
+ pos_embed_rope_base: 100
131
+ pos_embed_rope_min_period: null
132
+ pos_embed_rope_max_period: null
133
+ pos_embed_rope_normalize_coords: separate
134
+ pos_embed_rope_shift_coords: null
135
+ pos_embed_rope_jitter_coords: null
136
+ pos_embed_rope_rescale_coords: 2
137
+ pos_embed_rope_dtype: bf16
138
+ sparse24_ranges: []
139
+ sparse24_filter:
140
+ - mlp
141
+ sparse24_default: false
142
+ fp8_enabled: false
143
+ fp8_filter: blocks
144
+ teacher:
145
+ momentum_teacher: 0.994
146
+ final_momentum_teacher: 1
147
+ warmup_teacher_temp: 0.04
148
+ teacher_temp: 0.07
149
+ warmup_teacher_temp_epochs: 120
150
+ in_chans: 3
151
+ distillation:
152
+ enabled: true
153
+ full_cfg_path: <PATH/TO/TEACHER/CONFIG/config.yaml>
154
+ checkpoint_path: <PATH/TO/TEACHER/checkpoint.pth>
155
+ multidistillation:
156
+ enabled: true
157
+ global_batch_size: 1920
158
+ students:
159
+ - name: vits_mlp4_4
160
+ config_path: <PATH/TO/STUDENT/CONFIG/vits_mlp4_4.yaml>
161
+ ranks_range:
162
+ - 0
163
+ - 48
164
+ - name: vitsp_swiglu6_1
165
+ config_path: <PATH/TO/STUDENT/CONFIG/vitsp_swiglu6_1.yaml>
166
+ ranks_range:
167
+ - 48
168
+ - 96
169
+ - name: vitb_mlp4_3
170
+ config_path: <PATH/TO/STUDENT/CONFIG/vitb_mlp4_3.yaml>
171
+ ranks_range:
172
+ - 96
173
+ - 176
174
+ - name: vitl_mlp4_1
175
+ config_path: <PATH/TO/STUDENT/CONFIG/vitl_mlp4_1.yaml>
176
+ ranks_range:
177
+ - 176
178
+ - 296
179
+ hrft:
180
+ enabled: false
181
+ checkpoint_path: ''
182
+ optim:
183
+ epochs: 20
184
+ optimizer: adamw
185
+ weight_decay: 0.04
186
+ weight_decay_end: 0.2
187
+ lr: 0.0002
188
+ warmup_epochs: 0
189
+ min_lr: 1.0e-06
190
+ schedule_trunc_extra: 0.0
191
+ clip_grad: 3.0
192
+ freeze_last_layer_epochs: 0
193
+ scaling_rule: sqrt_wrt_1024
194
+ patch_embed_lr_mult: 0.2
195
+ dino_head_wd_multiplier: 1.0
196
+ layerwise_decay: 0.99
197
+ multi_tensor_optim: true
198
+ dump_fsdp_weights_path: ''
199
+ adamw_beta1: 0.9
200
+ adamw_beta2: 0.999
201
+ crops:
202
+ global_crops_scale:
203
+ - 0.32
204
+ - 1.0
205
+ local_crops_number: 8
206
+ local_crops_scale:
207
+ - 0.05
208
+ - 0.32
209
+ global_crops_size: 256
210
+ local_crops_size: 112
211
+ global_local_crop_pairs_ratios: 1.0
212
+ gram_teacher_crops_size: 256
213
+ localcrops_subset_of_globalcrops: false
214
+ share_color_jitter: false
215
+ horizontal_flips: false
216
+ gram_teacher_no_distortions: false
217
+ rgb_mean:
218
+ - 0.485
219
+ - 0.456
220
+ - 0.406
221
+ rgb_std:
222
+ - 0.229
223
+ - 0.224
224
+ - 0.225
225
+ checkpointing:
226
+ period: 3750
227
+ max_to_keep: 3
228
+ keep_every: 99999999999999999
229
+ schedules:
230
+ weight_decay:
231
+ start: 0.04
232
+ peak: 0.04
233
+ end: 0.04
234
+ warmup_epochs: 0
235
+ teacher_temp:
236
+ start: 0.04
237
+ peak: 0.07
238
+ end: 0.07
239
+ warmup_epochs: 0
240
+ lr:
241
+ start: 0
242
+ peak: 0
243
+ end: 5.0e-05
244
+ warmup_epochs: 0
245
+ freeze_last_layer_epochs: 0
246
+ cosine_epochs: 10
247
+ momentum:
248
+ start: 0.994
249
+ peak: 0.994
250
+ end: 1.0
251
+ warmup_epochs: 0
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/multi_distillation_test.yaml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: MultiDistillationMetaArch
3
+ multidistillation:
4
+ enabled: true
5
+ global_batch_size: 256
6
+ students:
7
+ - name: vits
8
+ config_path: dinov3/configs/train/multidist_tests/vits_p16.yaml
9
+ ranks_range:
10
+ - 0
11
+ - 4
12
+ - name: vitb
13
+ config_path: dinov3/configs/train/multidist_tests/vitb_p16.yaml
14
+ ranks_range:
15
+ - 4
16
+ - 8
17
+ distillation: # teacher
18
+ enabled: true
19
+ full_cfg_path: dinov3/configs/train/vitl_im1k_lin834.yaml
20
+ checkpoint_path: ignore
21
+ train:
22
+ dataset_path: ImageNet:split=TRAIN
23
+ cache_dataset: false
24
+ centering: "sinkhorn_knopp"
25
+ compile: true
26
+ ibot:
27
+ separate_head: true
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/multidist_tests/vitb_p16.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # this corresponds to the default config
2
+ train:
3
+ dataset_path: ImageNet:split=TRAIN
4
+ checkpointing: true
5
+ student:
6
+ drop_path_rate: 0.1
7
+ arch: vit_base
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/multidist_tests/vits_p16.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # this corresponds to the default config
2
+ train:
3
+ dataset_path: ImageNet:split=TRAIN
4
+ student:
5
+ drop_path_rate: 0.1
6
+ arch: vit_small
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/configs/train/vitl_im1k_lin834.yaml ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tested on RSC: /checkpoint/dino/qas/rope/vitl16_im1k/
2
+ # gives 82.2 im1k-knn, 83.3 im1k-linear
3
+ # runs with a total batch size of 2048 (64/gpu, 4 nodes here)
4
+ # runs at 0.57s/iter
5
+ MODEL:
6
+ META_ARCHITECTURE: SSLMetaArch
7
+ DEVICE: cuda
8
+ WEIGHTS: ''
9
+ DTYPE: float32
10
+ compute_precision:
11
+ param_dtype: bf16
12
+ reduce_dtype: fp32
13
+ sharding_strategy: SHARD_GRAD_OP
14
+ dino:
15
+ loss_weight: 1.0
16
+ global_ignore_diagonal: true
17
+ head_n_prototypes: 65536
18
+ head_bottleneck_dim: 256
19
+ head_norm_last_layer: false
20
+ head_nlayers: 3
21
+ head_hidden_dim: 2048
22
+ koleo_loss_weight: 0.1
23
+ koleo_loss_distributed: false
24
+ koleo_topk: 1
25
+ koleo_distributed_replicas: 0
26
+ force_weight_norm: false
27
+ ibot:
28
+ loss_weight: 1.0
29
+ mask_sample_probability: 0.5
30
+ mask_ratio_min_max:
31
+ - 0.1
32
+ - 0.5
33
+ mask_random_circular_shift: false
34
+ force_masking_even_with_zero_weight: false
35
+ separate_head: true
36
+ head_n_prototypes: 65536
37
+ head_bottleneck_dim: 256
38
+ head_norm_last_layer: false
39
+ head_nlayers: 3
40
+ head_hidden_dim: 2048
41
+ train:
42
+ batch_size_per_gpu: 64
43
+ dataset_path: ImageNet:split=TRAIN
44
+ output_dir: /checkpoint/dino/qas/rope/vitl16_im1k
45
+ saveckp_freq: 20
46
+ seed: 0
47
+ num_workers: 10
48
+ OFFICIAL_EPOCH_LENGTH: 1250
49
+ monitor_gradient_norm: false
50
+ chunk_schedule: []
51
+ cache_dataset: true
52
+ use_teacher_head: true
53
+ learn_from_teacher_tokens: false
54
+ centering: sinkhorn_knopp
55
+ checkpointing: false
56
+ compile: true
57
+ cudagraphs: false
58
+ cell_augmentation: false
59
+ cell_augmentation_type: hpa
60
+ student:
61
+ arch: vit_large
62
+ patch_size: 16
63
+ drop_path_rate: 0.3
64
+ layerscale: 1.0e-05
65
+ patch_drop: 0.0
66
+ pretrained_weights: ''
67
+ ffn_layer: mlp
68
+ ffn_ratio: 4.0
69
+ resume_from_teacher_chkpt: ''
70
+ qkv_bias: true
71
+ proj_bias: true
72
+ ffn_bias: true
73
+ norm_layer: layernorm
74
+ n_storage_tokens: 0
75
+ mask_k_bias: false
76
+ in_chans: 3
77
+ pos_embed_type: rope
78
+ pos_embed_rope_base: 100.0
79
+ pos_embed_rope_min_period: null
80
+ pos_embed_rope_max_period: null
81
+ pos_embed_rope_normalize_coords: separate # min, max, separate
82
+ pos_embed_rope_shift_coords: null
83
+ pos_embed_rope_jitter_coords: null
84
+ pos_embed_rope_rescale_coords: null
85
+ pos_embed_rope_dtype: bf16
86
+ fp8_enabled: False # Convert Linear layers to operate in fp8 precision
87
+ fp8_filter: "blocks" # Regex that must appear in module path; empty means everything
88
+ teacher:
89
+ momentum_teacher: 0.992
90
+ final_momentum_teacher: 1
91
+ warmup_teacher_temp: 0.04
92
+ teacher_temp: 0.07
93
+ warmup_teacher_temp_epochs: 30
94
+ in_chans: 3
95
+ distillation:
96
+ enabled: false
97
+ full_cfg_path: ''
98
+ checkpoint_path: ''
99
+ multidistillation:
100
+ enabled: false
101
+ hrft:
102
+ enabled: false
103
+ checkpoint_path: ''
104
+ optim:
105
+ epochs: 100
106
+ optimizer: adamw
107
+ weight_decay: 0.04
108
+ weight_decay_end: 0.4
109
+ lr: 0.001
110
+ warmup_epochs: 10
111
+ min_lr: 1.0e-06
112
+ clip_grad: 3.0
113
+ freeze_last_layer_epochs: 1
114
+ scaling_rule: sqrt_wrt_1024
115
+ patch_embed_lr_mult: 0.2
116
+ dino_head_wd_multiplier: 1.0
117
+ layerwise_decay: 0.9
118
+ multi_tensor_optim: true
119
+ dump_fsdp_weights_path: ''
120
+ adamw_beta1: 0.9
121
+ adamw_beta2: 0.999
122
+ crops:
123
+ global_crops_scale:
124
+ - 0.32
125
+ - 1.0
126
+ local_crops_number: 8
127
+ local_crops_scale:
128
+ - 0.05
129
+ - 0.32
130
+ global_crops_size: 224
131
+ local_crops_size: 96
132
+ localcrops_subset_of_globalcrops: false
133
+ share_color_jitter: false
134
+ horizontal_flips: true
135
+ evaluation:
136
+ eval_period_iterations: 12500
137
+ low_freq_every: 5
138
+ config_files:
139
+ high_freq: benchmark_high_frequency.yaml
140
+ low_freq: benchmark_low_frequency.yaml
141
+ checkpointing:
142
+ period: 3750
143
+ max_to_keep: 3
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This software may be used and distributed in accordance with
4
+ # the terms of the DINOv3 License Agreement.
5
+
6
+ from .adapters import DatasetWithEnumeratedTargets
7
+ from .augmentations import DataAugmentationDINO
8
+ from .collate import collate_data_and_cast
9
+ from .loaders import SamplerType, make_data_loader, make_dataset
10
+ from .meta_loaders import CombinedDataLoader
11
+ from .masking import MaskingGenerator
12
+ from .transforms import make_classification_eval_transform, make_classification_train_transform
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/adapters.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This software may be used and distributed in accordance with
4
+ # the terms of the DINOv3 License Agreement.
5
+
6
+ from typing import Any, Optional, Tuple
7
+
8
+ from torch.utils.data import Dataset
9
+
10
+
11
+ def extend_samples_with_index(dataset_class):
12
+ class DatasetWithIndex(dataset_class):
13
+ def __init__(self, **kwargs) -> None:
14
+ root = dataset_class.get_root()
15
+ super().__init__(root=root, **kwargs)
16
+
17
+ def __getitem__(self, index: int):
18
+ image, target = super().__getitem__(index)
19
+ return image, target, index
20
+
21
+ return DatasetWithIndex
22
+
23
+
24
+ class DatasetWithEnumeratedTargets(Dataset):
25
+ """
26
+ If pad_dataset is set, pads based on torch's DistributedSampler implementation, which
27
+ with drop_last=False pads the last batch to be a multiple of the world size.
28
+ https://github.com/pytorch/pytorch/blob/main/torch/utils/data/distributed.py#L91
29
+ """
30
+
31
+ def __init__(self, dataset: Dataset, pad_dataset: bool = False, num_replicas: Optional[int] = None):
32
+ self._dataset = dataset
33
+ self._size = len(self._dataset)
34
+ self._padded_size = self._size
35
+ self._pad_dataset = pad_dataset
36
+ if self._pad_dataset:
37
+ assert num_replicas is not None, "num_replicas should be set if pad_dataset is True"
38
+ self._padded_size = num_replicas * ((len(dataset) + num_replicas - 1) // num_replicas)
39
+
40
+ def get_image_relpath(self, index: int) -> str:
41
+ assert self._pad_dataset or index < self._size
42
+ return self._dataset.get_image_relpath(index % self._size)
43
+
44
+ def get_image_data(self, index: int) -> bytes:
45
+ assert self._pad_dataset or index < self._size
46
+ return self._dataset.get_image_data(index % self._size)
47
+
48
+ def get_target(self, index: int) -> Tuple[Any, int]:
49
+ target = self._dataset.get_target(index % self._size)
50
+ if index >= self._size:
51
+ assert self._pad_dataset
52
+ return (-1, target)
53
+ return (index, target)
54
+
55
+ def get_sample_decoder(self, index: int) -> Any:
56
+ assert self._pad_dataset or index < self._size
57
+ return self._dataset.get_sample_decoder(index % self._size)
58
+
59
+ def __getitem__(self, index: int) -> Tuple[Any, Tuple[Any, int]]:
60
+ image, target = self._dataset[index % self._size]
61
+ if index >= self._size:
62
+ assert self._pad_dataset
63
+ return image, (-1, target)
64
+ target = index if target is None else target
65
+ return image, (index, target)
66
+
67
+ def __len__(self) -> int:
68
+ return self._padded_size
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/augmentations.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This software may be used and distributed in accordance with
4
+ # the terms of the DINOv3 License Agreement.
5
+
6
+ import logging
7
+
8
+ import numpy as np
9
+ import torch
10
+ from torch import nn
11
+ from torchvision.transforms import v2
12
+
13
+ from dinov3.data.transforms import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, GaussianBlur, make_normalize_transform
14
+
15
+ logger = logging.getLogger("dinov3")
16
+
17
+
18
+ class DataAugmentationDINO(object):
19
+ def __init__(
20
+ self,
21
+ global_crops_scale,
22
+ local_crops_scale,
23
+ local_crops_number,
24
+ global_crops_size=224,
25
+ local_crops_size=96,
26
+ gram_teacher_crops_size=None,
27
+ gram_teacher_no_distortions=False,
28
+ teacher_no_color_jitter=False,
29
+ local_crops_subset_of_global_crops=False,
30
+ patch_size=16,
31
+ share_color_jitter=False,
32
+ horizontal_flips=True,
33
+ mean=IMAGENET_DEFAULT_MEAN,
34
+ std=IMAGENET_DEFAULT_STD,
35
+ ):
36
+ self.global_crops_scale = global_crops_scale
37
+ self.local_crops_scale = local_crops_scale
38
+ self.local_crops_number = local_crops_number
39
+ self.global_crops_size = global_crops_size
40
+ self.local_crops_size = local_crops_size
41
+ self.gram_teacher_crops_size = gram_teacher_crops_size
42
+ self.gram_teacher_no_distortions = gram_teacher_no_distortions
43
+ self.teacher_no_color_jitter = teacher_no_color_jitter
44
+ self.local_crops_subset_of_global_crops = local_crops_subset_of_global_crops
45
+ self.patch_size = patch_size
46
+ self.share_color_jitter = share_color_jitter
47
+ self.mean = mean
48
+ self.std = std
49
+
50
+ logger.info("###################################")
51
+ logger.info("Using data augmentation parameters:")
52
+ logger.info(f"global_crops_scale: {global_crops_scale}")
53
+ logger.info(f"local_crops_scale: {local_crops_scale}")
54
+ logger.info(f"local_crops_number: {local_crops_number}")
55
+ logger.info(f"global_crops_size: {global_crops_size}")
56
+ logger.info(f"local_crops_size: {local_crops_size}")
57
+ logger.info(f"gram_crops_size: {gram_teacher_crops_size}")
58
+ logger.info(f"gram_teacher_no_distortions: {gram_teacher_no_distortions}")
59
+ logger.info(f"teacher_no_color_jitter: {teacher_no_color_jitter}")
60
+ logger.info(f"local_crops_subset_of_global_crops: {local_crops_subset_of_global_crops}")
61
+ logger.info(f"patch_size if local_crops_subset_of_global_crops: {patch_size}")
62
+ logger.info(f"share_color_jitter: {share_color_jitter}")
63
+ logger.info(f"horizontal flips: {horizontal_flips}")
64
+ logger.info("###################################")
65
+
66
+ # Global crops and gram teacher crops can have different sizes. We first take a crop of the maximum size
67
+ # and then resize it to the desired size for global and gram teacher crops.
68
+ global_crop_max_size = max(global_crops_size, gram_teacher_crops_size if gram_teacher_crops_size else 0)
69
+
70
+ # random resized crop and flip
71
+ self.geometric_augmentation_global = v2.Compose(
72
+ [
73
+ v2.RandomResizedCrop(
74
+ global_crop_max_size,
75
+ scale=global_crops_scale,
76
+ interpolation=v2.InterpolationMode.BICUBIC,
77
+ ),
78
+ v2.RandomHorizontalFlip(p=0.5 if horizontal_flips else 0.0),
79
+ ]
80
+ )
81
+
82
+ resize_global = nn.Identity() # Resize transform applied to global crops after random crop
83
+ self.resize_global_post_transf = (
84
+ nn.Identity()
85
+ ) # Resize transform applied to global crops after all other transforms
86
+ self.resize_gram_teacher = None # Resize transform applied to crops for gram teacher
87
+ if gram_teacher_crops_size is not None:
88
+ # All resize transforms will do nothing if the crop size is already the desired size.
89
+ if gram_teacher_no_distortions:
90
+ # When there a no distortions for the gram teacher crop, we can resize before the distortions.
91
+ # This is the preferred order, because it keeps the image size for the augmentations consistent,
92
+ # which matters e.g. for GaussianBlur.
93
+ resize_global = v2.Resize(
94
+ global_crops_size,
95
+ interpolation=v2.InterpolationMode.BICUBIC,
96
+ )
97
+ else:
98
+ # When there a no distortions for the gram teacher crop, we need to resize after the distortions,
99
+ # because the distortions are shared between global and gram teacher crops.
100
+ self.resize_global_post_transf = v2.Resize(
101
+ global_crops_size,
102
+ interpolation=v2.InterpolationMode.BICUBIC,
103
+ )
104
+
105
+ self.resize_gram_teacher = v2.Resize(
106
+ gram_teacher_crops_size,
107
+ interpolation=v2.InterpolationMode.BICUBIC,
108
+ )
109
+
110
+ self.geometric_augmentation_local = v2.Compose(
111
+ [
112
+ v2.RandomResizedCrop(
113
+ local_crops_size,
114
+ scale=local_crops_scale,
115
+ interpolation=v2.InterpolationMode.BICUBIC,
116
+ ),
117
+ v2.RandomHorizontalFlip(p=0.5 if horizontal_flips else 0.0),
118
+ ]
119
+ )
120
+
121
+ # color distortions / blurring
122
+ color_jittering = v2.Compose(
123
+ [
124
+ v2.RandomApply(
125
+ [v2.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)],
126
+ p=0.8,
127
+ ),
128
+ v2.RandomGrayscale(p=0.2),
129
+ ]
130
+ )
131
+
132
+ global_transfo1_extra = GaussianBlur(p=1.0)
133
+
134
+ global_transfo2_extra = v2.Compose(
135
+ [
136
+ GaussianBlur(p=0.1),
137
+ v2.RandomSolarize(threshold=128, p=0.2),
138
+ ]
139
+ )
140
+
141
+ local_transfo_extra = GaussianBlur(p=0.5)
142
+
143
+ # normalization
144
+ self.normalize = v2.Compose(
145
+ [
146
+ v2.ToImage(),
147
+ v2.ToDtype(torch.float32, scale=True),
148
+ make_normalize_transform(mean=mean, std=std),
149
+ ]
150
+ )
151
+
152
+ if self.share_color_jitter:
153
+ self.color_jittering = color_jittering
154
+ self.global_transfo1 = v2.Compose([resize_global, global_transfo1_extra, self.normalize])
155
+ self.global_transfo2 = v2.Compose([resize_global, global_transfo2_extra, self.normalize])
156
+ self.local_transfo = v2.Compose([local_transfo_extra, self.normalize])
157
+ else:
158
+ self.global_transfo1 = v2.Compose(
159
+ [resize_global, color_jittering, global_transfo1_extra, self.normalize]
160
+ )
161
+ self.global_transfo2 = v2.Compose(
162
+ [resize_global, color_jittering, global_transfo2_extra, self.normalize]
163
+ )
164
+ self.local_transfo = v2.Compose([color_jittering, local_transfo_extra, self.normalize])
165
+
166
+ def __call__(self, image):
167
+ output = {}
168
+ output["weak_flag"] = True # some residual from mugs
169
+
170
+ if self.share_color_jitter:
171
+ image = self.color_jittering(image)
172
+
173
+ # global crops:
174
+ im1_base = self.geometric_augmentation_global(image)
175
+ global_crop_1_transf = self.global_transfo1(im1_base)
176
+ global_crop_1 = self.resize_global_post_transf(global_crop_1_transf)
177
+
178
+ im2_base = self.geometric_augmentation_global(image)
179
+ global_crop_2_transf = self.global_transfo2(im2_base)
180
+ global_crop_2 = self.resize_global_post_transf(global_crop_2_transf)
181
+
182
+ output["global_crops"] = [global_crop_1, global_crop_2]
183
+
184
+ # global crops for teacher:
185
+ if self.teacher_no_color_jitter:
186
+ output["global_crops_teacher"] = [
187
+ self.normalize(im1_base),
188
+ self.normalize(im2_base),
189
+ ]
190
+ else:
191
+ output["global_crops_teacher"] = [global_crop_1, global_crop_2]
192
+
193
+ if self.gram_teacher_crops_size is not None:
194
+ # crops for gram teacher:
195
+ if self.gram_teacher_no_distortions:
196
+ gram_crop_1 = self.normalize(self.resize_gram_teacher(im1_base))
197
+ gram_crop_2 = self.normalize(self.resize_gram_teacher(im2_base))
198
+ else:
199
+ gram_crop_1 = self.resize_gram_teacher(global_crop_1_transf)
200
+ gram_crop_2 = self.resize_gram_teacher(global_crop_2_transf)
201
+ output["gram_teacher_crops"] = [gram_crop_1, gram_crop_2]
202
+
203
+ # local crops:
204
+ if self.local_crops_subset_of_global_crops:
205
+ _local_crops = [self.local_transfo(im1_base) for _ in range(self.local_crops_number // 2)] + [
206
+ self.local_transfo(im2_base) for _ in range(self.local_crops_number // 2)
207
+ ]
208
+
209
+ local_crops = []
210
+ offsets = []
211
+ gs = self.global_crops_size
212
+ ls = self.local_crops_size
213
+ for img in _local_crops:
214
+ rx, ry = np.random.randint(0, (gs - ls) // self.patch_size, 2) * self.patch_size
215
+ local_crops.append(img[:, rx : rx + ls, ry : ry + ls])
216
+ offsets.append((rx, ry))
217
+
218
+ output["local_crops"] = local_crops
219
+ output["offsets"] = offsets
220
+ else:
221
+ local_crops = [
222
+ self.local_transfo(self.geometric_augmentation_local(image)) for _ in range(self.local_crops_number)
223
+ ]
224
+ output["local_crops"] = local_crops
225
+ output["offsets"] = ()
226
+
227
+ return output
depth_anything_v2_metric/depth_anything_v2/dinov3/dinov3/data/collate.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This software may be used and distributed in accordance with
4
+ # the terms of the DINOv3 License Agreement.
5
+
6
+ import random
7
+
8
+ import torch
9
+
10
+
11
+ def collate_data_and_cast(
12
+ samples_list,
13
+ mask_ratio_tuple,
14
+ mask_probability,
15
+ dtype,
16
+ n_tokens=None,
17
+ mask_generator=None,
18
+ random_circular_shift=False,
19
+ local_batch_size=None,
20
+ ):
21
+ n_global_crops = len(samples_list[0][0]["global_crops"])
22
+ n_local_crops = len(samples_list[0][0]["local_crops"])
23
+
24
+ collated_global_crops = torch.stack(
25
+ [s[0]["global_crops"][i] for i in range(n_global_crops) for s in samples_list]
26
+ ) # [n_global_crops, B, ...]
27
+ collated_local_crops = torch.stack([s[0]["local_crops"][i] for i in range(n_local_crops) for s in samples_list])
28
+ if "gram_teacher_crops" in samples_list[0][0]:
29
+ collated_gram_teacher_crops = torch.stack(
30
+ [s[0]["gram_teacher_crops"][i] for i in range(n_global_crops) for s in samples_list]
31
+ ) # [n_global_crops, B, ...]
32
+ else:
33
+ collated_gram_teacher_crops = None
34
+
35
+ if local_batch_size is not None:
36
+ # multi-distillation case, number of masks is different because the number of samples masked
37
+ # is different of the number of samples passed into the teacher initially
38
+ B = n_global_crops * local_batch_size
39
+ else:
40
+ B = len(collated_global_crops)
41
+ N = n_tokens
42
+ n_samples_masked = int(B * mask_probability)
43
+ probs = torch.linspace(*mask_ratio_tuple, n_samples_masked + 1)
44
+ upperbound = 0
45
+ masks_list = []
46
+ for i in range(0, n_samples_masked):
47
+ prob_max = probs[i + 1]
48
+ mask = torch.BoolTensor(mask_generator(int(N * prob_max)))
49
+ if random_circular_shift: # apply le random circular shift to
50
+ shift_x, shift_y = (
51
+ random.randint(0, mask.shape[0] - 1),
52
+ random.randint(0, mask.shape[1] - 1),
53
+ )
54
+ mask = torch.roll(mask, (shift_x, shift_y), (0, 1))
55
+ masks_list.append(mask)
56
+ upperbound += int(N * prob_max)
57
+ for _ in range(n_samples_masked, B):
58
+ masks_list.append(torch.BoolTensor(mask_generator(0)))
59
+
60
+ random.shuffle(masks_list)
61
+
62
+ collated_masks = torch.stack(masks_list).flatten(1)
63
+ mask_indices_list = collated_masks.flatten().nonzero().flatten()
64
+
65
+ masks_weight = (1 / collated_masks.sum(-1).clamp(min=1.0)).unsqueeze(-1).expand_as(collated_masks)[collated_masks]
66
+
67
+ out = {
68
+ "collated_global_crops": collated_global_crops.to(dtype),
69
+ "collated_local_crops": collated_local_crops.to(dtype),
70
+ "collated_masks": collated_masks,
71
+ "mask_indices_list": mask_indices_list,
72
+ "masks_weight": masks_weight,
73
+ "upperbound": upperbound,
74
+ "n_masked_patches": torch.full((1,), fill_value=mask_indices_list.shape[0], dtype=torch.long),
75
+ }
76
+ if collated_gram_teacher_crops is not None:
77
+ out["collated_gram_teacher_crops"] = collated_gram_teacher_crops.to(dtype)
78
+ return out
79
+
80
+
81
+ # def get_batch_subset(collated_data_batch, target_bs):
82
+ def get_batch_subset(collated_data_batch, divide_by):
83
+ old_bs = collated_data_batch["collated_global_crops"].shape[0] // 2
84
+ target_bs = (old_bs + divide_by - 1) // divide_by
85
+ collated_global_crops = (
86
+ collated_data_batch["collated_global_crops"].unflatten(0, (2, old_bs)).narrow(1, 0, target_bs).flatten(0, 1)
87
+ )
88
+ collated_local_crops = (
89
+ collated_data_batch["collated_local_crops"].unflatten(0, (-1, old_bs)).narrow(1, 0, target_bs).flatten(0, 1)
90
+ )
91
+
92
+ masks_old_bs = collated_data_batch["collated_masks"].shape[0] // 2
93
+ masks_target_bs = masks_old_bs // divide_by
94
+ collated_masks = (
95
+ collated_data_batch["collated_masks"]
96
+ .unflatten(0, (2, masks_old_bs))
97
+ .narrow(1, 0, masks_target_bs)
98
+ .flatten(0, 1)
99
+ )
100
+ mask_indices_list = collated_masks.flatten().nonzero().flatten()
101
+
102
+ while mask_indices_list.shape[0] == 0:
103
+ _unbind = list(collated_data_batch["collated_masks"].unbind(0))
104
+ random.shuffle(_unbind)
105
+ _bind = torch.stack(_unbind, dim=0)
106
+ collated_masks = _bind.unflatten(0, (2, masks_old_bs)).narrow(1, 0, masks_target_bs).flatten(0, 1)
107
+ mask_indices_list = collated_masks.flatten().nonzero().flatten()
108
+
109
+ masks_weight = (1 / collated_masks.sum(-1).clamp(min=1.0)).unsqueeze(-1).expand_as(collated_masks)[collated_masks]
110
+ upperbound = collated_data_batch["upperbound"]
111
+
112
+ new_batch = {
113
+ "collated_global_crops": collated_global_crops,
114
+ "collated_local_crops": collated_local_crops,
115
+ "collated_masks": collated_masks,
116
+ "mask_indices_list": mask_indices_list,
117
+ "masks_weight": masks_weight,
118
+ "upperbound": upperbound,
119
+ "n_masked_patches": torch.full((1,), fill_value=mask_indices_list.shape[0], dtype=torch.long),
120
+ }
121
+
122
+ if "global_batch_size" in collated_data_batch.keys():
123
+ new_batch["global_batch_size"] = collated_data_batch["global_batch_size"] // divide_by
124
+
125
+ return new_batch