Spaces:

MohmedAnik
/

ObjectOrientmodel

Sleeping

App Files Files Community

MohmedAnik commited on Nov 21

Commit

06f2523

verified ·

1 Parent(s): 6ebe9b0

Upload 11 files

Browse files

Files changed (12) hide show

.gitattributes +1 -0
app.py +78 -0
examples/car.jpg +0 -0
examples/iMAC.jpg +0 -0
examples/pig.jpg +0 -0
examples/statue.png +3 -0
gitattributes +35 -0
inference.py +49 -0
paths.py +4 -0
requirements.txt +10 -0
utils.py +304 -0
vision_tower.py +161 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ examples/statue.png filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import gradio as gr
+from paths import *
+import os
+from vision_tower import DINOv2_MLP
+from transformers import AutoImageProcessor
+import torch
+from inference import *
+from utils import *
+from huggingface_hub import hf_hub_download
+ckpt_path = hf_hub_download(repo_id="Viglong/Orient-Anything", filename="ronormsigma1/dino_weight.pt", repo_type="model", cache_dir='./', resume_download=True)
+print(ckpt_path)
+save_path = './'
+device = 'cpu'
+dino = DINOv2_MLP(
+                    dino_mode   = 'large',
+                    in_dim      = 1024,
+                    out_dim     = 360+180+360+2,
+                    evaluate    = True,
+                    mask_dino   = False,
+                    frozen_back = False
+                )
+dino.eval()
+print('model create')
+dino.load_state_dict(torch.load(ckpt_path, map_location='cpu'))
+dino = dino.to(device)
+print('weight loaded')
+val_preprocess   = AutoImageProcessor.from_pretrained(DINO_LARGE, cache_dir='./')
+def infer_func(img, do_rm_bkg=True, do_infer_aug=False):
+    origin_img = Image.fromarray(img)
+    if do_infer_aug:
+        rm_bkg_img = background_preprocess(origin_img, True)
+        angles = get_3angle_infer_aug(origin_img, rm_bkg_img, dino, val_preprocess, device)
+    else:
+        rm_bkg_img = background_preprocess(origin_img, do_rm_bkg)
+        angles = get_3angle(rm_bkg_img, dino, val_preprocess, device)
+    phi   = np.radians(angles[0])
+    theta = np.radians(angles[1])
+    gamma = angles[2]
+    confidence = float(angles[3])
+    if confidence > 0.5:
+        render_axis = render_3D_axis(phi, theta, gamma)
+        res_img = overlay_images_with_scaling(render_axis, rm_bkg_img)
+    else:
+        res_img = img
+    # axis_model = "axis.obj"
+    return [res_img, round(float(angles[0]), 2), round(float(angles[1]), 2), round(float(angles[2]), 2), round(float(angles[3]), 2)]
+example_files = os.listdir('examples')
+example_files.sort()
+example_files = [[os.path.join('examples', filename), None, None] for filename in example_files]
+print(example_files)
+server = gr.Interface(
+    flagging_mode='never',
+    fn=infer_func,
+    examples=example_files,
+    cache_examples=False,
+    inputs=[
+        gr.Image(height=512, width=512, label="upload your image"),
+        gr.Checkbox(label="Remove Background", value=True),
+        gr.Checkbox(label="Inference time augmentation", value=False)
+    ],
+    outputs=[
+        gr.Image(height=512, width=512, label="result image"),
+        # gr.Model3D(clear_color=[0.0, 0.0, 0.0, 0.0],  label="3D Model"),
+        gr.Textbox(lines=1, label='Azimuth(0~360°) represents the position of the viewer in the xy plane'),
+        gr.Textbox(lines=1, label='Polar(-90~90°) indicating the height at which the viewer is located'),
+        gr.Textbox(lines=1, label='Rotation(-90~90°) represents the angle of rotation of the viewer'),
+        gr.Textbox(lines=1, label='Confidence(0~1) indicating whether the object has a meaningful orientation')
+    ]
+)
+server.launch()

examples/car.jpg ADDED Viewed

examples/iMAC.jpg ADDED Viewed

examples/pig.jpg ADDED Viewed

examples/statue.png ADDED Viewed

Git LFS Details

SHA256: bc88dd340ed4a6177207ecc649654b2c12ad82e949b4acdebf49ea94ff7597a5
Pointer size: 131 Bytes
Size of remote file: 330 kB

gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

inference.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+from PIL import Image
+from utils import *
+import torch.nn.functional as F
+import numpy as np
+def get_3angle(image, dino, val_preprocess, device):
+    # image = Image.open(image_path).convert('RGB')
+    image_inputs = val_preprocess(images = image)
+    image_inputs['pixel_values'] = torch.from_numpy(np.array(image_inputs['pixel_values'])).to(device)
+    with torch.no_grad():
+        dino_pred = dino(image_inputs)
+    gaus_ax_pred   = torch.argmax(dino_pred[:, 0:360], dim=-1)
+    gaus_pl_pred   = torch.argmax(dino_pred[:, 360:360+180], dim=-1)
+    gaus_ro_pred   = torch.argmax(dino_pred[:, 360+180:360+180+360], dim=-1)
+    confidence     = F.softmax(dino_pred[:, -2:], dim=-1)[0][0]
+    angles = torch.zeros(4)
+    angles[0]  = gaus_ax_pred
+    angles[1]  = gaus_pl_pred - 90
+    angles[2]  = gaus_ro_pred - 180
+    angles[3]  = confidence
+    return angles
+def get_3angle_infer_aug(origin_img, rm_bkg_img, dino, val_preprocess, device):
+    # image = Image.open(image_path).convert('RGB')
+    image = get_crop_images(origin_img, num=3) + get_crop_images(rm_bkg_img, num=3)
+    image_inputs = val_preprocess(images = image)
+    image_inputs['pixel_values'] = torch.from_numpy(np.array(image_inputs['pixel_values'])).to(device)
+    with torch.no_grad():
+        dino_pred = dino(image_inputs)
+    gaus_ax_pred   = torch.argmax(dino_pred[:, 0:360], dim=-1).to(torch.float32)
+    gaus_pl_pred   = torch.argmax(dino_pred[:, 360:360+180], dim=-1).to(torch.float32)
+    gaus_ro_pred   = torch.argmax(dino_pred[:, 360+180:360+180+360], dim=-1).to(torch.float32)
+    gaus_ax_pred   = remove_outliers_and_average_circular(gaus_ax_pred)
+    gaus_pl_pred   = remove_outliers_and_average(gaus_pl_pred)
+    gaus_ro_pred   = remove_outliers_and_average(gaus_ro_pred)
+    confidence     = torch.mean(F.softmax(dino_pred[:, -2:], dim=-1), dim=0)[0]
+    angles = torch.zeros(4)
+    angles[0]  = gaus_ax_pred
+    angles[1]  = gaus_pl_pred - 90
+    angles[2]  = gaus_ro_pred - 180
+    angles[3]  = confidence
+    return angles

paths.py ADDED Viewed

	@@ -0,0 +1,4 @@

+DINO_SMALL  = "facebook/dinov2-small"
+DINO_BASE   = "facebook/dinov2-base"
+DINO_LARGE  = "facebook/dinov2-large"
+DINO_GIANT  = "facebook/dinov2-giant"

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch==2.2.1
+transformers==4.38
+matplotlib
+pillow==10.2.0
+huggingface-hub==0.26.5
+gradio==5.9.0
+numpy==1.26.4
+onnxruntime
+rembg
+pydantic==2.10.6

utils.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import rembg
+import random
+import torch
+import numpy as np
+from PIL import Image, ImageOps
+import PIL
+from typing import Any
+import matplotlib.pyplot as plt
+import io
+def resize_foreground(
+    image: Image,
+    ratio: float,
+) -> Image:
+    image = np.array(image)
+    assert image.shape[-1] == 4
+    alpha = np.where(image[..., 3] > 0)
+    y1, y2, x1, x2 = (
+        alpha[0].min(),
+        alpha[0].max(),
+        alpha[1].min(),
+        alpha[1].max(),
+    )
+    # crop the foreground
+    fg = image[y1:y2, x1:x2]
+    # pad to square
+    size = max(fg.shape[0], fg.shape[1])
+    ph0, pw0 = (size - fg.shape[0]) // 2, (size - fg.shape[1]) // 2
+    ph1, pw1 = size - fg.shape[0] - ph0, size - fg.shape[1] - pw0
+    new_image = np.pad(
+        fg,
+        ((ph0, ph1), (pw0, pw1), (0, 0)),
+        mode="constant",
+        constant_values=((0, 0), (0, 0), (0, 0)),
+    )
+    # compute padding according to the ratio
+    new_size = int(new_image.shape[0] / ratio)
+    # pad to size, double side
+    ph0, pw0 = (new_size - size) // 2, (new_size - size) // 2
+    ph1, pw1 = new_size - size - ph0, new_size - size - pw0
+    new_image = np.pad(
+        new_image,
+        ((ph0, ph1), (pw0, pw1), (0, 0)),
+        mode="constant",
+        constant_values=((0, 0), (0, 0), (0, 0)),
+    )
+    new_image = Image.fromarray(new_image)
+    return new_image
+def remove_background(image: Image,
+    rembg_session: Any = None,
+    force: bool = False,
+    **rembg_kwargs,
+) -> Image:
+    do_remove = True
+    if image.mode == "RGBA" and image.getextrema()[3][0] < 255:
+        do_remove = False
+    do_remove = do_remove or force
+    if do_remove:
+        image = rembg.remove(image, session=rembg_session, **rembg_kwargs)
+    return image
+def random_crop(image, crop_scale=(0.8, 0.95)):
+    """
+    随机裁切图片
+        image (numpy.ndarray):  (H, W, C)。
+        crop_scale (tuple): (min_scale, max_scale)。
+    """
+    assert isinstance(image, Image.Image), "iput must be PIL.Image.Image"
+    assert len(crop_scale) == 2 and 0 < crop_scale[0] <= crop_scale[1] <= 1
+    width, height = image.size
+    # 计算裁切的高度和宽度
+    crop_width = random.randint(int(width * crop_scale[0]), int(width * crop_scale[1]))
+    crop_height = random.randint(int(height * crop_scale[0]), int(height * crop_scale[1]))
+    # 随机选择裁切的起始点
+    left = random.randint(0, width - crop_width)
+    top = random.randint(0, height - crop_height)
+    # 裁切图片
+    cropped_image = image.crop((left, top, left + crop_width, top + crop_height))
+    return cropped_image
+def get_crop_images(img, num=3):
+    cropped_images = []
+    for i in range(num):
+        cropped_images.append(random_crop(img))
+    return cropped_images
+def background_preprocess(input_image, do_remove_background):
+    rembg_session = rembg.new_session() if do_remove_background else None
+    if do_remove_background:
+        input_image = remove_background(input_image, rembg_session)
+        input_image = resize_foreground(input_image, 0.85)
+    return input_image
+def remove_outliers_and_average(tensor, threshold=1.5):
+    assert tensor.dim() == 1, "dimension of input Tensor must equal to 1"
+    q1 = torch.quantile(tensor, 0.25)
+    q3 = torch.quantile(tensor, 0.75)
+    iqr = q3 - q1
+    lower_bound = q1 - threshold * iqr
+    upper_bound = q3 + threshold * iqr
+    non_outliers = tensor[(tensor >= lower_bound) & (tensor <= upper_bound)]
+    if len(non_outliers) == 0:
+        return tensor.mean().item()
+    return non_outliers.mean().item()
+def remove_outliers_and_average_circular(tensor, threshold=1.5):
+    assert tensor.dim() == 1, "dimension of input Tensor must equal to 1"
+    # 将角度转换为二维平面上的点
+    radians = tensor * torch.pi / 180.0
+    x_coords = torch.cos(radians)
+    y_coords = torch.sin(radians)
+    # 计算平均向量
+    mean_x = torch.mean(x_coords)
+    mean_y = torch.mean(y_coords)
+    differences = torch.sqrt((x_coords - mean_x) * (x_coords - mean_x) + (y_coords - mean_y) * (y_coords - mean_y))
+    # 计算四分位数和 IQR
+    q1 = torch.quantile(differences, 0.25)
+    q3 = torch.quantile(differences, 0.75)
+    iqr = q3 - q1
+    # 计算上下限
+    lower_bound = q1 - threshold * iqr
+    upper_bound = q3 + threshold * iqr
+    # 筛选非离群点
+    non_outliers = tensor[(differences >= lower_bound) & (differences <= upper_bound)]
+    if len(non_outliers) == 0:
+        mean_angle = torch.atan2(mean_y, mean_x) * 180.0 / torch.pi
+        mean_angle = (mean_angle + 360) % 360
+        return mean_angle  # 如果没有非离群点，返回 None
+    # 对非离群点再次计算平均向量
+    radians = non_outliers * torch.pi / 180.0
+    x_coords = torch.cos(radians)
+    y_coords = torch.sin(radians)
+    mean_x = torch.mean(x_coords)
+    mean_y = torch.mean(y_coords)
+    mean_angle = torch.atan2(mean_y, mean_x) * 180.0 / torch.pi
+    mean_angle = (mean_angle + 360) % 360
+    return mean_angle
+def scale(x):
+    # print(x)
+    # if abs(x[0])<0.1 and abs(x[1])<0.1:
+    #     return x*5
+    # else:
+    #     return x
+    return x*3
+def get_proj2D_XYZ(phi, theta, gamma):
+    x = np.array([-1*np.sin(phi)*np.cos(gamma) - np.cos(phi)*np.sin(theta)*np.sin(gamma), np.sin(phi)*np.sin(gamma) - np.cos(phi)*np.sin(theta)*np.cos(gamma)])
+    y = np.array([-1*np.cos(phi)*np.cos(gamma) + np.sin(phi)*np.sin(theta)*np.sin(gamma), np.cos(phi)*np.sin(gamma) + np.sin(phi)*np.sin(theta)*np.cos(gamma)])
+    z = np.array([np.cos(theta)*np.sin(gamma), np.cos(theta)*np.cos(gamma)])
+    x = scale(x)
+    y = scale(y)
+    z = scale(z)
+    return x, y, z
+# 绘制3D坐标轴
+def draw_axis(ax, origin, vector, color, label=None):
+    ax.quiver(origin[0], origin[1], vector[0], vector[1], angles='xy', scale_units='xy', scale=1, color=color)
+    if label!=None:
+        ax.text(origin[0] + vector[0] * 1.1, origin[1] + vector[1] * 1.1, label, color=color, fontsize=12)
+def matplotlib_2D_arrow(angles, rm_bkg_img):
+    fig, ax = plt.subplots(figsize=(8, 8))
+    # 设置旋转角度
+    phi   = np.radians(angles[0])
+    theta = np.radians(angles[1])
+    gamma = np.radians(-1*angles[2])
+    w, h = rm_bkg_img.size
+    if h>w:
+        extent = [-5*w/h, 5*w/h, -5, 5]
+    else:
+        extent = [-5, 5, -5*h/w, 5*h/w]
+    ax.imshow(rm_bkg_img, extent=extent, zorder=0, aspect ='auto')  # extent 设置图片的显示范围
+    origin = np.array([0, 0])
+    # 旋转后的向量
+    rot_x, rot_y, rot_z = get_proj2D_XYZ(phi, theta, gamma)
+    # draw arrow
+    arrow_attr = [{'point':rot_x, 'color':'r', 'label':'front'},
+                  {'point':rot_y, 'color':'g', 'label':'right'},
+                  {'point':rot_z, 'color':'b', 'label':'top'}]
+    if phi> 45 and phi<=225:
+        order = [0,1,2]
+    elif phi > 225 and phi < 315:
+        order = [2,0,1]
+    else:
+        order = [2,1,0]
+    for i in range(3):
+        draw_axis(ax, origin, arrow_attr[order[i]]['point'], arrow_attr[order[i]]['color'], arrow_attr[order[i]]['label'])
+        # draw_axis(ax, origin, rot_y, 'g', label='right')
+        # draw_axis(ax, origin, rot_z, 'b', label='top')
+        # draw_axis(ax, origin, rot_x, 'r', label='front')
+    # 关闭坐标轴和网格
+    ax.set_axis_off()
+    ax.grid(False)
+    # 设置坐标范围
+    ax.set_xlim(-5, 5)
+    ax.set_ylim(-5, 5)
+def figure_to_img(fig):
+    with io.BytesIO() as buf:
+        fig.savefig(buf, format='JPG', bbox_inches='tight')
+        buf.seek(0)
+        image = Image.open(buf).copy()
+    return image
+from render import render, Model
+import math
+axis_model = Model("./axis.obj", texture_filename="./axis.png")
+def render_3D_axis(phi, theta, gamma):
+    radius = 240
+    # camera_location = [radius * math.cos(phi), radius * math.sin(phi), radius * math.tan(theta)]
+    # print(camera_location)
+    camera_location = [-1*radius * math.cos(phi), -1*radius * math.tan(theta), radius * math.sin(phi)]
+    img = render(
+        # Model("res/jinx.obj", texture_filename="res/jinx.tga"),
+        axis_model,
+        height=512,
+        width=512,
+        filename="tmp_render.png",
+        cam_loc = camera_location
+    )
+    img = img.rotate(gamma)
+    return img
+def overlay_images_with_scaling(center_image: Image.Image, background_image, target_size=(512, 512)):
+    """
+    调整前景图像大小为 512x512，将背景图像缩放以适配，并中心对齐叠加
+    :param center_image: 前景图像
+    :param background_image: 背景图像
+    :param target_size: 前景图像的目标大小，默认 (512, 512)
+    :return: 叠加后的图像
+    """
+    # 确保输入图像为 RGBA 模式
+    if center_image.mode != "RGBA":
+        center_image = center_image.convert("RGBA")
+    if background_image.mode != "RGBA":
+        background_image = background_image.convert("RGBA")
+    # 调整前景图像大小
+    center_image = center_image.resize(target_size)
+    # 缩放背景图像，确保其适合前景图像的尺寸
+    bg_width, bg_height = background_image.size
+    # 按宽度或高度等比例缩放背景
+    scale = target_size[0] / max(bg_width, bg_height)
+    new_width = int(bg_width * scale)
+    new_height = int(bg_height * scale)
+    resized_background = background_image.resize((new_width, new_height))
+    # 计算需要的填充量
+    pad_width = target_size[0] - new_width
+    pad_height = target_size[0] - new_height
+    # 计算上下左右的 padding
+    left = pad_width // 2
+    right = pad_width - left
+    top = pad_height // 2
+    bottom = pad_height - top
+    # 添加 padding
+    resized_background = ImageOps.expand(resized_background, border=(left, top, right, bottom), fill=(255,255,255,255))
+    # 将前景图像叠加到背景图像上
+    result = resized_background.copy()
+    result.paste(center_image, (0, 0), mask=center_image)
+    return result

vision_tower.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import torch
+from torch import nn
+import torch.nn.init as init
+import torch.nn.functional as F
+from paths import *
+from typing import Dict, List, Optional, Set, Tuple, Union
+from transformers import AutoImageProcessor, AutoModel, Dinov2Model
+from transformers.models.dinov2.modeling_dinov2 import Dinov2Embeddings
+from transformers.models.dinov2.configuration_dinov2 import Dinov2Config
+import numpy as np
+from contextlib import nullcontext
+def get_activation(activation):
+    if activation.lower() == 'gelu':
+        return nn.GELU()
+    elif activation.lower() == 'rrelu':
+        return nn.RReLU(inplace=True)
+    elif activation.lower() == 'selu':
+        return nn.SELU(inplace=True)
+    elif activation.lower() == 'silu':
+        return nn.SiLU(inplace=True)
+    elif activation.lower() == 'hardswish':
+        return nn.Hardswish(inplace=True)
+    elif activation.lower() == 'leakyrelu':
+        return nn.LeakyReLU(inplace=True)
+    elif activation.lower() == 'sigmoid':
+        return nn.Sigmoid()
+    elif activation.lower() == 'tanh':
+        return nn.Tanh()
+    else:
+        return nn.ReLU(inplace=True)
+class MLP_dim(nn.Module):
+    def __init__(
+        self, in_dim=512, out_dim=1024, bias=True, activation='relu'):
+        super().__init__()
+        self.act = get_activation(activation)
+        self.net1 = nn.Sequential(
+            nn.Linear(in_dim, int(out_dim), bias=bias),
+            nn.BatchNorm1d(int(out_dim)),
+            self.act
+        )
+        self.net2 = nn.Sequential(
+            nn.Linear(int(out_dim), out_dim, bias=bias),
+            nn.BatchNorm1d(out_dim)
+        )
+    def forward(self, x):
+        return self.net2(self.net1(x))
+class FLIP_Dinov2Embeddings(Dinov2Embeddings):
+    """
+    Construct the CLS token, mask token, position and patch embeddings.
+    """
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__(config)
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embeddings.projection.weight.dtype
+        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        if bool_masked_pos is not None:
+            # embeddings = torch.where(
+            #     bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
+            # )
+            B,S,D = embeddings.shape
+            batch_indices = torch.arange(B).unsqueeze(1)
+            embeddings = embeddings[batch_indices, bool_masked_pos]
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class FLIP_DINOv2(Dinov2Model):
+    def __init__(self, config):
+        super().__init__(config)
+        self.embeddings = FLIP_Dinov2Embeddings(config)
+class DINOv2_MLP(nn.Module):
+    def __init__(self,
+                 dino_mode,
+                 in_dim,
+                 out_dim,
+                 evaluate,
+                 mask_dino,
+                 frozen_back
+                ) -> None:
+        super().__init__()
+        # self.dinov2 = AutoModel.from_pretrained(DINO_BASE)
+        if dino_mode == 'base':
+            self.dinov2 = FLIP_DINOv2.from_pretrained(DINO_BASE, cache_dir='./')
+        elif dino_mode == 'large':
+            self.dinov2 = FLIP_DINOv2.from_pretrained(DINO_LARGE, cache_dir='./')
+        elif dino_mode == 'small':
+            self.dinov2 = FLIP_DINOv2.from_pretrained(DINO_SMALL, cache_dir='./')
+        elif dino_mode == 'giant':
+            self.dinov2 = FLIP_DINOv2.from_pretrained(DINO_GIANT, cache_dir='./')
+        self.down_sampler = MLP_dim(in_dim=in_dim, out_dim=out_dim)
+        self.random_mask  = False
+        if not evaluate:
+            self.init_weights(self.down_sampler)
+            self.random_mask = mask_dino
+        if frozen_back:
+            self.forward_mode = torch.no_grad()
+        else:
+            self.forward_mode = nullcontext()
+    def forward(self, img_inputs):
+        device = self.get_device()
+        # print(img_inputs['pixel_values'].shape)
+        with self.forward_mode:
+            if self.random_mask:
+                B = len(img_inputs['pixel_values'])
+                S = 256
+                indices = []
+                for i in range(B):
+                    tmp = torch.randperm(S)[:S//2]
+                    tmp = tmp.sort().values + 1
+                    indices.append(tmp)
+                indices = torch.stack(indices, dim=0)
+                indices = torch.cat([torch.zeros(B, 1, dtype=torch.long, device='cpu'), indices], dim=1)
+                # print(indices.shape)
+                img_inputs['bool_masked_pos'] = indices.to(device)
+            dino_outputs = self.dinov2(**img_inputs)
+            dino_seq = dino_outputs.last_hidden_state
+            # B,S,_ = dino_seq.shape
+            # dino_seq = dino_seq.view(B*S,-1)
+            dino_seq = dino_seq[:,0,:]
+        down_sample_out = self.down_sampler(dino_seq)
+        # down_sample_out = down_sample_out.view(B,S,-1)
+        # down_sample_out = down_sample_out[:,0,:]
+        return down_sample_out
+    def get_device(self):
+        return next(self.parameters()).device
+    def init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                init.constant_(m.bias, 0)