Spaces:

EgoHackZero
/

midas-depth-estimation

Sleeping

App Files Files Community

EgoHackZero commited on May 15

Commit

6225fda

1 Parent(s): fa1476f

try to add segmentation step

Browse files

Files changed (1) hide show

app.py +62 -34

app.py CHANGED Viewed

@@ -1,64 +1,92 @@
-import torch
-import gradio as gr
 import numpy as np
 import cv2
 from PIL import Image
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Загрузка модели
 midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large")
-midas.to(device)
-midas.eval()
-# Загрузка трансформаций
 midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
 transform = midas_transforms.dpt_transform
-def predict_depth(image):
-    # ======= 1. Преобразование в OpenCV формат =======
-    if not isinstance(image, Image.Image):
-        image = Image.fromarray(image)
-    image_np = np.array(image)
-    # OpenCV читает в BGR, но image_np скорее всего уже в RGB
-    img_rgb = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
-    img_rgb = cv2.cvtColor(img_rgb, cv2.COLOR_BGR2RGB)  # На всякий случай двойная проверка
-    # ======= 2. Преобразование как в официальном туториале =======
-    input_tensor = transform(img_rgb).to(device)  # shape: [3, H, W]
-    # ======= 3. Добавление batch размерности =======
-    if len(input_tensor.shape) == 3:
-        input_batch = input_tensor.unsqueeze(0)  # shape: [1, 3, H, W]
-    else:
-        input_batch = input_tensor  # Уже batch
-    # ======= 4. Предсказание =======
     with torch.no_grad():
         prediction = midas(input_batch)
         prediction = torch.nn.functional.interpolate(
             prediction.unsqueeze(1),
-            size=(img_rgb.shape[0], img_rgb.shape[1]),  # (H, W)
             mode="bicubic",
-            align_corners=False,
         ).squeeze()
-    # ======= 5. Нормализация и преобразование в изображение =======
     depth_map = prediction.cpu().numpy()
     depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
     depth_map = (depth_map * 255).astype(np.uint8)
-    depth_img = Image.fromarray(depth_map)
-    return depth_img
-# Gradio интерфейс
 iface = gr.Interface(
-    fn=predict_depth,
-    inputs=gr.Image(type="pil"),
-    outputs=gr.Image(type="pil"),
-    title="MiDaS Depth Estimation",
-    description="Drop img -> depth map"
 )
 if __name__ == "__main__":

 import numpy as np
+import torch
 import cv2
 from PIL import Image
+from transformers import pipeline
+import gradio as gr
+# ===== Device Setup =====
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+device_index = 0 if torch.cuda.is_available() else -1
+# ===== MiDaS Depth Estimation Setup =====
+# Load MiDaS model and transforms
 midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large")
+midas.to(device).eval()
 midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
 transform = midas_transforms.dpt_transform
+# ===== Segmentation Setup =====
+segmenter = pipeline(
+    "image-segmentation",
+    model="nvidia/segformer-b0-finetuned-ade-512-512",
+    device=device_index,
+    torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
+)
+# ===== Utility Functions =====
+def resize_image(img: Image.Image, max_size: int = 512) -> Image.Image:
+    width, height = img.size
+    if max(width, height) > max_size:
+        ratio = max_size / max(width, height)
+        new_size = (int(width * ratio), int(height * ratio))
+        return img.resize(new_size, Image.LANCZOS)
+    return img
+# ===== Depth Prediction =====
+def predict_depth(image: Image.Image) -> Image.Image:
+    # Ensure input is PIL Image
+    img = image.convert('RGB') if not isinstance(image, Image.Image) else image
+    img_np = np.array(img)
+    # Convert to the format expected by MiDaS
+    input_tensor = transform(img_np).to(device)
+    input_batch = input_tensor.unsqueeze(0) if input_tensor.ndim == 3 else input_tensor
+    # Predict depth
     with torch.no_grad():
         prediction = midas(input_batch)
         prediction = torch.nn.functional.interpolate(
             prediction.unsqueeze(1),
+            size=img_np.shape[:2],
             mode="bicubic",
+            align_corners=False
         ).squeeze()
+    # Normalize to 0-255
     depth_map = prediction.cpu().numpy()
     depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
     depth_map = (depth_map * 255).astype(np.uint8)
+    return Image.fromarray(depth_map)
+# ===== Segmentation =====
+def segment_image(img: Image.Image) -> Image.Image:
+    img = img.convert('RGB')
+    img_resized = resize_image(img)
+    results = segmenter(img_resized)
+    overlay = np.array(img_resized, dtype=np.uint8)
+    for res in results:
+        mask = np.array(res["mask"], dtype=bool)
+        color = np.random.randint(50, 255, 3, dtype=np.uint8)
+        overlay[mask] = (overlay[mask] * 0.6 + color * 0.4).astype(np.uint8)
+    return Image.fromarray(overlay)
+# ===== Gradio App =====
+def predict_fn(input_img: Image.Image) -> Image.Image:
+    # 1. Compute depth map
+    depth_img = predict_depth(input_img)
+    # 2. Segment the depth map
+    seg_img = segment_image(depth_img)
+    return seg_img
 iface = gr.Interface(
+    fn=predict_fn,
+    inputs=gr.Image(type="pil", label="Upload Image"),
+    outputs=gr.Image(type="pil", label="Segmented Depth Overlay"),
+    title="Depth-then-Segmentation Pipeline",
+    description="Upload an image. First computes a depth map via MiDaS, then applies SegFormer segmentation on the depth map."
 )
 if __name__ == "__main__":