rt-detr-object-detection

Running on Zero

App Files Files Community

freddyaboulton HF Staff commited on Sep 13, 2024

Commit

6a95f1f

1 Parent(s): cbc2dd6

code

Browse files

Files changed (3) hide show

app.py +65 -21
draw_boxes.py +41 -0
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,21 +1,67 @@
 import spaces
 import gradio as gr
 import cv2
-import tempfile
-from ultralytics import YOLOv10
-model = YOLOv10.from_pretrained(f'jameslahm/yolov10n')
 @spaces.GPU
-def yolov10_inference(image, conf_threshold):
-    width, _ = image.size
-    import time
-    start = time.time()
-    results = model.predict(source=image, imgsz=width, conf=conf_threshold)
-    end = time.time()
-    annotated_image = results[0].plot()
-    print("time", end - start)
-    return annotated_image[:, :, ::-1]
 css=""".my-group {max-width: 600px !important; max-height: 600 !important;}
@@ -26,18 +72,18 @@ with gr.Blocks(css=css) as app:
     gr.HTML(
         """
     <h1 style='text-align: center'>
-    YOLOv10 Webcam Stream
     </h1>
     """)
     gr.HTML(
         """
         <h3 style='text-align: center'>
-        <a href='https://arxiv.org/abs/2405.14458' target='_blank'>arXiv</a> | <a href='https://github.com/THU-MIG/yolov10' target='_blank'>github</a>
         </h3>
         """)
     with gr.Column(elem_classes=["my-column"]):
         with gr.Group(elem_classes=["my-group"]):
-            image = gr.Image(type="pil", label="Image", sources="webcam")
             conf_threshold = gr.Slider(
                 label="Confidence Threshold",
                 minimum=0.0,
@@ -45,12 +91,10 @@ with gr.Blocks(css=css) as app:
                 step=0.05,
                 value=0.30,
             )
-        image.stream(
-            fn=yolov10_inference,
-            inputs=[image, conf_threshold],
-            outputs=[image],
-            stream_every=0.1,
-            time_limit=30
         )
 if __name__ == '__main__':

 import spaces
 import gradio as gr
 import cv2
+from PIL import Image
+from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
+from draw_boxes import draw_bounding_boxes
+image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
+model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd")
 @spaces.GPU
+def stream_object_detection(video, conf_threshold):
+    cap = cv2.VideoCapture(video)
+    video_codec = cv2.VideoWriter_fourcc(*"x264") # type: ignore
+    fps = int(cap.get(cv2.CAP_PROP_FPS))
+    desired_fps = fps // 3
+    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    iterating, frame = cap.read()
+    n_frames = 0
+    n_chunks = 0
+    name = str(current_dir / f"output_{n_chunks}.ts")
+    segment_file = cv2.VideoWriter(name, video_codec, fps, (width, height)) # type: ignore
+    batch = []
+    while iterating:
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        if n_frames % 3 == 0:
+            batch.append(frame)
+        if len(batch) == desired_fps:
+            inputs = image_processor(images=batch, return_tensors="pt")
+            with torch.no_grad():
+                outputs = model(**inputs)
+            boxes = image_processor.post_process_object_detection(
+                outputs,
+                target_sizes=torch.tensor([batch[0].shape[::-1]] * len(batch)),
+                threshold=conf_threshold)
+            for array, box in zip(batch, boxes):
+                pil_image = draw_bounding_boxes(Image.from_array(array), boxes[0], model, 0.3)
+                frame = numpy.array(pil_image)
+                # Convert RGB to BGR
+                frame = frame[:, :, ::-1].copy()
+                segment_file.write(frame)
+            segment_file.release()
+            n_frames = 0
+            n_chunks += 1
+            yield name
+            name = str(current_dir / f"output_{n_chunks}.ts")
+            segment_file = cv2.VideoWriter(name, video_codec, fps, (width, height)) # type: ignore
+        iterating, frame = cap.read()
+        n_frames += 1
+    segment_file.release()
+    yield name
 css=""".my-group {max-width: 600px !important; max-height: 600 !important;}
     gr.HTML(
         """
     <h1 style='text-align: center'>
+    Video Object Detection with RT-DETR
     </h1>
     """)
     gr.HTML(
         """
         <h3 style='text-align: center'>
+        <a href='https://arxiv.org/abs/2304.08069' target='_blank'>arXiv</a> | <a href='https://huggingface.co/PekingU/rtdetr_r101vd_coco_o365' target='_blank'>github</a>
         </h3>
         """)
     with gr.Column(elem_classes=["my-column"]):
         with gr.Group(elem_classes=["my-group"]):
+            video = gr.Video(label="Video Source")
             conf_threshold = gr.Slider(
                 label="Confidence Threshold",
                 minimum=0.0,
                 step=0.05,
                 value=0.30,
             )
+        video.upload(
+            fn=stream_object_detection,
+            inputs=[video, conf_threshold],
+            outputs=[video],
         )
 if __name__ == '__main__':

draw_boxes.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from PIL import Image, ImageDraw, ImageFont
+import numpy as np
+import colorsys
+def get_color(label):
+    # Simple hash function to generate consistent colors for each label
+    hash_value = hash(label)
+    hue = (hash_value % 100) / 100.0
+    saturation = 0.7
+    value = 0.9
+    rgb = colorsys.hsv_to_rgb(hue, saturation, value)
+    return tuple(int(x * 255) for x in rgb)
+def draw_bounding_boxes(image: Image, results: dict, model, threshold=0.3):
+    draw = ImageDraw.Draw(image)
+    font = ImageFont.load_default()
+    for score, label_id, box in zip(results["scores"], results["labels"], results["boxes"]):
+        if score > threshold:
+            label = model.config.id2label[label_id.item()]
+            box = [round(i, 2) for i in box.tolist()]
+            color = get_color(label)
+            # Draw bounding box
+            draw.rectangle(box, outline=color, width=3)
+            # Prepare text
+            text = f"{label}: {score:.2f}"
+            text_bbox = draw.textbbox((0, 0), text, font=font)
+            text_width = text_bbox[2] - text_bbox[0]
+            text_height = text_bbox[3] - text_bbox[1]
+            # Draw text background
+            draw.rectangle([box[0], box[1] - text_height - 4, box[0] + text_width, box[1]], fill=color)
+            # Draw text
+            draw.text((box[0], box[1] - text_height - 4), text, fill="white", font=font)
+    return image
+import numpy as np

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 safetensors==0.4.3
-git+https://github.com/THU-MIG/yolov10.git
 gradio-client @ git+https://github.com/gradio-app/gradio@66349fe26827e3a3c15b738a1177e95fec7f5554#subdirectory=client/python
 https://gradio-pypi-previews.s3.amazonaws.com/66349fe26827e3a3c15b738a1177e95fec7f5554/gradio-4.42.0-py3-none-any.whl

 safetensors==0.4.3
+transformers
 gradio-client @ git+https://github.com/gradio-app/gradio@66349fe26827e3a3c15b738a1177e95fec7f5554#subdirectory=client/python
 https://gradio-pypi-previews.s3.amazonaws.com/66349fe26827e3a3c15b738a1177e95fec7f5554/gradio-4.42.0-py3-none-any.whl