Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from huggingface_hub import from_pretrained_keras | |
| from PIL import Image | |
| import io | |
| import matplotlib.pyplot as plt | |
| import os | |
| import re | |
| import zipfile | |
| import numpy as np | |
| import tensorflow as tf | |
| from tensorflow import keras | |
| import tensorflow_datasets as tfds | |
| coco_image = [] | |
| coco_dir = 'coco/images/' | |
| for idx, images in enumerate(os.listdir(coco_dir)): | |
| image = os.path.join(coco_dir, images) | |
| if os.path.isfile(image) and idx < 10: | |
| coco_image.append(image) | |
| _, dataset_info = tfds.load( | |
| "coco/2017", split=["train", "validation","test"], with_info=True, data_dir="data" | |
| ) | |
| #test_dataset = tfds.load("coco/2017", split="test", data_dir="data") | |
| int2str = dataset_info.features["objects"]["label"].int2str | |
| class AnchorBox: | |
| """Generates anchor boxes. | |
| This class has operations to generate anchor boxes for feature maps at | |
| strides `[8, 16, 32, 64, 128]`. Where each anchor each box is of the | |
| format `[x, y, width, height]`. | |
| Attributes: | |
| aspect_ratios: A list of float values representing the aspect ratios of | |
| the anchor boxes at each location on the feature map | |
| scales: A list of float values representing the scale of the anchor boxes | |
| at each location on the feature map. | |
| num_anchors: The number of anchor boxes at each location on feature map | |
| areas: A list of float values representing the areas of the anchor | |
| boxes for each feature map in the feature pyramid. | |
| strides: A list of float value representing the strides for each feature | |
| map in the feature pyramid. | |
| """ | |
| def __init__(self): | |
| self.aspect_ratios = [0.5, 1.0, 2.0] | |
| self.scales = [2 ** x for x in [0, 1 / 3, 2 / 3]] | |
| self._num_anchors = len(self.aspect_ratios) * len(self.scales) | |
| self._strides = [2 ** i for i in range(3, 8)] | |
| self._areas = [x ** 2 for x in [32.0, 64.0, 128.0, 256.0, 512.0]] | |
| self._anchor_dims = self._compute_dims() | |
| def _compute_dims(self): | |
| """Computes anchor box dimensions for all ratios and scales at all levels | |
| of the feature pyramid. | |
| """ | |
| anchor_dims_all = [] | |
| for area in self._areas: | |
| anchor_dims = [] | |
| for ratio in self.aspect_ratios: | |
| anchor_height = tf.math.sqrt(area / ratio) | |
| anchor_width = area / anchor_height | |
| dims = tf.reshape( | |
| tf.stack([anchor_width, anchor_height], axis=-1), [1, 1, 2] | |
| ) | |
| for scale in self.scales: | |
| anchor_dims.append(scale * dims) | |
| anchor_dims_all.append(tf.stack(anchor_dims, axis=-2)) | |
| return anchor_dims_all | |
| def _get_anchors(self, feature_height, feature_width, level): | |
| """Generates anchor boxes for a given feature map size and level | |
| Arguments: | |
| feature_height: An integer representing the height of the feature map. | |
| feature_width: An integer representing the width of the feature map. | |
| level: An integer representing the level of the feature map in the | |
| feature pyramid. | |
| Returns: | |
| anchor boxes with the shape | |
| `(feature_height * feature_width * num_anchors, 4)` | |
| """ | |
| rx = tf.range(feature_width, dtype=tf.float32) + 0.5 | |
| ry = tf.range(feature_height, dtype=tf.float32) + 0.5 | |
| centers = tf.stack(tf.meshgrid(rx, ry), axis=-1) * self._strides[level - 3] | |
| centers = tf.expand_dims(centers, axis=-2) | |
| centers = tf.tile(centers, [1, 1, self._num_anchors, 1]) | |
| dims = tf.tile( | |
| self._anchor_dims[level - 3], [feature_height, feature_width, 1, 1] | |
| ) | |
| anchors = tf.concat([centers, dims], axis=-1) | |
| return tf.reshape( | |
| anchors, [feature_height * feature_width * self._num_anchors, 4] | |
| ) | |
| def get_anchors(self, image_height, image_width): | |
| """Generates anchor boxes for all the feature maps of the feature pyramid. | |
| Arguments: | |
| image_height: Height of the input image. | |
| image_width: Width of the input image. | |
| Returns: | |
| anchor boxes for all the feature maps, stacked as a single tensor | |
| with shape `(total_anchors, 4)` | |
| """ | |
| anchors = [ | |
| self._get_anchors( | |
| tf.math.ceil(image_height / 2 ** i), | |
| tf.math.ceil(image_width / 2 ** i), | |
| i, | |
| ) | |
| for i in range(3, 8) | |
| ] | |
| return tf.concat(anchors, axis=0) | |
| class DecodePredictions(tf.keras.layers.Layer): | |
| """A Keras layer that decodes predictions of the RetinaNet model. | |
| Attributes: | |
| num_classes: Number of classes in the dataset | |
| confidence_threshold: Minimum class probability, below which detections | |
| are pruned. | |
| nms_iou_threshold: IOU threshold for the NMS operation | |
| max_detections_per_class: Maximum number of detections to retain per | |
| class. | |
| max_detections: Maximum number of detections to retain across all | |
| classes. | |
| box_variance: The scaling factors used to scale the bounding box | |
| predictions. | |
| """ | |
| def __init__( | |
| self, | |
| num_classes=80, | |
| confidence_threshold=0.05, | |
| nms_iou_threshold=0.5, | |
| max_detections_per_class=100, | |
| max_detections=100, | |
| box_variance=[0.1, 0.1, 0.2, 0.2], | |
| **kwargs | |
| ): | |
| super(DecodePredictions, self).__init__(**kwargs) | |
| self.num_classes = num_classes | |
| self.confidence_threshold = confidence_threshold | |
| self.nms_iou_threshold = nms_iou_threshold | |
| self.max_detections_per_class = max_detections_per_class | |
| self.max_detections = max_detections | |
| self._anchor_box = AnchorBox() | |
| self._box_variance = tf.convert_to_tensor( | |
| [0.1, 0.1, 0.2, 0.2], dtype=tf.float32 | |
| ) | |
| def _decode_box_predictions(self, anchor_boxes, box_predictions): | |
| boxes = box_predictions * self._box_variance | |
| boxes = tf.concat( | |
| [ | |
| boxes[:, :, :2] * anchor_boxes[:, :, 2:] + anchor_boxes[:, :, :2], | |
| tf.math.exp(boxes[:, :, 2:]) * anchor_boxes[:, :, 2:], | |
| ], | |
| axis=-1, | |
| ) | |
| boxes_transformed = convert_to_corners(boxes) | |
| return boxes_transformed | |
| def call(self, images, predictions): | |
| image_shape = tf.cast(tf.shape(images), dtype=tf.float32) | |
| anchor_boxes = self._anchor_box.get_anchors(image_shape[1], image_shape[2]) | |
| box_predictions = predictions[:, :, :4] | |
| cls_predictions = tf.nn.sigmoid(predictions[:, :, 4:]) | |
| boxes = self._decode_box_predictions(anchor_boxes[None, ...], box_predictions) | |
| return tf.image.combined_non_max_suppression( | |
| tf.expand_dims(boxes, axis=2), | |
| cls_predictions, | |
| self.max_detections_per_class, | |
| self.max_detections, | |
| self.nms_iou_threshold, | |
| self.confidence_threshold, | |
| clip_boxes=False, | |
| ) | |
| def convert_to_corners(boxes): | |
| """Changes the box format to corner coordinates | |
| Arguments: | |
| boxes: A tensor of rank 2 or higher with a shape of `(..., num_boxes, 4)` | |
| representing bounding boxes where each box is of the format | |
| `[x, y, width, height]`. | |
| Returns: | |
| converted boxes with shape same as that of boxes. | |
| """ | |
| return tf.concat( | |
| [boxes[..., :2] - boxes[..., 2:] / 2.0, boxes[..., :2] + boxes[..., 2:] / 2.0], | |
| axis=-1, | |
| ) | |
| def resize_and_pad_image( | |
| image, min_side=800.0, max_side=1333.0, jitter=[640, 1024], stride=128.0 | |
| ): | |
| """Resizes and pads image while preserving aspect ratio. | |
| 1. Resizes images so that the shorter side is equal to `min_side` | |
| 2. If the longer side is greater than `max_side`, then resize the image | |
| with longer side equal to `max_side` | |
| 3. Pad with zeros on right and bottom to make the image shape divisible by | |
| `stride` | |
| Arguments: | |
| image: A 3-D tensor of shape `(height, width, channels)` representing an | |
| image. | |
| min_side: The shorter side of the image is resized to this value, if | |
| `jitter` is set to None. | |
| max_side: If the longer side of the image exceeds this value after | |
| resizing, the image is resized such that the longer side now equals to | |
| this value. | |
| jitter: A list of floats containing minimum and maximum size for scale | |
| jittering. If available, the shorter side of the image will be | |
| resized to a random value in this range. | |
| stride: The stride of the smallest feature map in the feature pyramid. | |
| Can be calculated using `image_size / feature_map_size`. | |
| Returns: | |
| image: Resized and padded image. | |
| image_shape: Shape of the image before padding. | |
| ratio: The scaling factor used to resize the image | |
| """ | |
| image_shape = tf.cast(tf.shape(image)[:2], dtype=tf.float32) | |
| if jitter is not None: | |
| min_side = tf.random.uniform((), jitter[0], jitter[1], dtype=tf.float32) | |
| ratio = min_side / tf.reduce_min(image_shape) | |
| if ratio * tf.reduce_max(image_shape) > max_side: | |
| ratio = max_side / tf.reduce_max(image_shape) | |
| image_shape = ratio * image_shape | |
| image = tf.image.resize(image, tf.cast(image_shape, dtype=tf.int32)) | |
| padded_image_shape = tf.cast( | |
| tf.math.ceil(image_shape / stride) * stride, dtype=tf.int32 | |
| ) | |
| image = tf.image.pad_to_bounding_box( | |
| image, 0, 0, padded_image_shape[0], padded_image_shape[1] | |
| ) | |
| return image, image_shape, ratio | |
| def visualize_detections( | |
| image, boxes, classes, scores, figsize=(7, 7), linewidth=1, color=[0, 0, 1] | |
| ): | |
| """Visualize Detections""" | |
| image = np.array(image, dtype=np.uint8) | |
| plt.figure(figsize=figsize) | |
| plt.axis("off") | |
| plt.imshow(image) | |
| ax = plt.gca() | |
| for box, _cls, score in zip(boxes, classes, scores): | |
| text = "{}: {:.2f}".format(_cls, score) | |
| x1, y1, x2, y2 = box | |
| w, h = x2 - x1, y2 - y1 | |
| patch = plt.Rectangle( | |
| [x1, y1], w, h, fill=False, edgecolor=color, linewidth=linewidth | |
| ) | |
| ax.add_patch(patch) | |
| ax.text( | |
| x1, | |
| y1, | |
| text, | |
| bbox={"facecolor": color, "alpha": 0.4}, | |
| clip_box=ax.clipbox, | |
| clip_on=True, | |
| ) | |
| plt.show() | |
| return ax | |
| def prepare_image(image): | |
| image, _, ratio = resize_and_pad_image(image, jitter=None) | |
| image = tf.keras.applications.resnet.preprocess_input(image) | |
| return tf.expand_dims(image, axis=0), ratio | |
| model = from_pretrained_keras("keras-io/Object-Detection-RetinaNet") | |
| img_input = tf.keras.Input(shape=[None, None, 3], name="image") | |
| predictions = model(img_input, training=False) | |
| detections = DecodePredictions(confidence_threshold=0.5)(img_input, predictions) | |
| inference_model = tf.keras.Model(inputs=img_input, outputs=detections) | |
| def predict(image): | |
| input_image, ratio = prepare_image(image) | |
| detections = inference_model.predict(input_image) | |
| num_detections = detections.valid_detections[0] | |
| class_names = [ | |
| int2str(int(x)) for x in detections.nmsed_classes[0][:num_detections] | |
| ] | |
| img_buf = io.BytesIO() | |
| ax = visualize_detections( | |
| image, | |
| detections.nmsed_boxes[0][:num_detections] / ratio, | |
| class_names, | |
| detections.nmsed_scores[0][:num_detections], | |
| ) | |
| ax.figure.savefig(img_buf) | |
| img_buf.seek(0) | |
| img = Image.open(img_buf) | |
| return img | |
| # Input | |
| input = gr.inputs.Image(image_mode="RGB", type="numpy", label="Enter Object Image") | |
| # Output | |
| output = gr.outputs.Image(type="pil", label="Detected Objects with Class Category") | |
| title = "Object Detection With RetinaNet" | |
| description = "Upload an Image or take one from examples to localize objects present in an image, and at the same time, classify them into different categories" | |
| gr.Interface(fn=predict, inputs = input, outputs = output, examples=coco_image, allow_flagging=False, analytics_enabled=False, title=title, description=description, article="<center>Space By: <u><a href='https://github.com/robotjellyzone'><b>Kavya Bisht</b></a></u> \n Based on notebook <a href='https://keras.io/examples/vision/retinanet/'><b>this notebook</b></a></center>").launch(enable_queue=True, debug=True) |